feat: support nulls in the csv uploads (apache#10208)

* Support more table properties for the hive upload Refactor Add tests, and refactor them to be pytest friendly Use lowercase table names Ignore isort * Use sql params Co-authored-by: bogdan kyryliuk <[email protected]>
swtnk · Jul 6, 2020 · 84f8a51 · 84f8a51
1 parent 318e534
commit 84f8a51
Show file tree

Hide file tree

Showing 10 changed files with 337 additions and 172 deletions.
diff --git a/pytest.ini b/pytest.ini
@@ -15,7 +15,6 @@
 # limitations under the License.
 #
 [pytest]
-addopts = -ra -q
 testpaths =
     tests
 python_files = *_test.py test_*.py *_tests.py
diff --git a/superset/config.py b/superset/config.py
@@ -35,6 +35,7 @@
 from dateutil import tz
 from flask import Blueprint
 from flask_appbuilder.security.manager import AUTH_DB
+from pandas.io.parsers import STR_NA_VALUES
 
 from superset.jinja_context import (  # pylint: disable=unused-import
     BaseTemplateProcessor,
@@ -622,6 +623,9 @@ class CeleryConfig:  # pylint: disable=too-few-public-methods
     UPLOADED_CSV_HIVE_NAMESPACE
 ] if UPLOADED_CSV_HIVE_NAMESPACE else []
 
+# Values that should be treated as nulls for the csv uploads.
+CSV_DEFAULT_NA_NAMES = list(STR_NA_VALUES)
+
 # A dictionary of items that gets merged into the Jinja context for
 # SQL Lab. The existing context gets updated with this dictionary,
 # meaning values for existing keys get overwritten by the content of this

diff --git a/superset/db_engine_specs/hive.py b/superset/db_engine_specs/hive.py
@@ -106,6 +106,45 @@ def fetch_data(cls, cursor: Any, limit: int) -> List[Tuple[Any, ...]]:
         except pyhive.exc.ProgrammingError:
             return []
 
+    @classmethod
+    def get_create_table_stmt(  # pylint: disable=too-many-arguments
+        cls,
+        table: Table,
+        schema_definition: str,
+        location: str,
+        delim: str,
+        header_line_count: Optional[int],
+        null_values: Optional[List[str]],
+    ) -> text:
+        tblproperties = []
+        # available options:
+        # https://cwiki.apache.org/confluence/display/Hive/LanguageManual+DDL
+        # TODO(bkyryliuk): figure out what to do with the skip rows field.
+        params: Dict[str, str] = {
+            "delim": delim,
+            "location": location,
+        }
+        if header_line_count is not None and header_line_count >= 0:
+            header_line_count += 1
+            tblproperties.append("'skip.header.line.count'=':header_line_count'")
+            params["header_line_count"] = str(header_line_count)
+        if null_values:
+            # hive only supports 1 value for the null format
+            tblproperties.append("'serialization.null.format'=':null_value'")
+            params["null_value"] = null_values[0]
+
+        if tblproperties:
+            tblproperties_stmt = f"tblproperties ({', '.join(tblproperties)})"
+            sql = f"""CREATE TABLE {str(table)} ( {schema_definition} )
+                ROW FORMAT DELIMITED FIELDS TERMINATED BY :delim
+                STORED AS TEXTFILE LOCATION :location
+                {tblproperties_stmt}"""
+        else:
+            sql = f"""CREATE TABLE {str(table)} ( {schema_definition} )
+                ROW FORMAT DELIMITED FIELDS TERMINATED BY :delim
+                STORED AS TEXTFILE LOCATION :location"""
+        return sql, params
+
     @classmethod
     def create_table_from_csv(  # pylint: disable=too-many-arguments, too-many-locals
         cls,
@@ -182,18 +221,17 @@ def convert_to_hive_type(col_type: str) -> str:
             bucket_path,
             os.path.join(upload_prefix, table.table, os.path.basename(filename)),
         )
-        sql = text(
-            f"""CREATE TABLE {str(table)} ( {schema_definition} )
-            ROW FORMAT DELIMITED FIELDS TERMINATED BY :delim
-            STORED AS TEXTFILE LOCATION :location
-            tblproperties ('skip.header.line.count'='1')"""
+
+        sql, params = cls.get_create_table_stmt(
+            table,
+            schema_definition,
+            location,
+            csv_to_df_kwargs["sep"].encode().decode("unicode_escape"),
+            int(csv_to_df_kwargs.get("header", 0)),
+            csv_to_df_kwargs.get("na_values"),
         )
         engine = cls.get_engine(database)
-        engine.execute(
-            sql,
-            delim=csv_to_df_kwargs["sep"].encode().decode("unicode_escape"),
-            location=location,
-        )
+        engine.execute(text(sql), **params)
 
     @classmethod
     def convert_dttm(cls, target_type: str, dttm: datetime) -> Optional[str]:

diff --git a/superset/forms.py b/superset/forms.py
@@ -15,12 +15,27 @@
 # specific language governing permissions and limitations
 # under the License.
 """Contains the logic to create cohesive forms on the explore view"""
+import json
 from typing import Any, List, Optional
 
 from flask_appbuilder.fieldwidgets import BS3TextFieldWidget
 from wtforms import Field
 
 
+class JsonListField(Field):
+    widget = BS3TextFieldWidget()
+    data: List[str] = []
+
+    def _value(self) -> str:
+        return json.dumps(self.data)
+
+    def process_formdata(self, valuelist: List[str]) -> None:
+        if valuelist and valuelist[0]:
+            self.data = json.loads(valuelist[0])
+        else:
+            self.data = []
+
+
 class CommaSeparatedListField(Field):
     widget = BS3TextFieldWidget()
     data: List[str] = []

diff --git a/superset/views/database/forms.py b/superset/views/database/forms.py
@@ -26,7 +26,11 @@
 from wtforms.validators import DataRequired, Length, NumberRange, Optional
 
 from superset import app, db, security_manager
-from superset.forms import CommaSeparatedListField, filter_not_empty_values
+from superset.forms import (
+    CommaSeparatedListField,
+    filter_not_empty_values,
+    JsonListField,
+)
 from superset.models.core import Database
 
 config = app.config
@@ -210,6 +214,16 @@ def at_least_one_schema_is_allowed(database: Database) -> bool:
         validators=[Optional()],
         widget=BS3TextFieldWidget(),
     )
+    null_values = JsonListField(
+        _("Null values"),
+        default=config["CSV_DEFAULT_NA_NAMES"],
+        description=_(
+            "Json list of the values that should be treated as null. "
+            'Examples: [""], ["None", "N/A"], ["nan", "null"]. '
+            "Warning: Hive database supports only single value. "
+            'Use [""] for empty string.'
+        ),
+    )
 
 
 class ExcelToDatabaseForm(DynamicForm):
@@ -376,3 +390,13 @@ def at_least_one_schema_is_allowed(database: Database) -> bool:
         validators=[Optional()],
         widget=BS3TextFieldWidget(),
     )
+    null_values = JsonListField(
+        _("Null values"),
+        default=config["CSV_DEFAULT_NA_NAMES"],
+        description=_(
+            "Json list of the values that should be treated as null. "
+            'Examples: [""], ["None", "N/A"], ["nan", "null"]. '
+            "Warning: Hive database supports only single value. "
+            'Use [""] for empty string.'
+        ),
+    )
diff --git a/superset/views/database/views.py b/superset/views/database/views.py
@@ -149,6 +149,9 @@ def form_post(self, form: CsvToDatabaseForm) -> Response:
             database = (
                 db.session.query(models.Database).filter_by(id=con.data.get("id")).one()
             )
+
+            # More can be found here:
+            # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.read_csv.html
             csv_to_df_kwargs = {
                 "sep": form.sep.data,
                 "header": form.header.data if form.header.data else 0,
@@ -162,6 +165,12 @@ def form_post(self, form: CsvToDatabaseForm) -> Response:
                 "infer_datetime_format": form.infer_datetime_format.data,
                 "chunksize": 1000,
             }
+            if form.null_values.data:
+                csv_to_df_kwargs["na_values"] = form.null_values.data
+                csv_to_df_kwargs["keep_default_na"] = False
+
+            # More can be found here:
+            # https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.to_sql.html
             df_to_sql_kwargs = {
                 "name": csv_table.table,
                 "if_exists": form.if_exists.data,

diff --git a/tests/base_tests.py b/tests/base_tests.py
@@ -27,8 +27,8 @@
 from flask_testing import TestCase
 from sqlalchemy.orm import Session
 
+from tests.test_app import app
 from superset.sql_parse import CtasMethod
-from tests.test_app import app  # isort:skip
 from superset import db, security_manager
 from superset.connectors.base.models import BaseDatasource
 from superset.connectors.druid.models import DruidCluster, DruidDatasource

diff --git a/tests/core_tests.py b/tests/core_tests.py
@@ -916,12 +916,12 @@ def test_import_csv_explore_database(self):
 
     def test_import_csv(self):
         self.login(username="admin")
-        table_name = "".join(random.choice(string.ascii_uppercase) for _ in range(5))
+        table_name = "".join(random.choice(string.ascii_lowercase) for _ in range(5))
 
         f1 = "testCSV.csv"
         self.create_sample_csvfile(f1, ["a,b", "john,1", "paul,2"])
         f2 = "testCSV2.csv"
-        self.create_sample_csvfile(f2, ["b,c,d", "john,1,x", "paul,2,y"])
+        self.create_sample_csvfile(f2, ["b,c,d", "john,1,x", "paul,2,"])
         self.enable_csv_upload(utils.get_example_database())
 
         try:
@@ -957,6 +957,23 @@ def test_import_csv(self):
             table = self.get_table_by_name(table_name)
             # make sure the new column name is reflected in the table metadata
             self.assertIn("d", table.column_names)
+
+            # null values are set
+            self.upload_csv(
+                f2,
+                table_name,
+                extra={"null_values": '["", "john"]', "if_exists": "replace"},
+            )
+            # make sure that john and empty string are replaced with None
+            data = db.session.execute(f"SELECT * from {table_name}").fetchall()
+            assert data == [(None, 1, "x"), ("paul", 2, None)]
+
+            # default null values
+            self.upload_csv(f2, table_name, extra={"if_exists": "replace"})
+            # make sure that john and empty string are replaced with None
+            data = db.session.execute(f"SELECT * from {table_name}").fetchall()
+            assert data == [("john", 1, "x"), ("paul", 2, None)]
+
         finally:
             os.remove(f1)
             os.remove(f2)

diff --git a/tests/db_engine_specs/base_tests.py b/tests/db_engine_specs/base_tests.py
@@ -14,13 +14,13 @@
 # KIND, either express or implied.  See the License for the
 # specific language governing permissions and limitations
 # under the License.
+# isort:skip_file
 from datetime import datetime
 
+from tests.test_app import app
+from tests.base_tests import SupersetTestCase
 from superset.db_engine_specs.mysql import MySQLEngineSpec
 from superset.models.core import Database
-from tests.base_tests import SupersetTestCase
-
-from tests.test_app import app  # isort:skip
 
 
 class TestDbEngineSpec(SupersetTestCase):