From 49ea232c3a692ba38769f0e0924e287036323887 Mon Sep 17 00:00:00 2001 From: Ville Brofeldt <33317356+villebro@users.noreply.github.com> Date: Thu, 7 Nov 2019 20:03:42 +0200 Subject: [PATCH] [fix] Improve csv upload functionality (#8457) * [fix] csv upload when table metadata present * Remove table from hive spec * Move upload before table metadata creation * Refine upload logic, dd unit tests and fix translations * Use ALLOWED_EXTENSIONS from config * Address review comments * Fix error message grammar * Add return type to hive csv upload and replace first with one_or_none --- superset/config.py | 2 +- superset/db_engine_specs/base.py | 18 ++-- superset/db_engine_specs/hive.py | 2 +- .../translations/en/LC_MESSAGES/messages.po | 3 +- .../translations/fr/LC_MESSAGES/messages.json | 2 +- .../translations/fr/LC_MESSAGES/messages.po | 4 +- .../translations/it/LC_MESSAGES/messages.json | 4 +- .../translations/it/LC_MESSAGES/messages.po | 3 +- .../translations/ko/LC_MESSAGES/messages.po | 2 +- superset/translations/messages.pot | 3 +- .../translations/ru/LC_MESSAGES/messages.json | 6 +- .../translations/ru/LC_MESSAGES/messages.po | 4 +- .../translations/zh/LC_MESSAGES/messages.json | 4 +- .../translations/zh/LC_MESSAGES/messages.po | 4 +- superset/views/core.py | 2 +- superset/views/database/forms.py | 12 ++- superset/views/database/views.py | 64 ++++++++++---- tests/core_tests.py | 86 ++++++++++++++++--- 18 files changed, 156 insertions(+), 69 deletions(-) diff --git a/superset/config.py b/superset/config.py index 6380988acedef..fe6a3e73ffae8 100644 --- a/superset/config.py +++ b/superset/config.py @@ -288,7 +288,7 @@ def _try_json_readfile(filepath): # Allowed format types for upload on Database view # TODO: Add processing of other spreadsheet formats (xls, xlsx etc) -ALLOWED_EXTENSIONS = set(["csv"]) +ALLOWED_EXTENSIONS = {"csv", "tsv"} # CSV Options: key/value pairs that will be passed as argument to DataFrame.to_csv # method. diff --git a/superset/db_engine_specs/base.py b/superset/db_engine_specs/base.py index 7d81d57d86f4c..3f17503d2866d 100644 --- a/superset/db_engine_specs/base.py +++ b/superset/db_engine_specs/base.py @@ -37,7 +37,7 @@ from sqlalchemy.types import TypeEngine from werkzeug.utils import secure_filename -from superset import app, db, sql_parse +from superset import app, sql_parse from superset.utils import core as utils if TYPE_CHECKING: @@ -388,15 +388,17 @@ def df_to_sql(cls, df: pd.DataFrame, **kwargs): # pylint: disable=invalid-name df.to_sql(**kwargs) @classmethod - def create_table_from_csv(cls, form, table): - """ Create table (including metadata in backend) from contents of a csv. + def create_table_from_csv(cls, form) -> None: + """ + Create table from contents of a csv. Note: this method does not create + metadata for the table. + :param form: Parameters defining how to process data - :param table: Metadata of new table to be created """ def _allowed_file(filename: str) -> bool: # Only allow specific file extensions as specified in the config - extension = os.path.splitext(filename)[1] + extension = os.path.splitext(filename)[1].lower() return ( extension is not None and extension[1:] in config["ALLOWED_EXTENSIONS"] ) @@ -432,12 +434,6 @@ def _allowed_file(filename: str) -> bool: } cls.df_to_sql(**df_to_sql_kwargs) - table.user_id = g.user.id - table.schema = form.schema.data - table.fetch_metadata() - db.session.add(table) - db.session.commit() - @classmethod def convert_dttm(cls, target_type: str, dttm: datetime) -> Optional[str]: """ diff --git a/superset/db_engine_specs/hive.py b/superset/db_engine_specs/hive.py index 94413fa14916d..650f62596a40c 100644 --- a/superset/db_engine_specs/hive.py +++ b/superset/db_engine_specs/hive.py @@ -98,7 +98,7 @@ def fetch_data(cls, cursor, limit: int) -> List[Tuple]: return [] @classmethod - def create_table_from_csv(cls, form, table): # pylint: disable=too-many-locals + def create_table_from_csv(cls, form) -> None: # pylint: disable=too-many-locals """Uploads a csv file and creates a superset datasource in Hive.""" def convert_to_hive_type(col_type): diff --git a/superset/translations/en/LC_MESSAGES/messages.po b/superset/translations/en/LC_MESSAGES/messages.po index 34c5787650490..6bdd3a1c921dc 100644 --- a/superset/translations/en/LC_MESSAGES/messages.po +++ b/superset/translations/en/LC_MESSAGES/messages.po @@ -4669,7 +4669,7 @@ msgid "CSV to Database configuration" msgstr "" #: superset/views/core.py:375 -msgid "CSV file \"{0}\" uploaded to table \"{1}\" in database \"{2}\"" +msgid "CSV file \"%(csv_filename)s\" uploaded to table \"%(table_name)s\" in database \"%(db_name)s\"" msgstr "" #: superset/views/core.py:401 superset/views/core.py:667 @@ -7765,4 +7765,3 @@ msgstr "" #~ msgid "Saved Queries" #~ msgstr "" - diff --git a/superset/translations/fr/LC_MESSAGES/messages.json b/superset/translations/fr/LC_MESSAGES/messages.json index b49c73575f32f..4114301afe40b 100644 --- a/superset/translations/fr/LC_MESSAGES/messages.json +++ b/superset/translations/fr/LC_MESSAGES/messages.json @@ -3124,7 +3124,7 @@ "CSV to Database configuration": [ "" ], - "CSV file \"{0}\" uploaded to table \"{1}\" in database \"{2}\"": [ + "CSV file \"%(csv_filename)s\" uploaded to table \"%(table_name)s\" in database \"%(db_name)s\"": [ "" ], "User": [ diff --git a/superset/translations/fr/LC_MESSAGES/messages.po b/superset/translations/fr/LC_MESSAGES/messages.po index 180e0e7a23083..ed66cdcbc2ec8 100644 --- a/superset/translations/fr/LC_MESSAGES/messages.po +++ b/superset/translations/fr/LC_MESSAGES/messages.po @@ -4880,8 +4880,8 @@ msgid "CSV to Database configuration" msgstr "CSV vers la configuration de la base de données" #: superset/views/core.py:375 -msgid "CSV file \"{0}\" uploaded to table \"{1}\" in database \"{2}\"" -msgstr "Fichier CSV \"{0}\" chargé dans la table \"{1}\" de la base de données \"{2}\"" +msgid "CSV file \"%(csv_filename)s\" uploaded to table \"%(table_name)s\" in database \"%(db_name)s\"" +msgstr "Fichier CSV \"%(csv_filename)s\" chargé dans la table \"%(table_name)s\" de la base de données \"%(db_name)s\"" #: superset/views/core.py:401 superset/views/core.py:667 #: superset/views/sql_lab.py:23 superset/views/sql_lab.py:60 diff --git a/superset/translations/it/LC_MESSAGES/messages.json b/superset/translations/it/LC_MESSAGES/messages.json index 71378d40e3904..55836134da102 100644 --- a/superset/translations/it/LC_MESSAGES/messages.json +++ b/superset/translations/it/LC_MESSAGES/messages.json @@ -2923,7 +2923,7 @@ "CSV to Database configuration": [ "" ], - "CSV file \"{0}\" uploaded to table \"{1}\" in database \"{2}\"": [ + "CSV file \"%(csv_filename)s\" uploaded to table \"%(table_name)s\" in database \"%(db_name)s\"": [ "" ], "User": [ @@ -3132,4 +3132,4 @@ ] } } -} \ No newline at end of file +} diff --git a/superset/translations/it/LC_MESSAGES/messages.po b/superset/translations/it/LC_MESSAGES/messages.po index e3ca6e299f994..ba49fcb57c7a4 100644 --- a/superset/translations/it/LC_MESSAGES/messages.po +++ b/superset/translations/it/LC_MESSAGES/messages.po @@ -4432,7 +4432,7 @@ msgid "CSV to Database configuration" msgstr "" #: superset/views/core.py:363 -msgid "CSV file \"{0}\" uploaded to table \"{1}\" in database \"{2}\"" +msgid "CSV file \"%(csv_filename)s\" uploaded to table \"%(table_name)s\" in database \"%(db_name)s\"" msgstr "" #: superset/views/core.py:389 superset/views/core.py:655 @@ -6461,4 +6461,3 @@ msgstr "Query salvate" #~ msgid "Slice" #~ msgstr "Slice" - diff --git a/superset/translations/ko/LC_MESSAGES/messages.po b/superset/translations/ko/LC_MESSAGES/messages.po index 2b52cb7314292..e998c294d50f9 100644 --- a/superset/translations/ko/LC_MESSAGES/messages.po +++ b/superset/translations/ko/LC_MESSAGES/messages.po @@ -3788,7 +3788,7 @@ msgstr "대시보드 가져오기" msgid "CSV to Database configuration" msgstr "" -msgid "CSV file \"{0}\" uploaded to table \"{1}\" in database \"{2}\"" +msgid "CSV file \"%(csv_filename)s\" uploaded to table \"%(table_name)s\" in database \"%(db_name)s\"" msgstr "" msgid "User" diff --git a/superset/translations/messages.pot b/superset/translations/messages.pot index cd4a3f7a85d12..6c70ee1ebe912 100644 --- a/superset/translations/messages.pot +++ b/superset/translations/messages.pot @@ -4669,7 +4669,7 @@ msgid "CSV to Database configuration" msgstr "" #: superset/views/core.py:375 -msgid "CSV file \"{0}\" uploaded to table \"{1}\" in database \"{2}\"" +msgid "CSV file \"%(csv_filename)s\" uploaded to table \"%(table_name)s\" in database \"%(db_name)s\"" msgstr "" #: superset/views/core.py:401 superset/views/core.py:667 @@ -4967,4 +4967,3 @@ msgstr "" #: superset/views/sql_lab.py:86 msgid "Saved Queries" msgstr "" - diff --git a/superset/translations/ru/LC_MESSAGES/messages.json b/superset/translations/ru/LC_MESSAGES/messages.json index f375364be15a6..f8bb3524e93e5 100644 --- a/superset/translations/ru/LC_MESSAGES/messages.json +++ b/superset/translations/ru/LC_MESSAGES/messages.json @@ -2923,8 +2923,8 @@ "CSV to Database configuration": [ "Настройка CSV для БД" ], - "CSV file \"{0}\" uploaded to table \"{1}\" in database \"{2}\"": [ - "CSV-файл \"{0}\" загружен в таблицу \"{1}\" базы данных \"{2}\"" + "CSV file \"%(csv_filename)s\" uploaded to table \"%(table_name)s\" in database \"%(db_name)s\"": [ + "CSV-файл \"%(csv_filename)s\" загружен в таблицу \"%(table_name)s\" базы данных \"%(db_name)s\"" ], "User": [ "Пользователь" @@ -3132,4 +3132,4 @@ ] } } -} \ No newline at end of file +} diff --git a/superset/translations/ru/LC_MESSAGES/messages.po b/superset/translations/ru/LC_MESSAGES/messages.po index ea89a332c9930..29c44f0fb5f6a 100644 --- a/superset/translations/ru/LC_MESSAGES/messages.po +++ b/superset/translations/ru/LC_MESSAGES/messages.po @@ -4544,8 +4544,8 @@ msgid "CSV to Database configuration" msgstr "Настройка CSV для БД" #: superset/views/core.py:363 -msgid "CSV file \"{0}\" uploaded to table \"{1}\" in database \"{2}\"" -msgstr "CSV-файл \"{0}\" загружен в таблицу \"{1}\" базы данных \"{2}\"" +msgid "CSV file \"%(csv_filename)s\" uploaded to table \"%(table_name)s\" in database \"%(db_name)s\"" +msgstr "CSV-файл \"%(csv_filename)s\" загружен в таблицу \"%(table_name)s\" базы данных \"%(db_name)s\"" #: superset/views/core.py:389 superset/views/core.py:655 #: superset/views/sql_lab.py:16 superset/views/sql_lab.py:53 diff --git a/superset/translations/zh/LC_MESSAGES/messages.json b/superset/translations/zh/LC_MESSAGES/messages.json index 76640df1bd458..05c1de1ce0eb2 100644 --- a/superset/translations/zh/LC_MESSAGES/messages.json +++ b/superset/translations/zh/LC_MESSAGES/messages.json @@ -3124,8 +3124,8 @@ "CSV to Database configuration": [ "csv 到数据库配置" ], - "CSV file \"{0}\" uploaded to table \"{1}\" in database \"{2}\"": [ - "csv 文件 \"{0}\" 上传到数据库 \"{2}\" 中的表 \"{1}\"" + "CSV file \"%(csv_filename)s\" uploaded to table \"%(table_name)s\" in database \"%(db_name)s\"": [ + "csv 文件 \"%(csv_filename)s\" 上传到数据库 \"%(db_name)s\" 中的表 \"%(table_name)s\"" ], "User": [ "用户" diff --git a/superset/translations/zh/LC_MESSAGES/messages.po b/superset/translations/zh/LC_MESSAGES/messages.po index 5077f767d06f5..6b43381f4167a 100644 --- a/superset/translations/zh/LC_MESSAGES/messages.po +++ b/superset/translations/zh/LC_MESSAGES/messages.po @@ -4746,8 +4746,8 @@ msgid "CSV to Database configuration" msgstr "csv 到数据库配置" #: superset/views/core.py:375 -msgid "CSV file \"{0}\" uploaded to table \"{1}\" in database \"{2}\"" -msgstr "csv 文件 \"{0}\" 上传到数据库 \"{2}\" 中的表 \"{1}\"" +msgid "CSV file \"%(csv_filename)s\" uploaded to table \"%(table_name)s\" in database \"%(db_name)s\"" +msgstr "csv 文件 \"%(csv_filename)s\" 上传到数据库 \"%(db_name)s\" 中的表 \"%(table_name)s\"" #: superset/views/core.py:401 superset/views/core.py:667 #: superset/views/sql_lab.py:23 superset/views/sql_lab.py:60 diff --git a/superset/views/core.py b/superset/views/core.py index d438f07e49071..0b38fcffdc413 100755 --- a/superset/views/core.py +++ b/superset/views/core.py @@ -3049,7 +3049,7 @@ def schemas_access_for_csv_upload(self): except Exception: return json_error_response( "Failed to fetch schemas allowed for csv upload in this database! " - "Please contact Superset Admin!", + "Please contact your Superset Admin!", stacktrace=utils.get_stacktrace(), ) diff --git a/superset/views/database/forms.py b/superset/views/database/forms.py index e705b57f7dea3..57144bc1c6059 100644 --- a/superset/views/database/forms.py +++ b/superset/views/database/forms.py @@ -90,7 +90,17 @@ def at_least_one_schema_is_allowed(database): csv_file = FileField( _("CSV File"), description=_("Select a CSV file to be uploaded to a database."), - validators=[FileRequired(), FileAllowed(["csv"], _("CSV Files Only!"))], + validators=[ + FileRequired(), + FileAllowed( + config["ALLOWED_EXTENSIONS"], + _( + "Only the following file extensions are allowed: " + "%(allowed_extensions)s", + allowed_extensions=", ".join(config["ALLOWED_EXTENSIONS"]), + ), + ), + ], ) con = QuerySelectField( _("Database"), diff --git a/superset/views/database/views.py b/superset/views/database/views.py index c7a97964bd616..be522a266afcd 100644 --- a/superset/views/database/views.py +++ b/superset/views/database/views.py @@ -17,7 +17,7 @@ # pylint: disable=C,R,W import os -from flask import flash, redirect +from flask import flash, g, redirect from flask_appbuilder import SimpleFormView from flask_appbuilder.forms import DynamicForm from flask_appbuilder.models.sqla.interface import SQLAInterface @@ -28,7 +28,7 @@ from wtforms.validators import ValidationError import superset.models.core as models -from superset import app, appbuilder, security_manager +from superset import app, appbuilder, db, security_manager from superset.connectors.sqla.models import SqlaTable from superset.utils import core as utils from superset.views.base import DeleteMixin, SupersetModelView, YamlExportMixin @@ -102,10 +102,10 @@ def form_post(self, form): if not self.is_schema_allowed(database, schema_name): message = _( - 'Database "{0}" Schema "{1}" is not allowed for csv uploads. ' - "Please contact Superset Admin".format( - database.database_name, schema_name - ) + 'Database "%(database_name)s" schema "%(schema_name)s" ' + "is not allowed for csv uploads. Please contact your Superset Admin.", + database_name=database.database_name, + schema_name=schema_name, ) flash(message, "danger") return redirect("/csvtodatabaseview/form") @@ -117,32 +117,58 @@ def form_post(self, form): try: utils.ensure_path_exists(config["UPLOAD_FOLDER"]) csv_file.save(path) - table = SqlaTable(table_name=form.name.data) - table.database = form.data.get("con") - table.database_id = table.database.id - table.database.db_engine_spec.create_table_from_csv(form, table) + table_name = form.name.data + database = form.data.get("con") + database.db_engine_spec.create_table_from_csv(form) + + table = ( + db.session.query(SqlaTable) + .filter_by( + table_name=table_name, + schema=form.schema.data, + database_id=database.id, + ) + .one_or_none() + ) + if table: + table.fetch_metadata() + if not table: + table = SqlaTable(table_name=table_name) + table.database = database + table.database_id = database.id + table.user_id = g.user.id + table.schema = form.schema.data + table.fetch_metadata() + db.session.add(table) + db.session.commit() except Exception as e: + db.session.rollback() try: os.remove(path) except OSError: pass - message = ( - "Table name {} already exists. Please pick another".format( - form.name.data - ) - if isinstance(e, IntegrityError) - else str(e) + message = _( + 'Unable to upload CSV file "%(filename)s" to table ' + '"%(table_name)s" in database "%(db_name)s". ' + "Error message: %(error_msg)s", + filename=csv_filename, + table_name=form.name.data, + db_name=database.database_name, + error_msg=str(e), ) + flash(message, "danger") stats_logger.incr("failed_csv_upload") return redirect("/csvtodatabaseview/form") os.remove(path) # Go back to welcome page / splash screen - db_name = table.database.database_name message = _( - 'CSV file "{0}" uploaded to table "{1}" in ' - 'database "{2}"'.format(csv_filename, form.name.data, db_name) + 'CSV file "%(csv_filename)s" uploaded to table "%(table_name)s" in ' + 'database "%(db_name)s"', + csv_filename=csv_filename, + table_name=form.name.data, + db_name=table.database.database_name, ) flash(message, "info") stats_logger.incr("successful_csv_upload") diff --git a/tests/core_tests.py b/tests/core_tests.py index e2cf010b6ae88..79df5d513fec6 100644 --- a/tests/core_tests.py +++ b/tests/core_tests.py @@ -598,42 +598,100 @@ def test_slice_query_endpoint(self): def test_import_csv(self): self.login(username="admin") - filename = "testCSV.csv" table_name = "".join(random.choice(string.ascii_uppercase) for _ in range(5)) - test_file = open(filename, "w+") - test_file.write("a,b\n") - test_file.write("john,1\n") - test_file.write("paul,2\n") - test_file.close() + filename_1 = "testCSV.csv" + test_file_1 = open(filename_1, "w+") + test_file_1.write("a,b\n") + test_file_1.write("john,1\n") + test_file_1.write("paul,2\n") + test_file_1.close() + + filename_2 = "testCSV2.csv" + test_file_2 = open(filename_2, "w+") + test_file_2.write("b,c,d\n") + test_file_2.write("john,1,x\n") + test_file_2.write("paul,2,y\n") + test_file_2.close() + example_db = utils.get_example_database() example_db.allow_csv_upload = True db_id = example_db.id db.session.commit() - test_file = open(filename, "rb") form_data = { - "csv_file": test_file, + "csv_file": open(filename_1, "rb"), "sep": ",", "name": table_name, "con": db_id, - "if_exists": "append", + "if_exists": "fail", "index_label": "test_label", "mangle_dupe_cols": False, } url = "/databaseview/list/" add_datasource_page = self.get_resp(url) - assert "Upload a CSV" in add_datasource_page + self.assertIn("Upload a CSV", add_datasource_page) url = "/csvtodatabaseview/form" form_get = self.get_resp(url) - assert "CSV to Database configuration" in form_get + self.assertIn("CSV to Database configuration", form_get) try: - # ensure uploaded successfully + # initial upload with fail mode + resp = self.get_resp(url, data=form_data) + self.assertIn( + f'CSV file "{filename_1}" uploaded to table "{table_name}"', resp + ) + + # upload again with fail mode; should fail + form_data["csv_file"] = open(filename_1, "rb") resp = self.get_resp(url, data=form_data) - assert 'CSV file "testCSV.csv" uploaded to table' in resp + self.assertIn( + f'Unable to upload CSV file "{filename_1}" to table "{table_name}"', + resp, + ) + + # upload again with append mode + form_data["csv_file"] = open(filename_1, "rb") + form_data["if_exists"] = "append" + resp = self.get_resp(url, data=form_data) + self.assertIn( + f'CSV file "{filename_1}" uploaded to table "{table_name}"', resp + ) + + # upload again with replace mode + form_data["csv_file"] = open(filename_1, "rb") + form_data["if_exists"] = "replace" + resp = self.get_resp(url, data=form_data) + self.assertIn( + f'CSV file "{filename_1}" uploaded to table "{table_name}"', resp + ) + + # try to append to table from file with different schema + form_data["csv_file"] = open(filename_2, "rb") + form_data["if_exists"] = "append" + resp = self.get_resp(url, data=form_data) + self.assertIn( + f'Unable to upload CSV file "{filename_2}" to table "{table_name}"', + resp, + ) + + # replace table from file with different schema + form_data["csv_file"] = open(filename_2, "rb") + form_data["if_exists"] = "replace" + resp = self.get_resp(url, data=form_data) + self.assertIn( + f'CSV file "{filename_2}" uploaded to table "{table_name}"', resp + ) + table = ( + db.session.query(SqlaTable) + .filter_by(table_name=table_name, database_id=db_id) + .first() + ) + # make sure the new column name is reflected in the table metadata + self.assertIn("d", table.column_names) finally: - os.remove(filename) + os.remove(filename_1) + os.remove(filename_2) def test_dataframe_timezone(self): tz = psycopg2.tz.FixedOffsetTimezone(offset=60, name=None)