From 687b92c4fd5384486d780faaec6633de182b50ec Mon Sep 17 00:00:00 2001 From: Trinh Quoc Anh Date: Fri, 12 Apr 2024 16:57:58 +0000 Subject: [PATCH 1/4] chore: update python3statement url --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f2bf68b51..914cf6521 100644 --- a/README.md +++ b/README.md @@ -220,7 +220,7 @@ pip install -e . The profiling report is written in HTML and CSS, which means a modern browser is required. -You need [Python 3](https://python3statement.org/) to run the package. Other dependencies can be found in the requirements files: +You need [Python 3](https://python3statement.github.io/) to run the package. Other dependencies can be found in the requirements files: | Filename | Requirements| |----------|-------------| From 12376b33441819c901aa567f69f8bb26dae176b9 Mon Sep 17 00:00:00 2001 From: Alp Aribal Date: Sat, 13 Apr 2024 09:31:02 +0100 Subject: [PATCH 2/4] use correct value in template --- .../flavours/html/templates/alerts/alert_constant.html | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_constant.html b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_constant.html index cbe578dc7..2689418bd 100644 --- a/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_constant.html +++ b/src/ydata_profiling/report/presentation/flavours/html/templates/alerts/alert_constant.html @@ -1 +1 @@ -{{ alert.column_name }} has constant value "{{ alert.values['mode'] }}" +{{ alert.column_name }} has constant value "{{ alert.values['value_counts_without_nan'].index[0] }}" From 9cf9a085143f868e9a1f3394471e0d8f8646db9f Mon Sep 17 00:00:00 2001 From: frelion Date: Sat, 9 Dec 2023 14:01:37 +0800 Subject: [PATCH 3/4] Resolve the issue of conflicts between columns added during the analysis process and the original data columns in the Spark version. --- src/ydata_profiling/model/spark/summary_spark.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/ydata_profiling/model/spark/summary_spark.py b/src/ydata_profiling/model/spark/summary_spark.py index f9ce848f3..7d2ca0c2e 100644 --- a/src/ydata_profiling/model/spark/summary_spark.py +++ b/src/ydata_profiling/model/spark/summary_spark.py @@ -87,11 +87,17 @@ def multiprocess_1d(args: tuple) -> Tuple[str, dict]: column, df = args return column, describe_1d(config, df.select(column), summarizer, typeset) + # Rename the df column names to prevent potential conflicts + for col in df.columns: + df = df.withColumnRenamed(col, f"{col}_customer") + args = [(name, df) for name in df.columns] with multiprocessing.pool.ThreadPool(12) as executor: for i, (column, description) in enumerate( executor.imap_unordered(multiprocess_1d, args) ): + if column.endswith("_customer"): + column = column[:-9] pbar.set_postfix_str(f"Describe variable:{column}") # summary clean up for spark @@ -99,7 +105,7 @@ def multiprocess_1d(args: tuple) -> Tuple[str, dict]: series_description[column] = description pbar.update() - series_description = {k: series_description[k] for k in df.columns} + series_description = {k[:-9]: series_description[k[:-9]] for k in df.columns} # Mapping from column name to variable type series_description = sort_column_names(series_description, config.sort) From ddcb388839716daf7a33538f8bb152e2221c25a3 Mon Sep 17 00:00:00 2001 From: frelion Date: Sat, 9 Dec 2023 14:30:38 +0800 Subject: [PATCH 4/4] remove trailing whitespace --- src/ydata_profiling/model/spark/summary_spark.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ydata_profiling/model/spark/summary_spark.py b/src/ydata_profiling/model/spark/summary_spark.py index 7d2ca0c2e..13a85f4c3 100644 --- a/src/ydata_profiling/model/spark/summary_spark.py +++ b/src/ydata_profiling/model/spark/summary_spark.py @@ -87,7 +87,7 @@ def multiprocess_1d(args: tuple) -> Tuple[str, dict]: column, df = args return column, describe_1d(config, df.select(column), summarizer, typeset) - # Rename the df column names to prevent potential conflicts + # Rename the df column names to prevent potential conflicts for col in df.columns: df = df.withColumnRenamed(col, f"{col}_customer")