chore: update logic for row metric

ydataai · Oct 28, 2024 · bb878f7 · bb878f7
1 parent 7528e2c
commit bb878f7
Show file tree

Hide file tree

Showing 2 changed files with 20 additions and 17 deletions.
diff --git a/src/ydata_profiling/utils/common.py b/src/ydata_profiling/utils/common.py
@@ -148,18 +148,22 @@ def calculate_nrows(df):
 
     Returns: int, approximate number of rows
     """
-    try:
-        n_partitions = df.rdd.getNumPartitions()
-
-        nrows = (
-            df.rdd.mapPartitionsWithIndex(
-                lambda idx, partition: [sum(1 for _ in partition)] if idx == 0 else [0]
-            ).collect()[0]
-            * n_partitions
-        )
-    except:
-        nrows = (
-            0  # returns 0 in case it was not possible to compute it from the partition
-        )
+    if isinstance(df, pd.DataFrame):
+        if df is not None:
+            nrows = len(df)
+        else:
+            nrows = 0
+    else:
+        try:
+            n_partitions = df.rdd.getNumPartitions()
+
+            nrows = (
+                    df.rdd.mapPartitionsWithIndex(
+                        lambda idx, partition: [sum(1 for _ in partition)] if idx == 0 else [0]
+                    ).collect()[0]
+                    * n_partitions
+            )
+        except:
+            nrows = 0
 
     return nrows
diff --git a/src/ydata_profiling/utils/logger.py b/src/ydata_profiling/utils/logger.py
@@ -22,19 +22,18 @@ def info_def_report(self, df, timeseries: bool) -> None:  # noqa: ANN001
             ncols = len(df.columns)
         except AttributeError:
             ncols=0
-
+
+        nrows = calculate_nrows(df)
+
         if isinstance(df, pd.DataFrame):
             dataframe = "pandas"
             report_type = "regular"
-            nrows = len(df)
         elif df is None:
             dataframe = "pandas"
             report_type = "compare"
-            nrows = len(df)
         else:
             dataframe = "spark"
             report_type = "regular"
-            nrows = calculate_nrows(df)
 
         dbx = is_running_in_databricks()
         datatype = "timeseries" if timeseries else "tabular"