Skip to content

Commit

Permalink
chore: update logic for row metric
Browse files Browse the repository at this point in the history
  • Loading branch information
fabclmnt committed Oct 28, 2024
1 parent 7528e2c commit bb878f7
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 17 deletions.
30 changes: 17 additions & 13 deletions src/ydata_profiling/utils/common.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,18 +148,22 @@ def calculate_nrows(df):
Returns: int, approximate number of rows
"""
try:
n_partitions = df.rdd.getNumPartitions()

nrows = (
df.rdd.mapPartitionsWithIndex(
lambda idx, partition: [sum(1 for _ in partition)] if idx == 0 else [0]
).collect()[0]
* n_partitions
)
except:
nrows = (
0 # returns 0 in case it was not possible to compute it from the partition
)
if isinstance(df, pd.DataFrame):
if df is not None:
nrows = len(df)
else:
nrows = 0
else:
try:
n_partitions = df.rdd.getNumPartitions()

nrows = (
df.rdd.mapPartitionsWithIndex(
lambda idx, partition: [sum(1 for _ in partition)] if idx == 0 else [0]
).collect()[0]
* n_partitions
)
except:
nrows = 0

return nrows
7 changes: 3 additions & 4 deletions src/ydata_profiling/utils/logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,18 @@ def info_def_report(self, df, timeseries: bool) -> None: # noqa: ANN001
ncols = len(df.columns)
except AttributeError:
ncols=0


nrows = calculate_nrows(df)

if isinstance(df, pd.DataFrame):
dataframe = "pandas"
report_type = "regular"
nrows = len(df)
elif df is None:
dataframe = "pandas"
report_type = "compare"
nrows = len(df)
else:
dataframe = "spark"
report_type = "regular"
nrows = calculate_nrows(df)

dbx = is_running_in_databricks()
datatype = "timeseries" if timeseries else "tabular"
Expand Down

0 comments on commit bb878f7

Please sign in to comment.