From 2c616c6a009886dafd8b954cb1e16a7c24fd24b1 Mon Sep 17 00:00:00 2001 From: alexbarros Date: Thu, 4 Apr 2024 15:47:09 -0300 Subject: [PATCH] fix: boolean and date failures with empty data --- .../model/pandas/describe_boolean_pandas.py | 15 +++++++--- .../model/pandas/describe_date_pandas.py | 30 ++++++++++++------- .../report/structure/variables/render_date.py | 3 +- 3 files changed, 33 insertions(+), 15 deletions(-) diff --git a/src/ydata_profiling/model/pandas/describe_boolean_pandas.py b/src/ydata_profiling/model/pandas/describe_boolean_pandas.py index 492172e52..66cfe6491 100644 --- a/src/ydata_profiling/model/pandas/describe_boolean_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_boolean_pandas.py @@ -1,6 +1,7 @@ from typing import Tuple import pandas as pd +import numpy as np from ydata_profiling.config import Settings from ydata_profiling.model.pandas.imbalance_pandas import column_imbalance_score @@ -26,9 +27,15 @@ def pandas_describe_boolean_1d( A dict containing calculated series description values. """ - value_counts = summary["value_counts_without_nan"] - summary.update({"top": value_counts.index[0], "freq": value_counts.iloc[0]}) - - summary["imbalance"] = column_imbalance_score(value_counts, len(value_counts)) + value_counts: pd.Series = summary["value_counts_without_nan"] + if not value_counts.empty: + summary.update({"top": value_counts.index[0], "freq": value_counts.iloc[0]}) + summary["imbalance"] = column_imbalance_score(value_counts, len(value_counts)) + else: + summary.update({ + "top": np.nan, + "freq": 0, + "imbalance": 0, + }) return config, series, summary diff --git a/src/ydata_profiling/model/pandas/describe_date_pandas.py b/src/ydata_profiling/model/pandas/describe_date_pandas.py index b8df2e3ad..1ff64a50f 100644 --- a/src/ydata_profiling/model/pandas/describe_date_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_date_pandas.py @@ -29,16 +29,26 @@ def pandas_describe_date_1d( Returns: A dict containing calculated series description values. """ - summary.update( - { - "min": pd.Timestamp.to_pydatetime(series.min()), - "max": pd.Timestamp.to_pydatetime(series.max()), - } - ) - - summary["range"] = summary["max"] - summary["min"] - - values = series.values.astype(np.int64) // 10**9 + if summary["value_counts_without_nan"].empty: + values = series.values + summary.update( + { + "min": pd.NaT, + "max": pd.NaT, + "range": 0, + } + ) + else: + summary.update( + { + "min": pd.Timestamp.to_pydatetime(series.min()), + "max": pd.Timestamp.to_pydatetime(series.max()), + } + ) + + summary["range"] = summary["max"] - summary["min"] + + values = series.values.astype(np.int64) // 10**9 if config.vars.num.chi_squared_threshold > 0.0: summary["chi_squared"] = chi_square(values) diff --git a/src/ydata_profiling/report/structure/variables/render_date.py b/src/ydata_profiling/report/structure/variables/render_date.py index aa6850516..4cab3e5b9 100644 --- a/src/ydata_profiling/report/structure/variables/render_date.py +++ b/src/ydata_profiling/report/structure/variables/render_date.py @@ -103,13 +103,14 @@ def render_date(config: Settings, summary: Dict[str, Any]) -> Dict[str, Any]: ) # Bottom + n_bins = len(summary['histogram'][1]) - 1 if summary['histogram'] else 0 bottom = Container( [ Image( hist_data, image_format=image_format, alt="Histogram", - caption=f"Histogram with fixed size bins (bins={len(summary['histogram'][1]) - 1})", + caption=f"Histogram with fixed size bins (bins={n_bins})", name="Histogram", anchor_id=f"{varid}histogram", )