From 60f648f3237f8e57da9528a4f395f87caf01d2ef Mon Sep 17 00:00:00 2001 From: Alexandre Quemy Date: Tue, 5 Dec 2023 15:01:49 +0100 Subject: [PATCH 1/2] fix: consider all nan series as numerical --- .../model/pandas/describe_numeric_pandas.py | 52 ++++++++++++------- .../report/structure/variables/render_real.py | 15 +++--- src/ydata_profiling/visualisation/plot.py | 17 +++--- 3 files changed, 49 insertions(+), 35 deletions(-) diff --git a/src/ydata_profiling/model/pandas/describe_numeric_pandas.py b/src/ydata_profiling/model/pandas/describe_numeric_pandas.py index 5dc8baee2..e4e13f156 100644 --- a/src/ydata_profiling/model/pandas/describe_numeric_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_numeric_pandas.py @@ -50,19 +50,30 @@ def numeric_stats_numpy( index_values = vc.index.values # FIXME: can be performance optimized by using weights in std, var, kurt and skew... - - return { - "mean": np.average(index_values, weights=vc.values), - "std": np.std(present_values, ddof=1), - "variance": np.var(present_values, ddof=1), - "min": np.min(index_values), - "max": np.max(index_values), - # Unbiased kurtosis obtained using Fisher's definition (kurtosis of normal == 0.0). Normalized by N-1. - "kurtosis": series.kurt(), - # Unbiased skew normalized by N-1 - "skewness": series.skew(), - "sum": np.dot(index_values, vc.values), - } + if len(index_values): + return { + "mean": np.average(index_values, weights=vc.values), + "std": np.std(present_values, ddof=1), + "variance": np.var(present_values, ddof=1), + "min": np.min(index_values), + "max": np.max(index_values), + # Unbiased kurtosis obtained using Fisher's definition (kurtosis of normal == 0.0). Normalized by N-1. + "kurtosis": series.kurt(), + # Unbiased skew normalized by N-1 + "skewness": series.skew(), + "sum": np.dot(index_values, vc.values), + } + else: # Empty numerical series + return { + "mean": np.nan, + "std": 0., + "variance": 0., + "min": np.nan, + "max": np.nan, + "kurtosis": 0., + "skewness": 0., + "sum": 0, + } @describe_numeric_1d.register @@ -151,13 +162,14 @@ def pandas_describe_numeric_1d( else: stats["monotonic"] = 0 - stats.update( - histogram_compute( - config, - value_counts[~infinity_index].index.values, - summary["n_distinct"], - weights=value_counts[~infinity_index].values, + if len(value_counts[~infinity_index].index.values) > 0: + stats.update( + histogram_compute( + config, + value_counts[~infinity_index].index.values, + summary["n_distinct"], + weights=value_counts[~infinity_index].values, + ) ) - ) return config, series, stats diff --git a/src/ydata_profiling/report/structure/variables/render_real.py b/src/ydata_profiling/report/structure/variables/render_real.py index 471ee5844..607b4e064 100644 --- a/src/ydata_profiling/report/structure/variables/render_real.py +++ b/src/ydata_profiling/report/structure/variables/render_real.py @@ -118,12 +118,12 @@ def render_real(config: Settings, summary: dict) -> dict: style=config.html.style, ) - if isinstance(summary["histogram"], list): + if isinstance(summary.get("histogram", []), list): mini_histo = Image( mini_histogram( config, - [x[0] for x in summary["histogram"]], - [x[1] for x in summary["histogram"]], + [x[0] for x in summary.get("histogram", [])], + [x[1] for x in summary.get("histogram", [])], ), image_format=image_format, alt="Mini histogram", @@ -243,13 +243,14 @@ def render_real(config: Settings, summary: dict) -> dict: sequence_type="grid", ) - if isinstance(summary["histogram"], list): + if isinstance(summary.get("histogram", []), list): hist_data = histogram( config, - [x[0] for x in summary["histogram"]], - [x[1] for x in summary["histogram"]], + [x[0] for x in summary.get("histogram", [])], + [x[1] for x in summary.get("histogram", [])], ) - hist_caption = f"Histogram with fixed size bins (bins={len(summary['histogram'][0][1]) - 1})" + bins = len(summary['histogram'][0][1]) - 1 if 'histogram' in summary else 0 + hist_caption = f"Histogram with fixed size bins (bins={bins})" else: hist_data = histogram(config, *summary["histogram"]) hist_caption = f"Histogram with fixed size bins (bins={len(summary['histogram'][1]) - 1})" diff --git a/src/ydata_profiling/visualisation/plot.py b/src/ydata_profiling/visualisation/plot.py index 34ed7d133..88c38b98d 100644 --- a/src/ydata_profiling/visualisation/plot.py +++ b/src/ydata_profiling/visualisation/plot.py @@ -73,14 +73,15 @@ def _plot_histogram( plot = fig.add_subplot(111) for idx in reversed(list(range(n_labels))): - diff = np.diff(bins[idx]) - plot.bar( - bins[idx][:-1] + diff / 2, # type: ignore - series[idx], - diff, - facecolor=config.html.style.primary_colors[idx], - alpha=0.6, - ) + if len(bins): + diff = np.diff(bins[idx]) + plot.bar( + bins[idx][:-1] + diff / 2, # type: ignore + series[idx], + diff, + facecolor=config.html.style.primary_colors[idx], + alpha=0.6, + ) if date: plot.xaxis.set_major_formatter(FuncFormatter(format_fn)) From 5f4c910efb2f0131869a4f24041d232893fa2c8a Mon Sep 17 00:00:00 2001 From: Azory YData Bot Date: Tue, 5 Dec 2023 14:20:19 +0000 Subject: [PATCH 2/2] fix(linting): code formatting --- .../model/pandas/describe_numeric_pandas.py | 10 +++++----- .../report/structure/variables/render_real.py | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/ydata_profiling/model/pandas/describe_numeric_pandas.py b/src/ydata_profiling/model/pandas/describe_numeric_pandas.py index e4e13f156..fa3ffd6cf 100644 --- a/src/ydata_profiling/model/pandas/describe_numeric_pandas.py +++ b/src/ydata_profiling/model/pandas/describe_numeric_pandas.py @@ -63,15 +63,15 @@ def numeric_stats_numpy( "skewness": series.skew(), "sum": np.dot(index_values, vc.values), } - else: # Empty numerical series + else: # Empty numerical series return { "mean": np.nan, - "std": 0., - "variance": 0., + "std": 0.0, + "variance": 0.0, "min": np.nan, "max": np.nan, - "kurtosis": 0., - "skewness": 0., + "kurtosis": 0.0, + "skewness": 0.0, "sum": 0, } diff --git a/src/ydata_profiling/report/structure/variables/render_real.py b/src/ydata_profiling/report/structure/variables/render_real.py index 607b4e064..227200c27 100644 --- a/src/ydata_profiling/report/structure/variables/render_real.py +++ b/src/ydata_profiling/report/structure/variables/render_real.py @@ -249,7 +249,7 @@ def render_real(config: Settings, summary: dict) -> dict: [x[0] for x in summary.get("histogram", [])], [x[1] for x in summary.get("histogram", [])], ) - bins = len(summary['histogram'][0][1]) - 1 if 'histogram' in summary else 0 + bins = len(summary["histogram"][0][1]) - 1 if "histogram" in summary else 0 hist_caption = f"Histogram with fixed size bins (bins={bins})" else: hist_data = histogram(config, *summary["histogram"])