Skip to content

Commit

Permalink
fix: consider all nan series as numerical
Browse files Browse the repository at this point in the history
  • Loading branch information
aquemy committed Dec 5, 2023
1 parent 93e7430 commit f466efa
Show file tree
Hide file tree
Showing 3 changed files with 49 additions and 35 deletions.
52 changes: 32 additions & 20 deletions src/ydata_profiling/model/pandas/describe_numeric_pandas.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,19 +50,30 @@ def numeric_stats_numpy(
index_values = vc.index.values

# FIXME: can be performance optimized by using weights in std, var, kurt and skew...

return {
"mean": np.average(index_values, weights=vc.values),
"std": np.std(present_values, ddof=1),
"variance": np.var(present_values, ddof=1),
"min": np.min(index_values),
"max": np.max(index_values),
# Unbiased kurtosis obtained using Fisher's definition (kurtosis of normal == 0.0). Normalized by N-1.
"kurtosis": series.kurt(),
# Unbiased skew normalized by N-1
"skewness": series.skew(),
"sum": np.dot(index_values, vc.values),
}
if len(index_values):
return {
"mean": np.average(index_values, weights=vc.values),
"std": np.std(present_values, ddof=1),
"variance": np.var(present_values, ddof=1),
"min": np.min(index_values),
"max": np.max(index_values),
# Unbiased kurtosis obtained using Fisher's definition (kurtosis of normal == 0.0). Normalized by N-1.
"kurtosis": series.kurt(),
# Unbiased skew normalized by N-1
"skewness": series.skew(),
"sum": np.dot(index_values, vc.values),
}
else: # Empty numerical series
return {
"mean": np.nan,
"std": 0.,
"variance": 0.,
"min": np.nan,
"max": np.nan,
"kurtosis": 0.,
"skewness": 0.,
"sum": 0,
}


@describe_numeric_1d.register
Expand Down Expand Up @@ -151,13 +162,14 @@ def pandas_describe_numeric_1d(
else:
stats["monotonic"] = 0

stats.update(
histogram_compute(
config,
value_counts[~infinity_index].index.values,
summary["n_distinct"],
weights=value_counts[~infinity_index].values,
if len(value_counts[~infinity_index].index.values) > 0:
stats.update(
histogram_compute(
config,
value_counts[~infinity_index].index.values,
summary["n_distinct"],
weights=value_counts[~infinity_index].values,
)
)
)

return config, series, stats
15 changes: 8 additions & 7 deletions src/ydata_profiling/report/structure/variables/render_real.py
Original file line number Diff line number Diff line change
Expand Up @@ -118,12 +118,12 @@ def render_real(config: Settings, summary: dict) -> dict:
style=config.html.style,
)

if isinstance(summary["histogram"], list):
if isinstance(summary.get("histogram", []), list):
mini_histo = Image(
mini_histogram(
config,
[x[0] for x in summary["histogram"]],
[x[1] for x in summary["histogram"]],
[x[0] for x in summary.get("histogram", [])],
[x[1] for x in summary.get("histogram", [])],
),
image_format=image_format,
alt="Mini histogram",
Expand Down Expand Up @@ -243,13 +243,14 @@ def render_real(config: Settings, summary: dict) -> dict:
sequence_type="grid",
)

if isinstance(summary["histogram"], list):
if isinstance(summary.get("histogram", []), list):
hist_data = histogram(
config,
[x[0] for x in summary["histogram"]],
[x[1] for x in summary["histogram"]],
[x[0] for x in summary.get("histogram", [])],
[x[1] for x in summary.get("histogram", [])],
)
hist_caption = f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][0][1]) - 1})"
bins = len(summary['histogram'][0][1]) - 1 if 'histogram' in summary else 0
hist_caption = f"<strong>Histogram with fixed size bins</strong> (bins={bins})"
else:
hist_data = histogram(config, *summary["histogram"])
hist_caption = f"<strong>Histogram with fixed size bins</strong> (bins={len(summary['histogram'][1]) - 1})"
Expand Down
17 changes: 9 additions & 8 deletions src/ydata_profiling/visualisation/plot.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,14 +73,15 @@ def _plot_histogram(
plot = fig.add_subplot(111)

for idx in reversed(list(range(n_labels))):
diff = np.diff(bins[idx])
plot.bar(
bins[idx][:-1] + diff / 2, # type: ignore
series[idx],
diff,
facecolor=config.html.style.primary_colors[idx],
alpha=0.6,
)
if len(bins):
diff = np.diff(bins[idx])
plot.bar(
bins[idx][:-1] + diff / 2, # type: ignore
series[idx],
diff,
facecolor=config.html.style.primary_colors[idx],
alpha=0.6,
)

if date:
plot.xaxis.set_major_formatter(FuncFormatter(format_fn))
Expand Down

0 comments on commit f466efa

Please sign in to comment.