"""Boxplot statistics distribution model.
Computes quartiles, whiskers and outliers from one or more raw data series,
following the standard Tukey fences convention (configurable whisker
coefficient *whis*, default 1.5 × IQR).
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Sequence
import numpy as np
@dataclass
[docs]
class BoxplotStatistics:
"""Computed statistics for one data series."""
[docs]
values: np.ndarray # sorted raw values
[docs]
class BoxplotDistributionModel:
"""Compute and store boxplot statistics from raw series data."""
def __init__(
self,
series: Sequence[Sequence[float]],
labels: Sequence[str] | None = None,
whis: float = 1.5,
) -> None:
[docs]
self._whis = float(whis)
self._set_series(series, labels)
# ------------------------------------------------------------------
# Public API
# ------------------------------------------------------------------
@property
[docs]
def n_series(self) -> int:
return len(self._series)
@property
[docs]
def labels(self) -> list[str]:
return list(self._labels)
@property
[docs]
def whis(self) -> float:
return self._whis
[docs]
def set_series(
self,
series: Sequence[Sequence[float]],
labels: Sequence[str] | None = None,
whis: float | None = None,
) -> None:
if whis is not None:
self._whis = float(whis)
self._set_series(series, labels)
[docs]
def statistics(self) -> list[BoxplotStatistics]:
"""Return computed statistics for every series."""
out: list[BoxplotStatistics] = []
for arr, label in zip(self._series, self._labels):
q1 = float(np.percentile(arr, 25))
q2 = float(np.percentile(arr, 50))
q3 = float(np.percentile(arr, 75))
iqr = q3 - q1
lo = q1 - self._whis * iqr
hi = q3 + self._whis * iqr
inside_lo = arr[arr >= lo]
inside_hi = arr[arr <= hi]
wlo = float(inside_lo.min()) if inside_lo.size else q1
whi = float(inside_hi.max()) if inside_hi.size else q3
outliers = arr[(arr < wlo) | (arr > whi)]
out.append(BoxplotStatistics(
label=label,
values=np.sort(arr),
q1=q1,
q2=q2,
q3=q3,
iqr=iqr,
whisker_low=wlo,
whisker_high=whi,
outliers=outliers,
mean=float(np.mean(arr)),
))
return out
[docs]
def global_y_bounds(self) -> tuple[float, float]:
"""Return (y_min, y_max) encompassing all values including outliers."""
stats = self.statistics()
lo = min(s.whisker_low for s in stats)
hi = max(s.whisker_high for s in stats)
for s in stats:
if s.outliers.size:
lo = min(lo, float(s.outliers.min()))
hi = max(hi, float(s.outliers.max()))
return lo, hi
# ------------------------------------------------------------------
# Private helpers
# ------------------------------------------------------------------
[docs]
def _set_series(
self,
series: Sequence[Sequence[float]],
labels: Sequence[str] | None,
) -> None:
if not series:
raise ValueError("At least one data series is required")
processed: list[np.ndarray] = []
for i, s in enumerate(series):
arr = np.asarray(list(s), dtype=np.float64).ravel()
if arr.size == 0:
raise ValueError(f"Series {i} is empty")
if not np.isfinite(arr).all():
raise ValueError(f"Series {i} contains non-finite values")
processed.append(arr)
n = len(processed)
if labels is None:
lbls = [f"Series {i + 1}" for i in range(n)]
else:
lbls = list(labels)
if len(lbls) != n:
raise ValueError(
f"Number of labels ({len(lbls)}) must match number of"
f" series ({n})"
)
self._series = processed
self._labels = lbls