Source code for wolfhece.assets.boxplot.distribution

"""Boxplot statistics distribution model.

Computes quartiles, whiskers and outliers from one or more raw data series,
following the standard Tukey fences convention (configurable whisker
coefficient *whis*, default 1.5 × IQR).
"""

from __future__ import annotations

from dataclasses import dataclass
from typing import Sequence

import numpy as np


@dataclass
[docs] class BoxplotStatistics: """Computed statistics for one data series."""
[docs] label: str
[docs] values: np.ndarray # sorted raw values
[docs] q1: float
[docs] q2: float # median
[docs] q3: float
[docs] iqr: float
[docs] whisker_low: float
[docs] whisker_high: float
[docs] outliers: np.ndarray
[docs] mean: float
[docs] class BoxplotDistributionModel: """Compute and store boxplot statistics from raw series data.""" def __init__( self, series: Sequence[Sequence[float]], labels: Sequence[str] | None = None, whis: float = 1.5, ) -> None:
[docs] self._whis = float(whis)
self._set_series(series, labels) # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ @property
[docs] def n_series(self) -> int: return len(self._series)
@property
[docs] def labels(self) -> list[str]: return list(self._labels)
@property
[docs] def whis(self) -> float: return self._whis
[docs] def set_series( self, series: Sequence[Sequence[float]], labels: Sequence[str] | None = None, whis: float | None = None, ) -> None: if whis is not None: self._whis = float(whis) self._set_series(series, labels)
[docs] def statistics(self) -> list[BoxplotStatistics]: """Return computed statistics for every series.""" out: list[BoxplotStatistics] = [] for arr, label in zip(self._series, self._labels): q1 = float(np.percentile(arr, 25)) q2 = float(np.percentile(arr, 50)) q3 = float(np.percentile(arr, 75)) iqr = q3 - q1 lo = q1 - self._whis * iqr hi = q3 + self._whis * iqr inside_lo = arr[arr >= lo] inside_hi = arr[arr <= hi] wlo = float(inside_lo.min()) if inside_lo.size else q1 whi = float(inside_hi.max()) if inside_hi.size else q3 outliers = arr[(arr < wlo) | (arr > whi)] out.append(BoxplotStatistics( label=label, values=np.sort(arr), q1=q1, q2=q2, q3=q3, iqr=iqr, whisker_low=wlo, whisker_high=whi, outliers=outliers, mean=float(np.mean(arr)), )) return out
[docs] def global_y_bounds(self) -> tuple[float, float]: """Return (y_min, y_max) encompassing all values including outliers.""" stats = self.statistics() lo = min(s.whisker_low for s in stats) hi = max(s.whisker_high for s in stats) for s in stats: if s.outliers.size: lo = min(lo, float(s.outliers.min())) hi = max(hi, float(s.outliers.max())) return lo, hi
# ------------------------------------------------------------------ # Private helpers # ------------------------------------------------------------------
[docs] def _set_series( self, series: Sequence[Sequence[float]], labels: Sequence[str] | None, ) -> None: if not series: raise ValueError("At least one data series is required") processed: list[np.ndarray] = [] for i, s in enumerate(series): arr = np.asarray(list(s), dtype=np.float64).ravel() if arr.size == 0: raise ValueError(f"Series {i} is empty") if not np.isfinite(arr).all(): raise ValueError(f"Series {i} contains non-finite values") processed.append(arr) n = len(processed) if labels is None: lbls = [f"Series {i + 1}" for i in range(n)] else: lbls = list(labels) if len(lbls) != n: raise ValueError( f"Number of labels ({len(lbls)}) must match number of" f" series ({n})" ) self._series = processed self._labels = lbls