"""Cell 21 — Eval-curve renderer (4 plot panels for DESIGN.md §15 pitch).

Implements ``docs/modules/evaluation.md`` §2.1 ``render_plots``, §3.4
(per-language bars), §3.5 (drift-detection latency curve), §3.8 (2-min
budget), §5 ``PlotRenderError`` / ``WandBHistoryUnavailableWarning``,
§7 edge cases 2 (empty cohort), 3 (Stage-1 NaN), 6 (WandB purged).

Hard rules (evaluation.md §3.8, §6.3):
- ``matplotlib`` only; no seaborn.
- Canonical figsize ``(16, 9)`` inches at ``dpi=100`` → ``1600x900`` px PNGs.
- ``wandb_run_id is None`` → skip the two history-driven plots, render the
  other two; warn via ``WandBHistoryUnavailableWarning``.
- Wall-clock budget 2 minutes (``EvalBudgetExceededError``).
- No LLM-as-judge; static AST scan via ``_NO_LLM_JUDGE_FORBIDDEN_IMPORTS``.
"""

from __future__ import annotations

import math
import time
import warnings
from pathlib import Path
from typing import TYPE_CHECKING, Any

from cells.step_18_eval_baseline import (
    EvalBudgetExceededError,
    EvalReport,
    EvaluationError,
)

if TYPE_CHECKING:  # pragma: no cover - typing only
    from collections.abc import Callable


__all__ = [
    "BUDGET_RENDER_PLOTS_SECONDS",
    "CANONICAL_FIGSIZE",
    "CANONICAL_DPI",
    "PlotRenderError",
    "WandBHistoryUnavailableWarning",
    "render_plots",
]


# ---------------------------------------------------------------------------
# Constants — evaluation.md §3.8
# ---------------------------------------------------------------------------


CANONICAL_FIGSIZE: tuple[float, float] = (16.0, 9.0)
"""evaluation.md integration §3.4 — every PNG is 1600x900 px at dpi=100."""

CANONICAL_DPI: int = 100

BUDGET_RENDER_PLOTS_SECONDS: int = 120
"""evaluation.md §3.8 — 2-minute hard ceiling on ``render_plots``."""

_NO_LLM_JUDGE_FORBIDDEN_IMPORTS: frozenset[str] = frozenset(
    {"openai", "anthropic", "vertexai", "google.generativeai", "cohere"},
)


# ---------------------------------------------------------------------------
# Errors / warnings — evaluation.md §5
# ---------------------------------------------------------------------------


class PlotRenderError(EvaluationError):
    """``matplotlib`` save failure (disk full / unwriteable / missing font)."""


class WandBHistoryUnavailableWarning(UserWarning):
    """WandB history fetch failed — degrade gracefully (skip 2 plots)."""


# ---------------------------------------------------------------------------
# Internal helpers
# ---------------------------------------------------------------------------


def _new_figure(title: str) -> Any:
    """Return a new (fig, ax) pair pinned to the canonical figsize."""
    import matplotlib
    matplotlib.use("Agg", force=False)
    import matplotlib.pyplot as plt

    fig, ax = plt.subplots(figsize=CANONICAL_FIGSIZE, dpi=CANONICAL_DPI)
    ax.set_title(title)
    return fig, ax


def _save_figure(fig: Any, out_path: Path) -> None:
    try:
        out_path.parent.mkdir(parents=True, exist_ok=True)
        fig.savefig(out_path, dpi=CANONICAL_DPI, bbox_inches="tight")
    except OSError as exc:  # disk full, unwriteable
        raise PlotRenderError(
            f"failed to save plot to {out_path}: {exc}",
        ) from exc
    finally:
        import matplotlib.pyplot as plt
        plt.close(fig)


def _wandb_curves(wandb_run_id: str | None) -> dict[str, list[tuple[int, float]]]:
    """Try to fetch WandB history; return ``{}`` and warn on any failure."""
    if wandb_run_id is None:
        warnings.warn(
            "WandB run id is None — per_reward_stack and drift_latency_vs_step skipped.",
            WandBHistoryUnavailableWarning,
            stacklevel=2,
        )
        return {}
    wandb = _try_import_wandb()
    if wandb is None:
        warnings.warn(
            f"wandb import failed — history for {wandb_run_id!r} unavailable.",
            WandBHistoryUnavailableWarning,
            stacklevel=2,
        )
        return {}
    history = _try_fetch_wandb_history(wandb, wandb_run_id)
    if history is None:
        warnings.warn(
            f"WandB fetch failed for run {wandb_run_id!r}.",
            WandBHistoryUnavailableWarning,
            stacklevel=2,
        )
        return {}
    return _coerce_history(history)


def _try_import_wandb() -> Any:
    """Best-effort wandb import; returns ``None`` on failure."""
    import importlib
    try:
        return importlib.import_module("wandb")
    except ImportError:
        return None


def _try_fetch_wandb_history(wandb_mod: Any, run_id: str) -> Any:
    """Best-effort history fetch; returns ``None`` on any failure."""
    try:
        api = wandb_mod.Api()
        run = api.run(run_id)
        return run.history()
    except (RuntimeError, ValueError, ImportError, AttributeError, KeyError, TypeError):
        return None


def _coerce_history(history: Any) -> dict[str, list[tuple[int, float]]]:
    """Coerce a WandB history (DataFrame-like) into per-key (step, value) pairs."""
    if isinstance(history, dict):
        out: dict[str, list[tuple[int, float]]] = {}
        for key, rows in history.items():
            if isinstance(rows, list):
                out[key] = [(int(r[0]), float(r[1])) for r in rows]
        return out
    return {}


# ---------------------------------------------------------------------------
# Plot 1 — per-reward stack — evaluation.md §3.5 (over training steps)
# ---------------------------------------------------------------------------


def _plot_per_reward_stack(curves: dict[str, list[tuple[int, float]]], out_path: Path) -> Path:
    fig, ax = _new_figure("Per-reward means vs training step")
    keys = ("R1_mean", "R2_mean", "R3_mean", "R4_mean", "R5_mean")
    found_any = False
    for key in keys:
        rows = curves.get(f"train/{key}") or curves.get(key)
        if not rows:
            continue
        found_any = True
        steps = [r[0] for r in rows]
        values = [r[1] for r in rows]
        ax.plot(steps, values, label=key)
    if not found_any:
        ax.text(0.5, 0.5, "No WandB history available", ha="center", va="center")
    ax.set_xlabel("training step")
    ax.set_ylabel("reward mean")
    ax.legend(loc="best")
    _save_figure(fig, out_path)
    return out_path.resolve()


# ---------------------------------------------------------------------------
# Plot 2 — drift-detection latency vs step — evaluation.md §3.5
# ---------------------------------------------------------------------------


def _plot_drift_latency_vs_step(
    curves: dict[str, list[tuple[int, float]]],
    final: EvalReport,
    out_path: Path,
) -> Path:
    fig, ax = _new_figure("Drift-detection latency vs training step")
    p50_rows = curves.get("eval/drift_latency_p50") or []
    p95_rows = curves.get("eval/drift_latency_p95") or []
    if p50_rows:
        ax.plot([r[0] for r in p50_rows], [r[1] for r in p50_rows], label="p50")
    if p95_rows:
        ax.plot([r[0] for r in p95_rows], [r[1] for r in p95_rows], label="p95")

    # Final point (rightmost) from the held-out 50 (evaluation.md §3.5 fusion).
    p50_final = final.drift_detection_latency.stage3_median
    if not math.isnan(p50_final) and p50_rows:
        last_step = p50_rows[-1][0] + 50
        ax.scatter([last_step], [p50_final], label="final p50", marker="*", s=120)

    if not p50_rows and not p95_rows:
        ax.text(0.5, 0.5, "Stage 1 eval — no drift events", ha="center", va="center")
    ax.set_xlabel("training step")
    ax.set_ylabel("turns to adapt")
    ax.legend(loc="best")
    _save_figure(fig, out_path)
    return out_path.resolve()


# ---------------------------------------------------------------------------
# Plot 3 — per-language bars — evaluation.md §3.4
# ---------------------------------------------------------------------------


def _plot_per_language_bars(final: EvalReport, out_path: Path) -> Path:
    fig, ax = _new_figure("Per-language reward breakdown (final)")
    cohorts = [c for c in final.per_language if c.n_episodes > 0]
    if not cohorts:
        ax.text(0.5, 0.5, "No non-empty per-language cohorts", ha="center", va="center")
        _save_figure(fig, out_path)
        return out_path.resolve()

    languages = [c.language for c in cohorts]
    rewards = ("r1_mean", "r2_mean", "r3_mean", "r4_mean", "r5_mean")
    n_groups = len(languages)
    bar_width = 0.15
    import numpy as np

    x = np.arange(n_groups)
    for i, key in enumerate(rewards):
        values = [getattr(c, key) for c in cohorts]
        ax.bar(x + i * bar_width, values, bar_width, label=key.upper())
    ax.set_xticks(x + 2 * bar_width)
    ax.set_xticklabels(languages)
    ax.set_xlabel("language")
    ax.set_ylabel("mean")
    ax.legend(loc="best")

    # Annotate low-n cohorts (1-4) with '(low-n)' suffix per evaluation.md §3.4.
    for c, xi in zip(cohorts, x, strict=True):
        if 1 <= c.n_episodes <= 4:
            ax.annotate(
                f"(low-n n={c.n_episodes})",
                xy=(xi + 2 * bar_width, 0),
                xytext=(0, -20),
                textcoords="offset points",
                ha="center",
                fontsize=8,
            )
    _save_figure(fig, out_path)
    return out_path.resolve()


# ---------------------------------------------------------------------------
# Plot 4 — before/after bars — evaluation.md §2.1
# ---------------------------------------------------------------------------


def _plot_before_after_bars(
    baseline: EvalReport,
    final: EvalReport,
    out_path: Path,
) -> Path:
    fig, ax = _new_figure("Baseline vs Final — per-reward means with 95% CI")
    keys = ("reward", "r1", "r2", "r3", "r4", "r5")
    n_groups = len(keys)
    import numpy as np

    x = np.arange(n_groups)
    bar_w = 0.35
    base_means: list[float] = []
    base_errs: list[tuple[float, float]] = []
    final_means: list[float] = []
    final_errs: list[tuple[float, float]] = []
    for key in keys:
        b_mean, b_lo, b_hi = getattr(baseline, f"{key}_mean_ci")
        f_mean, f_lo, f_hi = getattr(final, f"{key}_mean_ci")
        base_means.append(b_mean)
        base_errs.append((b_mean - b_lo, b_hi - b_mean))
        final_means.append(f_mean)
        final_errs.append((f_mean - f_lo, f_hi - f_mean))

    base_err_arr = np.asarray(base_errs).T
    final_err_arr = np.asarray(final_errs).T
    ax.bar(x - bar_w / 2, base_means, bar_w, yerr=base_err_arr, label="baseline", capsize=4)
    ax.bar(x + bar_w / 2, final_means, bar_w, yerr=final_err_arr, label="final", capsize=4)
    ax.set_xticks(x)
    ax.set_xticklabels([k.upper() for k in keys])
    ax.set_xlabel("reward channel")
    ax.set_ylabel("mean (95% CI)")
    ax.legend(loc="best")

    # Zero-success-baseline annotation per evaluation.md §7.1.
    if math.isclose(baseline.r1_mean_ci[0], 0.0, abs_tol=1e-12):
        ax.annotate(
            "0 of 50 successes",
            xy=(1 - bar_w / 2, 0),
            xytext=(0, 12),
            textcoords="offset points",
            ha="center",
            fontsize=8,
        )
    _save_figure(fig, out_path)
    return out_path.resolve()


# ---------------------------------------------------------------------------
# Public entry point — evaluation.md §2.1
# ---------------------------------------------------------------------------


def render_plots(
    baseline: EvalReport,
    final: EvalReport,
    wandb_run_id: str | None,
    out_dir: Path,
    *,
    budget_seconds: int = BUDGET_RENDER_PLOTS_SECONDS,
    monotonic: Callable[[], float] | None = None,
) -> dict[str, Path]:
    """Render the 4 plot panels (evaluation.md §2.1, §3.5).

    ``wandb_run_id=None`` → skip the two history-driven plots, render the
    other two; warn via ``WandBHistoryUnavailableWarning``.
    """
    if not isinstance(out_dir, Path):
        raise EvaluationError(
            f"out_dir must be pathlib.Path; got {type(out_dir).__name__}",
        )
    out_dir.mkdir(parents=True, exist_ok=True)

    clock = monotonic if monotonic is not None else time.monotonic
    started = clock()

    paths: dict[str, Path] = {}
    curves = _wandb_curves(wandb_run_id)

    if wandb_run_id is not None and curves:
        paths["per_reward_stack"] = _plot_per_reward_stack(
            curves, out_dir / "per_reward_stack.png",
        )
        paths["drift_latency_vs_step"] = _plot_drift_latency_vs_step(
            curves, final, out_dir / "drift_latency_vs_step.png",
        )

    paths["per_language_bars"] = _plot_per_language_bars(
        final, out_dir / "per_language_bars.png",
    )
    paths["before_after_bars"] = _plot_before_after_bars(
        baseline, final, out_dir / "before_after_bars.png",
    )

    elapsed = clock() - started
    if elapsed > budget_seconds:
        raise EvalBudgetExceededError(
            f"render_plots wall-clock {elapsed:.1f}s exceeded {budget_seconds}s "
            f"({budget_seconds // 60} min ceiling)",
        )
    return paths