"""Benchmark baseline vs Siamese on CV-job pairs and propose decision thresholds.

Usage:
  source ../.venv-phase2/bin/activate
  python scripts/benchmark_models.py \
      --data ../data/training_pairs.csv \
      --baseline ../models/baseline_model.joblib \
      --siamese ../models/siamese_model_phase2_full \
      --out-json ../reports/model_comparison.json
"""

from __future__ import annotations

import argparse
import json
import sys
from pathlib import Path
from typing import Any

import joblib
import numpy as np
import pandas as pd
from sklearn.metrics import accuracy_score, f1_score, precision_recall_curve, precision_score, recall_score, roc_auc_score
from sklearn.model_selection import train_test_split

script_dir = Path(__file__).resolve().parent
repo_root = script_dir.parent.parent
if str(repo_root / "backend") not in sys.path:
    sys.path.insert(0, str(repo_root / "backend"))

from app.services.feature_engineering import PairFeatureMeta, build_pair_features  # noqa: E402
from app.services.normalization import normalize_text  # noqa: E402


def _load_baseline(path: Path) -> tuple[Any, dict[str, Any]]:
    bundle = joblib.load(path)
    return bundle["model"], bundle.get("meta") or {}


def _baseline_scores(model: Any, meta: dict[str, Any], cv_texts: list[str], job_texts: list[str]) -> np.ndarray:
    if isinstance(meta, PairFeatureMeta):
        feature_meta = meta
    else:
        feature_meta = PairFeatureMeta(tfidf=meta["tf"], svd=meta["svd"])
    X = np.vstack([build_pair_features(c, j, feature_meta) for c, j in zip(cv_texts, job_texts)])

    try:
        scores = model.predict_proba(X)[:, 1]
    except Exception:
        try:
            raw = model.decision_function(X)
            scores = 1.0 / (1.0 + np.exp(-raw))
        except Exception:
            scores = model.predict(X).astype(float)
    return np.clip(scores, 0.0, 1.0)


def _siamese_scores(model_path: Path, cv_texts: list[str], job_texts: list[str]) -> np.ndarray:
    from sentence_transformers import SentenceTransformer

    model = SentenceTransformer(str(model_path))
    cv_emb = model.encode(cv_texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)
    job_emb = model.encode(job_texts, convert_to_numpy=True, normalize_embeddings=True, show_progress_bar=False)
    cosine = np.sum(cv_emb * job_emb, axis=1)
    return np.clip(cosine, 0.0, 1.0)


def _metrics(y_true: np.ndarray, scores: np.ndarray, threshold: float = 0.5) -> dict[str, float]:
    preds = (scores >= threshold).astype(int)
    return {
        "accuracy": float(accuracy_score(y_true, preds)),
        "precision": float(precision_score(y_true, preds, zero_division=0)),
        "recall": float(recall_score(y_true, preds, zero_division=0)),
        "f1": float(f1_score(y_true, preds, zero_division=0)),
        "roc_auc": float(roc_auc_score(y_true, scores)),
        "threshold": float(threshold),
    }


def _best_f1_threshold(y_true: np.ndarray, scores: np.ndarray) -> float:
    candidates = np.linspace(0.0, 1.0, 101)
    best_t = 0.5
    best_f1 = -1.0
    for t in candidates:
        f1 = f1_score(y_true, (scores >= t).astype(int), zero_division=0)
        if f1 > best_f1:
            best_f1 = f1
            best_t = float(t)
    return best_t


def _threshold_for_precision(
    y_true: np.ndarray,
    scores: np.ndarray,
    target_precision: float,
    fallback: float,
    min_threshold: float,
) -> float:
    precision, recall, thresholds = precision_recall_curve(y_true, scores)
    if len(thresholds) == 0:
        return fallback

    candidates: list[float] = []
    for idx, threshold in enumerate(thresholds):
        p = precision[idx + 1]
        if p >= target_precision and float(threshold) >= min_threshold:
            candidates.append(float(threshold))

    if not candidates:
        return fallback
    return min(candidates)


def main() -> None:
    parser = argparse.ArgumentParser()
    parser.add_argument("--data", default=str(repo_root / "data" / "training_pairs.csv"))
    parser.add_argument("--baseline", default=str(repo_root / "models" / "baseline_model.joblib"))
    parser.add_argument("--siamese", default=str(repo_root / "models" / "siamese_model_phase2_full"))
    parser.add_argument("--test-size", type=float, default=0.15)
    parser.add_argument("--seed", type=int, default=42)
    parser.add_argument("--out-json", default=str(repo_root / "reports" / "model_comparison.json"))
    args = parser.parse_args()

    df = pd.read_csv(args.data)
    df["label"] = df["label"].astype(int)
    df["cv_text"] = df["cv_text"].fillna("").astype(str).map(normalize_text)
    df["job_text"] = df["job_text"].fillna("").astype(str).map(normalize_text)

    train_df, test_df = train_test_split(
        df,
        test_size=args.test_size,
        random_state=args.seed,
        stratify=df["label"],
    )

    y_test = test_df["label"].to_numpy()
    cv_texts = test_df["cv_text"].tolist()
    job_texts = test_df["job_text"].tolist()

    baseline_model, baseline_meta = _load_baseline(Path(args.baseline))
    baseline_scores = _baseline_scores(baseline_model, baseline_meta, cv_texts, job_texts)

    siamese_scores = _siamese_scores(Path(args.siamese), cv_texts, job_texts)

    baseline_default = _metrics(y_test, baseline_scores, threshold=0.5)
    siamese_default = _metrics(y_test, siamese_scores, threshold=0.5)

    baseline_best_t = _best_f1_threshold(y_test, baseline_scores)
    siamese_best_t = _best_f1_threshold(y_test, siamese_scores)

    baseline_best = _metrics(y_test, baseline_scores, threshold=baseline_best_t)
    siamese_best = _metrics(y_test, siamese_scores, threshold=siamese_best_t)

    # Business calibration from the best default ROC-AUC model.
    selected_model = "baseline" if baseline_default["roc_auc"] >= siamese_default["roc_auc"] else "siamese"
    selected_scores = baseline_scores if selected_model == "baseline" else siamese_scores

    accept_t = _threshold_for_precision(
        y_test,
        selected_scores,
        target_precision=0.90,
        fallback=0.80,
        min_threshold=0.50,
    )
    review_t = _threshold_for_precision(
        y_test,
        selected_scores,
        target_precision=0.70,
        fallback=0.50,
        min_threshold=0.30,
    )
    review_t = float(min(review_t, accept_t - 0.05)) if accept_t > 0.05 else 0.50

    result = {
        "dataset": {
            "path": str(Path(args.data).resolve()),
            "rows_total": int(len(df)),
            "rows_train": int(len(train_df)),
            "rows_test": int(len(test_df)),
            "seed": int(args.seed),
            "test_size": float(args.test_size),
        },
        "baseline": {
            "default_threshold_0_5": baseline_default,
            "best_f1": baseline_best,
        },
        "siamese": {
            "default_threshold_0_5": siamese_default,
            "best_f1": siamese_best,
            "model_path": str(Path(args.siamese).resolve()),
        },
        "production_recommendation": {
            "model": selected_model,
            "accept_threshold_score_pct": round(accept_t * 100.0, 2),
            "review_threshold_score_pct": round(review_t * 100.0, 2),
            "env": {
                "MATCH_ACCEPT_THRESHOLD": str(round(accept_t * 100.0, 2)),
                "MATCH_REVIEW_THRESHOLD": str(round(review_t * 100.0, 2)),
            },
        },
    }

    out_path = Path(args.out_json)
    out_path.parent.mkdir(parents=True, exist_ok=True)
    out_path.write_text(json.dumps(result, indent=2, ensure_ascii=False), encoding="utf-8")

    print("=== Model comparison complete ===")
    print(json.dumps(result, indent=2, ensure_ascii=False))
    print(f"Saved report to {out_path}")


if __name__ == "__main__":
    main()