// Contamination Prior (v0.7.3 anti-bullshit pack #4)
// Bayesian-ish prior on whether a benchmark score is contaminated, based on
// (model training cutoff date) × (benchmark release date) × (known leak status).
// Pure logic — no human strings. Open LLM Leaderboard v1 (MMLU/HellaSwag/etc)
// was killed for contamination; this lets a user calibrate trust per score.

// Benchmark database. Each entry tracks release date, whether it's known to
// be in common pretraining corpora (CommonCrawl etc), and a base-rate adjustment
// (incident-driven: confirmed leaks, paraphrased copies in training data, etc).
//
// Sources: arxiv 2404.00699 (contamination survey), HF dataset cards,
// public reproductions / known leak reports.
export const BENCHMARK_DB = {
  // Format: { id, name, released: "YYYY-MM", in_corpora: bool, leak_factor: 0..1, category, paper }
  "mmlu":          { id: "mmlu",          name: "MMLU",                 released: "2020-09", in_corpora: true,  leak_factor: 0.18, category: "knowledge", paper: "Hendrycks 2020" },
  "mmlu_pro":      { id: "mmlu_pro",      name: "MMLU-Pro",             released: "2024-06", in_corpora: false, leak_factor: 0.05, category: "knowledge", paper: "Wang 2024" },
  "hellaswag":     { id: "hellaswag",     name: "HellaSwag",            released: "2019-05", in_corpora: true,  leak_factor: 0.20, category: "commonsense", paper: "Zellers 2019" },
  "arc_challenge": { id: "arc_challenge", name: "ARC Challenge",        released: "2018-04", in_corpora: true,  leak_factor: 0.15, category: "knowledge", paper: "Clark 2018" },
  "truthfulqa":    { id: "truthfulqa",    name: "TruthfulQA",           released: "2021-09", in_corpora: true,  leak_factor: 0.10, category: "truthfulness", paper: "Lin 2021" },
  "gsm8k":         { id: "gsm8k",         name: "GSM8K",                released: "2021-10", in_corpora: true,  leak_factor: 0.12, category: "math", paper: "Cobbe 2021" },
  "math":          { id: "math",          name: "MATH",                 released: "2021-03", in_corpora: true,  leak_factor: 0.10, category: "math", paper: "Hendrycks 2021" },
  "humaneval":     { id: "humaneval",     name: "HumanEval",            released: "2021-07", in_corpora: true,  leak_factor: 0.18, category: "code", paper: "Chen 2021" },
  "mbpp":          { id: "mbpp",          name: "MBPP",                 released: "2021-08", in_corpora: true,  leak_factor: 0.12, category: "code", paper: "Austin 2021" },
  "bbh":           { id: "bbh",           name: "BIG-Bench Hard (BBH)", released: "2022-10", in_corpora: true,  leak_factor: 0.08, category: "reasoning", paper: "Suzgun 2022" },
  "ifeval":        { id: "ifeval",        name: "IFEval",               released: "2023-11", in_corpora: false, leak_factor: 0.05, category: "instruction", paper: "Zhou 2023" },
  "musr":          { id: "musr",          name: "MuSR",                 released: "2023-10", in_corpora: false, leak_factor: 0.04, category: "reasoning", paper: "Sprague 2023" },
  "gpqa":          { id: "gpqa",          name: "GPQA",                 released: "2023-11", in_corpora: false, leak_factor: 0.04, category: "graduate-knowledge", paper: "Rein 2023" },
  "math500":       { id: "math500",       name: "MATH-500",             released: "2023-11", in_corpora: false, leak_factor: 0.05, category: "math", paper: "Lightman 2023" },
  "aime24":        { id: "aime24",        name: "AIME 2024",            released: "2024-02", in_corpora: false, leak_factor: 0.02, category: "math", paper: "AIME 2024" },
  "winogrande":    { id: "winogrande",    name: "Winogrande",           released: "2019-07", in_corpora: true,  leak_factor: 0.15, category: "commonsense", paper: "Sakaguchi 2019" },
  "boolq":         { id: "boolq",         name: "BoolQ",                released: "2019-05", in_corpora: true,  leak_factor: 0.15, category: "reading", paper: "Clark 2019" },
  "drop":          { id: "drop",          name: "DROP",                 released: "2019-04", in_corpora: true,  leak_factor: 0.12, category: "reading", paper: "Dua 2019" },
  "triviaqa":      { id: "triviaqa",      name: "TriviaQA",             released: "2017-05", in_corpora: true,  leak_factor: 0.18, category: "knowledge", paper: "Joshi 2017" },
  "squad":         { id: "squad",         name: "SQuAD",                released: "2016-06", in_corpora: true,  leak_factor: 0.20, category: "reading", paper: "Rajpurkar 2016" },
};

// Parse "YYYY-MM" or "YYYY-MM-DD" or "YYYY". Returns Date or null.
function parseLooseDate(s) {
  if (!s) return null;
  const m = String(s).trim().match(/^(\d{4})(?:-(\d{1,2}))?(?:-(\d{1,2}))?/);
  if (!m) return null;
  const y = parseInt(m[1], 10);
  const mo = m[2] ? Math.max(1, Math.min(12, parseInt(m[2], 10))) : 6;
  const d = m[3] ? Math.max(1, Math.min(28, parseInt(m[3], 10))) : 15;
  return new Date(Date.UTC(y, mo - 1, d));
}

// Time-based base prior. Returns probability that benchmark text was in the
// model's training data given (cutoff - release) gap.
//
// Heuristic curve:
//   gap < 0 (released after cutoff)      → 0.02 (only via leaks)
//   gap 0-3 months                       → 0.10–0.25
//   gap 3-12 months                      → 0.25–0.55
//   gap 12-24 months                     → 0.55–0.75
//   gap > 24 months (heavily reproduced) → 0.75–0.92
function timePrior(gapMonths) {
  if (gapMonths < 0) return 0.02;
  if (gapMonths === 0) return 0.10;
  if (gapMonths <= 3)  return 0.10 + (gapMonths / 3) * 0.15;
  if (gapMonths <= 12) return 0.25 + ((gapMonths - 3) / 9) * 0.30;
  if (gapMonths <= 24) return 0.55 + ((gapMonths - 12) / 12) * 0.20;
  return Math.min(0.92, 0.75 + ((gapMonths - 24) / 36) * 0.17);
}

// Per-benchmark prior: time-prior × in_corpora boost + leak_factor.
// Caps at 0.97 (always some uncertainty).
export function computeContaminationPrior(modelCutoff, benchmarkId) {
  const bench = BENCHMARK_DB[benchmarkId];
  if (!bench) return null;
  const cutoffDate = parseLooseDate(modelCutoff);
  const releaseDate = parseLooseDate(bench.released);
  if (!cutoffDate || !releaseDate) return null;

  const gapMs = cutoffDate.getTime() - releaseDate.getTime();
  const gapMonths = gapMs / (1000 * 60 * 60 * 24 * 30.44);
  const tp = timePrior(gapMonths);
  const corporaBoost = bench.in_corpora ? 0.10 : 0.0;
  const raw = tp + corporaBoost + bench.leak_factor;
  const prior = Math.max(0.01, Math.min(0.97, raw));

  let level;
  if (prior >= 0.65) level = "high";
  else if (prior >= 0.30) level = "medium";
  else level = "low";

  return {
    benchmark: bench.name,
    benchmark_id: bench.id,
    benchmark_released: bench.released,
    benchmark_category: bench.category,
    benchmark_in_corpora: bench.in_corpora,
    benchmark_paper: bench.paper,
    model_cutoff: modelCutoff,
    gap_months: Math.round(gapMonths * 10) / 10,
    time_prior: Math.round(tp * 100) / 100,
    corpora_boost: corporaBoost,
    leak_factor: bench.leak_factor,
    prior: Math.round(prior * 100) / 100,
    level,
    advice_code: level === "high" ? "treat_unreliable" :
                 level === "medium" ? "verify_alternate" : "score_likely_clean",
  };
}

// Batch helper: rate all benchmarks for a given cutoff. Returns array sorted
// by prior descending so the most-contaminated ones surface first.
export function rateAllBenchmarks(modelCutoff) {
  return Object.values(BENCHMARK_DB)
    .map(b => computeContaminationPrior(modelCutoff, b.id))
    .filter(Boolean)
    .sort((a, b) => b.prior - a.prior);
}

// Aggregate verdict for a list of (benchmark_id, reported_score) pairs.
// User pastes their leaderboard scores → tool flags which are likely
// contaminated and which aren't.
export function aggregateScoreSheet(modelCutoff, scoreSheet) {
  const rows = [];
  for (const { benchmark_id, score } of scoreSheet) {
    const p = computeContaminationPrior(modelCutoff, benchmark_id);
    if (p) rows.push({ ...p, reported_score: score });
  }
  rows.sort((a, b) => b.prior - a.prior);
  const counts = { high: 0, medium: 0, low: 0 };
  for (const r of rows) counts[r.level]++;
  return {
    rows,
    counts,
    total: rows.length,
    high_pct: rows.length ? Math.round(counts.high / rows.length * 100) : 0,
  };
}