// Contamination Prior (v0.7.3 anti-bullshit pack #4) // Bayesian-ish prior on whether a benchmark score is contaminated, based on // (model training cutoff date) × (benchmark release date) × (known leak status). // Pure logic — no human strings. Open LLM Leaderboard v1 (MMLU/HellaSwag/etc) // was killed for contamination; this lets a user calibrate trust per score. // Benchmark database. Each entry tracks release date, whether it's known to // be in common pretraining corpora (CommonCrawl etc), and a base-rate adjustment // (incident-driven: confirmed leaks, paraphrased copies in training data, etc). // // Sources: arxiv 2404.00699 (contamination survey), HF dataset cards, // public reproductions / known leak reports. export const BENCHMARK_DB = { // Format: { id, name, released: "YYYY-MM", in_corpora: bool, leak_factor: 0..1, category, paper } "mmlu": { id: "mmlu", name: "MMLU", released: "2020-09", in_corpora: true, leak_factor: 0.18, category: "knowledge", paper: "Hendrycks 2020" }, "mmlu_pro": { id: "mmlu_pro", name: "MMLU-Pro", released: "2024-06", in_corpora: false, leak_factor: 0.05, category: "knowledge", paper: "Wang 2024" }, "hellaswag": { id: "hellaswag", name: "HellaSwag", released: "2019-05", in_corpora: true, leak_factor: 0.20, category: "commonsense", paper: "Zellers 2019" }, "arc_challenge": { id: "arc_challenge", name: "ARC Challenge", released: "2018-04", in_corpora: true, leak_factor: 0.15, category: "knowledge", paper: "Clark 2018" }, "truthfulqa": { id: "truthfulqa", name: "TruthfulQA", released: "2021-09", in_corpora: true, leak_factor: 0.10, category: "truthfulness", paper: "Lin 2021" }, "gsm8k": { id: "gsm8k", name: "GSM8K", released: "2021-10", in_corpora: true, leak_factor: 0.12, category: "math", paper: "Cobbe 2021" }, "math": { id: "math", name: "MATH", released: "2021-03", in_corpora: true, leak_factor: 0.10, category: "math", paper: "Hendrycks 2021" }, "humaneval": { id: "humaneval", name: "HumanEval", released: "2021-07", in_corpora: true, leak_factor: 0.18, category: "code", paper: "Chen 2021" }, "mbpp": { id: "mbpp", name: "MBPP", released: "2021-08", in_corpora: true, leak_factor: 0.12, category: "code", paper: "Austin 2021" }, "bbh": { id: "bbh", name: "BIG-Bench Hard (BBH)", released: "2022-10", in_corpora: true, leak_factor: 0.08, category: "reasoning", paper: "Suzgun 2022" }, "ifeval": { id: "ifeval", name: "IFEval", released: "2023-11", in_corpora: false, leak_factor: 0.05, category: "instruction", paper: "Zhou 2023" }, "musr": { id: "musr", name: "MuSR", released: "2023-10", in_corpora: false, leak_factor: 0.04, category: "reasoning", paper: "Sprague 2023" }, "gpqa": { id: "gpqa", name: "GPQA", released: "2023-11", in_corpora: false, leak_factor: 0.04, category: "graduate-knowledge", paper: "Rein 2023" }, "math500": { id: "math500", name: "MATH-500", released: "2023-11", in_corpora: false, leak_factor: 0.05, category: "math", paper: "Lightman 2023" }, "aime24": { id: "aime24", name: "AIME 2024", released: "2024-02", in_corpora: false, leak_factor: 0.02, category: "math", paper: "AIME 2024" }, "winogrande": { id: "winogrande", name: "Winogrande", released: "2019-07", in_corpora: true, leak_factor: 0.15, category: "commonsense", paper: "Sakaguchi 2019" }, "boolq": { id: "boolq", name: "BoolQ", released: "2019-05", in_corpora: true, leak_factor: 0.15, category: "reading", paper: "Clark 2019" }, "drop": { id: "drop", name: "DROP", released: "2019-04", in_corpora: true, leak_factor: 0.12, category: "reading", paper: "Dua 2019" }, "triviaqa": { id: "triviaqa", name: "TriviaQA", released: "2017-05", in_corpora: true, leak_factor: 0.18, category: "knowledge", paper: "Joshi 2017" }, "squad": { id: "squad", name: "SQuAD", released: "2016-06", in_corpora: true, leak_factor: 0.20, category: "reading", paper: "Rajpurkar 2016" }, }; // Parse "YYYY-MM" or "YYYY-MM-DD" or "YYYY". Returns Date or null. function parseLooseDate(s) { if (!s) return null; const m = String(s).trim().match(/^(\d{4})(?:-(\d{1,2}))?(?:-(\d{1,2}))?/); if (!m) return null; const y = parseInt(m[1], 10); const mo = m[2] ? Math.max(1, Math.min(12, parseInt(m[2], 10))) : 6; const d = m[3] ? Math.max(1, Math.min(28, parseInt(m[3], 10))) : 15; return new Date(Date.UTC(y, mo - 1, d)); } // Time-based base prior. Returns probability that benchmark text was in the // model's training data given (cutoff - release) gap. // // Heuristic curve: // gap < 0 (released after cutoff) → 0.02 (only via leaks) // gap 0-3 months → 0.10–0.25 // gap 3-12 months → 0.25–0.55 // gap 12-24 months → 0.55–0.75 // gap > 24 months (heavily reproduced) → 0.75–0.92 function timePrior(gapMonths) { if (gapMonths < 0) return 0.02; if (gapMonths === 0) return 0.10; if (gapMonths <= 3) return 0.10 + (gapMonths / 3) * 0.15; if (gapMonths <= 12) return 0.25 + ((gapMonths - 3) / 9) * 0.30; if (gapMonths <= 24) return 0.55 + ((gapMonths - 12) / 12) * 0.20; return Math.min(0.92, 0.75 + ((gapMonths - 24) / 36) * 0.17); } // Per-benchmark prior: time-prior × in_corpora boost + leak_factor. // Caps at 0.97 (always some uncertainty). export function computeContaminationPrior(modelCutoff, benchmarkId) { const bench = BENCHMARK_DB[benchmarkId]; if (!bench) return null; const cutoffDate = parseLooseDate(modelCutoff); const releaseDate = parseLooseDate(bench.released); if (!cutoffDate || !releaseDate) return null; const gapMs = cutoffDate.getTime() - releaseDate.getTime(); const gapMonths = gapMs / (1000 * 60 * 60 * 24 * 30.44); const tp = timePrior(gapMonths); const corporaBoost = bench.in_corpora ? 0.10 : 0.0; const raw = tp + corporaBoost + bench.leak_factor; const prior = Math.max(0.01, Math.min(0.97, raw)); let level; if (prior >= 0.65) level = "high"; else if (prior >= 0.30) level = "medium"; else level = "low"; return { benchmark: bench.name, benchmark_id: bench.id, benchmark_released: bench.released, benchmark_category: bench.category, benchmark_in_corpora: bench.in_corpora, benchmark_paper: bench.paper, model_cutoff: modelCutoff, gap_months: Math.round(gapMonths * 10) / 10, time_prior: Math.round(tp * 100) / 100, corpora_boost: corporaBoost, leak_factor: bench.leak_factor, prior: Math.round(prior * 100) / 100, level, advice_code: level === "high" ? "treat_unreliable" : level === "medium" ? "verify_alternate" : "score_likely_clean", }; } // Batch helper: rate all benchmarks for a given cutoff. Returns array sorted // by prior descending so the most-contaminated ones surface first. export function rateAllBenchmarks(modelCutoff) { return Object.values(BENCHMARK_DB) .map(b => computeContaminationPrior(modelCutoff, b.id)) .filter(Boolean) .sort((a, b) => b.prior - a.prior); } // Aggregate verdict for a list of (benchmark_id, reported_score) pairs. // User pastes their leaderboard scores → tool flags which are likely // contaminated and which aren't. export function aggregateScoreSheet(modelCutoff, scoreSheet) { const rows = []; for (const { benchmark_id, score } of scoreSheet) { const p = computeContaminationPrior(modelCutoff, benchmark_id); if (p) rows.push({ ...p, reported_score: score }); } rows.sort((a, b) => b.prior - a.prior); const counts = { high: 0, medium: 0, low: 0 }; for (const r of rows) counts[r.level]++; return { rows, counts, total: rows.length, high_pct: rows.length ? Math.round(counts.high / rows.length * 100) : 0, }; }