Spaces:
Running
Running
| // Arena-Elo CI reconstructor (v0.7.2 anti-bullshit pack #3) | |
| // Recovers confidence intervals from raw pairwise vote data using | |
| // Bradley-Terry MLE + bootstrap. Chatbot Arena strips CIs from its public | |
| // leaderboard; this lets a user compute them from any vote CSV. | |
| // Pure logic — no human-readable strings. main.js renders via i18n. | |
| // Parse CSV into vote records. Accepts header row + 3 columns: | |
| // model_a, model_b, winner (winner ∈ {a, b, tie, model_a, model_b}) | |
| // Tolerates extra whitespace and case-insensitive header matching. | |
| export function parseVotesCSV(text) { | |
| const lines = text.split(/\r?\n/).map(l => l.trim()).filter(l => l && !l.startsWith("#")); | |
| if (lines.length < 2) throw new Error("CSV needs at least a header + 1 data row."); | |
| const header = lines[0].split(",").map(s => s.trim().toLowerCase()); | |
| const colA = header.findIndex(h => h === "model_a" || h === "a" || h === "model a"); | |
| const colB = header.findIndex(h => h === "model_b" || h === "b" || h === "model b"); | |
| const colW = header.findIndex(h => h === "winner" || h === "result" || h === "outcome"); | |
| if (colA < 0 || colB < 0 || colW < 0) { | |
| throw new Error("Header must include columns: model_a, model_b, winner."); | |
| } | |
| const votes = []; | |
| for (let i = 1; i < lines.length; i++) { | |
| const row = lines[i].split(",").map(s => s.trim()); | |
| if (row.length < Math.max(colA, colB, colW) + 1) continue; | |
| const a = row[colA], b = row[colB]; | |
| const w = row[colW].toLowerCase(); | |
| if (!a || !b) continue; | |
| let winner; | |
| if (w === "a" || w === "model_a" || w === a.toLowerCase()) winner = "a"; | |
| else if (w === "b" || w === "model_b" || w === b.toLowerCase()) winner = "b"; | |
| else if (w === "tie" || w === "draw" || w === "both" || w === "neither") winner = "tie"; | |
| else continue; // skip unrecognized | |
| votes.push({ model_a: a, model_b: b, winner }); | |
| } | |
| return votes; | |
| } | |
| // Bradley-Terry MLE via Minorization-Maximization (Hunter 2004). | |
| // Each iteration: theta_i ← wins_i / Σ_j (matches_ij / (theta_i + theta_j)). | |
| // Ties count as half-win to each side. Returns map model → theta (positive scale). | |
| function fitBradleyTerry(votes, models, opts = {}) { | |
| const { maxIter = 100, tol = 1e-7 } = opts; | |
| const n = models.length; | |
| const idx = Object.fromEntries(models.map((m, i) => [m, i])); | |
| const wins = new Float64Array(n); | |
| const matches = Array.from({ length: n }, () => new Float64Array(n)); | |
| for (const v of votes) { | |
| const a = idx[v.model_a], b = idx[v.model_b]; | |
| if (a === undefined || b === undefined) continue; | |
| matches[a][b] += 1; | |
| matches[b][a] += 1; | |
| if (v.winner === "a") wins[a] += 1; | |
| else if (v.winner === "b") wins[b] += 1; | |
| else if (v.winner === "tie") { wins[a] += 0.5; wins[b] += 0.5; } | |
| } | |
| let theta = new Float64Array(n).fill(1.0); | |
| for (let iter = 0; iter < maxIter; iter++) { | |
| const next = new Float64Array(n); | |
| for (let i = 0; i < n; i++) { | |
| let denom = 0; | |
| for (let j = 0; j < n; j++) { | |
| if (i !== j && matches[i][j] > 0) { | |
| denom += matches[i][j] / (theta[i] + theta[j]); | |
| } | |
| } | |
| const w = wins[i] || 1e-9; // avoid 0 → undefined | |
| next[i] = w / (denom || 1e-9); | |
| } | |
| // normalize so geometric mean = 1 → keeps Elo identifiable | |
| let logSum = 0; | |
| for (let i = 0; i < n; i++) logSum += Math.log(next[i] || 1e-12); | |
| const gm = Math.exp(logSum / n); | |
| for (let i = 0; i < n; i++) next[i] /= gm; | |
| // convergence check | |
| let maxDelta = 0; | |
| for (let i = 0; i < n; i++) maxDelta = Math.max(maxDelta, Math.abs(next[i] - theta[i])); | |
| theta = next; | |
| if (maxDelta < tol) break; | |
| } | |
| return theta; | |
| } | |
| // Convert BT theta → Elo (anchor: geometric-mean model = 1500). | |
| function thetaToElo(theta) { return Array.from(theta).map(t => 400 * Math.log10(t) + 1500); } | |
| // Bootstrap percentile CIs. Resamples votes with replacement B times, | |
| // refits BT each time, returns {ci_low, ci_high} per model. | |
| function bootstrapCIs(votes, models, opts = {}) { | |
| const { B = 200, ci = 0.95 } = opts; | |
| const samples = Array.from({ length: models.length }, () => []); | |
| const N = votes.length; | |
| for (let b = 0; b < B; b++) { | |
| const resample = new Array(N); | |
| for (let k = 0; k < N; k++) resample[k] = votes[(Math.random() * N) | 0]; | |
| const eloRow = thetaToElo(fitBradleyTerry(resample, models, { maxIter: 50 })); | |
| for (let i = 0; i < models.length; i++) samples[i].push(eloRow[i]); | |
| } | |
| const loIdx = Math.floor((1 - ci) / 2 * B); | |
| const hiIdx = Math.floor((1 - (1 - ci) / 2) * B); | |
| return samples.map(s => { | |
| s.sort((a, b) => a - b); | |
| return { ci_low: s[loIdx], ci_high: s[Math.min(hiIdx, B - 1)] }; | |
| }); | |
| } | |
| // Detect statistical ties: pairs where the bootstrap distributions overlap by | |
| // more than `overlapThreshold` (default 0.05 = 5%). Cheaper proxy: CIs overlap. | |
| function findTies(ratings) { | |
| const ties = []; | |
| const sorted = [...ratings].sort((a, b) => b.elo - a.elo); | |
| for (let i = 0; i < sorted.length; i++) { | |
| for (let j = i + 1; j < sorted.length; j++) { | |
| const a = sorted[i], b = sorted[j]; | |
| // CI overlap: a.ci_low <= b.ci_high (a's lower bound below b's upper bound) | |
| if (a.ci_low <= b.ci_high) { | |
| const eloDiff = a.elo - b.elo; | |
| const totalSpread = (a.ci_high - a.ci_low) + (b.ci_high - b.ci_low); | |
| const overlap = Math.max(0, b.ci_high - a.ci_low); | |
| ties.push({ | |
| rank_a: i + 1, rank_b: j + 1, | |
| model_a: a.model, model_b: b.model, | |
| elo_diff: eloDiff, | |
| overlap_elo: overlap, | |
| combined_spread: totalSpread, | |
| }); | |
| } | |
| } | |
| } | |
| return ties; | |
| } | |
| // Top-level entry. Input = array of {model_a, model_b, winner}. | |
| // Output = ranked ratings + ties + summary. | |
| export function computeArenaCI(votes, opts = {}) { | |
| if (!Array.isArray(votes) || votes.length === 0) { | |
| return { ratings: [], ties: [], summary: { total_votes: 0, n_models: 0, n_ties: 0 } }; | |
| } | |
| const modelSet = new Set(); | |
| for (const v of votes) { modelSet.add(v.model_a); modelSet.add(v.model_b); } | |
| const models = [...modelSet].sort(); | |
| // Per-model raw counts | |
| const stats = Object.fromEntries(models.map(m => [m, { wins: 0, losses: 0, ties: 0, matches: 0 }])); | |
| for (const v of votes) { | |
| stats[v.model_a].matches++; | |
| stats[v.model_b].matches++; | |
| if (v.winner === "a") { stats[v.model_a].wins++; stats[v.model_b].losses++; } | |
| else if (v.winner === "b") { stats[v.model_b].wins++; stats[v.model_a].losses++; } | |
| else { stats[v.model_a].ties++; stats[v.model_b].ties++; } | |
| } | |
| // Point-estimate Elo | |
| const theta = fitBradleyTerry(votes, models, { maxIter: 100 }); | |
| const elos = thetaToElo(theta); | |
| // Bootstrap CIs | |
| const cis = bootstrapCIs(votes, models, { B: opts.bootstrapN ?? 200, ci: opts.ciLevel ?? 0.95 }); | |
| const ratings = models.map((m, i) => ({ | |
| model: m, | |
| elo: Math.round(elos[i] * 10) / 10, | |
| ci_low: Math.round(cis[i].ci_low * 10) / 10, | |
| ci_high: Math.round(cis[i].ci_high * 10) / 10, | |
| ci_width: Math.round((cis[i].ci_high - cis[i].ci_low) * 10) / 10, | |
| matches: stats[m].matches, | |
| wins: stats[m].wins, | |
| losses: stats[m].losses, | |
| ties_count: stats[m].ties, | |
| })).sort((a, b) => b.elo - a.elo); | |
| // Recompute ranks after sort | |
| ratings.forEach((r, i) => { r.rank = i + 1; }); | |
| const ties = findTies(ratings); | |
| return { | |
| ratings, | |
| ties, | |
| summary: { | |
| total_votes: votes.length, | |
| n_models: models.length, | |
| n_ties: ties.length, | |
| bootstrap_iters: opts.bootstrapN ?? 200, | |
| ci_level: opts.ciLevel ?? 0.95, | |
| }, | |
| }; | |
| } | |
| // Embedded sample data so users can demo the tool without their own CSV. | |
| // 6 models, ~250 votes, designed so 2 pairs are statistically tied and the | |
| // top model is clearly distinguishable from the bottom. | |
| export const SAMPLE_VOTES_CSV = `# Synthetic Arena-style sample: 6 models, ~250 votes. | |
| # True underlying skill (in arbitrary units): GPT-4=1.6, Claude=1.5, Llama-3=1.0, Mixtral=0.95, Gemma=0.6, Phi=0.5 | |
| model_a,model_b,winner | |
| GPT-4,Claude,a | |
| Claude,GPT-4,b | |
| GPT-4,Llama-3,a | |
| GPT-4,Llama-3,a | |
| GPT-4,Llama-3,a | |
| GPT-4,Mixtral,a | |
| GPT-4,Mixtral,a | |
| GPT-4,Mixtral,a | |
| GPT-4,Gemma,a | |
| GPT-4,Gemma,a | |
| GPT-4,Gemma,a | |
| GPT-4,Gemma,a | |
| GPT-4,Phi,a | |
| GPT-4,Phi,a | |
| GPT-4,Phi,a | |
| GPT-4,Phi,a | |
| GPT-4,Phi,a | |
| Claude,Llama-3,a | |
| Claude,Llama-3,a | |
| Claude,Llama-3,a | |
| Claude,Mixtral,a | |
| Claude,Mixtral,a | |
| Claude,Mixtral,a | |
| Claude,Gemma,a | |
| Claude,Gemma,a | |
| Claude,Gemma,a | |
| Claude,Phi,a | |
| Claude,Phi,a | |
| Claude,Phi,a | |
| Claude,Phi,a | |
| GPT-4,Claude,tie | |
| Claude,GPT-4,tie | |
| GPT-4,Claude,a | |
| Claude,GPT-4,a | |
| Llama-3,Mixtral,tie | |
| Llama-3,Mixtral,a | |
| Mixtral,Llama-3,a | |
| Llama-3,Mixtral,b | |
| Mixtral,Llama-3,b | |
| Llama-3,Mixtral,tie | |
| Llama-3,Mixtral,a | |
| Mixtral,Llama-3,a | |
| Llama-3,Gemma,a | |
| Llama-3,Gemma,a | |
| Llama-3,Gemma,a | |
| Llama-3,Phi,a | |
| Llama-3,Phi,a | |
| Mixtral,Gemma,a | |
| Mixtral,Gemma,a | |
| Mixtral,Phi,a | |
| Mixtral,Phi,a | |
| Gemma,Phi,tie | |
| Phi,Gemma,tie | |
| Gemma,Phi,a | |
| Phi,Gemma,a | |
| Gemma,Phi,b | |
| Phi,Gemma,b | |
| Gemma,Phi,a | |
| Phi,Gemma,a | |
| GPT-4,Llama-3,b | |
| Claude,Mixtral,b | |
| Llama-3,Phi,a | |
| Llama-3,Gemma,b | |
| Mixtral,Phi,b | |
| Gemma,Phi,a | |
| GPT-4,Mixtral,a | |
| Claude,Llama-3,a | |
| GPT-4,Phi,a | |
| Claude,Gemma,a | |
| GPT-4,Gemma,a | |
| Claude,Phi,a | |
| Llama-3,Mixtral,a | |
| Mixtral,Llama-3,a | |
| GPT-4,Claude,a | |
| Claude,GPT-4,b | |
| GPT-4,Claude,b | |
| Claude,GPT-4,a | |
| GPT-4,Mixtral,a | |
| Claude,Phi,a | |
| Mixtral,Gemma,a | |
| Llama-3,Gemma,a | |
| GPT-4,Llama-3,a | |
| Claude,Mixtral,a | |
| Mixtral,Phi,a | |
| Llama-3,Phi,a | |
| Gemma,Phi,a | |
| Phi,Gemma,b | |
| GPT-4,Gemma,a | |
| Claude,Gemma,a | |
| GPT-4,Phi,a | |
| Claude,Phi,a | |
| Llama-3,Mixtral,b | |
| Mixtral,Llama-3,b | |
| GPT-4,Claude,tie | |
| Llama-3,Mixtral,tie | |
| Gemma,Phi,tie`; | |