Spaces:

karlexmarin
/

taf-agent

Running

App Files Files Community

taf-agent / js /arena_ci.js

karlexmarin's picture

v0.7.2: Arena CI + Contamination Prior + v0.7 Help/Inventory documentation

d61ea0e about 1 month ago

9.45 kB

	// Arena-Elo CI reconstructor (v0.7.2 anti-bullshit pack #3)
	// Recovers confidence intervals from raw pairwise vote data using
	// Bradley-Terry MLE + bootstrap. Chatbot Arena strips CIs from its public
	// leaderboard; this lets a user compute them from any vote CSV.
	// Pure logic — no human-readable strings. main.js renders via i18n.

	// Parse CSV into vote records. Accepts header row + 3 columns:
	// model_a, model_b, winner (winner ∈ {a, b, tie, model_a, model_b})
	// Tolerates extra whitespace and case-insensitive header matching.
	export function parseVotesCSV(text) {
	const lines = text.split(/\r?\n/).map(l => l.trim()).filter(l => l && !l.startsWith("#"));
	if (lines.length < 2) throw new Error("CSV needs at least a header + 1 data row.");
	const header = lines[0].split(",").map(s => s.trim().toLowerCase());

	const colA = header.findIndex(h => h === "model_a" \|\| h === "a" \|\| h === "model a");
	const colB = header.findIndex(h => h === "model_b" \|\| h === "b" \|\| h === "model b");
	const colW = header.findIndex(h => h === "winner" \|\| h === "result" \|\| h === "outcome");
	if (colA < 0 \|\| colB < 0 \|\| colW < 0) {
	throw new Error("Header must include columns: model_a, model_b, winner.");
	}

	const votes = [];
	for (let i = 1; i < lines.length; i++) {
	const row = lines[i].split(",").map(s => s.trim());
	if (row.length < Math.max(colA, colB, colW) + 1) continue;
	const a = row[colA], b = row[colB];
	const w = row[colW].toLowerCase();
	if (!a \|\| !b) continue;
	let winner;
	if (w === "a" \|\| w === "model_a" \|\| w === a.toLowerCase()) winner = "a";
	else if (w === "b" \|\| w === "model_b" \|\| w === b.toLowerCase()) winner = "b";
	else if (w === "tie" \|\| w === "draw" \|\| w === "both" \|\| w === "neither") winner = "tie";
	else continue; // skip unrecognized
	votes.push({ model_a: a, model_b: b, winner });
	}
	return votes;
	}

	// Bradley-Terry MLE via Minorization-Maximization (Hunter 2004).
	// Each iteration: theta_i ← wins_i / Σ_j (matches_ij / (theta_i + theta_j)).
	// Ties count as half-win to each side. Returns map model → theta (positive scale).
	function fitBradleyTerry(votes, models, opts = {}) {
	const { maxIter = 100, tol = 1e-7 } = opts;
	const n = models.length;
	const idx = Object.fromEntries(models.map((m, i) => [m, i]));
	const wins = new Float64Array(n);
	const matches = Array.from({ length: n }, () => new Float64Array(n));

	for (const v of votes) {
	const a = idx[v.model_a], b = idx[v.model_b];
	if (a === undefined \|\| b === undefined) continue;
	matches[a][b] += 1;
	matches[b][a] += 1;
	if (v.winner === "a") wins[a] += 1;
	else if (v.winner === "b") wins[b] += 1;
	else if (v.winner === "tie") { wins[a] += 0.5; wins[b] += 0.5; }
	}

	let theta = new Float64Array(n).fill(1.0);
	for (let iter = 0; iter < maxIter; iter++) {
	const next = new Float64Array(n);
	for (let i = 0; i < n; i++) {
	let denom = 0;
	for (let j = 0; j < n; j++) {
	if (i !== j && matches[i][j] > 0) {
	denom += matches[i][j] / (theta[i] + theta[j]);
	}
	}
	const w = wins[i] \|\| 1e-9; // avoid 0 → undefined
	next[i] = w / (denom \|\| 1e-9);
	}
	// normalize so geometric mean = 1 → keeps Elo identifiable
	let logSum = 0;
	for (let i = 0; i < n; i++) logSum += Math.log(next[i] \|\| 1e-12);
	const gm = Math.exp(logSum / n);
	for (let i = 0; i < n; i++) next[i] /= gm;
	// convergence check
	let maxDelta = 0;
	for (let i = 0; i < n; i++) maxDelta = Math.max(maxDelta, Math.abs(next[i] - theta[i]));
	theta = next;
	if (maxDelta < tol) break;
	}
	return theta;
	}

	// Convert BT theta → Elo (anchor: geometric-mean model = 1500).
	function thetaToElo(theta) { return Array.from(theta).map(t => 400 * Math.log10(t) + 1500); }

	// Bootstrap percentile CIs. Resamples votes with replacement B times,
	// refits BT each time, returns {ci_low, ci_high} per model.
	function bootstrapCIs(votes, models, opts = {}) {
	const { B = 200, ci = 0.95 } = opts;
	const samples = Array.from({ length: models.length }, () => []);
	const N = votes.length;
	for (let b = 0; b < B; b++) {
	const resample = new Array(N);
	for (let k = 0; k < N; k++) resample[k] = votes[(Math.random() * N) \| 0];
	const eloRow = thetaToElo(fitBradleyTerry(resample, models, { maxIter: 50 }));
	for (let i = 0; i < models.length; i++) samples[i].push(eloRow[i]);
	}
	const loIdx = Math.floor((1 - ci) / 2 * B);
	const hiIdx = Math.floor((1 - (1 - ci) / 2) * B);
	return samples.map(s => {
	s.sort((a, b) => a - b);
	return { ci_low: s[loIdx], ci_high: s[Math.min(hiIdx, B - 1)] };
	});
	}

	// Detect statistical ties: pairs where the bootstrap distributions overlap by
	// more than `overlapThreshold` (default 0.05 = 5%). Cheaper proxy: CIs overlap.
	function findTies(ratings) {
	const ties = [];
	const sorted = [...ratings].sort((a, b) => b.elo - a.elo);
	for (let i = 0; i < sorted.length; i++) {
	for (let j = i + 1; j < sorted.length; j++) {
	const a = sorted[i], b = sorted[j];
	// CI overlap: a.ci_low <= b.ci_high (a's lower bound below b's upper bound)
	if (a.ci_low <= b.ci_high) {
	const eloDiff = a.elo - b.elo;
	const totalSpread = (a.ci_high - a.ci_low) + (b.ci_high - b.ci_low);
	const overlap = Math.max(0, b.ci_high - a.ci_low);
	ties.push({
	rank_a: i + 1, rank_b: j + 1,
	model_a: a.model, model_b: b.model,
	elo_diff: eloDiff,
	overlap_elo: overlap,
	combined_spread: totalSpread,
	});
	}
	}
	}
	return ties;
	}

	// Top-level entry. Input = array of {model_a, model_b, winner}.
	// Output = ranked ratings + ties + summary.
	export function computeArenaCI(votes, opts = {}) {
	if (!Array.isArray(votes) \|\| votes.length === 0) {
	return { ratings: [], ties: [], summary: { total_votes: 0, n_models: 0, n_ties: 0 } };
	}
	const modelSet = new Set();
	for (const v of votes) { modelSet.add(v.model_a); modelSet.add(v.model_b); }
	const models = [...modelSet].sort();

	// Per-model raw counts
	const stats = Object.fromEntries(models.map(m => [m, { wins: 0, losses: 0, ties: 0, matches: 0 }]));
	for (const v of votes) {
	stats[v.model_a].matches++;
	stats[v.model_b].matches++;
	if (v.winner === "a") { stats[v.model_a].wins++; stats[v.model_b].losses++; }
	else if (v.winner === "b") { stats[v.model_b].wins++; stats[v.model_a].losses++; }
	else { stats[v.model_a].ties++; stats[v.model_b].ties++; }
	}

	// Point-estimate Elo
	const theta = fitBradleyTerry(votes, models, { maxIter: 100 });
	const elos = thetaToElo(theta);
	// Bootstrap CIs
	const cis = bootstrapCIs(votes, models, { B: opts.bootstrapN ?? 200, ci: opts.ciLevel ?? 0.95 });

	const ratings = models.map((m, i) => ({
	model: m,
	elo: Math.round(elos[i] * 10) / 10,
	ci_low: Math.round(cis[i].ci_low * 10) / 10,
	ci_high: Math.round(cis[i].ci_high * 10) / 10,
	ci_width: Math.round((cis[i].ci_high - cis[i].ci_low) * 10) / 10,
	matches: stats[m].matches,
	wins: stats[m].wins,
	losses: stats[m].losses,
	ties_count: stats[m].ties,
	})).sort((a, b) => b.elo - a.elo);

	// Recompute ranks after sort
	ratings.forEach((r, i) => { r.rank = i + 1; });

	const ties = findTies(ratings);

	return {
	ratings,
	ties,
	summary: {
	total_votes: votes.length,
	n_models: models.length,
	n_ties: ties.length,
	bootstrap_iters: opts.bootstrapN ?? 200,
	ci_level: opts.ciLevel ?? 0.95,
	},
	};
	}

	// Embedded sample data so users can demo the tool without their own CSV.
	// 6 models, ~250 votes, designed so 2 pairs are statistically tied and the
	// top model is clearly distinguishable from the bottom.
	export const SAMPLE_VOTES_CSV = `# Synthetic Arena-style sample: 6 models, ~250 votes.
	# True underlying skill (in arbitrary units): GPT-4=1.6, Claude=1.5, Llama-3=1.0, Mixtral=0.95, Gemma=0.6, Phi=0.5
	model_a,model_b,winner
	GPT-4,Claude,a
	Claude,GPT-4,b
	GPT-4,Llama-3,a
	GPT-4,Llama-3,a
	GPT-4,Llama-3,a
	GPT-4,Mixtral,a
	GPT-4,Mixtral,a
	GPT-4,Mixtral,a
	GPT-4,Gemma,a
	GPT-4,Gemma,a
	GPT-4,Gemma,a
	GPT-4,Gemma,a
	GPT-4,Phi,a
	GPT-4,Phi,a
	GPT-4,Phi,a
	GPT-4,Phi,a
	GPT-4,Phi,a
	Claude,Llama-3,a
	Claude,Llama-3,a
	Claude,Llama-3,a
	Claude,Mixtral,a
	Claude,Mixtral,a
	Claude,Mixtral,a
	Claude,Gemma,a
	Claude,Gemma,a
	Claude,Gemma,a
	Claude,Phi,a
	Claude,Phi,a
	Claude,Phi,a
	Claude,Phi,a
	GPT-4,Claude,tie
	Claude,GPT-4,tie
	GPT-4,Claude,a
	Claude,GPT-4,a
	Llama-3,Mixtral,tie
	Llama-3,Mixtral,a
	Mixtral,Llama-3,a
	Llama-3,Mixtral,b
	Mixtral,Llama-3,b
	Llama-3,Mixtral,tie
	Llama-3,Mixtral,a
	Mixtral,Llama-3,a
	Llama-3,Gemma,a
	Llama-3,Gemma,a
	Llama-3,Gemma,a
	Llama-3,Phi,a
	Llama-3,Phi,a
	Mixtral,Gemma,a
	Mixtral,Gemma,a
	Mixtral,Phi,a
	Mixtral,Phi,a
	Gemma,Phi,tie
	Phi,Gemma,tie
	Gemma,Phi,a
	Phi,Gemma,a
	Gemma,Phi,b
	Phi,Gemma,b
	Gemma,Phi,a
	Phi,Gemma,a
	GPT-4,Llama-3,b
	Claude,Mixtral,b
	Llama-3,Phi,a
	Llama-3,Gemma,b
	Mixtral,Phi,b
	Gemma,Phi,a
	GPT-4,Mixtral,a
	Claude,Llama-3,a
	GPT-4,Phi,a
	Claude,Gemma,a
	GPT-4,Gemma,a
	Claude,Phi,a
	Llama-3,Mixtral,a
	Mixtral,Llama-3,a
	GPT-4,Claude,a
	Claude,GPT-4,b
	GPT-4,Claude,b
	Claude,GPT-4,a
	GPT-4,Mixtral,a
	Claude,Phi,a
	Mixtral,Gemma,a
	Llama-3,Gemma,a
	GPT-4,Llama-3,a
	Claude,Mixtral,a
	Mixtral,Phi,a
	Llama-3,Phi,a
	Gemma,Phi,a
	Phi,Gemma,b
	GPT-4,Gemma,a
	Claude,Gemma,a
	GPT-4,Phi,a
	Claude,Phi,a
	Llama-3,Mixtral,b
	Mixtral,Llama-3,b
	GPT-4,Claude,tie
	Llama-3,Mixtral,tie
	Gemma,Phi,tie`;