Spaces:

karlexmarin
/

taf-agent

Running

File size: 9,448 Bytes

d61ea0e

// Arena-Elo CI reconstructor (v0.7.2 anti-bullshit pack #3)
// Recovers confidence intervals from raw pairwise vote data using
// Bradley-Terry MLE + bootstrap. Chatbot Arena strips CIs from its public
// leaderboard; this lets a user compute them from any vote CSV.
// Pure logic — no human-readable strings. main.js renders via i18n.

// Parse CSV into vote records. Accepts header row + 3 columns:
//   model_a, model_b, winner   (winner ∈ {a, b, tie, model_a, model_b})
// Tolerates extra whitespace and case-insensitive header matching.
export function parseVotesCSV(text) {
  const lines = text.split(/\r?\n/).map(l => l.trim()).filter(l => l && !l.startsWith("#"));
  if (lines.length < 2) throw new Error("CSV needs at least a header + 1 data row.");
  const header = lines[0].split(",").map(s => s.trim().toLowerCase());

  const colA = header.findIndex(h => h === "model_a" || h === "a" || h === "model a");
  const colB = header.findIndex(h => h === "model_b" || h === "b" || h === "model b");
  const colW = header.findIndex(h => h === "winner" || h === "result" || h === "outcome");
  if (colA < 0 || colB < 0 || colW < 0) {
    throw new Error("Header must include columns: model_a, model_b, winner.");
  }

  const votes = [];
  for (let i = 1; i < lines.length; i++) {
    const row = lines[i].split(",").map(s => s.trim());
    if (row.length < Math.max(colA, colB, colW) + 1) continue;
    const a = row[colA], b = row[colB];
    const w = row[colW].toLowerCase();
    if (!a || !b) continue;
    let winner;
    if (w === "a" || w === "model_a" || w === a.toLowerCase()) winner = "a";
    else if (w === "b" || w === "model_b" || w === b.toLowerCase()) winner = "b";
    else if (w === "tie" || w === "draw" || w === "both" || w === "neither") winner = "tie";
    else continue; // skip unrecognized
    votes.push({ model_a: a, model_b: b, winner });
  }
  return votes;
}

// Bradley-Terry MLE via Minorization-Maximization (Hunter 2004).
// Each iteration: theta_i ← wins_i / Σ_j (matches_ij / (theta_i + theta_j)).
// Ties count as half-win to each side. Returns map model → theta (positive scale).
function fitBradleyTerry(votes, models, opts = {}) {
  const { maxIter = 100, tol = 1e-7 } = opts;
  const n = models.length;
  const idx = Object.fromEntries(models.map((m, i) => [m, i]));
  const wins = new Float64Array(n);
  const matches = Array.from({ length: n }, () => new Float64Array(n));

  for (const v of votes) {
    const a = idx[v.model_a], b = idx[v.model_b];
    if (a === undefined || b === undefined) continue;
    matches[a][b] += 1;
    matches[b][a] += 1;
    if (v.winner === "a") wins[a] += 1;
    else if (v.winner === "b") wins[b] += 1;
    else if (v.winner === "tie") { wins[a] += 0.5; wins[b] += 0.5; }
  }

  let theta = new Float64Array(n).fill(1.0);
  for (let iter = 0; iter < maxIter; iter++) {
    const next = new Float64Array(n);
    for (let i = 0; i < n; i++) {
      let denom = 0;
      for (let j = 0; j < n; j++) {
        if (i !== j && matches[i][j] > 0) {
          denom += matches[i][j] / (theta[i] + theta[j]);
        }
      }
      const w = wins[i] || 1e-9; // avoid 0 → undefined
      next[i] = w / (denom || 1e-9);
    }
    // normalize so geometric mean = 1 → keeps Elo identifiable
    let logSum = 0;
    for (let i = 0; i < n; i++) logSum += Math.log(next[i] || 1e-12);
    const gm = Math.exp(logSum / n);
    for (let i = 0; i < n; i++) next[i] /= gm;
    // convergence check
    let maxDelta = 0;
    for (let i = 0; i < n; i++) maxDelta = Math.max(maxDelta, Math.abs(next[i] - theta[i]));
    theta = next;
    if (maxDelta < tol) break;
  }
  return theta;
}

// Convert BT theta → Elo (anchor: geometric-mean model = 1500).
function thetaToElo(theta) { return Array.from(theta).map(t => 400 * Math.log10(t) + 1500); }

// Bootstrap percentile CIs. Resamples votes with replacement B times,
// refits BT each time, returns {ci_low, ci_high} per model.
function bootstrapCIs(votes, models, opts = {}) {
  const { B = 200, ci = 0.95 } = opts;
  const samples = Array.from({ length: models.length }, () => []);
  const N = votes.length;
  for (let b = 0; b < B; b++) {
    const resample = new Array(N);
    for (let k = 0; k < N; k++) resample[k] = votes[(Math.random() * N) | 0];
    const eloRow = thetaToElo(fitBradleyTerry(resample, models, { maxIter: 50 }));
    for (let i = 0; i < models.length; i++) samples[i].push(eloRow[i]);
  }
  const loIdx = Math.floor((1 - ci) / 2 * B);
  const hiIdx = Math.floor((1 - (1 - ci) / 2) * B);
  return samples.map(s => {
    s.sort((a, b) => a - b);
    return { ci_low: s[loIdx], ci_high: s[Math.min(hiIdx, B - 1)] };
  });
}

// Detect statistical ties: pairs where the bootstrap distributions overlap by
// more than `overlapThreshold` (default 0.05 = 5%). Cheaper proxy: CIs overlap.
function findTies(ratings) {
  const ties = [];
  const sorted = [...ratings].sort((a, b) => b.elo - a.elo);
  for (let i = 0; i < sorted.length; i++) {
    for (let j = i + 1; j < sorted.length; j++) {
      const a = sorted[i], b = sorted[j];
      // CI overlap: a.ci_low <= b.ci_high (a's lower bound below b's upper bound)
      if (a.ci_low <= b.ci_high) {
        const eloDiff = a.elo - b.elo;
        const totalSpread = (a.ci_high - a.ci_low) + (b.ci_high - b.ci_low);
        const overlap = Math.max(0, b.ci_high - a.ci_low);
        ties.push({
          rank_a: i + 1, rank_b: j + 1,
          model_a: a.model, model_b: b.model,
          elo_diff: eloDiff,
          overlap_elo: overlap,
          combined_spread: totalSpread,
        });
      }
    }
  }
  return ties;
}

// Top-level entry. Input = array of {model_a, model_b, winner}.
// Output = ranked ratings + ties + summary.
export function computeArenaCI(votes, opts = {}) {
  if (!Array.isArray(votes) || votes.length === 0) {
    return { ratings: [], ties: [], summary: { total_votes: 0, n_models: 0, n_ties: 0 } };
  }
  const modelSet = new Set();
  for (const v of votes) { modelSet.add(v.model_a); modelSet.add(v.model_b); }
  const models = [...modelSet].sort();

  // Per-model raw counts
  const stats = Object.fromEntries(models.map(m => [m, { wins: 0, losses: 0, ties: 0, matches: 0 }]));
  for (const v of votes) {
    stats[v.model_a].matches++;
    stats[v.model_b].matches++;
    if (v.winner === "a") { stats[v.model_a].wins++; stats[v.model_b].losses++; }
    else if (v.winner === "b") { stats[v.model_b].wins++; stats[v.model_a].losses++; }
    else { stats[v.model_a].ties++; stats[v.model_b].ties++; }
  }

  // Point-estimate Elo
  const theta = fitBradleyTerry(votes, models, { maxIter: 100 });
  const elos = thetaToElo(theta);
  // Bootstrap CIs
  const cis = bootstrapCIs(votes, models, { B: opts.bootstrapN ?? 200, ci: opts.ciLevel ?? 0.95 });

  const ratings = models.map((m, i) => ({
    model: m,
    elo: Math.round(elos[i] * 10) / 10,
    ci_low: Math.round(cis[i].ci_low * 10) / 10,
    ci_high: Math.round(cis[i].ci_high * 10) / 10,
    ci_width: Math.round((cis[i].ci_high - cis[i].ci_low) * 10) / 10,
    matches: stats[m].matches,
    wins: stats[m].wins,
    losses: stats[m].losses,
    ties_count: stats[m].ties,
  })).sort((a, b) => b.elo - a.elo);

  // Recompute ranks after sort
  ratings.forEach((r, i) => { r.rank = i + 1; });

  const ties = findTies(ratings);

  return {
    ratings,
    ties,
    summary: {
      total_votes: votes.length,
      n_models: models.length,
      n_ties: ties.length,
      bootstrap_iters: opts.bootstrapN ?? 200,
      ci_level: opts.ciLevel ?? 0.95,
    },
  };
}

// Embedded sample data so users can demo the tool without their own CSV.
// 6 models, ~250 votes, designed so 2 pairs are statistically tied and the
// top model is clearly distinguishable from the bottom.
export const SAMPLE_VOTES_CSV = `# Synthetic Arena-style sample: 6 models, ~250 votes.
# True underlying skill (in arbitrary units): GPT-4=1.6, Claude=1.5, Llama-3=1.0, Mixtral=0.95, Gemma=0.6, Phi=0.5
model_a,model_b,winner
GPT-4,Claude,a
Claude,GPT-4,b
GPT-4,Llama-3,a
GPT-4,Llama-3,a
GPT-4,Llama-3,a
GPT-4,Mixtral,a
GPT-4,Mixtral,a
GPT-4,Mixtral,a
GPT-4,Gemma,a
GPT-4,Gemma,a
GPT-4,Gemma,a
GPT-4,Gemma,a
GPT-4,Phi,a
GPT-4,Phi,a
GPT-4,Phi,a
GPT-4,Phi,a
GPT-4,Phi,a
Claude,Llama-3,a
Claude,Llama-3,a
Claude,Llama-3,a
Claude,Mixtral,a
Claude,Mixtral,a
Claude,Mixtral,a
Claude,Gemma,a
Claude,Gemma,a
Claude,Gemma,a
Claude,Phi,a
Claude,Phi,a
Claude,Phi,a
Claude,Phi,a
GPT-4,Claude,tie
Claude,GPT-4,tie
GPT-4,Claude,a
Claude,GPT-4,a
Llama-3,Mixtral,tie
Llama-3,Mixtral,a
Mixtral,Llama-3,a
Llama-3,Mixtral,b
Mixtral,Llama-3,b
Llama-3,Mixtral,tie
Llama-3,Mixtral,a
Mixtral,Llama-3,a
Llama-3,Gemma,a
Llama-3,Gemma,a
Llama-3,Gemma,a
Llama-3,Phi,a
Llama-3,Phi,a
Mixtral,Gemma,a
Mixtral,Gemma,a
Mixtral,Phi,a
Mixtral,Phi,a
Gemma,Phi,tie
Phi,Gemma,tie
Gemma,Phi,a
Phi,Gemma,a
Gemma,Phi,b
Phi,Gemma,b
Gemma,Phi,a
Phi,Gemma,a
GPT-4,Llama-3,b
Claude,Mixtral,b
Llama-3,Phi,a
Llama-3,Gemma,b
Mixtral,Phi,b
Gemma,Phi,a
GPT-4,Mixtral,a
Claude,Llama-3,a
GPT-4,Phi,a
Claude,Gemma,a
GPT-4,Gemma,a
Claude,Phi,a
Llama-3,Mixtral,a
Mixtral,Llama-3,a
GPT-4,Claude,a
Claude,GPT-4,b
GPT-4,Claude,b
Claude,GPT-4,a
GPT-4,Mixtral,a
Claude,Phi,a
Mixtral,Gemma,a
Llama-3,Gemma,a
GPT-4,Llama-3,a
Claude,Mixtral,a
Mixtral,Phi,a
Llama-3,Phi,a
Gemma,Phi,a
Phi,Gemma,b
GPT-4,Gemma,a
Claude,Gemma,a
GPT-4,Phi,a
Claude,Phi,a
Llama-3,Mixtral,b
Mixtral,Llama-3,b
GPT-4,Claude,tie
Llama-3,Mixtral,tie
Gemma,Phi,tie`;