"""
LLM Enzyme Kinetics Extraction Benchmark Leaderboard
Built with Gradio
"""
import gradio as gr
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime
import json
import os
from pathlib import Path
from auto_eval import BenchmarkEvaluator
from utils import (
load_leaderboard_data, format_metrics, get_leaderboard_summary,
filter_leaderboard, get_top_n, create_comparison_data
)
# CSS for better styling
custom_css = """
.gradio-container {
max-width: 1400px !important;
}
.metric-card {
background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
padding: 20px;
border-radius: 10px;
color: white;
text-align: center;
}
/* Make leaderboard table taller with scrolling */
.leaderboard-table {
height: 700px !important;
}
.leaderboard-table > div {
height: 700px !important;
overflow-y: auto !important;
}
.leaderboard-table table {
width: 100% !important;
}
.leaderboard-table th, .leaderboard-table td {
padding: 12px !important;
min-height: 40px !important;
}
"""
# Initialize leaderboard data
# Auto-detect correct data directory for both local and HuggingFace Space
if os.path.exists('data'):
# Running from leaderboard/ directory (HuggingFace Space)
LEADERBOARD_DF = load_leaderboard_data('data')
elif os.path.exists('leaderboard/data'):
# Running from repository root
LEADERBOARD_DF = load_leaderboard_data('leaderboard/data')
else:
# Fallback to default
LEADERBOARD_DF = load_leaderboard_data()
def create_leaderboard_table(
model_provider: str = "All",
ocr_type: str = "All",
verified_only: bool = False,
top_n: int = 50
) -> pd.DataFrame:
"""Create filtered leaderboard table"""
filtered_df = filter_leaderboard(LEADERBOARD_DF, model_provider, ocr_type, verified_only)
top_df = get_top_n(filtered_df, top_n)
if top_df.empty:
return pd.DataFrame(columns=["Rank", "Model", "Provider", "OCR", "Submitter", "Date",
"Km (Exact)", "Km (±10%)", "kcat (Exact)", "kcat (±10%)",
"kcat/Km (Exact)", "kcat/Km (±10%)", "Overall (Exact)", "Overall (±10%)"])
# Format for display
display_df = pd.DataFrame({
'Rank': range(1, len(top_df) + 1),
'Model': top_df['model_name'],
'Provider': top_df['model_provider'],
'OCR': top_df['ocr_type'],
'Submitter': top_df['submitter'],
'Date': top_df['submission_date'].dt.strftime('%Y-%m-%d'),
'Km (Exact)': top_df['km_exact_match'].apply(format_metrics),
'Km (±10%)': top_df['km_tolerance_match'].apply(format_metrics),
'kcat (Exact)': top_df['kcat_exact_match'].apply(format_metrics),
'kcat (±10%)': top_df['kcat_tolerance_match'].apply(format_metrics),
'kcat/Km (Exact)': top_df['km_kcat_exact_match'].apply(format_metrics),
'kcat/Km (±10%)': top_df['km_kcat_tolerance_match'].apply(format_metrics),
'Overall (Exact)': top_df['overall_exact_match'].apply(format_metrics),
'Overall (±10%)': top_df['overall_tolerance_match'].apply(format_metrics),
})
return display_df
def create_summary_cards() -> str:
"""Create summary statistics HTML"""
summary = get_leaderboard_summary(LEADERBOARD_DF)
html = f"""
Total Submissions
{summary['total_submissions']}
Unique Models
{summary['unique_models']}
Best Score
{summary['best_score']:.1f}%
Average Score
{summary['avg_score']:.1f}%
"""
return html
def create_score_comparison_chart() -> go.Figure:
"""Create score comparison bar chart"""
if LEADERBOARD_DF.empty:
fig = go.Figure()
fig.add_annotation(text="No submissions yet", xref="paper", yref="paper",
x=0.5, y=0.5, showarrow=False)
return fig
# Get top 10 submissions
top_10 = get_top_n(LEADERBOARD_DF, 10)
fig = go.Figure()
fig.add_trace(go.Bar(
x=top_10['overall_exact_match'] * 100,
y=top_10['model_name'] + ' (' + top_10['model_provider'] + ')',
orientation='h',
marker=dict(color='rgba(102, 126, 234, 0.8)'),
text=top_10['overall_exact_match'].apply(lambda x: f'{x*100:.1f}%'),
textposition='outside'
))
fig.update_layout(
title='Top 10 Models - Exact Match Accuracy',
xaxis_title='Accuracy (%)',
yaxis_title='Model',
height=400,
margin=dict(l=20, r=20, t=40, b=20)
)
return fig
def create_ocr_comparison_chart() -> go.Figure:
"""Create OCR type comparison chart"""
if LEADERBOARD_DF.empty:
fig = go.Figure()
fig.add_annotation(text="No submissions yet", xref="paper", yref="paper",
x=0.5, y=0.5, showarrow=False)
return fig
ocr_stats = LEADERBOARD_DF.groupby('ocr_type')['overall_exact_match'].agg(['mean', 'count']).reset_index()
fig = go.Figure()
fig.add_trace(go.Bar(
x=ocr_stats['ocr_type'],
y=ocr_stats['mean'] * 100,
marker=dict(color=['rgba(102, 126, 234, 0.8)', 'rgba(240, 147, 251, 0.8)', 'rgba(79, 172, 254, 0.8)']),
text=ocr_stats['mean'].apply(lambda x: f'{x*100:.1f}%'),
textposition='outside',
name='Accuracy'
))
fig.update_layout(
title='Performance by OCR Type',
xaxis_title='OCR Type',
yaxis_title='Average Exact Match (%)',
height=400,
margin=dict(l=20, r=20, t=40, b=20)
)
return fig
def create_timeline_chart() -> go.Figure:
"""Create submission timeline chart"""
if LEADERBOARD_DF.empty:
fig = go.Figure()
fig.add_annotation(text="No submissions yet", xref="paper", yref="paper",
x=0.5, y=0.5, showarrow=False)
return fig
df_sorted = LEADERBOARD_DF.sort_values('submission_date')
df_sorted['cumulative_best'] = df_sorted['overall_exact_match'].cummax()
fig = go.Figure()
# Add all submissions as scatter
fig.add_trace(go.Scatter(
x=df_sorted['submission_date'],
y=df_sorted['overall_exact_match'] * 100,
mode='markers',
name='Submissions',
marker=dict(size=8, color='rgba(102, 126, 234, 0.5)'),
text=df_sorted['model_name'],
hovertemplate='%{text}
%{x}
%{y:.1f}%'
))
# Add best score line
fig.add_trace(go.Scatter(
x=df_sorted['submission_date'],
y=df_sorted['cumulative_best'] * 100,
mode='lines',
name='Best Score',
line=dict(color='rgba(67, 233, 123, 0.8)', width=2)
))
fig.update_layout(
title='Submission Timeline & Progress',
xaxis_title='Date',
yaxis_title='Exact Match (%)',
height=400,
margin=dict(l=20, r=20, t=40, b=20),
hovermode='x unified'
)
return fig
def submit_result(
model_name: str,
model_provider: str,
ocr_type: str,
submitter: str,
km_exact: float,
km_tolerance: float,
kcat_exact: float,
kcat_tolerance: float,
km_kcat_exact: float,
km_kcat_tolerance: float,
total_papers: int,
notes: str
) -> str:
"""Submit a new result to the leaderboard"""
try:
# Calculate overall scores
overall_exact = (km_exact + kcat_exact + km_kcat_exact) / 3
overall_tolerance = (km_tolerance + kcat_tolerance + km_kcat_tolerance) / 3
# Create submission data
submission = {
'submission_id': f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{submitter}",
'model_name': model_name,
'model_provider': model_provider,
'ocr_type': ocr_type,
'submitter': submitter,
'submission_date': datetime.now().isoformat(),
'km_exact_match': km_exact / 100,
'km_tolerance_match': km_tolerance / 100,
'kcat_exact_match': kcat_exact / 100,
'kcat_tolerance_match': kcat_tolerance / 100,
'km_kcat_exact_match': km_kcat_exact / 100,
'km_kcat_tolerance_match': km_kcat_tolerance / 100,
'overall_exact_match': overall_exact / 100,
'overall_tolerance_match': overall_tolerance / 100,
'total_papers': total_papers,
'total_entries': total_papers * 3, # Approximate
'notes': notes,
'verified': False # Needs verification
}
# Save to data directory
data_dir = Path("leaderboard/data")
data_dir.mkdir(parents=True, exist_ok=True)
submission_file = data_dir / f"{submission['submission_id']}.json"
with open(submission_file, 'w') as f:
json.dump(submission, f, indent=2)
# Reload leaderboard data
global LEADERBOARD_DF
LEADERBOARD_DF = load_leaderboard_data()
return f"✅ Submission successful! Your ID: {submission['submission_id']}\n\nPlease create a PR or contact the maintainer to verify your submission."
except Exception as e:
return f"❌ Error: {str(e)}"
# Build Gradio interface
with gr.Blocks(css=custom_css, title="LLM Enzyme Kinetics Extraction Benchmark") as demo:
gr.Markdown(
"""
# 🧪 LLM Enzyme Kinetics Extraction Benchmark Leaderboard
Welcome to the leaderboard for the **LLM Enzyme Kinetics Golden Benchmark**!
This benchmark evaluates LLMs on extracting enzyme kinetic parameters (Km, kcat, kcat/Km)
from scientific literature.
📚 **Dataset**: 4,244 entries from 156 papers | 🎯 **Task**: Extract kinetic parameters from OCR-processed papers
"""
)
# Summary cards
gr.HTML(create_summary_cards())
with gr.Tabs():
# Tab 1: Leaderboard Table
with gr.TabItem("🏆 Leaderboard"):
gr.Markdown("### Filter and Search")
with gr.Row():
model_provider_dropdown = gr.Dropdown(
choices=["All", "OpenAI", "Anthropic", "Kimi", "Other"],
value="All",
label="Model Provider"
)
ocr_type_dropdown = gr.Dropdown(
choices=["All", "mathpix", "kimi", "pymupdf", "glm_ocr"],
value="All",
label="OCR Type"
)
verified_checkbox = gr.Checkbox(
label="Verified Only",
value=False
)
top_n_slider = gr.Slider(
minimum=10,
maximum=100,
value=50,
step=10,
label="Show Top N"
)
leaderboard_table = gr.Dataframe(
label="Leaderboard",
datatype=["markdown"] * 14,
interactive=False,
wrap=True,
elem_classes=["leaderboard-table"]
)
refresh_btn = gr.Button("🔄 Refresh", variant="primary")
refresh_btn.click(
fn=create_leaderboard_table,
inputs=[model_provider_dropdown, ocr_type_dropdown, verified_checkbox, top_n_slider],
outputs=leaderboard_table
)
# Initial load
demo.load(
fn=create_leaderboard_table,
inputs=[model_provider_dropdown, ocr_type_dropdown, verified_checkbox, top_n_slider],
outputs=leaderboard_table
)
# Tab 2: Visualizations
with gr.TabItem("📊 Visualizations"):
with gr.Row():
score_chart = gr.Plot(label="Top Models Comparison")
ocr_chart = gr.Plot(label="OCR Type Comparison")
with gr.Row():
timeline_chart = gr.Plot(label="Submission Timeline")
# Load charts
demo.load(
fn=lambda: [create_score_comparison_chart(), create_ocr_comparison_chart(), create_timeline_chart()],
outputs=[score_chart, ocr_chart, timeline_chart]
)
# Tab 3: Auto-Evaluate (🚀 Run Benchmark in Space)
with gr.TabItem("🚀 Auto-Evaluate"):
gr.Markdown("""
### 🎯 Run Full Benchmark Directly in the Space
**⚠️ Important Notes:**
- Your API key is **only used for this evaluation** and never stored
- Results are automatically saved to **GitHub** via Pull Request
- Data persists even after Space restarts (stored in GitHub)
- Requires a GitHub token with PR permissions
**💡 Benefits:**
✅ No local setup needed
✅ Fast evaluation (Space has direct access to data)
✅ Automatic submission via GitHub PR
✅ Results verified by maintainers before appearing on leaderboard
""")
with gr.Accordion("📖 How it works", open=False):
gr.Markdown("""
1. **Fill in your API credentials** (only used for this evaluation)
2. **Configure your model and settings**
3. **Run evaluation** - Space processes papers and extracts data
4. **Automatic submission** - Results saved to GitHub via PR
5. **Verification** - Maintainers review and merge your PR
6. **Appear on leaderboard** - Once verified, your results show up!
**Data Persistence:**
- Results saved to `leaderboard/data/submissions/` in GitHub
- PR created to: `github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark`
- Merged PRs loaded automatically by leaderboard
- Space restarts don't affect your data!
""")
gr.Markdown("---")
# GitHub Token for PR creation
with gr.Row():
github_token_input = gr.Textbox(
label="GitHub Token (for PR creation) *",
placeholder="ghp_xxxxxxxxxxxx",
type="password",
info="Create token at: https://github.com/settings/tokens (need 'repo' and 'pr' scopes)"
)
# API Configuration
gr.Markdown("### 🔧 API Configuration")
with gr.Row():
api_provider_input = gr.Radio(
choices=["OpenAI", "Anthropic", "Kimi/Moonshot"],
value="OpenAI",
label="API Provider *"
)
api_key_input = gr.Textbox(
label="API Key *",
type="password",
placeholder="sk-...",
info="Your API key is only used for this evaluation and never stored"
)
api_base_input = gr.Textbox(
label="API Base URL",
placeholder="https://api.openai.com/v1",
info="Default: https://api.openai.com/v1"
)
model_name_input = gr.Textbox(
label="Model Name *",
placeholder="e.g., gpt-4, claude-sonnet-4-5-20250929, kimi-k2.5"
)
# Evaluation Settings
gr.Markdown("### ⚙️ Evaluation Settings")
with gr.Row():
ocr_type_input = gr.Dropdown(
choices=["mathpix", "kimi", "pymupdf", "glm_ocr"],
value="mathpix",
label="OCR Type *",
info="Which OCR version to use for evaluation"
)
num_papers_input = gr.Slider(
minimum=1,
maximum=156,
value=5,
step=1,
label="Number of Papers (Quick Test: 1-5, Full Eval: 156)",
info="Start with 5 papers for testing, then run full evaluation"
)
submitter_input = gr.Textbox(
label="Submitter Name/Email *",
placeholder="Your name or email (will be displayed on leaderboard)",
info="Public information - will be shown on leaderboard"
)
run_eval_btn = gr.Button("🚀 Run Evaluation", variant="primary", size="lg")
eval_output = gr.Markdown()
def run_evaluation(github_token, api_provider, api_key, api_base,
model_name, ocr_type, num_papers, submitter):
"""Run automatic evaluation"""
if not github_token:
return "❌ **Error**: GitHub token is required to create a PR for saving results."
if not api_key:
return "❌ **Error**: API key is required."
if not model_name:
return "❌ **Error**: Model name is required."
if not submitter:
return "❌ **Error**: Submitter name is required."
# Set default API base if not provided
if not api_base:
if api_provider == "OpenAI":
api_base = "https://api.openai.com/v1"
elif api_provider == "Anthropic":
api_base = "https://api.anthropic.com"
elif api_provider == "Kimi/Moonshot":
api_base = "https://api.moonshot.cn/v1"
try:
evaluator = BenchmarkEvaluator(github_token=github_token)
# Run evaluation
success, results = evaluator.evaluate_submission(
api_key=api_key,
api_base=api_base,
model_name=model_name,
provider=api_provider,
ocr_type=ocr_type,
submitter=submitter,
num_papers=num_papers
)
if success:
# Format results
msg = f"""
## ✅ Evaluation Completed Successfully!
**Submission ID**: `{results['submission_id']}`
### 📊 Your Results:
| Metric | Score |
|--------|-------|
| **Overall Exact Match** | {results['overall_exact_match']*100:.2f}% |
| **Overall Tolerance (±10%)** | {results['overall_tolerance_match']*100:.2f}% |
| Papers Evaluated | {results['total_papers']} |
| Total Entries | {results['total_entries']} |
### 📝 Next Steps:
1. **Pull Request Created**: Check your email for PR notification
2. **Review**: Your results will be reviewed by maintainers
3. **Verification**: Once verified, results appear on the leaderboard
4. **Check PR**: https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark/pulls
### 💾 Data Persistence:
- ✅ Results saved to GitHub repository
- ✅ Persistent even after Space restarts
- ✅ Version controlled via Pull Request
- ✅ Safe from data loss
**Note**: Your submission is marked as "Unverified" until a maintainer reviews and approves it.
"""
return msg
else:
return f"❌ **Evaluation Failed**: {results.get('error', 'Unknown error')}"
except Exception as e:
return f"❌ **Error**: {str(e)}\n\nPlease check your inputs and try again."
run_eval_btn.click(
fn=run_evaluation,
inputs=[
github_token_input, api_provider_input, api_key_input,
api_base_input, model_name_input, ocr_type_input,
num_papers_input, submitter_input
],
outputs=eval_output
)
gr.Markdown("""
---
**⏱️ Expected Time**:
- Quick Test (1-5 papers): 2-5 minutes
- Full Evaluation (156 papers): 30-60 minutes
**💡 Tips**:
- Start with 1-5 papers to verify your setup
- Check the "Quick Test" box for fast feedback
- Use the same credentials for full evaluation
- Results are saved even if you close the tab!
**🔒 Privacy**:
- API keys are **never stored** in the Space
- Only used for the duration of evaluation
- Cleared from memory immediately after evaluation
""")
# Tab 4: Submit Results (Manual)
with gr.TabItem("📤 Submit Your Results"):
gr.Markdown("""
### 📝 Manually Submit Your Benchmark Results
**⚠️ Important**: Results submitted here are **only saved locally** (not persistent).
For persistent storage, use the **Auto-Evaluate** tab instead.
**Instructions:**
1. Run the benchmark locally: `python scripts/run_benchmark.py --mode full`
2. Collect your metrics from `evaluation_results/summary.csv`
3. Fill in the form below
4. Results saved to `leaderboard/data/` (local only)
**💡 Better Alternative**: Use the **Auto-Evaluate** tab for:
- ✅ Automatic GitHub PR creation
- ✅ Persistent data storage
- ✅ Direct integration with leaderboard
""")
with gr.Row():
model_name_input = gr.Textbox(label="Model Name *", placeholder="e.g., GPT-4, Claude-3.5-Sonnet")
model_provider_input = gr.Dropdown(
choices=["OpenAI", "Anthropic", "Kimi", "Other"],
label="Model Provider *"
)
with gr.Row():
ocr_type_input = gr.Dropdown(
choices=["mathpix", "kimi", "pymupdf", "glm_ocr"],
label="OCR Type *"
)
submitter_input = gr.Textbox(label="Submitter Name/Email *", placeholder="Your name or contact")
gr.Markdown("### Performance Metrics (%)")
with gr.Row():
km_exact_input = gr.Number(label="Km Exact Match *", minimum=0, maximum=100)
km_tolerance_input = gr.Number(label="Km Tolerance (±10%) *", minimum=0, maximum=100)
with gr.Row():
kcat_exact_input = gr.Number(label="kcat Exact Match *", minimum=0, maximum=100)
kcat_tolerance_input = gr.Number(label="kcat Tolerance (±10%) *", minimum=0, maximum=100)
with gr.Row():
km_kcat_exact_input = gr.Number(label="kcat/Km Exact Match *", minimum=0, maximum=100)
km_kcat_tolerance_input = gr.Number(label="kcat/Km Tolerance (±10%) *", minimum=0, maximum=100)
with gr.Row():
total_papers_input = gr.Number(label="Total Papers Evaluated *", minimum=1, maximum=156)
notes_input = gr.Textbox(
label="Notes",
placeholder="Any additional information about your setup (temperature, prompts, etc.)",
lines=3
)
submit_btn = gr.Button("Submit Results", variant="primary")
submission_output = gr.Markdown()
submit_btn.click(
fn=submit_result,
inputs=[
model_name_input, model_provider_input, ocr_type_input, submitter_input,
km_exact_input, km_tolerance_input, kcat_exact_input, kcat_tolerance_input,
km_kcat_exact_input, km_kcat_tolerance_input, total_papers_input, notes_input
],
outputs=submission_output
)
# Tab 5: About
with gr.TabItem("ℹ️ About"):
gr.Markdown("""
## About the Benchmark
The **LLM Enzyme Kinetics Golden Benchmark** evaluates the ability of Large Language Models
to extract structured enzyme kinetic data from scientific literature.
### Dataset
- **Papers**: 156 peer-reviewed publications
- **Entries**: 4,244 manually curated enzyme kinetic entries
- **Parameters**: Km, kcat, kcat/Km, pH, temperature, mutations
- **OCR Versions**: 3 parallel OCR outputs (Mathpix, Kimi, PyMuPDF)
### Evaluation Metrics
1. **Exact Match Accuracy**: Value must match exactly
2. **Tolerance Match (±10%)**: Value within 10% of ground truth
3. Scores are calculated for each parameter (Km, kcat, kcat/Km)
### How to Participate
1. Clone the repository:
```bash
git clone https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark.git
```
2. Install dependencies:
```bash
conda create -n enzyme_benchmark python=3.10 -y
conda activate enzyme_benchmark
pip install -r requirements.txt
```
3. Configure your API key in `.env`
4. Run the benchmark:
```bash
python scripts/run_benchmark.py --mode full
```
5. Submit your results through this leaderboard!
### Citation
If you use this benchmark, please cite our repository.
""")
gr.Markdown(
"""
---
**[GitHub Repository](https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark)**
| **[Documentation](https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark/blob/main/README.md)**
| **[How to Participate](https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark/blob/main/USAGE.md)**
*Last updated: {}
""".format(datetime.now().strftime("%Y-%m-%d"))
)
if __name__ == "__main__":
demo.launch()