| """
|
| LLM Enzyme Kinetics Extraction Benchmark Leaderboard
|
| Built with Gradio
|
| """
|
|
|
| import gradio as gr
|
| import pandas as pd
|
| import plotly.graph_objects as go
|
| import plotly.express as px
|
| from datetime import datetime
|
| import json
|
| import os
|
| from pathlib import Path
|
| from auto_eval import BenchmarkEvaluator
|
| from utils import (
|
| load_leaderboard_data, format_metrics, get_leaderboard_summary,
|
| filter_leaderboard, get_top_n, create_comparison_data
|
| )
|
|
|
|
|
| custom_css = """
|
| .gradio-container {
|
| max-width: 1400px !important;
|
| }
|
| .metric-card {
|
| background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
|
| padding: 20px;
|
| border-radius: 10px;
|
| color: white;
|
| text-align: center;
|
| }
|
| /* Make leaderboard table taller with scrolling */
|
| .leaderboard-table {
|
| min-height: 600px !important;
|
| }
|
| .leaderboard-table .wrap {
|
| height: 600px !important;
|
| overflow-y: auto !important;
|
| }
|
| """
|
|
|
|
|
|
|
| if os.path.exists('data'):
|
|
|
| LEADERBOARD_DF = load_leaderboard_data('data')
|
| elif os.path.exists('leaderboard/data'):
|
|
|
| LEADERBOARD_DF = load_leaderboard_data('leaderboard/data')
|
| else:
|
|
|
| LEADERBOARD_DF = load_leaderboard_data()
|
|
|
| def create_leaderboard_table(
|
| model_provider: str = "All",
|
| ocr_type: str = "All",
|
| verified_only: bool = False,
|
| top_n: int = 50
|
| ) -> pd.DataFrame:
|
| """Create filtered leaderboard table"""
|
| filtered_df = filter_leaderboard(LEADERBOARD_DF, model_provider, ocr_type, verified_only)
|
| top_df = get_top_n(filtered_df, top_n)
|
|
|
| if top_df.empty:
|
| return pd.DataFrame(columns=["Rank", "Model", "Provider", "OCR", "Submitter", "Date",
|
| "Km (Exact)", "Km (±10%)", "kcat (Exact)", "kcat (±10%)",
|
| "kcat/Km (Exact)", "kcat/Km (±10%)", "Overall (Exact)", "Overall (±10%)"])
|
|
|
|
|
| display_df = pd.DataFrame({
|
| 'Rank': range(1, len(top_df) + 1),
|
| 'Model': top_df['model_name'],
|
| 'Provider': top_df['model_provider'],
|
| 'OCR': top_df['ocr_type'],
|
| 'Submitter': top_df['submitter'],
|
| 'Date': top_df['submission_date'].dt.strftime('%Y-%m-%d'),
|
| 'Km (Exact)': top_df['km_exact_match'].apply(format_metrics),
|
| 'Km (±10%)': top_df['km_tolerance_match'].apply(format_metrics),
|
| 'kcat (Exact)': top_df['kcat_exact_match'].apply(format_metrics),
|
| 'kcat (±10%)': top_df['kcat_tolerance_match'].apply(format_metrics),
|
| 'kcat/Km (Exact)': top_df['km_kcat_exact_match'].apply(format_metrics),
|
| 'kcat/Km (±10%)': top_df['km_kcat_tolerance_match'].apply(format_metrics),
|
| 'Overall (Exact)': top_df['overall_exact_match'].apply(format_metrics),
|
| 'Overall (±10%)': top_df['overall_tolerance_match'].apply(format_metrics),
|
| })
|
|
|
| return display_df
|
|
|
|
|
| def create_summary_cards() -> str:
|
| """Create summary statistics HTML"""
|
| summary = get_leaderboard_summary(LEADERBOARD_DF)
|
|
|
| html = f"""
|
| <div style="display: grid; grid-template-columns: repeat(4, 1fr); gap: 15px; margin-bottom: 20px;">
|
| <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 10px; color: white; text-align: center;">
|
| <div style="font-size: 14px; opacity: 0.9;">Total Submissions</div>
|
| <div style="font-size: 32px; font-weight: bold;">{summary['total_submissions']}</div>
|
| </div>
|
| <div style="background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%); padding: 20px; border-radius: 10px; color: white; text-align: center;">
|
| <div style="font-size: 14px; opacity: 0.9;">Unique Models</div>
|
| <div style="font-size: 32px; font-weight: bold;">{summary['unique_models']}</div>
|
| </div>
|
| <div style="background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%); padding: 20px; border-radius: 10px; color: white; text-align: center;">
|
| <div style="font-size: 14px; opacity: 0.9;">Best Score</div>
|
| <div style="font-size: 32px; font-weight: bold;">{summary['best_score']:.1f}%</div>
|
| </div>
|
| <div style="background: linear-gradient(135deg, #43e97b 0%, #38f9d7 100%); padding: 20px; border-radius: 10px; color: white; text-align: center;">
|
| <div style="font-size: 14px; opacity: 0.9;">Average Score</div>
|
| <div style="font-size: 32px; font-weight: bold;">{summary['avg_score']:.1f}%</div>
|
| </div>
|
| </div>
|
| """
|
| return html
|
|
|
|
|
| def create_score_comparison_chart() -> go.Figure:
|
| """Create score comparison bar chart"""
|
| if LEADERBOARD_DF.empty:
|
| fig = go.Figure()
|
| fig.add_annotation(text="No submissions yet", xref="paper", yref="paper",
|
| x=0.5, y=0.5, showarrow=False)
|
| return fig
|
|
|
|
|
| top_10 = get_top_n(LEADERBOARD_DF, 10)
|
|
|
| fig = go.Figure()
|
| fig.add_trace(go.Bar(
|
| x=top_10['overall_exact_match'] * 100,
|
| y=top_10['model_name'] + ' (' + top_10['model_provider'] + ')',
|
| orientation='h',
|
| marker=dict(color='rgba(102, 126, 234, 0.8)'),
|
| text=top_10['overall_exact_match'].apply(lambda x: f'{x*100:.1f}%'),
|
| textposition='outside'
|
| ))
|
|
|
| fig.update_layout(
|
| title='Top 10 Models - Exact Match Accuracy',
|
| xaxis_title='Accuracy (%)',
|
| yaxis_title='Model',
|
| height=400,
|
| margin=dict(l=20, r=20, t=40, b=20)
|
| )
|
|
|
| return fig
|
|
|
|
|
| def create_ocr_comparison_chart() -> go.Figure:
|
| """Create OCR type comparison chart"""
|
| if LEADERBOARD_DF.empty:
|
| fig = go.Figure()
|
| fig.add_annotation(text="No submissions yet", xref="paper", yref="paper",
|
| x=0.5, y=0.5, showarrow=False)
|
| return fig
|
|
|
| ocr_stats = LEADERBOARD_DF.groupby('ocr_type')['overall_exact_match'].agg(['mean', 'count']).reset_index()
|
|
|
| fig = go.Figure()
|
| fig.add_trace(go.Bar(
|
| x=ocr_stats['ocr_type'],
|
| y=ocr_stats['mean'] * 100,
|
| marker=dict(color=['rgba(102, 126, 234, 0.8)', 'rgba(240, 147, 251, 0.8)', 'rgba(79, 172, 254, 0.8)']),
|
| text=ocr_stats['mean'].apply(lambda x: f'{x*100:.1f}%'),
|
| textposition='outside',
|
| name='Accuracy'
|
| ))
|
|
|
| fig.update_layout(
|
| title='Performance by OCR Type',
|
| xaxis_title='OCR Type',
|
| yaxis_title='Average Exact Match (%)',
|
| height=400,
|
| margin=dict(l=20, r=20, t=40, b=20)
|
| )
|
|
|
| return fig
|
|
|
|
|
| def create_timeline_chart() -> go.Figure:
|
| """Create submission timeline chart"""
|
| if LEADERBOARD_DF.empty:
|
| fig = go.Figure()
|
| fig.add_annotation(text="No submissions yet", xref="paper", yref="paper",
|
| x=0.5, y=0.5, showarrow=False)
|
| return fig
|
|
|
| df_sorted = LEADERBOARD_DF.sort_values('submission_date')
|
| df_sorted['cumulative_best'] = df_sorted['overall_exact_match'].cummax()
|
|
|
| fig = go.Figure()
|
|
|
|
|
| fig.add_trace(go.Scatter(
|
| x=df_sorted['submission_date'],
|
| y=df_sorted['overall_exact_match'] * 100,
|
| mode='markers',
|
| name='Submissions',
|
| marker=dict(size=8, color='rgba(102, 126, 234, 0.5)'),
|
| text=df_sorted['model_name'],
|
| hovertemplate='%{text}<br>%{x}<br>%{y:.1f}%'
|
| ))
|
|
|
|
|
| fig.add_trace(go.Scatter(
|
| x=df_sorted['submission_date'],
|
| y=df_sorted['cumulative_best'] * 100,
|
| mode='lines',
|
| name='Best Score',
|
| line=dict(color='rgba(67, 233, 123, 0.8)', width=2)
|
| ))
|
|
|
| fig.update_layout(
|
| title='Submission Timeline & Progress',
|
| xaxis_title='Date',
|
| yaxis_title='Exact Match (%)',
|
| height=400,
|
| margin=dict(l=20, r=20, t=40, b=20),
|
| hovermode='x unified'
|
| )
|
|
|
| return fig
|
|
|
|
|
| def submit_result(
|
| model_name: str,
|
| model_provider: str,
|
| ocr_type: str,
|
| submitter: str,
|
| km_exact: float,
|
| km_tolerance: float,
|
| kcat_exact: float,
|
| kcat_tolerance: float,
|
| km_kcat_exact: float,
|
| km_kcat_tolerance: float,
|
| total_papers: int,
|
| notes: str
|
| ) -> str:
|
| """Submit a new result to the leaderboard"""
|
| try:
|
|
|
| overall_exact = (km_exact + kcat_exact + km_kcat_exact) / 3
|
| overall_tolerance = (km_tolerance + kcat_tolerance + km_kcat_tolerance) / 3
|
|
|
|
|
| submission = {
|
| 'submission_id': f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{submitter}",
|
| 'model_name': model_name,
|
| 'model_provider': model_provider,
|
| 'ocr_type': ocr_type,
|
| 'submitter': submitter,
|
| 'submission_date': datetime.now().isoformat(),
|
| 'km_exact_match': km_exact / 100,
|
| 'km_tolerance_match': km_tolerance / 100,
|
| 'kcat_exact_match': kcat_exact / 100,
|
| 'kcat_tolerance_match': kcat_tolerance / 100,
|
| 'km_kcat_exact_match': km_kcat_exact / 100,
|
| 'km_kcat_tolerance_match': km_kcat_tolerance / 100,
|
| 'overall_exact_match': overall_exact / 100,
|
| 'overall_tolerance_match': overall_tolerance / 100,
|
| 'total_papers': total_papers,
|
| 'total_entries': total_papers * 3,
|
| 'notes': notes,
|
| 'verified': False
|
| }
|
|
|
|
|
| data_dir = Path("leaderboard/data")
|
| data_dir.mkdir(parents=True, exist_ok=True)
|
|
|
| submission_file = data_dir / f"{submission['submission_id']}.json"
|
| with open(submission_file, 'w') as f:
|
| json.dump(submission, f, indent=2)
|
|
|
|
|
| global LEADERBOARD_DF
|
| LEADERBOARD_DF = load_leaderboard_data()
|
|
|
| return f"✅ Submission successful! Your ID: {submission['submission_id']}\n\nPlease create a PR or contact the maintainer to verify your submission."
|
|
|
| except Exception as e:
|
| return f"❌ Error: {str(e)}"
|
|
|
|
|
|
|
| with gr.Blocks(css=custom_css, title="LLM Enzyme Kinetics Extraction Benchmark") as demo:
|
| gr.Markdown(
|
| """
|
| # 🧪 LLM Enzyme Kinetics Extraction Benchmark Leaderboard
|
|
|
| Welcome to the leaderboard for the **LLM Enzyme Kinetics Golden Benchmark**!
|
| This benchmark evaluates LLMs on extracting enzyme kinetic parameters (Km, kcat, kcat/Km)
|
| from scientific literature.
|
|
|
| 📚 **Dataset**: 4,244 entries from 156 papers | 🎯 **Task**: Extract kinetic parameters from OCR-processed papers
|
| """
|
| )
|
|
|
|
|
| gr.HTML(create_summary_cards())
|
|
|
| with gr.Tabs():
|
|
|
| with gr.TabItem("🏆 Leaderboard"):
|
| gr.Markdown("### Filter and Search")
|
|
|
| with gr.Row():
|
| model_provider_dropdown = gr.Dropdown(
|
| choices=["All", "OpenAI", "Anthropic", "Kimi", "Other"],
|
| value="All",
|
| label="Model Provider"
|
| )
|
| ocr_type_dropdown = gr.Dropdown(
|
| choices=["All", "mathpix", "kimi", "pymupdf", "glm_ocr"],
|
| value="All",
|
| label="OCR Type"
|
| )
|
| verified_checkbox = gr.Checkbox(
|
| label="Verified Only",
|
| value=False
|
| )
|
| top_n_slider = gr.Slider(
|
| minimum=10,
|
| maximum=100,
|
| value=50,
|
| step=10,
|
| label="Show Top N"
|
| )
|
|
|
| leaderboard_table = gr.Dataframe(
|
| label="Leaderboard",
|
| datatype=["markdown"] * 14,
|
| interactive=False,
|
| wrap=True,
|
| elem_classes=["leaderboard-table"]
|
| )
|
|
|
| refresh_btn = gr.Button("🔄 Refresh", variant="primary")
|
| refresh_btn.click(
|
| fn=create_leaderboard_table,
|
| inputs=[model_provider_dropdown, ocr_type_dropdown, verified_checkbox, top_n_slider],
|
| outputs=leaderboard_table
|
| )
|
|
|
|
|
| demo.load(
|
| fn=create_leaderboard_table,
|
| inputs=[model_provider_dropdown, ocr_type_dropdown, verified_checkbox, top_n_slider],
|
| outputs=leaderboard_table
|
| )
|
|
|
|
|
| with gr.TabItem("📊 Visualizations"):
|
| with gr.Row():
|
| score_chart = gr.Plot(label="Top Models Comparison")
|
| ocr_chart = gr.Plot(label="OCR Type Comparison")
|
|
|
| with gr.Row():
|
| timeline_chart = gr.Plot(label="Submission Timeline")
|
|
|
|
|
| demo.load(
|
| fn=lambda: [create_score_comparison_chart(), create_ocr_comparison_chart(), create_timeline_chart()],
|
| outputs=[score_chart, ocr_chart, timeline_chart]
|
| )
|
|
|
|
|
| with gr.TabItem("🚀 Auto-Evaluate"):
|
| gr.Markdown("""
|
| ### 🎯 Run Full Benchmark Directly in the Space
|
|
|
| **⚠️ Important Notes:**
|
| - Your API key is **only used for this evaluation** and never stored
|
| - Results are automatically saved to **GitHub** via Pull Request
|
| - Data persists even after Space restarts (stored in GitHub)
|
| - Requires a GitHub token with PR permissions
|
|
|
| **💡 Benefits:**
|
| ✅ No local setup needed
|
| ✅ Fast evaluation (Space has direct access to data)
|
| ✅ Automatic submission via GitHub PR
|
| ✅ Results verified by maintainers before appearing on leaderboard
|
| """)
|
|
|
| with gr.Accordion("📖 How it works", open=False):
|
| gr.Markdown("""
|
| 1. **Fill in your API credentials** (only used for this evaluation)
|
| 2. **Configure your model and settings**
|
| 3. **Run evaluation** - Space processes papers and extracts data
|
| 4. **Automatic submission** - Results saved to GitHub via PR
|
| 5. **Verification** - Maintainers review and merge your PR
|
| 6. **Appear on leaderboard** - Once verified, your results show up!
|
|
|
| **Data Persistence:**
|
| - Results saved to `leaderboard/data/submissions/` in GitHub
|
| - PR created to: `github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark`
|
| - Merged PRs loaded automatically by leaderboard
|
| - Space restarts don't affect your data!
|
| """)
|
|
|
| gr.Markdown("---")
|
|
|
|
|
| with gr.Row():
|
| github_token_input = gr.Textbox(
|
| label="GitHub Token (for PR creation) *",
|
| placeholder="ghp_xxxxxxxxxxxx",
|
| type="password",
|
| info="Create token at: https://github.com/settings/tokens (need 'repo' and 'pr' scopes)"
|
| )
|
|
|
|
|
| gr.Markdown("### 🔧 API Configuration")
|
|
|
| with gr.Row():
|
| api_provider_input = gr.Radio(
|
| choices=["OpenAI", "Anthropic", "Kimi/Moonshot"],
|
| value="OpenAI",
|
| label="API Provider *"
|
| )
|
| api_key_input = gr.Textbox(
|
| label="API Key *",
|
| type="password",
|
| placeholder="sk-...",
|
| info="Your API key is only used for this evaluation and never stored"
|
| )
|
| api_base_input = gr.Textbox(
|
| label="API Base URL",
|
| placeholder="https://api.openai.com/v1",
|
| info="Default: https://api.openai.com/v1"
|
| )
|
| model_name_input = gr.Textbox(
|
| label="Model Name *",
|
| placeholder="e.g., gpt-4, claude-sonnet-4-5-20250929, kimi-k2.5"
|
| )
|
|
|
|
|
| gr.Markdown("### ⚙️ Evaluation Settings")
|
|
|
| with gr.Row():
|
| ocr_type_input = gr.Dropdown(
|
| choices=["mathpix", "kimi", "pymupdf", "glm_ocr"],
|
| value="mathpix",
|
| label="OCR Type *",
|
| info="Which OCR version to use for evaluation"
|
| )
|
| num_papers_input = gr.Slider(
|
| minimum=1,
|
| maximum=156,
|
| value=5,
|
| step=1,
|
| label="Number of Papers (Quick Test: 1-5, Full Eval: 156)",
|
| info="Start with 5 papers for testing, then run full evaluation"
|
| )
|
|
|
| submitter_input = gr.Textbox(
|
| label="Submitter Name/Email *",
|
| placeholder="Your name or email (will be displayed on leaderboard)",
|
| info="Public information - will be shown on leaderboard"
|
| )
|
|
|
| run_eval_btn = gr.Button("🚀 Run Evaluation", variant="primary", size="lg")
|
| eval_output = gr.Markdown()
|
|
|
| def run_evaluation(github_token, api_provider, api_key, api_base,
|
| model_name, ocr_type, num_papers, submitter):
|
| """Run automatic evaluation"""
|
|
|
| if not github_token:
|
| return "❌ **Error**: GitHub token is required to create a PR for saving results."
|
|
|
| if not api_key:
|
| return "❌ **Error**: API key is required."
|
|
|
| if not model_name:
|
| return "❌ **Error**: Model name is required."
|
|
|
| if not submitter:
|
| return "❌ **Error**: Submitter name is required."
|
|
|
|
|
| if not api_base:
|
| if api_provider == "OpenAI":
|
| api_base = "https://api.openai.com/v1"
|
| elif api_provider == "Anthropic":
|
| api_base = "https://api.anthropic.com"
|
| elif api_provider == "Kimi/Moonshot":
|
| api_base = "https://api.moonshot.cn/v1"
|
|
|
| try:
|
| evaluator = BenchmarkEvaluator(github_token=github_token)
|
|
|
|
|
| success, results = evaluator.evaluate_submission(
|
| api_key=api_key,
|
| api_base=api_base,
|
| model_name=model_name,
|
| provider=api_provider,
|
| ocr_type=ocr_type,
|
| submitter=submitter,
|
| num_papers=num_papers
|
| )
|
|
|
| if success:
|
|
|
| msg = f"""
|
| ## ✅ Evaluation Completed Successfully!
|
|
|
| **Submission ID**: `{results['submission_id']}`
|
|
|
| ### 📊 Your Results:
|
| | Metric | Score |
|
| |--------|-------|
|
| | **Overall Exact Match** | {results['overall_exact_match']*100:.2f}% |
|
| | **Overall Tolerance (±10%)** | {results['overall_tolerance_match']*100:.2f}% |
|
| | Papers Evaluated | {results['total_papers']} |
|
| | Total Entries | {results['total_entries']} |
|
|
|
| ### 📝 Next Steps:
|
| 1. **Pull Request Created**: Check your email for PR notification
|
| 2. **Review**: Your results will be reviewed by maintainers
|
| 3. **Verification**: Once verified, results appear on the leaderboard
|
| 4. **Check PR**: https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark/pulls
|
|
|
| ### 💾 Data Persistence:
|
| - ✅ Results saved to GitHub repository
|
| - ✅ Persistent even after Space restarts
|
| - ✅ Version controlled via Pull Request
|
| - ✅ Safe from data loss
|
|
|
| **Note**: Your submission is marked as "Unverified" until a maintainer reviews and approves it.
|
| """
|
| return msg
|
| else:
|
| return f"❌ **Evaluation Failed**: {results.get('error', 'Unknown error')}"
|
|
|
| except Exception as e:
|
| return f"❌ **Error**: {str(e)}\n\nPlease check your inputs and try again."
|
|
|
| run_eval_btn.click(
|
| fn=run_evaluation,
|
| inputs=[
|
| github_token_input, api_provider_input, api_key_input,
|
| api_base_input, model_name_input, ocr_type_input,
|
| num_papers_input, submitter_input
|
| ],
|
| outputs=eval_output
|
| )
|
|
|
| gr.Markdown("""
|
| ---
|
| **⏱️ Expected Time**:
|
| - Quick Test (1-5 papers): 2-5 minutes
|
| - Full Evaluation (156 papers): 30-60 minutes
|
|
|
| **💡 Tips**:
|
| - Start with 1-5 papers to verify your setup
|
| - Check the "Quick Test" box for fast feedback
|
| - Use the same credentials for full evaluation
|
| - Results are saved even if you close the tab!
|
|
|
| **🔒 Privacy**:
|
| - API keys are **never stored** in the Space
|
| - Only used for the duration of evaluation
|
| - Cleared from memory immediately after evaluation
|
| """)
|
|
|
|
|
| with gr.TabItem("📤 Submit Your Results"):
|
| gr.Markdown("""
|
| ### 📝 Manually Submit Your Benchmark Results
|
|
|
| **⚠️ Important**: Results submitted here are **only saved locally** (not persistent).
|
| For persistent storage, use the **Auto-Evaluate** tab instead.
|
|
|
| **Instructions:**
|
| 1. Run the benchmark locally: `python scripts/run_benchmark.py --mode full`
|
| 2. Collect your metrics from `evaluation_results/summary.csv`
|
| 3. Fill in the form below
|
| 4. Results saved to `leaderboard/data/` (local only)
|
|
|
| **💡 Better Alternative**: Use the **Auto-Evaluate** tab for:
|
| - ✅ Automatic GitHub PR creation
|
| - ✅ Persistent data storage
|
| - ✅ Direct integration with leaderboard
|
| """)
|
|
|
| with gr.Row():
|
| model_name_input = gr.Textbox(label="Model Name *", placeholder="e.g., GPT-4, Claude-3.5-Sonnet")
|
| model_provider_input = gr.Dropdown(
|
| choices=["OpenAI", "Anthropic", "Kimi", "Other"],
|
| label="Model Provider *"
|
| )
|
|
|
| with gr.Row():
|
| ocr_type_input = gr.Dropdown(
|
| choices=["mathpix", "kimi", "pymupdf", "glm_ocr"],
|
| label="OCR Type *"
|
| )
|
| submitter_input = gr.Textbox(label="Submitter Name/Email *", placeholder="Your name or contact")
|
|
|
| gr.Markdown("### Performance Metrics (%)")
|
|
|
| with gr.Row():
|
| km_exact_input = gr.Number(label="Km Exact Match *", minimum=0, maximum=100)
|
| km_tolerance_input = gr.Number(label="Km Tolerance (±10%) *", minimum=0, maximum=100)
|
|
|
| with gr.Row():
|
| kcat_exact_input = gr.Number(label="kcat Exact Match *", minimum=0, maximum=100)
|
| kcat_tolerance_input = gr.Number(label="kcat Tolerance (±10%) *", minimum=0, maximum=100)
|
|
|
| with gr.Row():
|
| km_kcat_exact_input = gr.Number(label="kcat/Km Exact Match *", minimum=0, maximum=100)
|
| km_kcat_tolerance_input = gr.Number(label="kcat/Km Tolerance (±10%) *", minimum=0, maximum=100)
|
|
|
| with gr.Row():
|
| total_papers_input = gr.Number(label="Total Papers Evaluated *", minimum=1, maximum=156)
|
| notes_input = gr.Textbox(
|
| label="Notes",
|
| placeholder="Any additional information about your setup (temperature, prompts, etc.)",
|
| lines=3
|
| )
|
|
|
| submit_btn = gr.Button("Submit Results", variant="primary")
|
| submission_output = gr.Markdown()
|
|
|
| submit_btn.click(
|
| fn=submit_result,
|
| inputs=[
|
| model_name_input, model_provider_input, ocr_type_input, submitter_input,
|
| km_exact_input, km_tolerance_input, kcat_exact_input, kcat_tolerance_input,
|
| km_kcat_exact_input, km_kcat_tolerance_input, total_papers_input, notes_input
|
| ],
|
| outputs=submission_output
|
| )
|
|
|
|
|
| with gr.TabItem("ℹ️ About"):
|
| gr.Markdown("""
|
| ## About the Benchmark
|
|
|
| The **LLM Enzyme Kinetics Golden Benchmark** evaluates the ability of Large Language Models
|
| to extract structured enzyme kinetic data from scientific literature.
|
|
|
| ### Dataset
|
| - **Papers**: 156 peer-reviewed publications
|
| - **Entries**: 4,244 manually curated enzyme kinetic entries
|
| - **Parameters**: Km, kcat, kcat/Km, pH, temperature, mutations
|
| - **OCR Versions**: 3 parallel OCR outputs (Mathpix, Kimi, PyMuPDF)
|
|
|
| ### Evaluation Metrics
|
| 1. **Exact Match Accuracy**: Value must match exactly
|
| 2. **Tolerance Match (±10%)**: Value within 10% of ground truth
|
| 3. Scores are calculated for each parameter (Km, kcat, kcat/Km)
|
|
|
| ### How to Participate
|
| 1. Clone the repository:
|
| ```bash
|
| git clone https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark.git
|
| ```
|
|
|
| 2. Install dependencies:
|
| ```bash
|
| conda create -n enzyme_benchmark python=3.10 -y
|
| conda activate enzyme_benchmark
|
| pip install -r requirements.txt
|
| ```
|
|
|
| 3. Configure your API key in `.env`
|
|
|
| 4. Run the benchmark:
|
| ```bash
|
| python scripts/run_benchmark.py --mode full
|
| ```
|
|
|
| 5. Submit your results through this leaderboard!
|
|
|
| ### Citation
|
| If you use this benchmark, please cite our repository.
|
| """)
|
|
|
| gr.Markdown(
|
| """
|
| ---
|
| **[GitHub Repository](https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark)**
|
| | **[Documentation](https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark/blob/main/README.md)**
|
| | **[How to Participate](https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark/blob/main/USAGE.md)**
|
|
|
| *Last updated: {}
|
| """.format(datetime.now().strftime("%Y-%m-%d"))
|
| )
|
|
|
|
|
| if __name__ == "__main__":
|
| demo.launch()
|
|
|