""" LLM Enzyme Kinetics Extraction Benchmark Leaderboard Built with Gradio """ import gradio as gr import pandas as pd import plotly.graph_objects as go import plotly.express as px from datetime import datetime import json import os from pathlib import Path from auto_eval import BenchmarkEvaluator from utils import ( load_leaderboard_data, format_metrics, get_leaderboard_summary, filter_leaderboard, get_top_n, create_comparison_data ) # CSS for better styling custom_css = """ .gradio-container { max-width: 1400px !important; } .metric-card { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 10px; color: white; text-align: center; } /* Make leaderboard table taller with scrolling */ .leaderboard-table { min-height: 600px !important; } .leaderboard-table .wrap { height: 600px !important; overflow-y: auto !important; } """ # Initialize leaderboard data # Auto-detect correct data directory for both local and HuggingFace Space if os.path.exists('data'): # Running from leaderboard/ directory (HuggingFace Space) LEADERBOARD_DF = load_leaderboard_data('data') elif os.path.exists('leaderboard/data'): # Running from repository root LEADERBOARD_DF = load_leaderboard_data('leaderboard/data') else: # Fallback to default LEADERBOARD_DF = load_leaderboard_data() def create_leaderboard_table( model_provider: str = "All", ocr_type: str = "All", verified_only: bool = False, top_n: int = 50 ) -> pd.DataFrame: """Create filtered leaderboard table""" filtered_df = filter_leaderboard(LEADERBOARD_DF, model_provider, ocr_type, verified_only) top_df = get_top_n(filtered_df, top_n) if top_df.empty: return pd.DataFrame(columns=["Rank", "Model", "Provider", "OCR", "Submitter", "Date", "Km (Exact)", "Km (±10%)", "kcat (Exact)", "kcat (±10%)", "kcat/Km (Exact)", "kcat/Km (±10%)", "Overall (Exact)", "Overall (±10%)"]) # Format for display display_df = pd.DataFrame({ 'Rank': range(1, len(top_df) + 1), 'Model': top_df['model_name'], 'Provider': top_df['model_provider'], 'OCR': top_df['ocr_type'], 'Submitter': top_df['submitter'], 'Date': top_df['submission_date'].dt.strftime('%Y-%m-%d'), 'Km (Exact)': top_df['km_exact_match'].apply(format_metrics), 'Km (±10%)': top_df['km_tolerance_match'].apply(format_metrics), 'kcat (Exact)': top_df['kcat_exact_match'].apply(format_metrics), 'kcat (±10%)': top_df['kcat_tolerance_match'].apply(format_metrics), 'kcat/Km (Exact)': top_df['km_kcat_exact_match'].apply(format_metrics), 'kcat/Km (±10%)': top_df['km_kcat_tolerance_match'].apply(format_metrics), 'Overall (Exact)': top_df['overall_exact_match'].apply(format_metrics), 'Overall (±10%)': top_df['overall_tolerance_match'].apply(format_metrics), }) return display_df def create_summary_cards() -> str: """Create summary statistics HTML""" summary = get_leaderboard_summary(LEADERBOARD_DF) html = f"""
Total Submissions
{summary['total_submissions']}
Unique Models
{summary['unique_models']}
Best Score
{summary['best_score']:.1f}%
Average Score
{summary['avg_score']:.1f}%
""" return html def create_score_comparison_chart() -> go.Figure: """Create score comparison bar chart""" if LEADERBOARD_DF.empty: fig = go.Figure() fig.add_annotation(text="No submissions yet", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False) return fig # Get top 10 submissions top_10 = get_top_n(LEADERBOARD_DF, 10) fig = go.Figure() fig.add_trace(go.Bar( x=top_10['overall_exact_match'] * 100, y=top_10['model_name'] + ' (' + top_10['model_provider'] + ')', orientation='h', marker=dict(color='rgba(102, 126, 234, 0.8)'), text=top_10['overall_exact_match'].apply(lambda x: f'{x*100:.1f}%'), textposition='outside' )) fig.update_layout( title='Top 10 Models - Exact Match Accuracy', xaxis_title='Accuracy (%)', yaxis_title='Model', height=400, margin=dict(l=20, r=20, t=40, b=20) ) return fig def create_ocr_comparison_chart() -> go.Figure: """Create OCR type comparison chart""" if LEADERBOARD_DF.empty: fig = go.Figure() fig.add_annotation(text="No submissions yet", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False) return fig ocr_stats = LEADERBOARD_DF.groupby('ocr_type')['overall_exact_match'].agg(['mean', 'count']).reset_index() fig = go.Figure() fig.add_trace(go.Bar( x=ocr_stats['ocr_type'], y=ocr_stats['mean'] * 100, marker=dict(color=['rgba(102, 126, 234, 0.8)', 'rgba(240, 147, 251, 0.8)', 'rgba(79, 172, 254, 0.8)']), text=ocr_stats['mean'].apply(lambda x: f'{x*100:.1f}%'), textposition='outside', name='Accuracy' )) fig.update_layout( title='Performance by OCR Type', xaxis_title='OCR Type', yaxis_title='Average Exact Match (%)', height=400, margin=dict(l=20, r=20, t=40, b=20) ) return fig def create_timeline_chart() -> go.Figure: """Create submission timeline chart""" if LEADERBOARD_DF.empty: fig = go.Figure() fig.add_annotation(text="No submissions yet", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False) return fig df_sorted = LEADERBOARD_DF.sort_values('submission_date') df_sorted['cumulative_best'] = df_sorted['overall_exact_match'].cummax() fig = go.Figure() # Add all submissions as scatter fig.add_trace(go.Scatter( x=df_sorted['submission_date'], y=df_sorted['overall_exact_match'] * 100, mode='markers', name='Submissions', marker=dict(size=8, color='rgba(102, 126, 234, 0.5)'), text=df_sorted['model_name'], hovertemplate='%{text}
%{x}
%{y:.1f}%' )) # Add best score line fig.add_trace(go.Scatter( x=df_sorted['submission_date'], y=df_sorted['cumulative_best'] * 100, mode='lines', name='Best Score', line=dict(color='rgba(67, 233, 123, 0.8)', width=2) )) fig.update_layout( title='Submission Timeline & Progress', xaxis_title='Date', yaxis_title='Exact Match (%)', height=400, margin=dict(l=20, r=20, t=40, b=20), hovermode='x unified' ) return fig def submit_result( model_name: str, model_provider: str, ocr_type: str, submitter: str, km_exact: float, km_tolerance: float, kcat_exact: float, kcat_tolerance: float, km_kcat_exact: float, km_kcat_tolerance: float, total_papers: int, notes: str ) -> str: """Submit a new result to the leaderboard""" try: # Calculate overall scores overall_exact = (km_exact + kcat_exact + km_kcat_exact) / 3 overall_tolerance = (km_tolerance + kcat_tolerance + km_kcat_tolerance) / 3 # Create submission data submission = { 'submission_id': f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{submitter}", 'model_name': model_name, 'model_provider': model_provider, 'ocr_type': ocr_type, 'submitter': submitter, 'submission_date': datetime.now().isoformat(), 'km_exact_match': km_exact / 100, 'km_tolerance_match': km_tolerance / 100, 'kcat_exact_match': kcat_exact / 100, 'kcat_tolerance_match': kcat_tolerance / 100, 'km_kcat_exact_match': km_kcat_exact / 100, 'km_kcat_tolerance_match': km_kcat_tolerance / 100, 'overall_exact_match': overall_exact / 100, 'overall_tolerance_match': overall_tolerance / 100, 'total_papers': total_papers, 'total_entries': total_papers * 3, # Approximate 'notes': notes, 'verified': False # Needs verification } # Save to data directory data_dir = Path("leaderboard/data") data_dir.mkdir(parents=True, exist_ok=True) submission_file = data_dir / f"{submission['submission_id']}.json" with open(submission_file, 'w') as f: json.dump(submission, f, indent=2) # Reload leaderboard data global LEADERBOARD_DF LEADERBOARD_DF = load_leaderboard_data() return f"✅ Submission successful! Your ID: {submission['submission_id']}\n\nPlease create a PR or contact the maintainer to verify your submission." except Exception as e: return f"❌ Error: {str(e)}" # Build Gradio interface with gr.Blocks(css=custom_css, title="LLM Enzyme Kinetics Extraction Benchmark") as demo: gr.Markdown( """ # 🧪 LLM Enzyme Kinetics Extraction Benchmark Leaderboard Welcome to the leaderboard for the **LLM Enzyme Kinetics Golden Benchmark**! This benchmark evaluates LLMs on extracting enzyme kinetic parameters (Km, kcat, kcat/Km) from scientific literature. 📚 **Dataset**: 4,244 entries from 156 papers | 🎯 **Task**: Extract kinetic parameters from OCR-processed papers """ ) # Summary cards gr.HTML(create_summary_cards()) with gr.Tabs(): # Tab 1: Leaderboard Table with gr.TabItem("🏆 Leaderboard"): gr.Markdown("### Filter and Search") with gr.Row(): model_provider_dropdown = gr.Dropdown( choices=["All", "OpenAI", "Anthropic", "Kimi", "Other"], value="All", label="Model Provider" ) ocr_type_dropdown = gr.Dropdown( choices=["All", "mathpix", "kimi", "pymupdf", "glm_ocr"], value="All", label="OCR Type" ) verified_checkbox = gr.Checkbox( label="Verified Only", value=False ) top_n_slider = gr.Slider( minimum=10, maximum=100, value=50, step=10, label="Show Top N" ) leaderboard_table = gr.Dataframe( label="Leaderboard", datatype=["markdown"] * 14, interactive=False, wrap=True, elem_classes=["leaderboard-table"] ) refresh_btn = gr.Button("🔄 Refresh", variant="primary") refresh_btn.click( fn=create_leaderboard_table, inputs=[model_provider_dropdown, ocr_type_dropdown, verified_checkbox, top_n_slider], outputs=leaderboard_table ) # Initial load demo.load( fn=create_leaderboard_table, inputs=[model_provider_dropdown, ocr_type_dropdown, verified_checkbox, top_n_slider], outputs=leaderboard_table ) # Tab 2: Visualizations with gr.TabItem("📊 Visualizations"): with gr.Row(): score_chart = gr.Plot(label="Top Models Comparison") ocr_chart = gr.Plot(label="OCR Type Comparison") with gr.Row(): timeline_chart = gr.Plot(label="Submission Timeline") # Load charts demo.load( fn=lambda: [create_score_comparison_chart(), create_ocr_comparison_chart(), create_timeline_chart()], outputs=[score_chart, ocr_chart, timeline_chart] ) # Tab 3: Auto-Evaluate (🚀 Run Benchmark in Space) with gr.TabItem("🚀 Auto-Evaluate"): gr.Markdown(""" ### 🎯 Run Full Benchmark Directly in the Space **⚠️ Important Notes:** - Your API key is **only used for this evaluation** and never stored - Results are automatically saved to **GitHub** via Pull Request - Data persists even after Space restarts (stored in GitHub) - Requires a GitHub token with PR permissions **💡 Benefits:** ✅ No local setup needed ✅ Fast evaluation (Space has direct access to data) ✅ Automatic submission via GitHub PR ✅ Results verified by maintainers before appearing on leaderboard """) with gr.Accordion("📖 How it works", open=False): gr.Markdown(""" 1. **Fill in your API credentials** (only used for this evaluation) 2. **Configure your model and settings** 3. **Run evaluation** - Space processes papers and extracts data 4. **Automatic submission** - Results saved to GitHub via PR 5. **Verification** - Maintainers review and merge your PR 6. **Appear on leaderboard** - Once verified, your results show up! **Data Persistence:** - Results saved to `leaderboard/data/submissions/` in GitHub - PR created to: `github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark` - Merged PRs loaded automatically by leaderboard - Space restarts don't affect your data! """) gr.Markdown("---") # GitHub Token for PR creation with gr.Row(): github_token_input = gr.Textbox( label="GitHub Token (for PR creation) *", placeholder="ghp_xxxxxxxxxxxx", type="password", info="Create token at: https://github.com/settings/tokens (need 'repo' and 'pr' scopes)" ) # API Configuration gr.Markdown("### 🔧 API Configuration") with gr.Row(): api_provider_input = gr.Radio( choices=["OpenAI", "Anthropic", "Kimi/Moonshot"], value="OpenAI", label="API Provider *" ) api_key_input = gr.Textbox( label="API Key *", type="password", placeholder="sk-...", info="Your API key is only used for this evaluation and never stored" ) api_base_input = gr.Textbox( label="API Base URL", placeholder="https://api.openai.com/v1", info="Default: https://api.openai.com/v1" ) model_name_input = gr.Textbox( label="Model Name *", placeholder="e.g., gpt-4, claude-sonnet-4-5-20250929, kimi-k2.5" ) # Evaluation Settings gr.Markdown("### ⚙️ Evaluation Settings") with gr.Row(): ocr_type_input = gr.Dropdown( choices=["mathpix", "kimi", "pymupdf", "glm_ocr"], value="mathpix", label="OCR Type *", info="Which OCR version to use for evaluation" ) num_papers_input = gr.Slider( minimum=1, maximum=156, value=5, step=1, label="Number of Papers (Quick Test: 1-5, Full Eval: 156)", info="Start with 5 papers for testing, then run full evaluation" ) submitter_input = gr.Textbox( label="Submitter Name/Email *", placeholder="Your name or email (will be displayed on leaderboard)", info="Public information - will be shown on leaderboard" ) run_eval_btn = gr.Button("🚀 Run Evaluation", variant="primary", size="lg") eval_output = gr.Markdown() def run_evaluation(github_token, api_provider, api_key, api_base, model_name, ocr_type, num_papers, submitter): """Run automatic evaluation""" if not github_token: return "❌ **Error**: GitHub token is required to create a PR for saving results." if not api_key: return "❌ **Error**: API key is required." if not model_name: return "❌ **Error**: Model name is required." if not submitter: return "❌ **Error**: Submitter name is required." # Set default API base if not provided if not api_base: if api_provider == "OpenAI": api_base = "https://api.openai.com/v1" elif api_provider == "Anthropic": api_base = "https://api.anthropic.com" elif api_provider == "Kimi/Moonshot": api_base = "https://api.moonshot.cn/v1" try: evaluator = BenchmarkEvaluator(github_token=github_token) # Run evaluation success, results = evaluator.evaluate_submission( api_key=api_key, api_base=api_base, model_name=model_name, provider=api_provider, ocr_type=ocr_type, submitter=submitter, num_papers=num_papers ) if success: # Format results msg = f""" ## ✅ Evaluation Completed Successfully! **Submission ID**: `{results['submission_id']}` ### 📊 Your Results: | Metric | Score | |--------|-------| | **Overall Exact Match** | {results['overall_exact_match']*100:.2f}% | | **Overall Tolerance (±10%)** | {results['overall_tolerance_match']*100:.2f}% | | Papers Evaluated | {results['total_papers']} | | Total Entries | {results['total_entries']} | ### 📝 Next Steps: 1. **Pull Request Created**: Check your email for PR notification 2. **Review**: Your results will be reviewed by maintainers 3. **Verification**: Once verified, results appear on the leaderboard 4. **Check PR**: https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark/pulls ### 💾 Data Persistence: - ✅ Results saved to GitHub repository - ✅ Persistent even after Space restarts - ✅ Version controlled via Pull Request - ✅ Safe from data loss **Note**: Your submission is marked as "Unverified" until a maintainer reviews and approves it. """ return msg else: return f"❌ **Evaluation Failed**: {results.get('error', 'Unknown error')}" except Exception as e: return f"❌ **Error**: {str(e)}\n\nPlease check your inputs and try again." run_eval_btn.click( fn=run_evaluation, inputs=[ github_token_input, api_provider_input, api_key_input, api_base_input, model_name_input, ocr_type_input, num_papers_input, submitter_input ], outputs=eval_output ) gr.Markdown(""" --- **⏱️ Expected Time**: - Quick Test (1-5 papers): 2-5 minutes - Full Evaluation (156 papers): 30-60 minutes **💡 Tips**: - Start with 1-5 papers to verify your setup - Check the "Quick Test" box for fast feedback - Use the same credentials for full evaluation - Results are saved even if you close the tab! **🔒 Privacy**: - API keys are **never stored** in the Space - Only used for the duration of evaluation - Cleared from memory immediately after evaluation """) # Tab 4: Submit Results (Manual) with gr.TabItem("📤 Submit Your Results"): gr.Markdown(""" ### 📝 Manually Submit Your Benchmark Results **⚠️ Important**: Results submitted here are **only saved locally** (not persistent). For persistent storage, use the **Auto-Evaluate** tab instead. **Instructions:** 1. Run the benchmark locally: `python scripts/run_benchmark.py --mode full` 2. Collect your metrics from `evaluation_results/summary.csv` 3. Fill in the form below 4. Results saved to `leaderboard/data/` (local only) **💡 Better Alternative**: Use the **Auto-Evaluate** tab for: - ✅ Automatic GitHub PR creation - ✅ Persistent data storage - ✅ Direct integration with leaderboard """) with gr.Row(): model_name_input = gr.Textbox(label="Model Name *", placeholder="e.g., GPT-4, Claude-3.5-Sonnet") model_provider_input = gr.Dropdown( choices=["OpenAI", "Anthropic", "Kimi", "Other"], label="Model Provider *" ) with gr.Row(): ocr_type_input = gr.Dropdown( choices=["mathpix", "kimi", "pymupdf", "glm_ocr"], label="OCR Type *" ) submitter_input = gr.Textbox(label="Submitter Name/Email *", placeholder="Your name or contact") gr.Markdown("### Performance Metrics (%)") with gr.Row(): km_exact_input = gr.Number(label="Km Exact Match *", minimum=0, maximum=100) km_tolerance_input = gr.Number(label="Km Tolerance (±10%) *", minimum=0, maximum=100) with gr.Row(): kcat_exact_input = gr.Number(label="kcat Exact Match *", minimum=0, maximum=100) kcat_tolerance_input = gr.Number(label="kcat Tolerance (±10%) *", minimum=0, maximum=100) with gr.Row(): km_kcat_exact_input = gr.Number(label="kcat/Km Exact Match *", minimum=0, maximum=100) km_kcat_tolerance_input = gr.Number(label="kcat/Km Tolerance (±10%) *", minimum=0, maximum=100) with gr.Row(): total_papers_input = gr.Number(label="Total Papers Evaluated *", minimum=1, maximum=156) notes_input = gr.Textbox( label="Notes", placeholder="Any additional information about your setup (temperature, prompts, etc.)", lines=3 ) submit_btn = gr.Button("Submit Results", variant="primary") submission_output = gr.Markdown() submit_btn.click( fn=submit_result, inputs=[ model_name_input, model_provider_input, ocr_type_input, submitter_input, km_exact_input, km_tolerance_input, kcat_exact_input, kcat_tolerance_input, km_kcat_exact_input, km_kcat_tolerance_input, total_papers_input, notes_input ], outputs=submission_output ) # Tab 5: About with gr.TabItem("ℹ️ About"): gr.Markdown(""" ## About the Benchmark The **LLM Enzyme Kinetics Golden Benchmark** evaluates the ability of Large Language Models to extract structured enzyme kinetic data from scientific literature. ### Dataset - **Papers**: 156 peer-reviewed publications - **Entries**: 4,244 manually curated enzyme kinetic entries - **Parameters**: Km, kcat, kcat/Km, pH, temperature, mutations - **OCR Versions**: 3 parallel OCR outputs (Mathpix, Kimi, PyMuPDF) ### Evaluation Metrics 1. **Exact Match Accuracy**: Value must match exactly 2. **Tolerance Match (±10%)**: Value within 10% of ground truth 3. Scores are calculated for each parameter (Km, kcat, kcat/Km) ### How to Participate 1. Clone the repository: ```bash git clone https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark.git ``` 2. Install dependencies: ```bash conda create -n enzyme_benchmark python=3.10 -y conda activate enzyme_benchmark pip install -r requirements.txt ``` 3. Configure your API key in `.env` 4. Run the benchmark: ```bash python scripts/run_benchmark.py --mode full ``` 5. Submit your results through this leaderboard! ### Citation If you use this benchmark, please cite our repository. """) gr.Markdown( """ --- **[GitHub Repository](https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark)** | **[Documentation](https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark/blob/main/README.md)** | **[How to Participate](https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark/blob/main/USAGE.md)** *Last updated: {} """.format(datetime.now().strftime("%Y-%m-%d")) ) if __name__ == "__main__": demo.launch()