""" LLM Enzyme Kinetics Extraction Benchmark Leaderboard Built with Gradio """ import gradio as gr import pandas as pd import plotly.graph_objects as go import plotly.express as px from datetime import datetime import json from pathlib import Path from utils import ( load_leaderboard_data, format_metrics, get_leaderboard_summary, filter_leaderboard, get_top_n, create_comparison_data ) # CSS for better styling custom_css = """ .gradio-container { max-width: 1400px !important; } .metric-card { background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 10px; color: white; text-align: center; } .leaderboard-table { font-size: 14px; } """ # Initialize leaderboard data LEADERBOARD_DF = load_leaderboard_data() def create_leaderboard_table( model_provider: str = "All", ocr_type: str = "All", verified_only: bool = False, top_n: int = 50 ) -> pd.DataFrame: """Create filtered leaderboard table""" filtered_df = filter_leaderboard(LEADERBOARD_DF, model_provider, ocr_type, verified_only) top_df = get_top_n(filtered_df, top_n) if top_df.empty: return pd.DataFrame(columns=["Rank", "Model", "Provider", "OCR", "Submitter", "Date", "Km (Exact)", "Km (±10%)", "kcat (Exact)", "kcat (±10%)", "kcat/Km (Exact)", "kcat/Km (±10%)", "Overall (Exact)", "Overall (±10%)"]) # Format for display display_df = pd.DataFrame({ 'Rank': range(1, len(top_df) + 1), 'Model': top_df['model_name'], 'Provider': top_df['model_provider'], 'OCR': top_df['ocr_type'], 'Submitter': top_df['submitter'], 'Date': top_df['submission_date'].dt.strftime('%Y-%m-%d'), 'Km (Exact)': top_df['km_exact_match'].apply(format_metrics), 'Km (±10%)': top_df['km_tolerance_match'].apply(format_metrics), 'kcat (Exact)': top_df['kcat_exact_match'].apply(format_metrics), 'kcat (±10%)': top_df['kcat_tolerance_match'].apply(format_metrics), 'kcat/Km (Exact)': top_df['km_kcat_exact_match'].apply(format_metrics), 'kcat/Km (±10%)': top_df['km_kcat_tolerance_match'].apply(format_metrics), 'Overall (Exact)': top_df['overall_exact_match'].apply(format_metrics), 'Overall (±10%)': top_df['overall_tolerance_match'].apply(format_metrics), }) return display_df def create_summary_cards() -> str: """Create summary statistics HTML""" summary = get_leaderboard_summary(LEADERBOARD_DF) html = f"""
Total Submissions
{summary['total_submissions']}
Unique Models
{summary['unique_models']}
Best Score
{summary['best_score']:.1f}%
Average Score
{summary['avg_score']:.1f}%
""" return html def create_score_comparison_chart() -> go.Figure: """Create score comparison bar chart""" if LEADERBOARD_DF.empty: fig = go.Figure() fig.add_annotation(text="No submissions yet", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False) return fig # Get top 10 submissions top_10 = get_top_n(LEADERBOARD_DF, 10) fig = go.Figure() fig.add_trace(go.Bar( x=top_10['overall_exact_match'] * 100, y=top_10['model_name'] + ' (' + top_10['model_provider'] + ')', orientation='h', marker=dict(color='rgba(102, 126, 234, 0.8)'), text=top_10['overall_exact_match'].apply(lambda x: f'{x*100:.1f}%'), textposition='outside' )) fig.update_layout( title='Top 10 Models - Exact Match Accuracy', xaxis_title='Accuracy (%)', yaxis_title='Model', height=400, margin=dict(l=20, r=20, t=40, b=20) ) return fig def create_ocr_comparison_chart() -> go.Figure: """Create OCR type comparison chart""" if LEADERBOARD_DF.empty: fig = go.Figure() fig.add_annotation(text="No submissions yet", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False) return fig ocr_stats = LEADERBOARD_DF.groupby('ocr_type')['overall_exact_match'].agg(['mean', 'count']).reset_index() fig = go.Figure() fig.add_trace(go.Bar( x=ocr_stats['ocr_type'], y=ocr_stats['mean'] * 100, marker=dict(color=['rgba(102, 126, 234, 0.8)', 'rgba(240, 147, 251, 0.8)', 'rgba(79, 172, 254, 0.8)']), text=ocr_stats['mean'].apply(lambda x: f'{x*100:.1f}%'), textposition='outside', name='Accuracy' )) fig.update_layout( title='Performance by OCR Type', xaxis_title='OCR Type', yaxis_title='Average Exact Match (%)', height=400, margin=dict(l=20, r=20, t=40, b=20) ) return fig def create_timeline_chart() -> go.Figure: """Create submission timeline chart""" if LEADERBOARD_DF.empty: fig = go.Figure() fig.add_annotation(text="No submissions yet", xref="paper", yref="paper", x=0.5, y=0.5, showarrow=False) return fig df_sorted = LEADERBOARD_DF.sort_values('submission_date') df_sorted['cumulative_best'] = df_sorted['overall_exact_match'].cummax() fig = go.Figure() # Add all submissions as scatter fig.add_trace(go.Scatter( x=df_sorted['submission_date'], y=df_sorted['overall_exact_match'] * 100, mode='markers', name='Submissions', marker=dict(size=8, color='rgba(102, 126, 234, 0.5)'), text=df_sorted['model_name'], hovertemplate='%{text}
%{x}
%{y:.1f}%' )) # Add best score line fig.add_trace(go.Scatter( x=df_sorted['submission_date'], y=df_sorted['cumulative_best'] * 100, mode='lines', name='Best Score', line=dict(color='rgba(67, 233, 123, 0.8)', width=2) )) fig.update_layout( title='Submission Timeline & Progress', xaxis_title='Date', yaxis_title='Exact Match (%)', height=400, margin=dict(l=20, r=20, t=40, b=20), hovermode='x unified' ) return fig def submit_result( model_name: str, model_provider: str, ocr_type: str, submitter: str, km_exact: float, km_tolerance: float, kcat_exact: float, kcat_tolerance: float, km_kcat_exact: float, km_kcat_tolerance: float, total_papers: int, notes: str ) -> str: """Submit a new result to the leaderboard""" try: # Calculate overall scores overall_exact = (km_exact + kcat_exact + km_kcat_exact) / 3 overall_tolerance = (km_tolerance + kcat_tolerance + km_kcat_tolerance) / 3 # Create submission data submission = { 'submission_id': f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{submitter}", 'model_name': model_name, 'model_provider': model_provider, 'ocr_type': ocr_type, 'submitter': submitter, 'submission_date': datetime.now().isoformat(), 'km_exact_match': km_exact / 100, 'km_tolerance_match': km_tolerance / 100, 'kcat_exact_match': kcat_exact / 100, 'kcat_tolerance_match': kcat_tolerance / 100, 'km_kcat_exact_match': km_kcat_exact / 100, 'km_kcat_tolerance_match': km_kcat_tolerance / 100, 'overall_exact_match': overall_exact / 100, 'overall_tolerance_match': overall_tolerance / 100, 'total_papers': total_papers, 'total_entries': total_papers * 3, # Approximate 'notes': notes, 'verified': False # Needs verification } # Save to data directory data_dir = Path("leaderboard/data") data_dir.mkdir(parents=True, exist_ok=True) submission_file = data_dir / f"{submission['submission_id']}.json" with open(submission_file, 'w') as f: json.dump(submission, f, indent=2) # Reload leaderboard data global LEADERBOARD_DF LEADERBOARD_DF = load_leaderboard_data() return f"✅ Submission successful! Your ID: {submission['submission_id']}\n\nPlease create a PR or contact the maintainer to verify your submission." except Exception as e: return f"❌ Error: {str(e)}" # Build Gradio interface with gr.Blocks(css=custom_css, title="LLM Enzyme Kinetics Extraction Benchmark") as demo: gr.Markdown( """ # 🧪 LLM Enzyme Kinetics Extraction Benchmark Leaderboard Welcome to the leaderboard for the **LLM Enzyme Kinetics Golden Benchmark**! This benchmark evaluates LLMs on extracting enzyme kinetic parameters (Km, kcat, kcat/Km) from scientific literature. 📚 **Dataset**: 4,244 entries from 156 papers | 🎯 **Task**: Extract kinetic parameters from OCR-processed papers """ ) # Summary cards gr.HTML(create_summary_cards()) with gr.Tabs(): # Tab 1: Leaderboard Table with gr.TabItem("🏆 Leaderboard"): gr.Markdown("### Filter and Search") with gr.Row(): model_provider_dropdown = gr.Dropdown( choices=["All", "OpenAI", "Anthropic", "Kimi", "Other"], value="All", label="Model Provider" ) ocr_type_dropdown = gr.Dropdown( choices=["All", "mathpix", "kimi", "pymupdf"], value="All", label="OCR Type" ) verified_checkbox = gr.Checkbox( label="Verified Only", value=False ) top_n_slider = gr.Slider( minimum=10, maximum=100, value=50, step=10, label="Show Top N" ) leaderboard_table = gr.Dataframe( label="Leaderboard", datatype=["markdown"] * 14, interactive=False, wrap=True ) refresh_btn = gr.Button("🔄 Refresh", variant="primary") refresh_btn.click( fn=create_leaderboard_table, inputs=[model_provider_dropdown, ocr_type_dropdown, verified_checkbox, top_n_slider], outputs=leaderboard_table ) # Initial load demo.load( fn=create_leaderboard_table, inputs=[model_provider_dropdown, ocr_type_dropdown, verified_checkbox, top_n_slider], outputs=leaderboard_table ) # Tab 2: Visualizations with gr.TabItem("📊 Visualizations"): with gr.Row(): score_chart = gr.Plot(label="Top Models Comparison") ocr_chart = gr.Plot(label="OCR Type Comparison") with gr.Row(): timeline_chart = gr.Plot(label="Submission Timeline") # Load charts demo.load( fn=lambda: [create_score_comparison_chart(), create_ocr_comparison_chart(), create_timeline_chart()], outputs=[score_chart, ocr_chart, timeline_chart] ) # Tab 3: Submit Results with gr.TabItem("📤 Submit Your Results"): gr.Markdown(""" ### Submit your benchmark results to the leaderboard! **Instructions:** 1. Run the benchmark using the provided scripts 2. Collect your evaluation metrics 3. Fill in the form below 4. Your submission will be reviewed before appearing on the leaderboard **Evaluation Scripts:** ```bash python scripts/run_benchmark.py --mode full ``` """) with gr.Row(): model_name_input = gr.Textbox(label="Model Name *", placeholder="e.g., GPT-4, Claude-3.5-Sonnet") model_provider_input = gr.Dropdown( choices=["OpenAI", "Anthropic", "Kimi", "Other"], label="Model Provider *" ) with gr.Row(): ocr_type_input = gr.Dropdown( choices=["mathpix", "kimi", "pymupdf"], label="OCR Type *" ) submitter_input = gr.Textbox(label="Submitter Name/Email *", placeholder="Your name or contact") gr.Markdown("### Performance Metrics (%)") with gr.Row(): km_exact_input = gr.Number(label="Km Exact Match *", minimum=0, maximum=100) km_tolerance_input = gr.Number(label="Km Tolerance (±10%) *", minimum=0, maximum=100) with gr.Row(): kcat_exact_input = gr.Number(label="kcat Exact Match *", minimum=0, maximum=100) kcat_tolerance_input = gr.Number(label="kcat Tolerance (±10%) *", minimum=0, maximum=100) with gr.Row(): km_kcat_exact_input = gr.Number(label="kcat/Km Exact Match *", minimum=0, maximum=100) km_kcat_tolerance_input = gr.Number(label="kcat/Km Tolerance (±10%) *", minimum=0, maximum=100) with gr.Row(): total_papers_input = gr.Number(label="Total Papers Evaluated *", minimum=1, maximum=156) notes_input = gr.Textbox( label="Notes", placeholder="Any additional information about your setup (temperature, prompts, etc.)", lines=3 ) submit_btn = gr.Button("Submit Results", variant="primary") submission_output = gr.Markdown() submit_btn.click( fn=submit_result, inputs=[ model_name_input, model_provider_input, ocr_type_input, submitter_input, km_exact_input, km_tolerance_input, kcat_exact_input, kcat_tolerance_input, km_kcat_exact_input, km_kcat_tolerance_input, total_papers_input, notes_input ], outputs=submission_output ) # Tab 4: About with gr.TabItem("ℹ️ About"): gr.Markdown(""" ## About the Benchmark The **LLM Enzyme Kinetics Golden Benchmark** evaluates the ability of Large Language Models to extract structured enzyme kinetic data from scientific literature. ### Dataset - **Papers**: 156 peer-reviewed publications - **Entries**: 4,244 manually curated enzyme kinetic entries - **Parameters**: Km, kcat, kcat/Km, pH, temperature, mutations - **OCR Versions**: 3 parallel OCR outputs (Mathpix, Kimi, PyMuPDF) ### Evaluation Metrics 1. **Exact Match Accuracy**: Value must match exactly 2. **Tolerance Match (±10%)**: Value within 10% of ground truth 3. Scores are calculated for each parameter (Km, kcat, kcat/Km) ### How to Participate 1. Clone the repository: ```bash git clone https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark.git ``` 2. Install dependencies: ```bash conda create -n enzyme_benchmark python=3.10 -y conda activate enzyme_benchmark pip install -r requirements.txt ``` 3. Configure your API key in `.env` 4. Run the benchmark: ```bash python scripts/run_benchmark.py --mode full ``` 5. Submit your results through this leaderboard! ### Citation If you use this benchmark, please cite our repository. """) gr.Markdown( """ --- **[GitHub Repository](https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark)** | **[Documentation](https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark/blob/main/README.md)** | **[How to Participate](https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark/blob/main/USAGE.md)** *Last updated: {} """.format(datetime.now().strftime("%Y-%m-%d")) ) if __name__ == "__main__": demo.launch()