Spaces:

jackkuo
/

llm-enzyme-kinetics-leaderboard

Sleeping

App Files Files Community

github-actions[bot] commited on Feb 4

Commit

a924780

1 Parent(s): bbffc2c

Update leaderboard from GitHub main branch

Browse files

Files changed (5) hide show

.gitattributes +0 -35
README.md +34 -5
app.py +474 -0
requirements.txt +4 -0
utils.py +127 -0

.gitattributes DELETED Viewed

@@ -1,35 +0,0 @@
-*.7z filter=lfs diff=lfs merge=lfs -text
-*.arrow filter=lfs diff=lfs merge=lfs -text
-*.bin filter=lfs diff=lfs merge=lfs -text
-*.bz2 filter=lfs diff=lfs merge=lfs -text
-*.ckpt filter=lfs diff=lfs merge=lfs -text
-*.ftz filter=lfs diff=lfs merge=lfs -text
-*.gz filter=lfs diff=lfs merge=lfs -text
-*.h5 filter=lfs diff=lfs merge=lfs -text
-*.joblib filter=lfs diff=lfs merge=lfs -text
-*.lfs.* filter=lfs diff=lfs merge=lfs -text
-*.mlmodel filter=lfs diff=lfs merge=lfs -text
-*.model filter=lfs diff=lfs merge=lfs -text
-*.msgpack filter=lfs diff=lfs merge=lfs -text
-*.npy filter=lfs diff=lfs merge=lfs -text
-*.npz filter=lfs diff=lfs merge=lfs -text
-*.onnx filter=lfs diff=lfs merge=lfs -text
-*.ot filter=lfs diff=lfs merge=lfs -text
-*.parquet filter=lfs diff=lfs merge=lfs -text
-*.pb filter=lfs diff=lfs merge=lfs -text
-*.pickle filter=lfs diff=lfs merge=lfs -text
-*.pkl filter=lfs diff=lfs merge=lfs -text
-*.pt filter=lfs diff=lfs merge=lfs -text
-*.pth filter=lfs diff=lfs merge=lfs -text
-*.rar filter=lfs diff=lfs merge=lfs -text
-*.safetensors filter=lfs diff=lfs merge=lfs -text
-saved_model/**/* filter=lfs diff=lfs merge=lfs -text
-*.tar.* filter=lfs diff=lfs merge=lfs -text
-*.tar filter=lfs diff=lfs merge=lfs -text
-*.tflite filter=lfs diff=lfs merge=lfs -text
-*.tgz filter=lfs diff=lfs merge=lfs -text
-*.wasm filter=lfs diff=lfs merge=lfs -text
-*.xz filter=lfs diff=lfs merge=lfs -text
-*.zip filter=lfs diff=lfs merge=lfs -text
-*.zst filter=lfs diff=lfs merge=lfs -text
-*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,12 +1,41 @@
 ---
-title: Llm Enzyme Kinetics Leaderboard
-emoji: 📈
 colorFrom: blue
-colorTo: red
 sdk: gradio
-sdk_version: 6.5.1
 app_file: app.py
 pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: LLM Enzyme Kinetics Benchmark Leaderboard
+emoji: 🧪
 colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: 4.0.0
 app_file: app.py
 pinned: false
+license: mit
 ---
+# LLM Enzyme Kinetics Extraction Benchmark Leaderboard
+Interactive leaderboard for comparing LLM performance on enzyme kinetics extraction from scientific literature.
+## 🏆 Features
+- Live leaderboard with real-time rankings
+- Interactive filters (model provider, OCR type)
+- Performance visualizations
+- Result submission system
+- Timeline tracking
+## 📊 Benchmark Info
+- **Papers**: 156 peer-reviewed publications
+- **Entries**: 4,244 enzyme kinetic entries
+- **Parameters**: Km, kcat, kcat/Km
+- **OCR Types**: Mathpix, Kimi, PyMuPDF
+## 🚀 How to Participate
+1. Clone the main repository
+2. Run the benchmark: `python scripts/run_benchmark.py --mode full`
+3. Submit your results through this leaderboard!
+## 📚 Documentation
+- [Full Documentation](https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark)
+- [Usage Guide](https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark/blob/main/USAGE.md)

app.py ADDED Viewed

	@@ -0,0 +1,474 @@

+"""
+LLM Enzyme Kinetics Extraction Benchmark Leaderboard
+Built with Gradio
+"""
+import gradio as gr
+import pandas as pd
+import plotly.graph_objects as go
+import plotly.express as px
+from datetime import datetime
+import json
+from pathlib import Path
+from utils import (
+    load_leaderboard_data, format_metrics, get_leaderboard_summary,
+    filter_leaderboard, get_top_n, create_comparison_data
+)
+# CSS for better styling
+custom_css = """
+.gradio-container {
+    max-width: 1400px !important;
+}
+.metric-card {
+    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
+    padding: 20px;
+    border-radius: 10px;
+    color: white;
+    text-align: center;
+}
+.leaderboard-table {
+    font-size: 14px;
+}
+"""
+# Initialize leaderboard data
+LEADERBOARD_DF = load_leaderboard_data()
+def create_leaderboard_table(
+    model_provider: str = "All",
+    ocr_type: str = "All",
+    verified_only: bool = False,
+    top_n: int = 50
+) -> pd.DataFrame:
+    """Create filtered leaderboard table"""
+    filtered_df = filter_leaderboard(LEADERBOARD_DF, model_provider, ocr_type, verified_only)
+    top_df = get_top_n(filtered_df, top_n)
+    if top_df.empty:
+        return pd.DataFrame(columns=["Rank", "Model", "Provider", "OCR", "Submitter", "Date",
+                                    "Km (Exact)", "Km (±10%)", "kcat (Exact)", "kcat (±10%)",
+                                    "kcat/Km (Exact)", "kcat/Km (±10%)", "Overall (Exact)", "Overall (±10%)"])
+    # Format for display
+    display_df = pd.DataFrame({
+        'Rank': range(1, len(top_df) + 1),
+        'Model': top_df['model_name'],
+        'Provider': top_df['model_provider'],
+        'OCR': top_df['ocr_type'],
+        'Submitter': top_df['submitter'],
+        'Date': top_df['submission_date'].dt.strftime('%Y-%m-%d'),
+        'Km (Exact)': top_df['km_exact_match'].apply(format_metrics),
+        'Km (±10%)': top_df['km_tolerance_match'].apply(format_metrics),
+        'kcat (Exact)': top_df['kcat_exact_match'].apply(format_metrics),
+        'kcat (±10%)': top_df['kcat_tolerance_match'].apply(format_metrics),
+        'kcat/Km (Exact)': top_df['km_kcat_exact_match'].apply(format_metrics),
+        'kcat/Km (±10%)': top_df['km_kcat_tolerance_match'].apply(format_metrics),
+        'Overall (Exact)': top_df['overall_exact_match'].apply(format_metrics),
+        'Overall (±10%)': top_df['overall_tolerance_match'].apply(format_metrics),
+    })
+    return display_df
+def create_summary_cards() -> str:
+    """Create summary statistics HTML"""
+    summary = get_leaderboard_summary(LEADERBOARD_DF)
+    html = f"""
+    <div style="display: grid; grid-template-columns: repeat(4, 1fr); gap: 15px; margin-bottom: 20px;">
+        <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 10px; color: white; text-align: center;">
+            <div style="font-size: 14px; opacity: 0.9;">Total Submissions</div>
+            <div style="font-size: 32px; font-weight: bold;">{summary['total_submissions']}</div>
+        </div>
+        <div style="background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%); padding: 20px; border-radius: 10px; color: white; text-align: center;">
+            <div style="font-size: 14px; opacity: 0.9;">Unique Models</div>
+            <div style="font-size: 32px; font-weight: bold;">{summary['unique_models']}</div>
+        </div>
+        <div style="background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%); padding: 20px; border-radius: 10px; color: white; text-align: center;">
+            <div style="font-size: 14px; opacity: 0.9;">Best Score</div>
+            <div style="font-size: 32px; font-weight: bold;">{summary['best_score']:.1f}%</div>
+        </div>
+        <div style="background: linear-gradient(135deg, #43e97b 0%, #38f9d7 100%); padding: 20px; border-radius: 10px; color: white; text-align: center;">
+            <div style="font-size: 14px; opacity: 0.9;">Average Score</div>
+            <div style="font-size: 32px; font-weight: bold;">{summary['avg_score']:.1f}%</div>
+        </div>
+    </div>
+    """
+    return html
+def create_score_comparison_chart() -> go.Figure:
+    """Create score comparison bar chart"""
+    if LEADERBOARD_DF.empty:
+        fig = go.Figure()
+        fig.add_annotation(text="No submissions yet", xref="paper", yref="paper",
+                          x=0.5, y=0.5, showarrow=False)
+        return fig
+    # Get top 10 submissions
+    top_10 = get_top_n(LEADERBOARD_DF, 10)
+    fig = go.Figure()
+    fig.add_trace(go.Bar(
+        x=top_10['overall_exact_match'] * 100,
+        y=top_10['model_name'] + ' (' + top_10['model_provider'] + ')',
+        orientation='h',
+        marker=dict(color='rgba(102, 126, 234, 0.8)'),
+        text=top_10['overall_exact_match'].apply(lambda x: f'{x*100:.1f}%'),
+        textposition='outside'
+    ))
+    fig.update_layout(
+        title='Top 10 Models - Exact Match Accuracy',
+        xaxis_title='Accuracy (%)',
+        yaxis_title='Model',
+        height=400,
+        margin=dict(l=20, r=20, t=40, b=20)
+    )
+    return fig
+def create_ocr_comparison_chart() -> go.Figure:
+    """Create OCR type comparison chart"""
+    if LEADERBOARD_DF.empty:
+        fig = go.Figure()
+        fig.add_annotation(text="No submissions yet", xref="paper", yref="paper",
+                          x=0.5, y=0.5, showarrow=False)
+        return fig
+    ocr_stats = LEADERBOARD_DF.groupby('ocr_type')['overall_exact_match'].agg(['mean', 'count']).reset_index()
+    fig = go.Figure()
+    fig.add_trace(go.Bar(
+        x=ocr_stats['ocr_type'],
+        y=ocr_stats['mean'] * 100,
+        marker=dict(color=['rgba(102, 126, 234, 0.8)', 'rgba(240, 147, 251, 0.8)', 'rgba(79, 172, 254, 0.8)']),
+        text=ocr_stats['mean'].apply(lambda x: f'{x*100:.1f}%'),
+        textposition='outside',
+        name='Accuracy'
+    ))
+    fig.update_layout(
+        title='Performance by OCR Type',
+        xaxis_title='OCR Type',
+        yaxis_title='Average Exact Match (%)',
+        height=400,
+        margin=dict(l=20, r=20, t=40, b=20)
+    )
+    return fig
+def create_timeline_chart() -> go.Figure:
+    """Create submission timeline chart"""
+    if LEADERBOARD_DF.empty:
+        fig = go.Figure()
+        fig.add_annotation(text="No submissions yet", xref="paper", yref="paper",
+                          x=0.5, y=0.5, showarrow=False)
+        return fig
+    df_sorted = LEADERBOARD_DF.sort_values('submission_date')
+    df_sorted['cumulative_best'] = df_sorted['overall_exact_match'].cummax()
+    fig = go.Figure()
+    # Add all submissions as scatter
+    fig.add_trace(go.Scatter(
+        x=df_sorted['submission_date'],
+        y=df_sorted['overall_exact_match'] * 100,
+        mode='markers',
+        name='Submissions',
+        marker=dict(size=8, color='rgba(102, 126, 234, 0.5)'),
+        text=df_sorted['model_name'],
+        hovertemplate='%{text}<br>%{x}<br>%{y:.1f}%'
+    ))
+    # Add best score line
+    fig.add_trace(go.Scatter(
+        x=df_sorted['submission_date'],
+        y=df_sorted['cumulative_best'] * 100,
+        mode='lines',
+        name='Best Score',
+        line=dict(color='rgba(67, 233, 123, 0.8)', width=2)
+    ))
+    fig.update_layout(
+        title='Submission Timeline & Progress',
+        xaxis_title='Date',
+        yaxis_title='Exact Match (%)',
+        height=400,
+        margin=dict(l=20, r=20, t=40, b=20),
+        hovermode='x unified'
+    )
+    return fig
+def submit_result(
+    model_name: str,
+    model_provider: str,
+    ocr_type: str,
+    submitter: str,
+    km_exact: float,
+    km_tolerance: float,
+    kcat_exact: float,
+    kcat_tolerance: float,
+    km_kcat_exact: float,
+    km_kcat_tolerance: float,
+    total_papers: int,
+    notes: str
+) -> str:
+    """Submit a new result to the leaderboard"""
+    try:
+        # Calculate overall scores
+        overall_exact = (km_exact + kcat_exact + km_kcat_exact) / 3
+        overall_tolerance = (km_tolerance + kcat_tolerance + km_kcat_tolerance) / 3
+        # Create submission data
+        submission = {
+            'submission_id': f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{submitter}",
+            'model_name': model_name,
+            'model_provider': model_provider,
+            'ocr_type': ocr_type,
+            'submitter': submitter,
+            'submission_date': datetime.now().isoformat(),
+            'km_exact_match': km_exact / 100,
+            'km_tolerance_match': km_tolerance / 100,
+            'kcat_exact_match': kcat_exact / 100,
+            'kcat_tolerance_match': kcat_tolerance / 100,
+            'km_kcat_exact_match': km_kcat_exact / 100,
+            'km_kcat_tolerance_match': km_kcat_tolerance / 100,
+            'overall_exact_match': overall_exact / 100,
+            'overall_tolerance_match': overall_tolerance / 100,
+            'total_papers': total_papers,
+            'total_entries': total_papers * 3,  # Approximate
+            'notes': notes,
+            'verified': False  # Needs verification
+        }
+        # Save to data directory
+        data_dir = Path("leaderboard/data")
+        data_dir.mkdir(parents=True, exist_ok=True)
+        submission_file = data_dir / f"{submission['submission_id']}.json"
+        with open(submission_file, 'w') as f:
+            json.dump(submission, f, indent=2)
+        # Reload leaderboard data
+        global LEADERBOARD_DF
+        LEADERBOARD_DF = load_leaderboard_data()
+        return f"✅ Submission successful! Your ID: {submission['submission_id']}\n\nPlease create a PR or contact the maintainer to verify your submission."
+    except Exception as e:
+        return f"❌ Error: {str(e)}"
+# Build Gradio interface
+with gr.Blocks(css=custom_css, title="LLM Enzyme Kinetics Extraction Benchmark") as demo:
+    gr.Markdown(
+        """
+        # 🧪 LLM Enzyme Kinetics Extraction Benchmark Leaderboard
+        Welcome to the leaderboard for the **LLM Enzyme Kinetics Golden Benchmark**!
+        This benchmark evaluates LLMs on extracting enzyme kinetic parameters (Km, kcat, kcat/Km)
+        from scientific literature.
+        📚 **Dataset**: 4,244 entries from 156 papers | 🎯 **Task**: Extract kinetic parameters from OCR-processed papers
+        """
+    )
+    # Summary cards
+    gr.HTML(create_summary_cards())
+    with gr.Tabs():
+        # Tab 1: Leaderboard Table
+        with gr.TabItem("🏆 Leaderboard"):
+            gr.Markdown("### Filter and Search")
+            with gr.Row():
+                model_provider_dropdown = gr.Dropdown(
+                    choices=["All", "OpenAI", "Anthropic", "Kimi", "Other"],
+                    value="All",
+                    label="Model Provider"
+                )
+                ocr_type_dropdown = gr.Dropdown(
+                    choices=["All", "mathpix", "kimi", "pymupdf"],
+                    value="All",
+                    label="OCR Type"
+                )
+                verified_checkbox = gr.Checkbox(
+                    label="Verified Only",
+                    value=False
+                )
+                top_n_slider = gr.Slider(
+                    minimum=10,
+                    maximum=100,
+                    value=50,
+                    step=10,
+                    label="Show Top N"
+                )
+            leaderboard_table = gr.Dataframe(
+                label="Leaderboard",
+                datatype=["markdown"] * 14,
+                interactive=False,
+                wrap=True
+            )
+            refresh_btn = gr.Button("🔄 Refresh", variant="primary")
+            refresh_btn.click(
+                fn=create_leaderboard_table,
+                inputs=[model_provider_dropdown, ocr_type_dropdown, verified_checkbox, top_n_slider],
+                outputs=leaderboard_table
+            )
+            # Initial load
+            demo.load(
+                fn=create_leaderboard_table,
+                inputs=[model_provider_dropdown, ocr_type_dropdown, verified_checkbox, top_n_slider],
+                outputs=leaderboard_table
+            )
+        # Tab 2: Visualizations
+        with gr.TabItem("📊 Visualizations"):
+            with gr.Row():
+                score_chart = gr.Plot(label="Top Models Comparison")
+                ocr_chart = gr.Plot(label="OCR Type Comparison")
+            with gr.Row():
+                timeline_chart = gr.Plot(label="Submission Timeline")
+            # Load charts
+            demo.load(
+                fn=lambda: [create_score_comparison_chart(), create_ocr_comparison_chart(), create_timeline_chart()],
+                outputs=[score_chart, ocr_chart, timeline_chart]
+            )
+        # Tab 3: Submit Results
+        with gr.TabItem("📤 Submit Your Results"):
+            gr.Markdown("""
+            ### Submit your benchmark results to the leaderboard!
+            **Instructions:**
+            1. Run the benchmark using the provided scripts
+            2. Collect your evaluation metrics
+            3. Fill in the form below
+            4. Your submission will be reviewed before appearing on the leaderboard
+            **Evaluation Scripts:**
+            ```bash
+            python scripts/run_benchmark.py --mode full
+            ```
+            """)
+            with gr.Row():
+                model_name_input = gr.Textbox(label="Model Name *", placeholder="e.g., GPT-4, Claude-3.5-Sonnet")
+                model_provider_input = gr.Dropdown(
+                    choices=["OpenAI", "Anthropic", "Kimi", "Other"],
+                    label="Model Provider *"
+                )
+            with gr.Row():
+                ocr_type_input = gr.Dropdown(
+                    choices=["mathpix", "kimi", "pymupdf"],
+                    label="OCR Type *"
+                )
+                submitter_input = gr.Textbox(label="Submitter Name/Email *", placeholder="Your name or contact")
+            gr.Markdown("### Performance Metrics (%)")
+            with gr.Row():
+                km_exact_input = gr.Number(label="Km Exact Match *", minimum=0, maximum=100)
+                km_tolerance_input = gr.Number(label="Km Tolerance (±10%) *", minimum=0, maximum=100)
+            with gr.Row():
+                kcat_exact_input = gr.Number(label="kcat Exact Match *", minimum=0, maximum=100)
+                kcat_tolerance_input = gr.Number(label="kcat Tolerance (±10%) *", minimum=0, maximum=100)
+            with gr.Row():
+                km_kcat_exact_input = gr.Number(label="kcat/Km Exact Match *", minimum=0, maximum=100)
+                km_kcat_tolerance_input = gr.Number(label="kcat/Km Tolerance (±10%) *", minimum=0, maximum=100)
+            with gr.Row():
+                total_papers_input = gr.Number(label="Total Papers Evaluated *", minimum=1, maximum=156)
+                notes_input = gr.Textbox(
+                    label="Notes",
+                    placeholder="Any additional information about your setup (temperature, prompts, etc.)",
+                    lines=3
+                )
+            submit_btn = gr.Button("Submit Results", variant="primary")
+            submission_output = gr.Markdown()
+            submit_btn.click(
+                fn=submit_result,
+                inputs=[
+                    model_name_input, model_provider_input, ocr_type_input, submitter_input,
+                    km_exact_input, km_tolerance_input, kcat_exact_input, kcat_tolerance_input,
+                    km_kcat_exact_input, km_kcat_tolerance_input, total_papers_input, notes_input
+                ],
+                outputs=submission_output
+            )
+        # Tab 4: About
+        with gr.TabItem("ℹ️ About"):
+            gr.Markdown("""
+            ## About the Benchmark
+            The **LLM Enzyme Kinetics Golden Benchmark** evaluates the ability of Large Language Models
+            to extract structured enzyme kinetic data from scientific literature.
+            ### Dataset
+            - **Papers**: 156 peer-reviewed publications
+            - **Entries**: 4,244 manually curated enzyme kinetic entries
+            - **Parameters**: Km, kcat, kcat/Km, pH, temperature, mutations
+            - **OCR Versions**: 3 parallel OCR outputs (Mathpix, Kimi, PyMuPDF)
+            ### Evaluation Metrics
+            1. **Exact Match Accuracy**: Value must match exactly
+            2. **Tolerance Match (±10%)**: Value within 10% of ground truth
+            3. Scores are calculated for each parameter (Km, kcat, kcat/Km)
+            ### How to Participate
+            1. Clone the repository:
+               ```bash
+               git clone https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark.git
+               ```
+            2. Install dependencies:
+               ```bash
+               conda create -n enzyme_benchmark python=3.10 -y
+               conda activate enzyme_benchmark
+               pip install -r requirements.txt
+               ```
+            3. Configure your API key in `.env`
+            4. Run the benchmark:
+               ```bash
+               python scripts/run_benchmark.py --mode full
+               ```
+            5. Submit your results through this leaderboard!
+            ### Citation
+            If you use this benchmark, please cite our repository.
+            """)
+    gr.Markdown(
+        """
+        ---
+        **[GitHub Repository](https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark)**
+        | **[Documentation](https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark/blob/main/README.md)**
+        | **[How to Participate](https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark/blob/main/USAGE.md)**
+        *Last updated: {}
+        """.format(datetime.now().strftime("%Y-%m-%d"))
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,4 @@

+gradio>=4.0.0
+pandas>=2.0.0
+plotly>=5.0.0
+python-dotenv>=1.0.0

utils.py ADDED Viewed

	@@ -0,0 +1,127 @@

+"""Utility functions for leaderboard"""
+import pandas as pd
+from pathlib import Path
+from typing import Dict, List, Optional
+import json
+def load_leaderboard_data(data_dir: str = "leaderboard/data") -> pd.DataFrame:
+    """
+    Load all leaderboard data from JSON files
+    Args:
+        data_dir: Directory containing submission JSON files
+    Returns:
+        DataFrame with all submissions
+    """
+    data_path = Path(data_dir)
+    if not data_path.exists():
+        # Create empty DataFrame with default columns
+        return pd.DataFrame(columns=[
+            'submission_id', 'model_name', 'model_provider', 'ocr_type',
+            'submitter', 'submission_date', 'km_exact_match', 'km_tolerance_match',
+            'kcat_exact_match', 'kcat_tolerance_match', 'km_kcat_exact_match',
+            'km_kcat_tolerance_match', 'overall_exact_match', 'overall_tolerance_match',
+            'total_papers', 'total_entries', 'notes', 'verified'
+        ])
+    all_data = []
+    for json_file in data_path.glob("*.json"):
+        try:
+            with open(json_file, 'r') as f:
+                data = json.load(f)
+                all_data.append(data)
+        except Exception as e:
+            print(f"Error loading {json_file}: {e}")
+    if not all_data:
+        return pd.DataFrame(columns=[
+            'submission_id', 'model_name', 'model_provider', 'ocr_type',
+            'submitter', 'submission_date', 'km_exact_match', 'km_tolerance_match',
+            'kcat_exact_match', 'kcat_tolerance_match', 'km_kcat_exact_match',
+            'km_kcat_tolerance_match', 'overall_exact_match', 'overall_tolerance_match',
+            'total_papers', 'total_entries', 'notes', 'verified'
+        ])
+    df = pd.DataFrame(all_data)
+    # Convert date strings to datetime
+    if 'submission_date' in df.columns:
+        df['submission_date'] = pd.to_datetime(df['submission_date'])
+    return df.sort_values('overall_exact_match', ascending=False)
+def format_metrics(value: float, as_percentage: bool = True) -> str:
+    """Format metric value for display"""
+    if as_percentage:
+        return f"{value * 100:.2f}%"
+    return f"{value:.4f}"
+def get_leaderboard_summary(df: pd.DataFrame) -> Dict:
+    """Get summary statistics from leaderboard"""
+    if df.empty:
+        return {
+            'total_submissions': 0,
+            'unique_models': 0,
+            'best_score': 0.0,
+            'avg_score': 0.0
+        }
+    return {
+        'total_submissions': len(df),
+        'unique_models': df['model_name'].nunique(),
+        'best_score': df['overall_exact_match'].max() * 100,
+        'avg_score': df['overall_exact_match'].mean() * 100,
+        'verified_submissions': df['verified'].sum() if 'verified' in df.columns else 0
+    }
+def filter_leaderboard(
+    df: pd.DataFrame,
+    model_provider: Optional[str] = None,
+    ocr_type: Optional[str] = None,
+    verified_only: bool = False
+) -> pd.DataFrame:
+    """Filter leaderboard based on criteria"""
+    filtered_df = df.copy()
+    if model_provider and model_provider != "All":
+        filtered_df = filtered_df[filtered_df['model_provider'] == model_provider]
+    if ocr_type and ocr_type != "All":
+        filtered_df = filtered_df[filtered_df['ocr_type'] == ocr_type]
+    if verified_only and 'verified' in filtered_df.columns:
+        filtered_df = filtered_df[filtered_df['verified'] == True]
+    return filtered_df
+def get_top_n(df: pd.DataFrame, n: int = 10) -> pd.DataFrame:
+    """Get top N submissions"""
+    return df.head(n)
+def create_comparison_data(df: pd.DataFrame) -> Dict:
+    """Create data for comparison charts"""
+    if df.empty:
+        return {}
+    # Group by model provider
+    provider_stats = df.groupby('model_provider').agg({
+        'overall_exact_match': ['mean', 'max', 'count'],
+        'overall_tolerance_match': 'mean'
+    }).round(4)
+    # Group by OCR type
+    ocr_stats = df.groupby('ocr_type').agg({
+        'overall_exact_match': ['mean', 'max', 'count']
+    }).round(4)
+    return {
+        'by_provider': provider_stats.to_dict(),
+        'by_ocr': ocr_stats.to_dict()
+    }