"""
LLM Enzyme Kinetics Extraction Benchmark Leaderboard
Built with Gradio
"""

import gradio as gr
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from datetime import datetime
import json
from pathlib import Path
from utils import (
    load_leaderboard_data, format_metrics, get_leaderboard_summary,
    filter_leaderboard, get_top_n, create_comparison_data
)

# CSS for better styling
custom_css = """
.gradio-container {
    max-width: 1400px !important;
}
.metric-card {
    background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
    padding: 20px;
    border-radius: 10px;
    color: white;
    text-align: center;
}
.leaderboard-table {
    font-size: 14px;
}
"""

# Initialize leaderboard data
LEADERBOARD_DF = load_leaderboard_data()

def create_leaderboard_table(
    model_provider: str = "All",
    ocr_type: str = "All",
    verified_only: bool = False,
    top_n: int = 50
) -> pd.DataFrame:
    """Create filtered leaderboard table"""
    filtered_df = filter_leaderboard(LEADERBOARD_DF, model_provider, ocr_type, verified_only)
    top_df = get_top_n(filtered_df, top_n)

    if top_df.empty:
        return pd.DataFrame(columns=["Rank", "Model", "Provider", "OCR", "Submitter", "Date",
                                    "Km (Exact)", "Km (±10%)", "kcat (Exact)", "kcat (±10%)",
                                    "kcat/Km (Exact)", "kcat/Km (±10%)", "Overall (Exact)", "Overall (±10%)"])

    # Format for display
    display_df = pd.DataFrame({
        'Rank': range(1, len(top_df) + 1),
        'Model': top_df['model_name'],
        'Provider': top_df['model_provider'],
        'OCR': top_df['ocr_type'],
        'Submitter': top_df['submitter'],
        'Date': top_df['submission_date'].dt.strftime('%Y-%m-%d'),
        'Km (Exact)': top_df['km_exact_match'].apply(format_metrics),
        'Km (±10%)': top_df['km_tolerance_match'].apply(format_metrics),
        'kcat (Exact)': top_df['kcat_exact_match'].apply(format_metrics),
        'kcat (±10%)': top_df['kcat_tolerance_match'].apply(format_metrics),
        'kcat/Km (Exact)': top_df['km_kcat_exact_match'].apply(format_metrics),
        'kcat/Km (±10%)': top_df['km_kcat_tolerance_match'].apply(format_metrics),
        'Overall (Exact)': top_df['overall_exact_match'].apply(format_metrics),
        'Overall (±10%)': top_df['overall_tolerance_match'].apply(format_metrics),
    })

    return display_df


def create_summary_cards() -> str:
    """Create summary statistics HTML"""
    summary = get_leaderboard_summary(LEADERBOARD_DF)

    html = f"""
    <div style="display: grid; grid-template-columns: repeat(4, 1fr); gap: 15px; margin-bottom: 20px;">
        <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 10px; color: white; text-align: center;">
            <div style="font-size: 14px; opacity: 0.9;">Total Submissions</div>
            <div style="font-size: 32px; font-weight: bold;">{summary['total_submissions']}</div>
        </div>
        <div style="background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%); padding: 20px; border-radius: 10px; color: white; text-align: center;">
            <div style="font-size: 14px; opacity: 0.9;">Unique Models</div>
            <div style="font-size: 32px; font-weight: bold;">{summary['unique_models']}</div>
        </div>
        <div style="background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%); padding: 20px; border-radius: 10px; color: white; text-align: center;">
            <div style="font-size: 14px; opacity: 0.9;">Best Score</div>
            <div style="font-size: 32px; font-weight: bold;">{summary['best_score']:.1f}%</div>
        </div>
        <div style="background: linear-gradient(135deg, #43e97b 0%, #38f9d7 100%); padding: 20px; border-radius: 10px; color: white; text-align: center;">
            <div style="font-size: 14px; opacity: 0.9;">Average Score</div>
            <div style="font-size: 32px; font-weight: bold;">{summary['avg_score']:.1f}%</div>
        </div>
    </div>
    """
    return html


def create_score_comparison_chart() -> go.Figure:
    """Create score comparison bar chart"""
    if LEADERBOARD_DF.empty:
        fig = go.Figure()
        fig.add_annotation(text="No submissions yet", xref="paper", yref="paper",
                          x=0.5, y=0.5, showarrow=False)
        return fig

    # Get top 10 submissions
    top_10 = get_top_n(LEADERBOARD_DF, 10)

    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=top_10['overall_exact_match'] * 100,
        y=top_10['model_name'] + ' (' + top_10['model_provider'] + ')',
        orientation='h',
        marker=dict(color='rgba(102, 126, 234, 0.8)'),
        text=top_10['overall_exact_match'].apply(lambda x: f'{x*100:.1f}%'),
        textposition='outside'
    ))

    fig.update_layout(
        title='Top 10 Models - Exact Match Accuracy',
        xaxis_title='Accuracy (%)',
        yaxis_title='Model',
        height=400,
        margin=dict(l=20, r=20, t=40, b=20)
    )

    return fig


def create_ocr_comparison_chart() -> go.Figure:
    """Create OCR type comparison chart"""
    if LEADERBOARD_DF.empty:
        fig = go.Figure()
        fig.add_annotation(text="No submissions yet", xref="paper", yref="paper",
                          x=0.5, y=0.5, showarrow=False)
        return fig

    ocr_stats = LEADERBOARD_DF.groupby('ocr_type')['overall_exact_match'].agg(['mean', 'count']).reset_index()

    fig = go.Figure()
    fig.add_trace(go.Bar(
        x=ocr_stats['ocr_type'],
        y=ocr_stats['mean'] * 100,
        marker=dict(color=['rgba(102, 126, 234, 0.8)', 'rgba(240, 147, 251, 0.8)', 'rgba(79, 172, 254, 0.8)']),
        text=ocr_stats['mean'].apply(lambda x: f'{x*100:.1f}%'),
        textposition='outside',
        name='Accuracy'
    ))

    fig.update_layout(
        title='Performance by OCR Type',
        xaxis_title='OCR Type',
        yaxis_title='Average Exact Match (%)',
        height=400,
        margin=dict(l=20, r=20, t=40, b=20)
    )

    return fig


def create_timeline_chart() -> go.Figure:
    """Create submission timeline chart"""
    if LEADERBOARD_DF.empty:
        fig = go.Figure()
        fig.add_annotation(text="No submissions yet", xref="paper", yref="paper",
                          x=0.5, y=0.5, showarrow=False)
        return fig

    df_sorted = LEADERBOARD_DF.sort_values('submission_date')
    df_sorted['cumulative_best'] = df_sorted['overall_exact_match'].cummax()

    fig = go.Figure()

    # Add all submissions as scatter
    fig.add_trace(go.Scatter(
        x=df_sorted['submission_date'],
        y=df_sorted['overall_exact_match'] * 100,
        mode='markers',
        name='Submissions',
        marker=dict(size=8, color='rgba(102, 126, 234, 0.5)'),
        text=df_sorted['model_name'],
        hovertemplate='%{text}<br>%{x}<br>%{y:.1f}%'
    ))

    # Add best score line
    fig.add_trace(go.Scatter(
        x=df_sorted['submission_date'],
        y=df_sorted['cumulative_best'] * 100,
        mode='lines',
        name='Best Score',
        line=dict(color='rgba(67, 233, 123, 0.8)', width=2)
    ))

    fig.update_layout(
        title='Submission Timeline & Progress',
        xaxis_title='Date',
        yaxis_title='Exact Match (%)',
        height=400,
        margin=dict(l=20, r=20, t=40, b=20),
        hovermode='x unified'
    )

    return fig


def submit_result(
    model_name: str,
    model_provider: str,
    ocr_type: str,
    submitter: str,
    km_exact: float,
    km_tolerance: float,
    kcat_exact: float,
    kcat_tolerance: float,
    km_kcat_exact: float,
    km_kcat_tolerance: float,
    total_papers: int,
    notes: str
) -> str:
    """Submit a new result to the leaderboard"""
    try:
        # Calculate overall scores
        overall_exact = (km_exact + kcat_exact + km_kcat_exact) / 3
        overall_tolerance = (km_tolerance + kcat_tolerance + km_kcat_tolerance) / 3

        # Create submission data
        submission = {
            'submission_id': f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{submitter}",
            'model_name': model_name,
            'model_provider': model_provider,
            'ocr_type': ocr_type,
            'submitter': submitter,
            'submission_date': datetime.now().isoformat(),
            'km_exact_match': km_exact / 100,
            'km_tolerance_match': km_tolerance / 100,
            'kcat_exact_match': kcat_exact / 100,
            'kcat_tolerance_match': kcat_tolerance / 100,
            'km_kcat_exact_match': km_kcat_exact / 100,
            'km_kcat_tolerance_match': km_kcat_tolerance / 100,
            'overall_exact_match': overall_exact / 100,
            'overall_tolerance_match': overall_tolerance / 100,
            'total_papers': total_papers,
            'total_entries': total_papers * 3,  # Approximate
            'notes': notes,
            'verified': False  # Needs verification
        }

        # Save to data directory
        data_dir = Path("leaderboard/data")
        data_dir.mkdir(parents=True, exist_ok=True)

        submission_file = data_dir / f"{submission['submission_id']}.json"
        with open(submission_file, 'w') as f:
            json.dump(submission, f, indent=2)

        # Reload leaderboard data
        global LEADERBOARD_DF
        LEADERBOARD_DF = load_leaderboard_data()

        return f"✅ Submission successful! Your ID: {submission['submission_id']}\n\nPlease create a PR or contact the maintainer to verify your submission."

    except Exception as e:
        return f"❌ Error: {str(e)}"


# Build Gradio interface
with gr.Blocks(css=custom_css, title="LLM Enzyme Kinetics Extraction Benchmark") as demo:
    gr.Markdown(
        """
        # 🧪 LLM Enzyme Kinetics Extraction Benchmark Leaderboard

        Welcome to the leaderboard for the **LLM Enzyme Kinetics Golden Benchmark**!
        This benchmark evaluates LLMs on extracting enzyme kinetic parameters (Km, kcat, kcat/Km)
        from scientific literature.

        📚 **Dataset**: 4,244 entries from 156 papers | 🎯 **Task**: Extract kinetic parameters from OCR-processed papers
        """
    )

    # Summary cards
    gr.HTML(create_summary_cards())

    with gr.Tabs():
        # Tab 1: Leaderboard Table
        with gr.TabItem("🏆 Leaderboard"):
            gr.Markdown("### Filter and Search")

            with gr.Row():
                model_provider_dropdown = gr.Dropdown(
                    choices=["All", "OpenAI", "Anthropic", "Kimi", "Other"],
                    value="All",
                    label="Model Provider"
                )
                ocr_type_dropdown = gr.Dropdown(
                    choices=["All", "mathpix", "kimi", "pymupdf"],
                    value="All",
                    label="OCR Type"
                )
                verified_checkbox = gr.Checkbox(
                    label="Verified Only",
                    value=False
                )
                top_n_slider = gr.Slider(
                    minimum=10,
                    maximum=100,
                    value=50,
                    step=10,
                    label="Show Top N"
                )

            leaderboard_table = gr.Dataframe(
                label="Leaderboard",
                datatype=["markdown"] * 14,
                interactive=False,
                wrap=True
            )

            refresh_btn = gr.Button("🔄 Refresh", variant="primary")
            refresh_btn.click(
                fn=create_leaderboard_table,
                inputs=[model_provider_dropdown, ocr_type_dropdown, verified_checkbox, top_n_slider],
                outputs=leaderboard_table
            )

            # Initial load
            demo.load(
                fn=create_leaderboard_table,
                inputs=[model_provider_dropdown, ocr_type_dropdown, verified_checkbox, top_n_slider],
                outputs=leaderboard_table
            )

        # Tab 2: Visualizations
        with gr.TabItem("📊 Visualizations"):
            with gr.Row():
                score_chart = gr.Plot(label="Top Models Comparison")
                ocr_chart = gr.Plot(label="OCR Type Comparison")

            with gr.Row():
                timeline_chart = gr.Plot(label="Submission Timeline")

            # Load charts
            demo.load(
                fn=lambda: [create_score_comparison_chart(), create_ocr_comparison_chart(), create_timeline_chart()],
                outputs=[score_chart, ocr_chart, timeline_chart]
            )

        # Tab 3: Submit Results
        with gr.TabItem("📤 Submit Your Results"):
            gr.Markdown("""
            ### Submit your benchmark results to the leaderboard!

            **Instructions:**
            1. Run the benchmark using the provided scripts
            2. Collect your evaluation metrics
            3. Fill in the form below
            4. Your submission will be reviewed before appearing on the leaderboard

            **Evaluation Scripts:**
            ```bash
            python scripts/run_benchmark.py --mode full
            ```
            """)

            with gr.Row():
                model_name_input = gr.Textbox(label="Model Name *", placeholder="e.g., GPT-4, Claude-3.5-Sonnet")
                model_provider_input = gr.Dropdown(
                    choices=["OpenAI", "Anthropic", "Kimi", "Other"],
                    label="Model Provider *"
                )

            with gr.Row():
                ocr_type_input = gr.Dropdown(
                    choices=["mathpix", "kimi", "pymupdf"],
                    label="OCR Type *"
                )
                submitter_input = gr.Textbox(label="Submitter Name/Email *", placeholder="Your name or contact")

            gr.Markdown("### Performance Metrics (%)")

            with gr.Row():
                km_exact_input = gr.Number(label="Km Exact Match *", minimum=0, maximum=100)
                km_tolerance_input = gr.Number(label="Km Tolerance (±10%) *", minimum=0, maximum=100)

            with gr.Row():
                kcat_exact_input = gr.Number(label="kcat Exact Match *", minimum=0, maximum=100)
                kcat_tolerance_input = gr.Number(label="kcat Tolerance (±10%) *", minimum=0, maximum=100)

            with gr.Row():
                km_kcat_exact_input = gr.Number(label="kcat/Km Exact Match *", minimum=0, maximum=100)
                km_kcat_tolerance_input = gr.Number(label="kcat/Km Tolerance (±10%) *", minimum=0, maximum=100)

            with gr.Row():
                total_papers_input = gr.Number(label="Total Papers Evaluated *", minimum=1, maximum=156)
                notes_input = gr.Textbox(
                    label="Notes",
                    placeholder="Any additional information about your setup (temperature, prompts, etc.)",
                    lines=3
                )

            submit_btn = gr.Button("Submit Results", variant="primary")
            submission_output = gr.Markdown()

            submit_btn.click(
                fn=submit_result,
                inputs=[
                    model_name_input, model_provider_input, ocr_type_input, submitter_input,
                    km_exact_input, km_tolerance_input, kcat_exact_input, kcat_tolerance_input,
                    km_kcat_exact_input, km_kcat_tolerance_input, total_papers_input, notes_input
                ],
                outputs=submission_output
            )

        # Tab 4: About
        with gr.TabItem("ℹ️ About"):
            gr.Markdown("""
            ## About the Benchmark

            The **LLM Enzyme Kinetics Golden Benchmark** evaluates the ability of Large Language Models
            to extract structured enzyme kinetic data from scientific literature.

            ### Dataset
            - **Papers**: 156 peer-reviewed publications
            - **Entries**: 4,244 manually curated enzyme kinetic entries
            - **Parameters**: Km, kcat, kcat/Km, pH, temperature, mutations
            - **OCR Versions**: 3 parallel OCR outputs (Mathpix, Kimi, PyMuPDF)

            ### Evaluation Metrics
            1. **Exact Match Accuracy**: Value must match exactly
            2. **Tolerance Match (±10%)**: Value within 10% of ground truth
            3. Scores are calculated for each parameter (Km, kcat, kcat/Km)

            ### How to Participate
            1. Clone the repository:
               ```bash
               git clone https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark.git
               ```

            2. Install dependencies:
               ```bash
               conda create -n enzyme_benchmark python=3.10 -y
               conda activate enzyme_benchmark
               pip install -r requirements.txt
               ```

            3. Configure your API key in `.env`

            4. Run the benchmark:
               ```bash
               python scripts/run_benchmark.py --mode full
               ```

            5. Submit your results through this leaderboard!

            ### Citation
            If you use this benchmark, please cite our repository.
            """)

    gr.Markdown(
        """
        ---
        **[GitHub Repository](https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark)**
        | **[Documentation](https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark/blob/main/README.md)**
        | **[How to Participate](https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark/blob/main/USAGE.md)**

        *Last updated: {}
        """.format(datetime.now().strftime("%Y-%m-%d"))
    )


if __name__ == "__main__":
    demo.launch()