import os
import json
import glob
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import gradio as gr
from typing import Dict, List, Any, Tuple, Optional
from huggingface_hub import snapshot_download
from pathlib import Path

# Define the key benchmarks to track from the JSON results file
# Define the key benchmarks to track from the JSON results file
MAIN_BENCHMARKS = {
    "belebele_ukr_Cyrl": {
        "metric": "acc",
        "name": "Belebele Ukrainian",
        "scale": [0, 1],
    },
    "global_mmlu_full_uk": {"metric": "acc", "name": "MMLU Ukrainian", "scale": [0, 1]},
    "flores_uk": {"metric": "bleu", "name": "FLORES Ukrainian", "scale": [0, 40]},
    "long_flores_uk": {
        "metric": "bleu",
        "name": "Long FLORES Ukrainian",
        "scale": [0, 40],
    },
    "squad_uk": {"metric": "f1", "name": "SQuAD Ukrainian", "scale": [0, 100]},
    "xlsum_uk": {"metric": "bleu", "name": "XLSum Ukrainian", "scale": [0, 30]},
    "triviaqa_uk": {
        "metric": "exact_match",
        "name": "TriviaQA Ukrainian",
        "scale": [0, 1],
    },
    "arc_challenge_uk": {
        "metric": "exact_match",
        "name": "ARC Challenge Ukrainian",
        "scale": [0, 1],
    },
    "arc_easy_uk": {"metric": "acc", "name": "ARC Easy Ukrainian", "scale": [0, 1]},
    "winogrande_uk": {"metric": "acc", "name": "Winogrande Ukrainian", "scale": [0, 1]},
    "gsm8k_uk": {"metric": "exact_match", "name": "GSM8K Ukrainian", "scale": [0, 1]},
    "ifeval_uk": {
        "metric": "prompt_level_strict_acc",
        "name": "IFEval Ukrainian",
        "scale": [0, 1],
    },
    "wmt_en_uk": {"metric": "bleu", "name": "WMT EN→UK", "scale": [0, 40]},
    "zno_uk_geography": {"metric": "exact", "name": "ZNO Geography", "scale": [0, 1]},
    "zno_uk_history": {"metric": "exact", "name": "ZNO History", "scale": [0, 1]},
    "zno_uk_language_and_literature": {
        "metric": "exact",
        "name": "ZNO Language & Literature",
        "scale": [0, 1],
    },
    "zno_uk_math": {"metric": "exact", "name": "ZNO Math", "scale": [0, 1]},
}

# MMLU - Only use aggregate score, no subcategories
MMLU_BENCHMARKS = {}

# FLORES Language Pair benchmarks - ONLY English-Ukrainian pairs
FLORES_BENCHMARKS = {
    "flores_en-uk": {"metric": "bleu", "name": "FLORES EN→UK", "scale": [0, 40]},
    "flores_uk-en": {"metric": "bleu", "name": "FLORES UK→EN", "scale": [0, 40]},
}

# Long FLORES Language Pair benchmarks - ONLY English-Ukrainian pairs
LONG_FLORES_BENCHMARKS = {
    "long_flores_en-uk": {
        "metric": "bleu",
        "name": "Long FLORES EN→UK",
        "scale": [0, 40],
    },
    "long_flores_uk-en": {
        "metric": "bleu",
        "name": "Long FLORES UK→EN",
        "scale": [0, 40],
    },
}

# Combine all benchmarks for detailed view
ALL_BENCHMARKS = {
    **MAIN_BENCHMARKS,
    **MMLU_BENCHMARKS,
    **FLORES_BENCHMARKS,
    **LONG_FLORES_BENCHMARKS,
}


def extract_model_name(file_path: str) -> str:
    """Extract model name from the file path."""
    # Format: eval-results/<model_name>/results_*.json
    parts = file_path.split(os.sep)
    if len(parts) >= 2:
        return parts[-2]  # The second-to-last element should be the model name
    return os.path.basename(file_path).replace("results_", "").replace(".json", "")


def download_benchmark_dataset(
    repo_id: str = "lang-uk/ukrainian-llm-leaderboard-results",
    local_dir: str = "./eval-results",
    token: Optional[str] = None,
    force_download: bool = False,
) -> str:
    """
    Download benchmark results dataset from Hugging Face Hub if local directory doesn't exist or is empty.

    Args:
        repo_id: The Hugging Face repository ID containing the benchmark results
        local_dir: Local directory to download the files to
        token: Hugging Face token (optional, for private repos)
        force_download: If True, download even if directory exists and has files

    Returns:
        Path to the directory (either existing or downloaded)
    """
    local_path = Path(local_dir)

    # Check if directory exists and has files
    if not force_download and local_path.exists() and any(local_path.iterdir()):
        print(
            f"Directory {local_dir} already exists and contains files. Using existing data."
        )
        return str(local_path)

    try:
        # Create local directory if it doesn't exist
        local_path.mkdir(parents=True, exist_ok=True)

        print(f"Downloading benchmark dataset from {repo_id} to {local_dir}...")

        # Download the entire repository
        downloaded_path = snapshot_download(
            repo_id=repo_id, local_dir=local_dir, token=token, repo_type="dataset"
        )

        print(f"Dataset downloaded successfully to: {downloaded_path}")
        return downloaded_path

    except Exception as e:
        print(f"Error downloading dataset: {e}")
        print(f"Using local directory: {local_dir}")
        return local_dir


def load_results(results_dir: str = "eval-results") -> List[Dict[str, Any]]:
    """Load all results from JSON files in the results directory."""
    all_results = []

    # Check if directory exists and has content, download if not
    if not os.path.exists(results_dir) or not os.listdir(results_dir):
        print(
            f"Results directory {results_dir} not found or empty. Attempting to download..."
        )
        results_dir = download_benchmark_dataset(local_dir=results_dir)

    results_dir = os.path.join(results_dir, "aggregated")

    # Find all results files
    pattern = os.path.join(results_dir, "**", "results*.json")
    result_files = glob.glob(pattern, recursive=True)

    if not result_files:
        print(f"No result files found in {results_dir}")
        # Create a sample result for demonstration if no files found
        return [{"model_name": "No models found - Add files to eval-results directory"}]

    for file_path in result_files:
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            # model_name = data.get("model_name", extract_model_name(file_path))
            parent_folder = os.path.basename(
                os.path.dirname(os.path.dirname(file_path))
            )
            model_name = data.get(
                "model_name",
                extract_model_name(file_path),
            )

            # Extract relevant metrics
            model_results = {"model_name": model_name}

            model_in_results = any([model_name == k["model_name"] for k in all_results])
            if model_in_results:
                print(f"Duplicate model name found: {model_name}. Skipping.")
                continue

            # Process all benchmark categories
            for benchmark_dict in [ALL_BENCHMARKS]:
                model_results["N-Shot"] = data.get("n-shot", {"bench": 0})
                model_results["N-Shot"] = np.bincount(np.array(list(model_results["N-Shot"].values()))).argmax()
                model_results["model_name"] += f" ({model_results['N-Shot']}-shot)"
                for benchmark, config in benchmark_dict.items():
                    metric = config["metric"]
                    # Check if benchmark exists in results
                    if benchmark in data.get("results", {}):
                        result = data["results"][benchmark]
                        # Handle different metric formats in the JSON
                        if f"{metric},none" in result:
                            model_results[benchmark] = result[f"{metric},none"]
                        elif f"{metric},remove_whitespace" in result:
                            model_results[benchmark] = result[
                                f"{metric},remove_whitespace"
                            ]
                        elif f"{metric},flexible-extract" in result:
                            model_results[benchmark] = result[
                                f"{metric},flexible-extract"
                            ]
                        elif f"{metric},strict-match" in result:
                            model_results[benchmark] = result[f"{metric},strict-match"]
                        elif metric in result:
                            model_results[benchmark] = result[metric]

            # Calculate mean of EN↔UK pairs for flores_uk
            if "flores_en-uk" in model_results and "flores_uk-en" in model_results:
                model_results["flores_uk"] = (
                    model_results["flores_en-uk"] + model_results["flores_uk-en"]
                ) / 2

            # Calculate mean of EN↔UK pairs for long_flores_uk
            if (
                "long_flores_en-uk" in model_results
                and "long_flores_uk-en" in model_results
            ):
                model_results["long_flores_uk"] = (
                    model_results["long_flores_en-uk"]
                    + model_results["long_flores_uk-en"]
                ) / 2

            all_results.append(model_results)

        except Exception as e:
            raise e
            print(f"Error loading {file_path}: {e}")

    return all_results


def create_dataframe(
    results: List[Dict[str, Any]],
    benchmark_set: Dict[str, Dict],
    normalize_scores: bool = True,
) -> pd.DataFrame:
    """Create a DataFrame from the results with the specified benchmarks."""
    if not results:
        return pd.DataFrame(columns=["Model"])

    df = pd.DataFrame(results)

    # Prepare list of columns to include
    columns = ["model_name"]
    column_mapping = {"model_name": "Model"}

    # Add benchmarks that exist in the results
    for benchmark, config in benchmark_set.items():
        if benchmark in df.columns:
            columns.append(benchmark)
            column_mapping[benchmark] = config["name"]

    # Filter columns that exist
    existing_columns = [col for col in columns if col in df.columns]

    # Create the dataframe with selected columns
    if len(existing_columns) <= 1:  # Only model_name column exists
        return pd.DataFrame({"Model": df["model_name"]})

    result_df = df[existing_columns].rename(columns=column_mapping)

    # Normalize scores for better readability if requested
    if normalize_scores:
        for col in result_df.columns:
            if col != "Model":
                # Check if values are in [0,1] range (except for BLEU which might be higher)
                if (
                    col.startswith("MMLU")
                    or col == "Belebele Ukrainian"
                    or col == "SQuAD Ukrainian"
                    or col == "ARC Easy Ukrainian"
                    or col == "Winogrande Ukrainian"
                    or col == "IFEval Ukrainian"
                ) and result_df[col].mean() < 1:
                    result_df[col] = (result_df[col] * 100).round(2)
                else:
                    result_df[col] = result_df[col].round(2)

    return result_df


def calculate_average_rank(df: pd.DataFrame) -> pd.DataFrame:
    """Calculate the average rank of each model across all metrics."""
    if len(df.columns) <= 1:  # Only Model column
        return df

    # Get metric columns (all except Model)
    metric_columns = [col for col in df.columns if col != "Model"]

    # Calculate ranks for each metric (higher is better)
    for col in metric_columns:
        rank_col = f"{col}_rank"
        df[rank_col] = df[col].rank(ascending=False, method="min")

    # Calculate average rank
    rank_columns = [f"{col}_rank" for col in metric_columns]
    df["Average Rank"] = df[rank_columns].mean(axis=1).round(2)

    # Drop individual rank columns
    df = df.drop(columns=rank_columns)

    # Sort by average rank (lower is better)
    df = df.sort_values("Average Rank", ascending=True)

    return df


def create_relative_scores(df: pd.DataFrame) -> pd.DataFrame:
    """Calculate relative performance as percentage of the best model's score."""
    result_df = df.copy()

    for col in result_df.columns:
        if col not in ["Model", "Average Rank"]:
            max_score = result_df[col].max()
            if max_score > 0:
                result_df[col] = ((result_df[col] / max_score) * 100).round(2)

    return result_df


def create_radar_chart(df: pd.DataFrame, selected_models: List[str]) -> plt.Figure:
    """Create a radar chart comparing multiple models."""
    if not selected_models or len(selected_models) == 0:
        return None

    # Get metric columns (excluding Model and Average Rank)
    metric_columns = [col for col in df.columns if col not in ["Model", "Average Rank"]]

    if len(metric_columns) == 0:
        return None

    # Filter data for selected models
    plot_df = df[df["Model"].isin(selected_models)]

    # Number of variables
    num_vars = len(metric_columns)

    # Compute angle for each axis
    angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()
    angles += angles[:1]  # Complete the circle

    # Initialize the plot
    fig, ax = plt.subplots(figsize=(10, 10), subplot_kw=dict(projection="polar"))

    # Get scales for normalization
    scales = {}
    for col in metric_columns:
        # Find the corresponding benchmark configuration
        benchmark_key = None
        for bench_dict in [MAIN_BENCHMARKS, FLORES_BENCHMARKS, LONG_FLORES_BENCHMARKS]:
            for key, config in bench_dict.items():
                if config["name"] == col:
                    benchmark_key = key
                    break
            if benchmark_key:
                break

        if benchmark_key and benchmark_key in ALL_BENCHMARKS:
            scales[col] = ALL_BENCHMARKS[benchmark_key]["scale"]
        else:
            # Fallback to data-based scale
            scales[col] = [0, df[col].max() * 1.1]

    # Normalize values to 0-100 scale for radar chart
    normalized_df = plot_df.copy()
    for col in metric_columns:
        min_val, max_val = scales[col]
        normalized_df[col] = (
            (plot_df[col] - min_val) / (max_val - min_val) * 100
        ).clip(0, 100)

    # Plot each model
    for idx, model in enumerate(selected_models):
        model_data = normalized_df[normalized_df["Model"] == model]
        if len(model_data) == 0:
            continue

        values = model_data[metric_columns].values.flatten().tolist()
        values += values[:1]  # Complete the circle

        ax.plot(angles, values, "o-", linewidth=2, label=model)
        ax.fill(angles, values, alpha=0.15)

    # Set scale to 0-100
    ax.set_ylim(0, 100)

    # Fix axis to go in the right order
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(metric_columns, size=8)

    # Add concentric circles for reference
    ax.set_yticks([20, 40, 60, 80, 100])
    ax.set_yticklabels(["20%", "40%", "60%", "80%", "100%"], size=6)
    ax.grid(True)

    # Add legend
    ax.legend(loc="upper right", bbox_to_anchor=(1.3, 1.1))

    plt.title("Model Comparison (Normalized to Scale)", size=14, y=1.08)
    plt.tight_layout()

    return fig


def create_bar_chart(df: pd.DataFrame, metric: str) -> plt.Figure:
    """Create a bar chart for a specific metric."""
    if metric not in df.columns or metric == "Model":
        return None

    fig, ax = plt.subplots(figsize=(10, max(6, len(df) * 0.3)))

    # Sort by the metric
    plot_df = df.sort_values(metric, ascending=True)

    # Create horizontal bar chart
    bars = ax.barh(plot_df["Model"], plot_df[metric])

    # Color bars with a gradient
    colors = plt.cm.viridis(np.linspace(0.3, 0.9, len(plot_df)))
    for bar, color in zip(bars, colors):
        bar.set_color(color)

    # Set x-axis scale based on benchmark configuration
    benchmark_key = None
    for bench_dict in [MAIN_BENCHMARKS, FLORES_BENCHMARKS, LONG_FLORES_BENCHMARKS]:
        for key, config in bench_dict.items():
            if config["name"] == metric:
                benchmark_key = key
                break
        if benchmark_key:
            break

    if False: # TODO: proper fix for scales # benchmark_key and benchmark_key in ALL_BENCHMARKS:
        min_val, max_val = ALL_BENCHMARKS[benchmark_key]["scale"]
        ax.set_xlim(min_val, max_val)

    ax.set_xlabel(metric)
    ax.set_title(f"Model Performance on {metric}")
    ax.grid(axis="x", alpha=0.3)

    plt.tight_layout()
    return fig


def create_leaderboard_app() -> gr.Blocks:
    """Create the Gradio interface for the leaderboard."""
    # Load results

    results = load_results("eval-results")

    # Create main dataframe
    main_df = create_dataframe(results, MAIN_BENCHMARKS)
    main_df = calculate_average_rank(main_df)

    # Create detailed dataframe
    detailed_df = create_dataframe(results, ALL_BENCHMARKS)
    detailed_df = calculate_average_rank(detailed_df)

    # Create MMLU dataframe (only aggregate, no subcategories)
    mmlu_df = create_dataframe(
        results,
        {"global_mmlu_full_uk": MAIN_BENCHMARKS["global_mmlu_full_uk"]},
    )
    mmlu_df = calculate_average_rank(mmlu_df)

    # Create FLORES dataframe (only en-uk and uk-en)
    flores_df = create_dataframe(
        results,
        {**{"flores_uk": MAIN_BENCHMARKS["flores_uk"]}, **FLORES_BENCHMARKS},
    )

    flores_df = calculate_average_rank(flores_df)

    # Create Long FLORES dataframe (only en-uk and uk-en)
    long_flores_df = create_dataframe(
        results,
        {
            **{"long_flores_uk": MAIN_BENCHMARKS["long_flores_uk"]},
            **LONG_FLORES_BENCHMARKS,
        },
    )
    long_flores_df = calculate_average_rank(long_flores_df)

    # Create Gradio interface
    with gr.Blocks(
        title="Ukrainian Language Model Leaderboard",
        theme=gr.themes.Soft(),
    ) as app:
        gr.Markdown(
            """
        # 🇺🇦 Ukrainian Language Model Leaderboard
        
        Welcome to the Ukrainian Language Model Leaderboard! This dashboard displays performance metrics 
        for various language models on Ukrainian language benchmarks.
        """
        )

        # Main Leaderboard Tab
        with gr.Tab("📊 Main Leaderboard"):
            gr.Markdown(
                """
            ## Main Performance Metrics
            
            This table shows the core benchmarks for evaluating Ukrainian language understanding and generation.
            Models are ranked by their average performance across all metrics.
            
            **Note:** For FLORES benchmarks, only English↔Ukrainian pairs are shown. For MMLU, only the aggregate score is displayed.
            """
            )

            with gr.Row():
                with gr.Column(scale=4):
                    main_table = gr.Dataframe(
                        value=main_df.sort_values("Average Rank", ascending=True),
                        label="Leaderboard",
                        interactive=False,
                        wrap=False,
                    )

                with gr.Column(scale=1):
                    gr.Markdown("### Display Options")
                    main_sort_by = gr.Dropdown(
                        choices=main_df.columns.tolist(),
                        value="Average Rank",
                        label="Sort by",
                    )
                    main_sort_asc = gr.Checkbox(value=True, label="Ascending Order")
                    main_show_relative = gr.Checkbox(
                        value=False, label="Show Relative Scores (%)"
                    )
                    refresh_btn = gr.Button("🔄 Refresh Data")

        # Detailed Benchmarks Tab
        with gr.Tab("📈 Detailed Benchmarks"):
            gr.Markdown(
                """
            ## Detailed Performance Breakdown
            
            Explore performance on specific benchmark categories.
            
            **Note:** MMLU shows only the aggregate score. FLORES and Long FLORES show only English↔Ukrainian translation pairs.
            """
            )

            with gr.Row():
                benchmark_category = gr.Radio(
                    choices=[
                        "All Benchmarks",
                        "MMLU (Aggregate Only)",
                        "FLORES Translation Pairs (EN↔UK)",
                        "Long FLORES Translation Pairs (EN↔UK)",
                    ],
                    value="All Benchmarks",
                    label="Select Category",
                )
                remove_dupes = gr.Checkbox(value=False, label="Remove Duplicate Models")

            detailed_table = gr.Dataframe(
                value=detailed_df.sort_values("Average Rank", ascending=True),
                label="Detailed Results",
                interactive=False,
                wrap=False,
            )

            with gr.Row():
                detailed_sort_by = gr.Dropdown(
                    choices=detailed_df.columns.tolist(),
                    value="Average Rank",
                    label="Sort by",
                )
                detailed_sort_asc = gr.Checkbox(value=True, label="Ascending Order")

        # Model Comparison Tab
        with gr.Tab("🔍 Model Comparison"):
            gr.Markdown(
                """
            ## Compare Models
            
            Select multiple models to compare their performance across all benchmarks using a radar chart.
            """
            )

            with gr.Row():
                model_selector = gr.Dropdown(
                    choices=(
                        main_df["Model"].tolist() if "Model" in main_df.columns else []
                    ),
                    multiselect=True,
                    label="Select Models to Compare",
                )
                compare_btn = gr.Button("Generate Comparison")

            with gr.Row():
                comparison_chart = gr.Plot(label="Radar Chart Comparison")

            comparison_table = gr.Dataframe(
                label="Comparison Table",
                interactive=False,
                wrap=False,
            )

        # Visualizations Tab
        with gr.Tab("📊 Visualizations"):
            with gr.Row():
                with gr.Column():
                    metric_selector = gr.Dropdown(
                        choices=[col for col in main_df.columns if col != "Model"],
                        value=main_df.columns[1] if len(main_df.columns) > 1 else None,
                        label="Select Metric to Visualize",
                    )
                    viz_btn = gr.Button("Generate Visualization")

            with gr.Row():
                metric_plot = gr.Plot(label="Benchmark Performance")

            gr.Markdown(
                """
            ## Visualizations
            
            Select a metric to generate a bar chart showing the performance of all models on that specific benchmark.
            """
            )

        # Update functions
        def update_main_table(sort_by, sort_asc, show_relative):
            df = main_df.copy()

            if show_relative:
                df = create_relative_scores(df)

            if sort_by in df.columns:
                df = df.sort_values(sort_by, ascending=sort_asc)

            return df

        def update_detailed_table(category, sort_by, sort_asc, remove_duplicates):
            if category == "All Benchmarks":
                df = detailed_df.copy()
            elif category == "MMLU (Aggregate Only)":
                df = mmlu_df.copy()
            elif category == "FLORES Translation Pairs (EN↔UK)":
                df = flores_df.copy()
            elif category == "Long FLORES Translation Pairs (EN↔UK)":
                df = long_flores_df.copy()
            else:
                df = detailed_df.copy()

            if remove_duplicates and "Model" in df.columns:
                # Keep only the first occurrence of each model
                df = df.drop_duplicates(subset=["Model"])

            if sort_by in df.columns:
                df = df.sort_values(sort_by, ascending=sort_asc)

            return df

        def update_comparison(models):
            if not models or len(models) == 0:
                return None, pd.DataFrame()

            # Create comparison chart
            chart = create_radar_chart(main_df, models)

            # Create comparison table
            table = main_df[main_df["Model"].isin(models)].copy()

            return chart, table

        def update_visualization(metric):
            if not metric:
                return None

            chart = create_bar_chart(main_df, metric)
            return chart

        def refresh_data():
            nonlocal results, main_df, detailed_df, mmlu_df, flores_df, long_flores_df
            results = load_results()

            # Recreate all dataframes
            main_df = create_dataframe(results, MAIN_BENCHMARKS)
            main_df = calculate_average_rank(main_df)

            detailed_df = create_dataframe(results, ALL_BENCHMARKS)
            detailed_df = calculate_average_rank(detailed_df)

            mmlu_df = create_dataframe(
                results,
                {"global_mmlu_full_uk": MAIN_BENCHMARKS["global_mmlu_full_uk"]},
            )
            mmlu_df = calculate_average_rank(mmlu_df)

            flores_df = create_dataframe(
                results,
                {**{"flores_uk": MAIN_BENCHMARKS["flores_uk"]}, **FLORES_BENCHMARKS},
            )
            flores_df = calculate_average_rank(flores_df)

            long_flores_df = create_dataframe(
                results,
                {
                    **{"long_flores_uk": MAIN_BENCHMARKS["long_flores_uk"]},
                    **LONG_FLORES_BENCHMARKS,
                },
            )
            long_flores_df = calculate_average_rank(long_flores_df)

            # Update dropdown choices
            model_list = main_df["Model"].tolist() if "Model" in main_df.columns else []

            # Return updated table and dropdown
            return (
                main_df.sort_values("Average Rank", ascending=True),
                gr.Dropdown.update(choices=model_list),
            )

        # Connect event handlers
        main_sort_by.change(
            fn=update_main_table,
            inputs=[main_sort_by, main_sort_asc, main_show_relative],
            outputs=main_table,
        )

        main_sort_asc.change(
            fn=update_main_table,
            inputs=[main_sort_by, main_sort_asc, main_show_relative],
            outputs=main_table,
        )

        main_show_relative.change(
            fn=update_main_table,
            inputs=[main_sort_by, main_sort_asc, main_show_relative],
            outputs=main_table,
        )

        refresh_btn.click(
            fn=refresh_data,
            inputs=[],
            outputs=[main_table, model_selector],
        )

        benchmark_category.change(
            fn=update_detailed_table,
            inputs=[
                benchmark_category,
                detailed_sort_by,
                detailed_sort_asc,
                remove_dupes,
            ],
            outputs=detailed_table,
        )

        detailed_sort_by.change(
            fn=update_detailed_table,
            inputs=[
                benchmark_category,
                detailed_sort_by,
                detailed_sort_asc,
                remove_dupes,
            ],
            outputs=detailed_table,
        )

        detailed_sort_asc.change(
            fn=update_detailed_table,
            inputs=[
                benchmark_category,
                detailed_sort_by,
                detailed_sort_asc,
                remove_dupes,
            ],
            outputs=detailed_table,
        )

        remove_dupes.change(
            fn=update_detailed_table,
            inputs=[
                benchmark_category,
                detailed_sort_by,
                detailed_sort_asc,
                remove_dupes,
            ],
            outputs=detailed_table,
        )

        compare_btn.click(
            fn=update_comparison,
            inputs=[model_selector],
            outputs=[comparison_chart, comparison_table],
        )

        viz_btn.click(
            fn=update_visualization, inputs=[metric_selector], outputs=[metric_plot]
        )

        with open("README.md", "r", encoding="utf-8") as f:
            readme_content = f.read()
            readme_content = readme_content[readme_content.find("---", 5):]


        # Footer
        gr.Markdown(
            readme_content
        )


    return app


# Run the app when the script is executed
if __name__ == "__main__":
    app = create_leaderboard_app()
    app.launch()