File size: 4,231 Bytes
a924780 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 | """Utility functions for leaderboard"""
import pandas as pd
from pathlib import Path
from typing import Dict, List, Optional
import json
def load_leaderboard_data(data_dir: str = "leaderboard/data") -> pd.DataFrame:
"""
Load all leaderboard data from JSON files
Args:
data_dir: Directory containing submission JSON files
Returns:
DataFrame with all submissions
"""
data_path = Path(data_dir)
if not data_path.exists():
# Create empty DataFrame with default columns
return pd.DataFrame(columns=[
'submission_id', 'model_name', 'model_provider', 'ocr_type',
'submitter', 'submission_date', 'km_exact_match', 'km_tolerance_match',
'kcat_exact_match', 'kcat_tolerance_match', 'km_kcat_exact_match',
'km_kcat_tolerance_match', 'overall_exact_match', 'overall_tolerance_match',
'total_papers', 'total_entries', 'notes', 'verified'
])
all_data = []
for json_file in data_path.glob("*.json"):
try:
with open(json_file, 'r') as f:
data = json.load(f)
all_data.append(data)
except Exception as e:
print(f"Error loading {json_file}: {e}")
if not all_data:
return pd.DataFrame(columns=[
'submission_id', 'model_name', 'model_provider', 'ocr_type',
'submitter', 'submission_date', 'km_exact_match', 'km_tolerance_match',
'kcat_exact_match', 'kcat_tolerance_match', 'km_kcat_exact_match',
'km_kcat_tolerance_match', 'overall_exact_match', 'overall_tolerance_match',
'total_papers', 'total_entries', 'notes', 'verified'
])
df = pd.DataFrame(all_data)
# Convert date strings to datetime
if 'submission_date' in df.columns:
df['submission_date'] = pd.to_datetime(df['submission_date'])
return df.sort_values('overall_exact_match', ascending=False)
def format_metrics(value: float, as_percentage: bool = True) -> str:
"""Format metric value for display"""
if as_percentage:
return f"{value * 100:.2f}%"
return f"{value:.4f}"
def get_leaderboard_summary(df: pd.DataFrame) -> Dict:
"""Get summary statistics from leaderboard"""
if df.empty:
return {
'total_submissions': 0,
'unique_models': 0,
'best_score': 0.0,
'avg_score': 0.0
}
return {
'total_submissions': len(df),
'unique_models': df['model_name'].nunique(),
'best_score': df['overall_exact_match'].max() * 100,
'avg_score': df['overall_exact_match'].mean() * 100,
'verified_submissions': df['verified'].sum() if 'verified' in df.columns else 0
}
def filter_leaderboard(
df: pd.DataFrame,
model_provider: Optional[str] = None,
ocr_type: Optional[str] = None,
verified_only: bool = False
) -> pd.DataFrame:
"""Filter leaderboard based on criteria"""
filtered_df = df.copy()
if model_provider and model_provider != "All":
filtered_df = filtered_df[filtered_df['model_provider'] == model_provider]
if ocr_type and ocr_type != "All":
filtered_df = filtered_df[filtered_df['ocr_type'] == ocr_type]
if verified_only and 'verified' in filtered_df.columns:
filtered_df = filtered_df[filtered_df['verified'] == True]
return filtered_df
def get_top_n(df: pd.DataFrame, n: int = 10) -> pd.DataFrame:
"""Get top N submissions"""
return df.head(n)
def create_comparison_data(df: pd.DataFrame) -> Dict:
"""Create data for comparison charts"""
if df.empty:
return {}
# Group by model provider
provider_stats = df.groupby('model_provider').agg({
'overall_exact_match': ['mean', 'max', 'count'],
'overall_tolerance_match': 'mean'
}).round(4)
# Group by OCR type
ocr_stats = df.groupby('ocr_type').agg({
'overall_exact_match': ['mean', 'max', 'count']
}).round(4)
return {
'by_provider': provider_stats.to_dict(),
'by_ocr': ocr_stats.to_dict()
}
|