github-actions[bot] commited on
Commit
a924780
·
1 Parent(s): bbffc2c

Update leaderboard from GitHub main branch

Browse files
Files changed (5) hide show
  1. .gitattributes +0 -35
  2. README.md +34 -5
  3. app.py +474 -0
  4. requirements.txt +4 -0
  5. utils.py +127 -0
.gitattributes DELETED
@@ -1,35 +0,0 @@
1
- *.7z filter=lfs diff=lfs merge=lfs -text
2
- *.arrow filter=lfs diff=lfs merge=lfs -text
3
- *.bin filter=lfs diff=lfs merge=lfs -text
4
- *.bz2 filter=lfs diff=lfs merge=lfs -text
5
- *.ckpt filter=lfs diff=lfs merge=lfs -text
6
- *.ftz filter=lfs diff=lfs merge=lfs -text
7
- *.gz filter=lfs diff=lfs merge=lfs -text
8
- *.h5 filter=lfs diff=lfs merge=lfs -text
9
- *.joblib filter=lfs diff=lfs merge=lfs -text
10
- *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
- *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
- *.model filter=lfs diff=lfs merge=lfs -text
13
- *.msgpack filter=lfs diff=lfs merge=lfs -text
14
- *.npy filter=lfs diff=lfs merge=lfs -text
15
- *.npz filter=lfs diff=lfs merge=lfs -text
16
- *.onnx filter=lfs diff=lfs merge=lfs -text
17
- *.ot filter=lfs diff=lfs merge=lfs -text
18
- *.parquet filter=lfs diff=lfs merge=lfs -text
19
- *.pb filter=lfs diff=lfs merge=lfs -text
20
- *.pickle filter=lfs diff=lfs merge=lfs -text
21
- *.pkl filter=lfs diff=lfs merge=lfs -text
22
- *.pt filter=lfs diff=lfs merge=lfs -text
23
- *.pth filter=lfs diff=lfs merge=lfs -text
24
- *.rar filter=lfs diff=lfs merge=lfs -text
25
- *.safetensors filter=lfs diff=lfs merge=lfs -text
26
- saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
- *.tar.* filter=lfs diff=lfs merge=lfs -text
28
- *.tar filter=lfs diff=lfs merge=lfs -text
29
- *.tflite filter=lfs diff=lfs merge=lfs -text
30
- *.tgz filter=lfs diff=lfs merge=lfs -text
31
- *.wasm filter=lfs diff=lfs merge=lfs -text
32
- *.xz filter=lfs diff=lfs merge=lfs -text
33
- *.zip filter=lfs diff=lfs merge=lfs -text
34
- *.zst filter=lfs diff=lfs merge=lfs -text
35
- *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
README.md CHANGED
@@ -1,12 +1,41 @@
1
  ---
2
- title: Llm Enzyme Kinetics Leaderboard
3
- emoji: 📈
4
  colorFrom: blue
5
- colorTo: red
6
  sdk: gradio
7
- sdk_version: 6.5.1
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  ---
2
+ title: LLM Enzyme Kinetics Benchmark Leaderboard
3
+ emoji: 🧪
4
  colorFrom: blue
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 4.0.0
8
  app_file: app.py
9
  pinned: false
10
+ license: mit
11
  ---
12
 
13
+ # LLM Enzyme Kinetics Extraction Benchmark Leaderboard
14
+
15
+ Interactive leaderboard for comparing LLM performance on enzyme kinetics extraction from scientific literature.
16
+
17
+ ## 🏆 Features
18
+
19
+ - Live leaderboard with real-time rankings
20
+ - Interactive filters (model provider, OCR type)
21
+ - Performance visualizations
22
+ - Result submission system
23
+ - Timeline tracking
24
+
25
+ ## 📊 Benchmark Info
26
+
27
+ - **Papers**: 156 peer-reviewed publications
28
+ - **Entries**: 4,244 enzyme kinetic entries
29
+ - **Parameters**: Km, kcat, kcat/Km
30
+ - **OCR Types**: Mathpix, Kimi, PyMuPDF
31
+
32
+ ## 🚀 How to Participate
33
+
34
+ 1. Clone the main repository
35
+ 2. Run the benchmark: `python scripts/run_benchmark.py --mode full`
36
+ 3. Submit your results through this leaderboard!
37
+
38
+ ## 📚 Documentation
39
+
40
+ - [Full Documentation](https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark)
41
+ - [Usage Guide](https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark/blob/main/USAGE.md)
app.py ADDED
@@ -0,0 +1,474 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ LLM Enzyme Kinetics Extraction Benchmark Leaderboard
3
+ Built with Gradio
4
+ """
5
+
6
+ import gradio as gr
7
+ import pandas as pd
8
+ import plotly.graph_objects as go
9
+ import plotly.express as px
10
+ from datetime import datetime
11
+ import json
12
+ from pathlib import Path
13
+ from utils import (
14
+ load_leaderboard_data, format_metrics, get_leaderboard_summary,
15
+ filter_leaderboard, get_top_n, create_comparison_data
16
+ )
17
+
18
+ # CSS for better styling
19
+ custom_css = """
20
+ .gradio-container {
21
+ max-width: 1400px !important;
22
+ }
23
+ .metric-card {
24
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
25
+ padding: 20px;
26
+ border-radius: 10px;
27
+ color: white;
28
+ text-align: center;
29
+ }
30
+ .leaderboard-table {
31
+ font-size: 14px;
32
+ }
33
+ """
34
+
35
+ # Initialize leaderboard data
36
+ LEADERBOARD_DF = load_leaderboard_data()
37
+
38
+ def create_leaderboard_table(
39
+ model_provider: str = "All",
40
+ ocr_type: str = "All",
41
+ verified_only: bool = False,
42
+ top_n: int = 50
43
+ ) -> pd.DataFrame:
44
+ """Create filtered leaderboard table"""
45
+ filtered_df = filter_leaderboard(LEADERBOARD_DF, model_provider, ocr_type, verified_only)
46
+ top_df = get_top_n(filtered_df, top_n)
47
+
48
+ if top_df.empty:
49
+ return pd.DataFrame(columns=["Rank", "Model", "Provider", "OCR", "Submitter", "Date",
50
+ "Km (Exact)", "Km (±10%)", "kcat (Exact)", "kcat (±10%)",
51
+ "kcat/Km (Exact)", "kcat/Km (±10%)", "Overall (Exact)", "Overall (±10%)"])
52
+
53
+ # Format for display
54
+ display_df = pd.DataFrame({
55
+ 'Rank': range(1, len(top_df) + 1),
56
+ 'Model': top_df['model_name'],
57
+ 'Provider': top_df['model_provider'],
58
+ 'OCR': top_df['ocr_type'],
59
+ 'Submitter': top_df['submitter'],
60
+ 'Date': top_df['submission_date'].dt.strftime('%Y-%m-%d'),
61
+ 'Km (Exact)': top_df['km_exact_match'].apply(format_metrics),
62
+ 'Km (±10%)': top_df['km_tolerance_match'].apply(format_metrics),
63
+ 'kcat (Exact)': top_df['kcat_exact_match'].apply(format_metrics),
64
+ 'kcat (±10%)': top_df['kcat_tolerance_match'].apply(format_metrics),
65
+ 'kcat/Km (Exact)': top_df['km_kcat_exact_match'].apply(format_metrics),
66
+ 'kcat/Km (±10%)': top_df['km_kcat_tolerance_match'].apply(format_metrics),
67
+ 'Overall (Exact)': top_df['overall_exact_match'].apply(format_metrics),
68
+ 'Overall (±10%)': top_df['overall_tolerance_match'].apply(format_metrics),
69
+ })
70
+
71
+ return display_df
72
+
73
+
74
+ def create_summary_cards() -> str:
75
+ """Create summary statistics HTML"""
76
+ summary = get_leaderboard_summary(LEADERBOARD_DF)
77
+
78
+ html = f"""
79
+ <div style="display: grid; grid-template-columns: repeat(4, 1fr); gap: 15px; margin-bottom: 20px;">
80
+ <div style="background: linear-gradient(135deg, #667eea 0%, #764ba2 100%); padding: 20px; border-radius: 10px; color: white; text-align: center;">
81
+ <div style="font-size: 14px; opacity: 0.9;">Total Submissions</div>
82
+ <div style="font-size: 32px; font-weight: bold;">{summary['total_submissions']}</div>
83
+ </div>
84
+ <div style="background: linear-gradient(135deg, #f093fb 0%, #f5576c 100%); padding: 20px; border-radius: 10px; color: white; text-align: center;">
85
+ <div style="font-size: 14px; opacity: 0.9;">Unique Models</div>
86
+ <div style="font-size: 32px; font-weight: bold;">{summary['unique_models']}</div>
87
+ </div>
88
+ <div style="background: linear-gradient(135deg, #4facfe 0%, #00f2fe 100%); padding: 20px; border-radius: 10px; color: white; text-align: center;">
89
+ <div style="font-size: 14px; opacity: 0.9;">Best Score</div>
90
+ <div style="font-size: 32px; font-weight: bold;">{summary['best_score']:.1f}%</div>
91
+ </div>
92
+ <div style="background: linear-gradient(135deg, #43e97b 0%, #38f9d7 100%); padding: 20px; border-radius: 10px; color: white; text-align: center;">
93
+ <div style="font-size: 14px; opacity: 0.9;">Average Score</div>
94
+ <div style="font-size: 32px; font-weight: bold;">{summary['avg_score']:.1f}%</div>
95
+ </div>
96
+ </div>
97
+ """
98
+ return html
99
+
100
+
101
+ def create_score_comparison_chart() -> go.Figure:
102
+ """Create score comparison bar chart"""
103
+ if LEADERBOARD_DF.empty:
104
+ fig = go.Figure()
105
+ fig.add_annotation(text="No submissions yet", xref="paper", yref="paper",
106
+ x=0.5, y=0.5, showarrow=False)
107
+ return fig
108
+
109
+ # Get top 10 submissions
110
+ top_10 = get_top_n(LEADERBOARD_DF, 10)
111
+
112
+ fig = go.Figure()
113
+ fig.add_trace(go.Bar(
114
+ x=top_10['overall_exact_match'] * 100,
115
+ y=top_10['model_name'] + ' (' + top_10['model_provider'] + ')',
116
+ orientation='h',
117
+ marker=dict(color='rgba(102, 126, 234, 0.8)'),
118
+ text=top_10['overall_exact_match'].apply(lambda x: f'{x*100:.1f}%'),
119
+ textposition='outside'
120
+ ))
121
+
122
+ fig.update_layout(
123
+ title='Top 10 Models - Exact Match Accuracy',
124
+ xaxis_title='Accuracy (%)',
125
+ yaxis_title='Model',
126
+ height=400,
127
+ margin=dict(l=20, r=20, t=40, b=20)
128
+ )
129
+
130
+ return fig
131
+
132
+
133
+ def create_ocr_comparison_chart() -> go.Figure:
134
+ """Create OCR type comparison chart"""
135
+ if LEADERBOARD_DF.empty:
136
+ fig = go.Figure()
137
+ fig.add_annotation(text="No submissions yet", xref="paper", yref="paper",
138
+ x=0.5, y=0.5, showarrow=False)
139
+ return fig
140
+
141
+ ocr_stats = LEADERBOARD_DF.groupby('ocr_type')['overall_exact_match'].agg(['mean', 'count']).reset_index()
142
+
143
+ fig = go.Figure()
144
+ fig.add_trace(go.Bar(
145
+ x=ocr_stats['ocr_type'],
146
+ y=ocr_stats['mean'] * 100,
147
+ marker=dict(color=['rgba(102, 126, 234, 0.8)', 'rgba(240, 147, 251, 0.8)', 'rgba(79, 172, 254, 0.8)']),
148
+ text=ocr_stats['mean'].apply(lambda x: f'{x*100:.1f}%'),
149
+ textposition='outside',
150
+ name='Accuracy'
151
+ ))
152
+
153
+ fig.update_layout(
154
+ title='Performance by OCR Type',
155
+ xaxis_title='OCR Type',
156
+ yaxis_title='Average Exact Match (%)',
157
+ height=400,
158
+ margin=dict(l=20, r=20, t=40, b=20)
159
+ )
160
+
161
+ return fig
162
+
163
+
164
+ def create_timeline_chart() -> go.Figure:
165
+ """Create submission timeline chart"""
166
+ if LEADERBOARD_DF.empty:
167
+ fig = go.Figure()
168
+ fig.add_annotation(text="No submissions yet", xref="paper", yref="paper",
169
+ x=0.5, y=0.5, showarrow=False)
170
+ return fig
171
+
172
+ df_sorted = LEADERBOARD_DF.sort_values('submission_date')
173
+ df_sorted['cumulative_best'] = df_sorted['overall_exact_match'].cummax()
174
+
175
+ fig = go.Figure()
176
+
177
+ # Add all submissions as scatter
178
+ fig.add_trace(go.Scatter(
179
+ x=df_sorted['submission_date'],
180
+ y=df_sorted['overall_exact_match'] * 100,
181
+ mode='markers',
182
+ name='Submissions',
183
+ marker=dict(size=8, color='rgba(102, 126, 234, 0.5)'),
184
+ text=df_sorted['model_name'],
185
+ hovertemplate='%{text}<br>%{x}<br>%{y:.1f}%'
186
+ ))
187
+
188
+ # Add best score line
189
+ fig.add_trace(go.Scatter(
190
+ x=df_sorted['submission_date'],
191
+ y=df_sorted['cumulative_best'] * 100,
192
+ mode='lines',
193
+ name='Best Score',
194
+ line=dict(color='rgba(67, 233, 123, 0.8)', width=2)
195
+ ))
196
+
197
+ fig.update_layout(
198
+ title='Submission Timeline & Progress',
199
+ xaxis_title='Date',
200
+ yaxis_title='Exact Match (%)',
201
+ height=400,
202
+ margin=dict(l=20, r=20, t=40, b=20),
203
+ hovermode='x unified'
204
+ )
205
+
206
+ return fig
207
+
208
+
209
+ def submit_result(
210
+ model_name: str,
211
+ model_provider: str,
212
+ ocr_type: str,
213
+ submitter: str,
214
+ km_exact: float,
215
+ km_tolerance: float,
216
+ kcat_exact: float,
217
+ kcat_tolerance: float,
218
+ km_kcat_exact: float,
219
+ km_kcat_tolerance: float,
220
+ total_papers: int,
221
+ notes: str
222
+ ) -> str:
223
+ """Submit a new result to the leaderboard"""
224
+ try:
225
+ # Calculate overall scores
226
+ overall_exact = (km_exact + kcat_exact + km_kcat_exact) / 3
227
+ overall_tolerance = (km_tolerance + kcat_tolerance + km_kcat_tolerance) / 3
228
+
229
+ # Create submission data
230
+ submission = {
231
+ 'submission_id': f"{datetime.now().strftime('%Y%m%d_%H%M%S')}_{submitter}",
232
+ 'model_name': model_name,
233
+ 'model_provider': model_provider,
234
+ 'ocr_type': ocr_type,
235
+ 'submitter': submitter,
236
+ 'submission_date': datetime.now().isoformat(),
237
+ 'km_exact_match': km_exact / 100,
238
+ 'km_tolerance_match': km_tolerance / 100,
239
+ 'kcat_exact_match': kcat_exact / 100,
240
+ 'kcat_tolerance_match': kcat_tolerance / 100,
241
+ 'km_kcat_exact_match': km_kcat_exact / 100,
242
+ 'km_kcat_tolerance_match': km_kcat_tolerance / 100,
243
+ 'overall_exact_match': overall_exact / 100,
244
+ 'overall_tolerance_match': overall_tolerance / 100,
245
+ 'total_papers': total_papers,
246
+ 'total_entries': total_papers * 3, # Approximate
247
+ 'notes': notes,
248
+ 'verified': False # Needs verification
249
+ }
250
+
251
+ # Save to data directory
252
+ data_dir = Path("leaderboard/data")
253
+ data_dir.mkdir(parents=True, exist_ok=True)
254
+
255
+ submission_file = data_dir / f"{submission['submission_id']}.json"
256
+ with open(submission_file, 'w') as f:
257
+ json.dump(submission, f, indent=2)
258
+
259
+ # Reload leaderboard data
260
+ global LEADERBOARD_DF
261
+ LEADERBOARD_DF = load_leaderboard_data()
262
+
263
+ return f"✅ Submission successful! Your ID: {submission['submission_id']}\n\nPlease create a PR or contact the maintainer to verify your submission."
264
+
265
+ except Exception as e:
266
+ return f"❌ Error: {str(e)}"
267
+
268
+
269
+ # Build Gradio interface
270
+ with gr.Blocks(css=custom_css, title="LLM Enzyme Kinetics Extraction Benchmark") as demo:
271
+ gr.Markdown(
272
+ """
273
+ # 🧪 LLM Enzyme Kinetics Extraction Benchmark Leaderboard
274
+
275
+ Welcome to the leaderboard for the **LLM Enzyme Kinetics Golden Benchmark**!
276
+ This benchmark evaluates LLMs on extracting enzyme kinetic parameters (Km, kcat, kcat/Km)
277
+ from scientific literature.
278
+
279
+ 📚 **Dataset**: 4,244 entries from 156 papers | 🎯 **Task**: Extract kinetic parameters from OCR-processed papers
280
+ """
281
+ )
282
+
283
+ # Summary cards
284
+ gr.HTML(create_summary_cards())
285
+
286
+ with gr.Tabs():
287
+ # Tab 1: Leaderboard Table
288
+ with gr.TabItem("🏆 Leaderboard"):
289
+ gr.Markdown("### Filter and Search")
290
+
291
+ with gr.Row():
292
+ model_provider_dropdown = gr.Dropdown(
293
+ choices=["All", "OpenAI", "Anthropic", "Kimi", "Other"],
294
+ value="All",
295
+ label="Model Provider"
296
+ )
297
+ ocr_type_dropdown = gr.Dropdown(
298
+ choices=["All", "mathpix", "kimi", "pymupdf"],
299
+ value="All",
300
+ label="OCR Type"
301
+ )
302
+ verified_checkbox = gr.Checkbox(
303
+ label="Verified Only",
304
+ value=False
305
+ )
306
+ top_n_slider = gr.Slider(
307
+ minimum=10,
308
+ maximum=100,
309
+ value=50,
310
+ step=10,
311
+ label="Show Top N"
312
+ )
313
+
314
+ leaderboard_table = gr.Dataframe(
315
+ label="Leaderboard",
316
+ datatype=["markdown"] * 14,
317
+ interactive=False,
318
+ wrap=True
319
+ )
320
+
321
+ refresh_btn = gr.Button("🔄 Refresh", variant="primary")
322
+ refresh_btn.click(
323
+ fn=create_leaderboard_table,
324
+ inputs=[model_provider_dropdown, ocr_type_dropdown, verified_checkbox, top_n_slider],
325
+ outputs=leaderboard_table
326
+ )
327
+
328
+ # Initial load
329
+ demo.load(
330
+ fn=create_leaderboard_table,
331
+ inputs=[model_provider_dropdown, ocr_type_dropdown, verified_checkbox, top_n_slider],
332
+ outputs=leaderboard_table
333
+ )
334
+
335
+ # Tab 2: Visualizations
336
+ with gr.TabItem("📊 Visualizations"):
337
+ with gr.Row():
338
+ score_chart = gr.Plot(label="Top Models Comparison")
339
+ ocr_chart = gr.Plot(label="OCR Type Comparison")
340
+
341
+ with gr.Row():
342
+ timeline_chart = gr.Plot(label="Submission Timeline")
343
+
344
+ # Load charts
345
+ demo.load(
346
+ fn=lambda: [create_score_comparison_chart(), create_ocr_comparison_chart(), create_timeline_chart()],
347
+ outputs=[score_chart, ocr_chart, timeline_chart]
348
+ )
349
+
350
+ # Tab 3: Submit Results
351
+ with gr.TabItem("📤 Submit Your Results"):
352
+ gr.Markdown("""
353
+ ### Submit your benchmark results to the leaderboard!
354
+
355
+ **Instructions:**
356
+ 1. Run the benchmark using the provided scripts
357
+ 2. Collect your evaluation metrics
358
+ 3. Fill in the form below
359
+ 4. Your submission will be reviewed before appearing on the leaderboard
360
+
361
+ **Evaluation Scripts:**
362
+ ```bash
363
+ python scripts/run_benchmark.py --mode full
364
+ ```
365
+ """)
366
+
367
+ with gr.Row():
368
+ model_name_input = gr.Textbox(label="Model Name *", placeholder="e.g., GPT-4, Claude-3.5-Sonnet")
369
+ model_provider_input = gr.Dropdown(
370
+ choices=["OpenAI", "Anthropic", "Kimi", "Other"],
371
+ label="Model Provider *"
372
+ )
373
+
374
+ with gr.Row():
375
+ ocr_type_input = gr.Dropdown(
376
+ choices=["mathpix", "kimi", "pymupdf"],
377
+ label="OCR Type *"
378
+ )
379
+ submitter_input = gr.Textbox(label="Submitter Name/Email *", placeholder="Your name or contact")
380
+
381
+ gr.Markdown("### Performance Metrics (%)")
382
+
383
+ with gr.Row():
384
+ km_exact_input = gr.Number(label="Km Exact Match *", minimum=0, maximum=100)
385
+ km_tolerance_input = gr.Number(label="Km Tolerance (±10%) *", minimum=0, maximum=100)
386
+
387
+ with gr.Row():
388
+ kcat_exact_input = gr.Number(label="kcat Exact Match *", minimum=0, maximum=100)
389
+ kcat_tolerance_input = gr.Number(label="kcat Tolerance (±10%) *", minimum=0, maximum=100)
390
+
391
+ with gr.Row():
392
+ km_kcat_exact_input = gr.Number(label="kcat/Km Exact Match *", minimum=0, maximum=100)
393
+ km_kcat_tolerance_input = gr.Number(label="kcat/Km Tolerance (±10%) *", minimum=0, maximum=100)
394
+
395
+ with gr.Row():
396
+ total_papers_input = gr.Number(label="Total Papers Evaluated *", minimum=1, maximum=156)
397
+ notes_input = gr.Textbox(
398
+ label="Notes",
399
+ placeholder="Any additional information about your setup (temperature, prompts, etc.)",
400
+ lines=3
401
+ )
402
+
403
+ submit_btn = gr.Button("Submit Results", variant="primary")
404
+ submission_output = gr.Markdown()
405
+
406
+ submit_btn.click(
407
+ fn=submit_result,
408
+ inputs=[
409
+ model_name_input, model_provider_input, ocr_type_input, submitter_input,
410
+ km_exact_input, km_tolerance_input, kcat_exact_input, kcat_tolerance_input,
411
+ km_kcat_exact_input, km_kcat_tolerance_input, total_papers_input, notes_input
412
+ ],
413
+ outputs=submission_output
414
+ )
415
+
416
+ # Tab 4: About
417
+ with gr.TabItem("ℹ️ About"):
418
+ gr.Markdown("""
419
+ ## About the Benchmark
420
+
421
+ The **LLM Enzyme Kinetics Golden Benchmark** evaluates the ability of Large Language Models
422
+ to extract structured enzyme kinetic data from scientific literature.
423
+
424
+ ### Dataset
425
+ - **Papers**: 156 peer-reviewed publications
426
+ - **Entries**: 4,244 manually curated enzyme kinetic entries
427
+ - **Parameters**: Km, kcat, kcat/Km, pH, temperature, mutations
428
+ - **OCR Versions**: 3 parallel OCR outputs (Mathpix, Kimi, PyMuPDF)
429
+
430
+ ### Evaluation Metrics
431
+ 1. **Exact Match Accuracy**: Value must match exactly
432
+ 2. **Tolerance Match (±10%)**: Value within 10% of ground truth
433
+ 3. Scores are calculated for each parameter (Km, kcat, kcat/Km)
434
+
435
+ ### How to Participate
436
+ 1. Clone the repository:
437
+ ```bash
438
+ git clone https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark.git
439
+ ```
440
+
441
+ 2. Install dependencies:
442
+ ```bash
443
+ conda create -n enzyme_benchmark python=3.10 -y
444
+ conda activate enzyme_benchmark
445
+ pip install -r requirements.txt
446
+ ```
447
+
448
+ 3. Configure your API key in `.env`
449
+
450
+ 4. Run the benchmark:
451
+ ```bash
452
+ python scripts/run_benchmark.py --mode full
453
+ ```
454
+
455
+ 5. Submit your results through this leaderboard!
456
+
457
+ ### Citation
458
+ If you use this benchmark, please cite our repository.
459
+ """)
460
+
461
+ gr.Markdown(
462
+ """
463
+ ---
464
+ **[GitHub Repository](https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark)**
465
+ | **[Documentation](https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark/blob/main/README.md)**
466
+ | **[How to Participate](https://github.com/JackKuo666/LLM-Enzyme-Kinetics-Golden-Benchmark/blob/main/USAGE.md)**
467
+
468
+ *Last updated: {}
469
+ """.format(datetime.now().strftime("%Y-%m-%d"))
470
+ )
471
+
472
+
473
+ if __name__ == "__main__":
474
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ pandas>=2.0.0
3
+ plotly>=5.0.0
4
+ python-dotenv>=1.0.0
utils.py ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Utility functions for leaderboard"""
2
+ import pandas as pd
3
+ from pathlib import Path
4
+ from typing import Dict, List, Optional
5
+ import json
6
+
7
+
8
+ def load_leaderboard_data(data_dir: str = "leaderboard/data") -> pd.DataFrame:
9
+ """
10
+ Load all leaderboard data from JSON files
11
+
12
+ Args:
13
+ data_dir: Directory containing submission JSON files
14
+
15
+ Returns:
16
+ DataFrame with all submissions
17
+ """
18
+ data_path = Path(data_dir)
19
+ if not data_path.exists():
20
+ # Create empty DataFrame with default columns
21
+ return pd.DataFrame(columns=[
22
+ 'submission_id', 'model_name', 'model_provider', 'ocr_type',
23
+ 'submitter', 'submission_date', 'km_exact_match', 'km_tolerance_match',
24
+ 'kcat_exact_match', 'kcat_tolerance_match', 'km_kcat_exact_match',
25
+ 'km_kcat_tolerance_match', 'overall_exact_match', 'overall_tolerance_match',
26
+ 'total_papers', 'total_entries', 'notes', 'verified'
27
+ ])
28
+
29
+ all_data = []
30
+ for json_file in data_path.glob("*.json"):
31
+ try:
32
+ with open(json_file, 'r') as f:
33
+ data = json.load(f)
34
+ all_data.append(data)
35
+ except Exception as e:
36
+ print(f"Error loading {json_file}: {e}")
37
+
38
+ if not all_data:
39
+ return pd.DataFrame(columns=[
40
+ 'submission_id', 'model_name', 'model_provider', 'ocr_type',
41
+ 'submitter', 'submission_date', 'km_exact_match', 'km_tolerance_match',
42
+ 'kcat_exact_match', 'kcat_tolerance_match', 'km_kcat_exact_match',
43
+ 'km_kcat_tolerance_match', 'overall_exact_match', 'overall_tolerance_match',
44
+ 'total_papers', 'total_entries', 'notes', 'verified'
45
+ ])
46
+
47
+ df = pd.DataFrame(all_data)
48
+
49
+ # Convert date strings to datetime
50
+ if 'submission_date' in df.columns:
51
+ df['submission_date'] = pd.to_datetime(df['submission_date'])
52
+
53
+ return df.sort_values('overall_exact_match', ascending=False)
54
+
55
+
56
+ def format_metrics(value: float, as_percentage: bool = True) -> str:
57
+ """Format metric value for display"""
58
+ if as_percentage:
59
+ return f"{value * 100:.2f}%"
60
+ return f"{value:.4f}"
61
+
62
+
63
+ def get_leaderboard_summary(df: pd.DataFrame) -> Dict:
64
+ """Get summary statistics from leaderboard"""
65
+ if df.empty:
66
+ return {
67
+ 'total_submissions': 0,
68
+ 'unique_models': 0,
69
+ 'best_score': 0.0,
70
+ 'avg_score': 0.0
71
+ }
72
+
73
+ return {
74
+ 'total_submissions': len(df),
75
+ 'unique_models': df['model_name'].nunique(),
76
+ 'best_score': df['overall_exact_match'].max() * 100,
77
+ 'avg_score': df['overall_exact_match'].mean() * 100,
78
+ 'verified_submissions': df['verified'].sum() if 'verified' in df.columns else 0
79
+ }
80
+
81
+
82
+ def filter_leaderboard(
83
+ df: pd.DataFrame,
84
+ model_provider: Optional[str] = None,
85
+ ocr_type: Optional[str] = None,
86
+ verified_only: bool = False
87
+ ) -> pd.DataFrame:
88
+ """Filter leaderboard based on criteria"""
89
+ filtered_df = df.copy()
90
+
91
+ if model_provider and model_provider != "All":
92
+ filtered_df = filtered_df[filtered_df['model_provider'] == model_provider]
93
+
94
+ if ocr_type and ocr_type != "All":
95
+ filtered_df = filtered_df[filtered_df['ocr_type'] == ocr_type]
96
+
97
+ if verified_only and 'verified' in filtered_df.columns:
98
+ filtered_df = filtered_df[filtered_df['verified'] == True]
99
+
100
+ return filtered_df
101
+
102
+
103
+ def get_top_n(df: pd.DataFrame, n: int = 10) -> pd.DataFrame:
104
+ """Get top N submissions"""
105
+ return df.head(n)
106
+
107
+
108
+ def create_comparison_data(df: pd.DataFrame) -> Dict:
109
+ """Create data for comparison charts"""
110
+ if df.empty:
111
+ return {}
112
+
113
+ # Group by model provider
114
+ provider_stats = df.groupby('model_provider').agg({
115
+ 'overall_exact_match': ['mean', 'max', 'count'],
116
+ 'overall_tolerance_match': 'mean'
117
+ }).round(4)
118
+
119
+ # Group by OCR type
120
+ ocr_stats = df.groupby('ocr_type').agg({
121
+ 'overall_exact_match': ['mean', 'max', 'count']
122
+ }).round(4)
123
+
124
+ return {
125
+ 'by_provider': provider_stats.to_dict(),
126
+ 'by_ocr': ocr_stats.to_dict()
127
+ }