# Mellum 2 Base (Pretrain) — evaluation results # Self-reported by JetBrains. Pre-training evaluations from Mellum 2 Technical Report (Table 5). # Only entries for benchmarks confirmed to be registered as HF Hub Benchmarks are listed. # Other reported benchmarks (HumanEval, MBPP, BBH, HellaSwag, ARC, etc.) can be added # once their dataset repos define an `eval.yaml`. - dataset: id: Idavidrein/gpqa task_id: diamond value: 31.31 date: "2026-05-27" notes: "pre-training eval, no-tools" - dataset: id: Idavidrein/gpqa task_id: main value: 35.04 date: "2026-05-27" notes: "pre-training eval, no-tools" - dataset: id: TIGER-Lab/MMLU-Pro task_id: mmlu_pro value: 59.31 date: "2026-05-27" notes: "pre-training eval, no-tools, exact match" - dataset: id: openai/gsm8k task_id: gsm8k value: 81.73 date: "2026-05-27" notes: "pre-training eval, no-tools, exact match"