# Mellum 2 Base (Pretrain) — evaluation results
# Self-reported by JetBrains. Pre-training evaluations from Mellum 2 Technical Report (Table 5).
# Only entries for benchmarks confirmed to be registered as HF Hub Benchmarks are listed.
# Other reported benchmarks (HumanEval, MBPP, BBH, HellaSwag, ARC, etc.) can be added
# once their dataset repos define an `eval.yaml`.

- dataset:
    id: Idavidrein/gpqa
    task_id: diamond
  value: 31.31
  date: "2026-05-27"
  notes: "pre-training eval, no-tools"

- dataset:
    id: Idavidrein/gpqa
    task_id: main
  value: 35.04
  date: "2026-05-27"
  notes: "pre-training eval, no-tools"

- dataset:
    id: TIGER-Lab/MMLU-Pro
    task_id: mmlu_pro
  value: 59.31
  date: "2026-05-27"
  notes: "pre-training eval, no-tools, exact match"

- dataset:
    id: openai/gsm8k
    task_id: gsm8k
  value: 81.73
  date: "2026-05-27"
  notes: "pre-training eval, no-tools, exact match"