# Experimental WSD variant: longer stable phase, shorter decay.
# Goal: reduce time spent in the suspected unstable LR band ~6e-5 -> 1e-4.
# Keep experiment-only variants under configs/testing/.

dataset_dir: /mnt/apps/llm-nanochat/datasets/202605011052_fresh_50_50_score100_2500_sourcebalanced
output_dir: /mnt/apps/llm-nanochat/artifacts/runs/20260517_stable-config-recipe-v5-gpt2small-lr2e4-batchmaxpossible-bs6-wsd-fastdecay
tokenizer_dir: /mnt/apps/llm-nanochat/tokenizers/tok_202605011052_fresh_50_50_score100_32k_fromscratch
seed: 1337

model:
  vocab_size: 32000
  dim: 768
  n_layers: 12
  n_heads: 12

training:
  sequence_length: 2500
  max_steps: 10000
  batch_size: 6
  grad_accum_steps: 16

  learning_rate: 0.0002
  peak_lr: 0.0002
  lr_schedule: wsd

  warmup_steps: 500
  stable_steps: 7000
  decay_steps: 2500
  final_lr: 1.0e-05

  adamw_betas:
    - 0.9
    - 0.95
  adamw_eps: 1.0e-08
  weight_decay: 0.1
  clip_grad_norm: 1.0

  save_every_steps: 500
  checkpoint_dir: /mnt/apps/llm-nanochat/checkpoints/20260517_stable-config-recipe-v5-gpt2small-lr2e4-batchmaxpossible-bs6-wsd-fastdecay
  precision: bf16

evaluation:
  validation_every_steps: 1000
  validation_max_batches: 128
  probe_every_steps: 1000
  probe_tokenizer_dir: /mnt/apps/llm-nanochat/tokenizers/tok_202605011052_fresh_50_50_score100_32k_fromscratch
  probe_max_new_tokens: 32
  probe_prompts:
    en:
      - prompt: "The capital of Italy is"
        expected_next_text: " Rome"
      - prompt: "A small language model should"
        expected_next_text: " be"
    it:
      - prompt: "La capitale d'Italia è"
        expected_next_text: " Roma"
      - prompt: "Un piccolo modello linguistico dovrebbe"
        expected_next_text: " essere"