# Experimental WSD variant: longer stable phase, shorter decay. # Goal: reduce time spent in the suspected unstable LR band ~6e-5 -> 1e-4. # Keep experiment-only variants under configs/testing/. dataset_dir: /mnt/apps/llm-nanochat/datasets/202605011052_fresh_50_50_score100_2500_sourcebalanced output_dir: /mnt/apps/llm-nanochat/artifacts/runs/20260517_stable-config-recipe-v5-gpt2small-lr2e4-batchmaxpossible-bs6-wsd-fastdecay tokenizer_dir: /mnt/apps/llm-nanochat/tokenizers/tok_202605011052_fresh_50_50_score100_32k_fromscratch seed: 1337 model: vocab_size: 32000 dim: 768 n_layers: 12 n_heads: 12 training: sequence_length: 2500 max_steps: 10000 batch_size: 6 grad_accum_steps: 16 learning_rate: 0.0002 peak_lr: 0.0002 lr_schedule: wsd warmup_steps: 500 stable_steps: 7000 decay_steps: 2500 final_lr: 1.0e-05 adamw_betas: - 0.9 - 0.95 adamw_eps: 1.0e-08 weight_decay: 0.1 clip_grad_norm: 1.0 save_every_steps: 500 checkpoint_dir: /mnt/apps/llm-nanochat/checkpoints/20260517_stable-config-recipe-v5-gpt2small-lr2e4-batchmaxpossible-bs6-wsd-fastdecay precision: bf16 evaluation: validation_every_steps: 1000 validation_max_batches: 128 probe_every_steps: 1000 probe_tokenizer_dir: /mnt/apps/llm-nanochat/tokenizers/tok_202605011052_fresh_50_50_score100_32k_fromscratch probe_max_new_tokens: 32 probe_prompts: en: - prompt: "The capital of Italy is" expected_next_text: " Rome" - prompt: "A small language model should" expected_next_text: " be" it: - prompt: "La capitale d'Italia รจ" expected_next_text: " Roma" - prompt: "Un piccolo modello linguistico dovrebbe" expected_next_text: " essere"