{ "checkpoint_config": { "actual_precision": "bf16", "adamw_betas": [ 0.9, 0.95 ], "adamw_eps": 1e-08, "attention_kernel_policy": "auto", "batch_size": 6, "benchmark": { "enable_central_tensorboard": true, "enable_local_tensorboard": true, "enabled": false, "output_path": "/mnt/apps/llm-nanochat/artifacts/runs/20260517_stable-config-recipe-v5-gpt2small-lr2e4-batchmaxpossible-bs6-wsd-fastdecay/throughput_benchmark.json", "warmup_steps": 0 }, "checkpoint_dir": "/mnt/apps/llm-nanochat/checkpoints/20260517_stable-config-recipe-v5-gpt2small-lr2e4-batchmaxpossible-bs6-wsd-fastdecay", "clip_grad_norm": 1.0, "compile": { "backend": null, "compile_setup_sec": 0.0, "diagnostic": null, "dynamic": false, "enabled": false, "error_policy": "raise", "fullgraph": false, "mode": null, "requested": false, "status": "disabled" }, "dataset": { "storage_mode": "indexed_jsonl" }, "decay_steps": 2500, "deterministic_algorithms": false, "device": "cuda", "dim": 768, "final_lr": 1e-05, "fp8_backend": null, "grad_accum_steps": 16, "learning_rate": 0.0002, "logging": { "enable_central_tensorboard": true, "enable_local_tensorboard": true, "metrics_flush_every_steps": 1, "metrics_writer": "persistent_jsonl_handle" }, "lr": 0.0002, "lr_schedule": "wsd", "max_seq_len": 2500, "max_steps": 10000, "n_heads": 12, "n_layers": 12, "optimizer": { "backend": "torch", "betas": [ 0.9, 0.95 ], "eps": 1e-08, "implementation": "torch.optim.AdamW", "learning_rate": 0.0002, "state_precision": "full_precision", "type": "adamw", "weight_decay": 0.1 }, "optimizer_backend": "torch", "optimizer_implementation": "torch.optim.AdamW", "optimizer_state_precision": "full_precision", "optimizer_type": "adamw", "peak_lr": 0.0002, "repro": { "attention_kernel_policy": "auto", "cublas_workspace_config": null, "cudnn_benchmark": true, "cudnn_deterministic": false, "deterministic_algorithms": false, "flash_sdp_enabled": true, "math_sdp_enabled": true, "mem_efficient_sdp_enabled": true, "pythonhashseed": "1337", "seed": 1337 }, "requested_precision": "bf16", "save_every_steps": 500, "scheduler": { "decay_steps": 2500, "final_lr": 1e-05, "peak_lr": 0.0002, "schedule_type": "wsd", "stable_steps": 7000, "total_steps": 10000, "warmup_steps": 500 }, "seed": 1337, "stable_steps": 7000, "train_cache_ram_bytes": 1073741824, "train_cache_ram_mb": 1024, "vocab_size": 32000, "warmup_steps": 500, "weight_decay": 0.1 }, "checkpoint_path": "/mnt/apps/llm-nanochat/checkpoints/20260517_stable-config-recipe-v5-gpt2small-lr2e4-batchmaxpossible-bs6-wsd-fastdecay/step_10000.pt", "exported_at": "2026-05-19T13:49:55.443068+00:00", "format": "llm-nanochat-safetensors-export", "global_step": 10000, "metadata_path": "/mnt/apps/llm-nanochat/hf_exports/gpt2small-en-it-nanochat-lr2e4-bs6-wsd-fastdecay-step10000/step_10000.safetensors.json", "model_config": { "dim": 768, "max_seq_len": 2500, "n_heads": 12, "n_layers": 12, "vocab_size": 32000 }, "num_parameters": 136128000, "num_tensors": 149, "provenance": { "checkpoint_dir": "/mnt/apps/llm-nanochat/checkpoints/20260517_stable-config-recipe-v5-gpt2small-lr2e4-batchmaxpossible-bs6-wsd-fastdecay", "checkpoint_name": "step_10000.pt", "checkpoint_path": "/mnt/apps/llm-nanochat/checkpoints/20260517_stable-config-recipe-v5-gpt2small-lr2e4-batchmaxpossible-bs6-wsd-fastdecay/step_10000.pt", "global_step": 10000, "packed_dataset_config_path": null, "run_dir": "/mnt/apps/llm-nanochat/artifacts/runs/20260517_stable-config-recipe-v5-gpt2small-lr2e4-batchmaxpossible-bs6-wsd-fastdecay", "tokenizer_dir": "/mnt/apps/llm-nanochat/tokenizers/tok_202605011052_fresh_50_50_score100_32k_fromscratch", "training_config_path": "/home/descanso/.openclaw/workspace/python_project/llm-nanochat/configs/testing/20260517_stable-config-recipe-v5-gpt2small-lr2e4-batchmaxpossible-bs6-wsd-fastdecay.yaml" }, "safetensors_path": "/mnt/apps/llm-nanochat/hf_exports/gpt2small-en-it-nanochat-lr2e4-bs6-wsd-fastdecay-step10000/step_10000.safetensors", "source_checkpoint_path": "/mnt/apps/llm-nanochat/checkpoints/20260517_stable-config-recipe-v5-gpt2small-lr2e4-batchmaxpossible-bs6-wsd-fastdecay/step_10000.pt", "source_global_step": 10000, "tensor_names": [ "token_emb.weight", "pos_emb.weight", "blocks.layers.0.self_attn.in_proj_weight", "blocks.layers.0.self_attn.in_proj_bias", "blocks.layers.0.self_attn.out_proj.weight", "blocks.layers.0.self_attn.out_proj.bias", "blocks.layers.0.linear1.weight", "blocks.layers.0.linear1.bias", "blocks.layers.0.linear2.weight", "blocks.layers.0.linear2.bias", "blocks.layers.0.norm1.weight", "blocks.layers.0.norm1.bias", "blocks.layers.0.norm2.weight", "blocks.layers.0.norm2.bias", "blocks.layers.1.self_attn.in_proj_weight", "blocks.layers.1.self_attn.in_proj_bias", "blocks.layers.1.self_attn.out_proj.weight", "blocks.layers.1.self_attn.out_proj.bias", "blocks.layers.1.linear1.weight", "blocks.layers.1.linear1.bias", "blocks.layers.1.linear2.weight", "blocks.layers.1.linear2.bias", "blocks.layers.1.norm1.weight", "blocks.layers.1.norm1.bias", "blocks.layers.1.norm2.weight", "blocks.layers.1.norm2.bias", "blocks.layers.2.self_attn.in_proj_weight", "blocks.layers.2.self_attn.in_proj_bias", "blocks.layers.2.self_attn.out_proj.weight", "blocks.layers.2.self_attn.out_proj.bias", "blocks.layers.2.linear1.weight", "blocks.layers.2.linear1.bias", "blocks.layers.2.linear2.weight", "blocks.layers.2.linear2.bias", "blocks.layers.2.norm1.weight", "blocks.layers.2.norm1.bias", "blocks.layers.2.norm2.weight", "blocks.layers.2.norm2.bias", "blocks.layers.3.self_attn.in_proj_weight", "blocks.layers.3.self_attn.in_proj_bias", "blocks.layers.3.self_attn.out_proj.weight", "blocks.layers.3.self_attn.out_proj.bias", "blocks.layers.3.linear1.weight", "blocks.layers.3.linear1.bias", "blocks.layers.3.linear2.weight", "blocks.layers.3.linear2.bias", "blocks.layers.3.norm1.weight", "blocks.layers.3.norm1.bias", "blocks.layers.3.norm2.weight", "blocks.layers.3.norm2.bias", "blocks.layers.4.self_attn.in_proj_weight", "blocks.layers.4.self_attn.in_proj_bias", "blocks.layers.4.self_attn.out_proj.weight", "blocks.layers.4.self_attn.out_proj.bias", "blocks.layers.4.linear1.weight", "blocks.layers.4.linear1.bias", "blocks.layers.4.linear2.weight", "blocks.layers.4.linear2.bias", "blocks.layers.4.norm1.weight", "blocks.layers.4.norm1.bias", "blocks.layers.4.norm2.weight", "blocks.layers.4.norm2.bias", "blocks.layers.5.self_attn.in_proj_weight", "blocks.layers.5.self_attn.in_proj_bias", "blocks.layers.5.self_attn.out_proj.weight", "blocks.layers.5.self_attn.out_proj.bias", "blocks.layers.5.linear1.weight", "blocks.layers.5.linear1.bias", "blocks.layers.5.linear2.weight", "blocks.layers.5.linear2.bias", "blocks.layers.5.norm1.weight", "blocks.layers.5.norm1.bias", "blocks.layers.5.norm2.weight", "blocks.layers.5.norm2.bias", "blocks.layers.6.self_attn.in_proj_weight", "blocks.layers.6.self_attn.in_proj_bias", "blocks.layers.6.self_attn.out_proj.weight", "blocks.layers.6.self_attn.out_proj.bias", "blocks.layers.6.linear1.weight", "blocks.layers.6.linear1.bias", "blocks.layers.6.linear2.weight", "blocks.layers.6.linear2.bias", "blocks.layers.6.norm1.weight", "blocks.layers.6.norm1.bias", "blocks.layers.6.norm2.weight", "blocks.layers.6.norm2.bias", "blocks.layers.7.self_attn.in_proj_weight", "blocks.layers.7.self_attn.in_proj_bias", "blocks.layers.7.self_attn.out_proj.weight", "blocks.layers.7.self_attn.out_proj.bias", "blocks.layers.7.linear1.weight", "blocks.layers.7.linear1.bias", "blocks.layers.7.linear2.weight", "blocks.layers.7.linear2.bias", "blocks.layers.7.norm1.weight", "blocks.layers.7.norm1.bias", "blocks.layers.7.norm2.weight", "blocks.layers.7.norm2.bias", "blocks.layers.8.self_attn.in_proj_weight", "blocks.layers.8.self_attn.in_proj_bias", "blocks.layers.8.self_attn.out_proj.weight", "blocks.layers.8.self_attn.out_proj.bias", "blocks.layers.8.linear1.weight", "blocks.layers.8.linear1.bias", "blocks.layers.8.linear2.weight", "blocks.layers.8.linear2.bias", "blocks.layers.8.norm1.weight", "blocks.layers.8.norm1.bias", "blocks.layers.8.norm2.weight", "blocks.layers.8.norm2.bias", "blocks.layers.9.self_attn.in_proj_weight", "blocks.layers.9.self_attn.in_proj_bias", "blocks.layers.9.self_attn.out_proj.weight", "blocks.layers.9.self_attn.out_proj.bias", "blocks.layers.9.linear1.weight", "blocks.layers.9.linear1.bias", "blocks.layers.9.linear2.weight", "blocks.layers.9.linear2.bias", "blocks.layers.9.norm1.weight", "blocks.layers.9.norm1.bias", "blocks.layers.9.norm2.weight", "blocks.layers.9.norm2.bias", "blocks.layers.10.self_attn.in_proj_weight", "blocks.layers.10.self_attn.in_proj_bias", "blocks.layers.10.self_attn.out_proj.weight", "blocks.layers.10.self_attn.out_proj.bias", "blocks.layers.10.linear1.weight", "blocks.layers.10.linear1.bias", "blocks.layers.10.linear2.weight", "blocks.layers.10.linear2.bias", "blocks.layers.10.norm1.weight", "blocks.layers.10.norm1.bias", "blocks.layers.10.norm2.weight", "blocks.layers.10.norm2.bias", "blocks.layers.11.self_attn.in_proj_weight", "blocks.layers.11.self_attn.in_proj_bias", "blocks.layers.11.self_attn.out_proj.weight", "blocks.layers.11.self_attn.out_proj.bias", "blocks.layers.11.linear1.weight", "blocks.layers.11.linear1.bias", "blocks.layers.11.linear2.weight", "blocks.layers.11.linear2.bias", "blocks.layers.11.norm1.weight", "blocks.layers.11.norm1.bias", "blocks.layers.11.norm2.weight", "blocks.layers.11.norm2.bias", "ln_f.weight", "ln_f.bias", "head.weight" ], "tokenizer_reference": { "packed_dataset_config_path": null, "tokenizer_dir": "/mnt/apps/llm-nanochat/tokenizers/tok_202605011052_fresh_50_50_score100_32k_fromscratch", "training_config_path": "/home/descanso/.openclaw/workspace/python_project/llm-nanochat/configs/testing/20260517_stable-config-recipe-v5-gpt2small-lr2e4-batchmaxpossible-bs6-wsd-fastdecay.yaml" } }