# OpenWebText DD-v1 comparison run
# Matches lang-full-ddv1 proxy setup but trained on OpenWebText (~9.3B tokens)
# Purpose: matched-data comparison vs GPT-2 (same data, different architecture)

# Model
model_size: proxy

# AttnRes DD-v1
attn_res: true
attn_res_boundaries: "0,3,7,12,21,25"

# Data
data_path: /dev/shm/luxia-data/openwebtext_smollm2.bin
sequence_length: 4096
micro_batch_size: 16

# Training
total_tokens: 9_300_000_000
muon_lr: 0.02
adamw_lr: 0.0006
warmup_steps: 2000
decay_start_pct: 0.90
decay_type: sqrt
gradient_clip: 1.0

# Muon
muon_momentum: 0.95
muon_weight_decay: 0.01
muon_ns_iterations: 5
muon_ns_coefficients: gram_ns

# NCA resume — co-trained NCA+AttnRes DD-v1 checkpoint (seed-17, 852M tokens)
resume_nca: checkpoints/nca-attnres-ddv1/step_00006500.pt

# Optimizations
compile: true
attn_impl: auto
fp8: true
use_liger: false

# Checkpointing — async, save every ~1B tokens, keep all for geometric analysis
checkpoint_dir: checkpoints/openwebtext-ddv1
save_every: 3800
keep_checkpoints: 20
async_checkpoint: true
checkpoint_shm_dir: /dev/shm/luxia-base-ckpts

# Geometric monitoring
geo_monitor: true
geo_monitor_tier1_every: 75
geo_monitor_tier2_every: 500

# Logging
log_every: 10
wandb: true
wandb_project: kotodama-ddv1-openwebtext
wandb_run_name: openwebtext-ddv1-9B-cotrained

# HF upload — push final checkpoint on completion
hf_upload_repo: aethera-gp/kotodama-owt-ddv1