naima-dojo-741hz-v4 / train_dojo_v4.py
misterJB's picture
fix: CUDA Error 802 β€” move to main(), add warmup loop, device_map=auto
3c90c91 verified
Raw
History Blame Contribute Delete
5.22 kB
#!/usr/bin/env python3
"""
DOJO v4 supplement top-up β€” runs inside HuggingFace container.
Base: misterJB/naima-dojo-741hz-v3 (pure BF16, quantization_config=null)
Data: dojo_gold_v4_supplement.jsonl (576 examples)
Out: misterJB/naima-dojo-741hz-v4
v3 has no MXFP4 β€” no CC9.0 requirement, no bypass patch needed.
Hub push at end (hub_strategy="end") β€” proven in v3.
FIX (2026-04-21): CUDA Error 802 "system not yet initialized"
Root cause: module-level from_pretrained called before CUDA is ready in container.
Fix: all code moved into main(), CUDA warmup loop added, device_map="auto".
"""
import os, sys, time, torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer
from trl import SFTConfig, SFTTrainer
CHAMBER = "DOJO"
HZ = 741
BASE = "misterJB/naima-dojo-741hz-v3"
DATASET = "misterJB/dojo-manifestation-training"
SUPPLEMENT = "dojo_gold_v4_supplement.jsonl"
OUTPUT = "misterJB/naima-dojo-741hz-v4"
CKPT_DIR = "/tmp/dojo-v4-output"
HF_TOKEN = os.environ["HF_TOKEN"]
def main():
# CUDA warmup: container GPU driver takes up to 60s to finish init
for attempt in range(6):
if torch.cuda.is_available() and torch.cuda.device_count() > 0:
break
print(f"[warmup] CUDA not ready (attempt {attempt+1}/6) β€” waiting 10s...", flush=True)
time.sleep(10)
else:
raise RuntimeError("CUDA unavailable after 60s β€” aborting")
print(f"=== {CHAMBER} {HZ}Hz v4 Supplement Top-Up ===", flush=True)
print(f"GPU : {torch.cuda.get_device_name(0)}", flush=True)
print(f"VRAM : {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB", flush=True)
print(f"Base : {BASE}", flush=True)
print(f"Out : {OUTPUT}", flush=True)
# ── Load tokenizer ────────────────────────────────────────────────────────────
tokenizer = AutoTokenizer.from_pretrained(BASE, trust_remote_code=True, token=HF_TOKEN)
if tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.eos_token
tokenizer.model_max_length = 2048
# ── Load model (v3 is pure BF16 β€” no MXFP4 patch needed) ────────────────────
print("Loading v3 weights (BF16)...", flush=True)
model = AutoModelForCausalLM.from_pretrained(
BASE,
torch_dtype=torch.bfloat16,
device_map="auto", # auto: Accelerate handles CUDA init gracefully
trust_remote_code=True,
token=HF_TOKEN,
)
model.config.use_cache = False
param_count = sum(p.numel() for p in model.parameters())
print(f"βœ… Model loaded β€” {param_count / 1e9:.1f}B parameters", flush=True)
# ── Load supplement corpus ────────────────────────────────────────────────────
ds = load_dataset(DATASET, data_files={"train": SUPPLEMENT}, split="train", token=HF_TOKEN)
print(f"βœ… Supplement corpus: {len(ds)} examples", flush=True)
# ── Training config ───────────────────────────────────────────────────────────
# 3 epochs Γ— 576 examples / (batch 1 Γ— grad_accum 16) β‰ˆ 108 optimizer steps/epoch β‰ˆ 324 total
args = SFTConfig(
output_dir=CKPT_DIR,
num_train_epochs=3,
per_device_train_batch_size=1,
gradient_accumulation_steps=16,
learning_rate=5e-6,
warmup_steps=32,
lr_scheduler_type="cosine",
weight_decay=0.01,
bf16=True,
gradient_checkpointing=False, # H200 has 150GB β€” no need, avoids backward stall
save_strategy="no", # no mid-run saves β€” 20B optimizer state fills disk
logging_steps=1,
push_to_hub=True,
hub_model_id=OUTPUT,
hub_token=HF_TOKEN,
hub_strategy="end", # proven in v3
report_to="none",
optim="paged_adamw_8bit",
dataloader_num_workers=0, # avoid data loader deadlock in container
)
trainer = SFTTrainer(
model=model,
args=args,
train_dataset=ds,
processing_class=tokenizer,
)
# ── Train ─────────────────────────────────────────────────────────────────────
print("πŸ”± Training begins...", flush=True)
trainer.train()
# ── Final push ────────────────────────────────────────────────────────────────
trainer.push_to_hub(commit_message="DOJO 741Hz v4 β€” valformat gate binding, 3 epochs supplement")
print(f"βœ… DOJO v4 pushed to {OUTPUT}", flush=True)
print("β†’ NEXT: uv run python β—ΌοΈŽDOJO/training/validate_spoke.py DOJO", flush=True)
if __name__ == "__main__":
main()