#!/usr/bin/env python3 """ DOJO v4 supplement top-up — runs inside HuggingFace container. Base: misterJB/naima-dojo-741hz-v3 (pure BF16, quantization_config=null) Data: dojo_gold_v4_supplement.jsonl (576 examples) Out: misterJB/naima-dojo-741hz-v4 v3 has no MXFP4 — no CC9.0 requirement, no bypass patch needed. Hub push at end (hub_strategy="end") — proven in v3. FIX (2026-04-21): CUDA Error 802 "system not yet initialized" Root cause: module-level from_pretrained called before CUDA is ready in container. Fix: all code moved into main(), CUDA warmup loop added, device_map="auto". """ import os, sys, time, torch from datasets import load_dataset from transformers import AutoModelForCausalLM, AutoTokenizer from trl import SFTConfig, SFTTrainer CHAMBER = "DOJO" HZ = 741 BASE = "misterJB/naima-dojo-741hz-v3" DATASET = "misterJB/dojo-manifestation-training" SUPPLEMENT = "dojo_gold_v4_supplement.jsonl" OUTPUT = "misterJB/naima-dojo-741hz-v4" CKPT_DIR = "/tmp/dojo-v4-output" HF_TOKEN = os.environ["HF_TOKEN"] def main(): # CUDA warmup: container GPU driver takes up to 60s to finish init for attempt in range(6): if torch.cuda.is_available() and torch.cuda.device_count() > 0: break print(f"[warmup] CUDA not ready (attempt {attempt+1}/6) — waiting 10s...", flush=True) time.sleep(10) else: raise RuntimeError("CUDA unavailable after 60s — aborting") print(f"=== {CHAMBER} {HZ}Hz v4 Supplement Top-Up ===", flush=True) print(f"GPU : {torch.cuda.get_device_name(0)}", flush=True) print(f"VRAM : {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB", flush=True) print(f"Base : {BASE}", flush=True) print(f"Out : {OUTPUT}", flush=True) # ── Load tokenizer ──────────────────────────────────────────────────────────── tokenizer = AutoTokenizer.from_pretrained(BASE, trust_remote_code=True, token=HF_TOKEN) if tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.eos_token tokenizer.model_max_length = 2048 # ── Load model (v3 is pure BF16 — no MXFP4 patch needed) ──────────────────── print("Loading v3 weights (BF16)...", flush=True) model = AutoModelForCausalLM.from_pretrained( BASE, torch_dtype=torch.bfloat16, device_map="auto", # auto: Accelerate handles CUDA init gracefully trust_remote_code=True, token=HF_TOKEN, ) model.config.use_cache = False param_count = sum(p.numel() for p in model.parameters()) print(f"✅ Model loaded — {param_count / 1e9:.1f}B parameters", flush=True) # ── Load supplement corpus ──────────────────────────────────────────────────── ds = load_dataset(DATASET, data_files={"train": SUPPLEMENT}, split="train", token=HF_TOKEN) print(f"✅ Supplement corpus: {len(ds)} examples", flush=True) # ── Training config ─────────────────────────────────────────────────────────── # 3 epochs × 576 examples / (batch 1 × grad_accum 16) ≈ 108 optimizer steps/epoch ≈ 324 total args = SFTConfig( output_dir=CKPT_DIR, num_train_epochs=3, per_device_train_batch_size=1, gradient_accumulation_steps=16, learning_rate=5e-6, warmup_steps=32, lr_scheduler_type="cosine", weight_decay=0.01, bf16=True, gradient_checkpointing=False, # H200 has 150GB — no need, avoids backward stall save_strategy="no", # no mid-run saves — 20B optimizer state fills disk logging_steps=1, push_to_hub=True, hub_model_id=OUTPUT, hub_token=HF_TOKEN, hub_strategy="end", # proven in v3 report_to="none", optim="paged_adamw_8bit", dataloader_num_workers=0, # avoid data loader deadlock in container ) trainer = SFTTrainer( model=model, args=args, train_dataset=ds, processing_class=tokenizer, ) # ── Train ───────────────────────────────────────────────────────────────────── print("🔱 Training begins...", flush=True) trainer.train() # ── Final push ──────────────────────────────────────────────────────────────── trainer.push_to_hub(commit_message="DOJO 741Hz v4 — valformat gate binding, 3 epochs supplement") print(f"✅ DOJO v4 pushed to {OUTPUT}", flush=True) print("→ NEXT: uv run python ◼︎DOJO/training/validate_spoke.py DOJO", flush=True) if __name__ == "__main__": main()