| |
| """ |
| DOJO v4 supplement top-up β runs inside HuggingFace container. |
| |
| Base: misterJB/naima-dojo-741hz-v3 (pure BF16, quantization_config=null) |
| Data: dojo_gold_v4_supplement.jsonl (576 examples) |
| Out: misterJB/naima-dojo-741hz-v4 |
| |
| v3 has no MXFP4 β no CC9.0 requirement, no bypass patch needed. |
| Hub push at end (hub_strategy="end") β proven in v3. |
| |
| FIX (2026-04-21): CUDA Error 802 "system not yet initialized" |
| Root cause: module-level from_pretrained called before CUDA is ready in container. |
| Fix: all code moved into main(), CUDA warmup loop added, device_map="auto". |
| """ |
| import os, sys, time, torch |
| from datasets import load_dataset |
| from transformers import AutoModelForCausalLM, AutoTokenizer |
| from trl import SFTConfig, SFTTrainer |
|
|
| CHAMBER = "DOJO" |
| HZ = 741 |
| BASE = "misterJB/naima-dojo-741hz-v3" |
| DATASET = "misterJB/dojo-manifestation-training" |
| SUPPLEMENT = "dojo_gold_v4_supplement.jsonl" |
| OUTPUT = "misterJB/naima-dojo-741hz-v4" |
| CKPT_DIR = "/tmp/dojo-v4-output" |
| HF_TOKEN = os.environ["HF_TOKEN"] |
|
|
|
|
| def main(): |
| |
| for attempt in range(6): |
| if torch.cuda.is_available() and torch.cuda.device_count() > 0: |
| break |
| print(f"[warmup] CUDA not ready (attempt {attempt+1}/6) β waiting 10s...", flush=True) |
| time.sleep(10) |
| else: |
| raise RuntimeError("CUDA unavailable after 60s β aborting") |
|
|
| print(f"=== {CHAMBER} {HZ}Hz v4 Supplement Top-Up ===", flush=True) |
| print(f"GPU : {torch.cuda.get_device_name(0)}", flush=True) |
| print(f"VRAM : {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB", flush=True) |
| print(f"Base : {BASE}", flush=True) |
| print(f"Out : {OUTPUT}", flush=True) |
|
|
| |
| tokenizer = AutoTokenizer.from_pretrained(BASE, trust_remote_code=True, token=HF_TOKEN) |
| if tokenizer.pad_token is None: |
| tokenizer.pad_token = tokenizer.eos_token |
| tokenizer.model_max_length = 2048 |
|
|
| |
| print("Loading v3 weights (BF16)...", flush=True) |
| model = AutoModelForCausalLM.from_pretrained( |
| BASE, |
| torch_dtype=torch.bfloat16, |
| device_map="auto", |
| trust_remote_code=True, |
| token=HF_TOKEN, |
| ) |
| model.config.use_cache = False |
| param_count = sum(p.numel() for p in model.parameters()) |
| print(f"β
Model loaded β {param_count / 1e9:.1f}B parameters", flush=True) |
|
|
| |
| ds = load_dataset(DATASET, data_files={"train": SUPPLEMENT}, split="train", token=HF_TOKEN) |
| print(f"β
Supplement corpus: {len(ds)} examples", flush=True) |
|
|
| |
| |
| args = SFTConfig( |
| output_dir=CKPT_DIR, |
| num_train_epochs=3, |
| per_device_train_batch_size=1, |
| gradient_accumulation_steps=16, |
| learning_rate=5e-6, |
| warmup_steps=32, |
| lr_scheduler_type="cosine", |
| weight_decay=0.01, |
| bf16=True, |
| gradient_checkpointing=False, |
| save_strategy="no", |
| logging_steps=1, |
| push_to_hub=True, |
| hub_model_id=OUTPUT, |
| hub_token=HF_TOKEN, |
| hub_strategy="end", |
| report_to="none", |
| optim="paged_adamw_8bit", |
| dataloader_num_workers=0, |
| ) |
|
|
| trainer = SFTTrainer( |
| model=model, |
| args=args, |
| train_dataset=ds, |
| processing_class=tokenizer, |
| ) |
|
|
| |
| print("π± Training begins...", flush=True) |
| trainer.train() |
|
|
| |
| trainer.push_to_hub(commit_message="DOJO 741Hz v4 β valformat gate binding, 3 epochs supplement") |
| print(f"β
DOJO v4 pushed to {OUTPUT}", flush=True) |
| print("β NEXT: uv run python βΌοΈDOJO/training/validate_spoke.py DOJO", flush=True) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|