misterJB commited on
Commit
3c90c91
·
verified ·
1 Parent(s): 877b37e

fix: CUDA Error 802 — move to main(), add warmup loop, device_map=auto

Browse files
Files changed (1) hide show
  1. train_dojo_v4.py +90 -87
train_dojo_v4.py CHANGED
@@ -7,13 +7,15 @@ Data: dojo_gold_v4_supplement.jsonl (576 examples)
7
  Out: misterJB/naima-dojo-741hz-v4
8
 
9
  v3 has no MXFP4 — no CC9.0 requirement, no bypass patch needed.
10
- Checkpoints go to /checkpoints (HF Storage Bucket mount) every 50 steps.
11
- Hub push happens at every checkpoint so progress survives timeout.
 
 
 
12
  """
13
- import os, torch
14
- from pathlib import Path
15
  from datasets import load_dataset
16
- from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer
17
  from trl import SFTConfig, SFTTrainer
18
 
19
  CHAMBER = "DOJO"
@@ -22,88 +24,89 @@ BASE = "misterJB/naima-dojo-741hz-v3"
22
  DATASET = "misterJB/dojo-manifestation-training"
23
  SUPPLEMENT = "dojo_gold_v4_supplement.jsonl"
24
  OUTPUT = "misterJB/naima-dojo-741hz-v4"
25
- CKPT_DIR = "/checkpoints/dojo-v4" # HF Storage Bucket mount
26
  HF_TOKEN = os.environ["HF_TOKEN"]
27
 
28
- print(f"=== {CHAMBER} {HZ}Hz v4 Supplement Top-Up ===")
29
- print(f"GPU : {torch.cuda.get_device_name(0) if torch.cuda.is_available() else 'none'}")
30
- print(f"VRAM : {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB" if torch.cuda.is_available() else "VRAM: n/a")
31
- print(f"Base : {BASE}")
32
- print(f"Out : {OUTPUT}")
33
-
34
- # ── Load tokenizer ────────────────────────────────────────────────────────────
35
- tokenizer = AutoTokenizer.from_pretrained(BASE, trust_remote_code=True, token=HF_TOKEN)
36
- if tokenizer.pad_token is None:
37
- tokenizer.pad_token = tokenizer.eos_token
38
- tokenizer.model_max_length = 2048 # TRL 1.x: set here, not in SFTConfig/SFTTrainer
39
-
40
- # ── Load model (v3 is pure BF16 — no MXFP4 patch needed) ────────────────────
41
- print("Loading v3 weights (BF16)...")
42
- model = AutoModelForCausalLM.from_pretrained(
43
- BASE,
44
- torch_dtype=torch.bfloat16,
45
- device_map="cuda:0", # explicit single GPU — no CPU offload risk
46
- trust_remote_code=True,
47
- token=HF_TOKEN,
48
- )
49
- model.config.use_cache = False
50
- # gradient_checkpointing disabled — H200 150GB is sufficient, avoids backward stall
51
- param_count = sum(p.numel() for p in model.parameters())
52
- print(f"✅ Model loaded — {param_count / 1e9:.1f}B parameters")
53
-
54
- # ── Load supplement corpus ────────────────────────────────────────────────────
55
- ds = load_dataset(DATASET, data_files={"train": SUPPLEMENT}, split="train", token=HF_TOKEN)
56
- print(f"✅ Supplement corpus: {len(ds)} examples")
57
-
58
- # ── Check for existing checkpoint to resume ───────────────────────────────────
59
- resume_from = None
60
- ckpt_dir = Path(CKPT_DIR)
61
- if ckpt_dir.exists():
62
- checkpoints = sorted(ckpt_dir.glob("checkpoint-*"), key=lambda p: int(p.name.split("-")[1]))
63
- if checkpoints:
64
- resume_from = str(checkpoints[-1])
65
- print(f"✅ Resuming from checkpoint: {resume_from}")
66
  else:
67
- print("No prior checkpointstarting fresh")
68
- else:
69
- print("No checkpoint dir starting fresh")
70
-
71
- # ── Training config ───────────────────────────────────────────────────────────
72
- # 3 epochs × 576 examples / (batch 1 × grad_accum 16) ≈ 108 optimizer steps/epoch ≈ 324 total
73
- args = SFTConfig(
74
- output_dir=CKPT_DIR,
75
- num_train_epochs=3,
76
- per_device_train_batch_size=1,
77
- gradient_accumulation_steps=16, # effective batch 16
78
- learning_rate=5e-6, # half of v3 LR — prevents catastrophic forgetting
79
- warmup_steps=32, # TRL 1.1: warmup_ratio deprecated
80
- lr_scheduler_type="cosine",
81
- weight_decay=0.01,
82
- bf16=True,
83
- gradient_checkpointing=False, # H200 has 150GB — no need, avoids backward pass stall
84
- save_strategy="no", # no mid-run saves — 20B optimizer state fills disk
85
- logging_steps=1, # every step visible in job logs
86
- push_to_hub=True,
87
- hub_model_id=OUTPUT,
88
- hub_token=HF_TOKEN,
89
- hub_strategy="end", # push only at end — matches v3 proven strategy
90
- report_to="none",
91
- optim="paged_adamw_8bit", # 8-bit paged: fits in 150GB with 20B model
92
- dataloader_num_workers=0, # avoid data loader deadlock in container
93
- )
94
-
95
- trainer = SFTTrainer(
96
- model=model,
97
- args=args,
98
- train_dataset=ds,
99
- processing_class=tokenizer,
100
- )
101
-
102
- # ── Train ─────────────────────────────────────────────────────────────────────
103
- print("🔱 Training begins...")
104
- trainer.train(resume_from_checkpoint=resume_from)
105
-
106
- # ── Final push ────────────────────────────────────────────────────────────────
107
- trainer.push_to_hub(commit_message="DOJO 741Hz v4 — valformat gate binding, 3 epochs supplement")
108
- print(f"✅ DOJO v4 pushed to {OUTPUT}")
109
- print("→ NEXT: uv run python ◼︎DOJO/training/validate_spoke.py DOJO")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  Out: misterJB/naima-dojo-741hz-v4
8
 
9
  v3 has no MXFP4 — no CC9.0 requirement, no bypass patch needed.
10
+ Hub push at end (hub_strategy="end") proven in v3.
11
+
12
+ FIX (2026-04-21): CUDA Error 802 "system not yet initialized"
13
+ Root cause: module-level from_pretrained called before CUDA is ready in container.
14
+ Fix: all code moved into main(), CUDA warmup loop added, device_map="auto".
15
  """
16
+ import os, sys, time, torch
 
17
  from datasets import load_dataset
18
+ from transformers import AutoModelForCausalLM, AutoTokenizer
19
  from trl import SFTConfig, SFTTrainer
20
 
21
  CHAMBER = "DOJO"
 
24
  DATASET = "misterJB/dojo-manifestation-training"
25
  SUPPLEMENT = "dojo_gold_v4_supplement.jsonl"
26
  OUTPUT = "misterJB/naima-dojo-741hz-v4"
27
+ CKPT_DIR = "/tmp/dojo-v4-output"
28
  HF_TOKEN = os.environ["HF_TOKEN"]
29
 
30
+
31
+ def main():
32
+ # CUDA warmup: container GPU driver takes up to 60s to finish init
33
+ for attempt in range(6):
34
+ if torch.cuda.is_available() and torch.cuda.device_count() > 0:
35
+ break
36
+ print(f"[warmup] CUDA not ready (attempt {attempt+1}/6) — waiting 10s...", flush=True)
37
+ time.sleep(10)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
  else:
39
+ raise RuntimeError("CUDA unavailable after 60s aborting")
40
+
41
+ print(f"=== {CHAMBER} {HZ}Hz v4 Supplement Top-Up ===", flush=True)
42
+ print(f"GPU : {torch.cuda.get_device_name(0)}", flush=True)
43
+ print(f"VRAM : {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB", flush=True)
44
+ print(f"Base : {BASE}", flush=True)
45
+ print(f"Out : {OUTPUT}", flush=True)
46
+
47
+ # ── Load tokenizer ────────────────────────────────────────────────────────────
48
+ tokenizer = AutoTokenizer.from_pretrained(BASE, trust_remote_code=True, token=HF_TOKEN)
49
+ if tokenizer.pad_token is None:
50
+ tokenizer.pad_token = tokenizer.eos_token
51
+ tokenizer.model_max_length = 2048
52
+
53
+ # ── Load model (v3 is pure BF16 — no MXFP4 patch needed) ────────────────────
54
+ print("Loading v3 weights (BF16)...", flush=True)
55
+ model = AutoModelForCausalLM.from_pretrained(
56
+ BASE,
57
+ torch_dtype=torch.bfloat16,
58
+ device_map="auto", # auto: Accelerate handles CUDA init gracefully
59
+ trust_remote_code=True,
60
+ token=HF_TOKEN,
61
+ )
62
+ model.config.use_cache = False
63
+ param_count = sum(p.numel() for p in model.parameters())
64
+ print(f"✅ Model loaded {param_count / 1e9:.1f}B parameters", flush=True)
65
+
66
+ # ── Load supplement corpus ────────────────────────────────────────────────────
67
+ ds = load_dataset(DATASET, data_files={"train": SUPPLEMENT}, split="train", token=HF_TOKEN)
68
+ print(f"✅ Supplement corpus: {len(ds)} examples", flush=True)
69
+
70
+ # ── Training config ───────────────────────────────────────────────────────────
71
+ # 3 epochs × 576 examples / (batch 1 × grad_accum 16) ≈ 108 optimizer steps/epoch ≈ 324 total
72
+ args = SFTConfig(
73
+ output_dir=CKPT_DIR,
74
+ num_train_epochs=3,
75
+ per_device_train_batch_size=1,
76
+ gradient_accumulation_steps=16,
77
+ learning_rate=5e-6,
78
+ warmup_steps=32,
79
+ lr_scheduler_type="cosine",
80
+ weight_decay=0.01,
81
+ bf16=True,
82
+ gradient_checkpointing=False, # H200 has 150GB — no need, avoids backward stall
83
+ save_strategy="no", # no mid-run saves — 20B optimizer state fills disk
84
+ logging_steps=1,
85
+ push_to_hub=True,
86
+ hub_model_id=OUTPUT,
87
+ hub_token=HF_TOKEN,
88
+ hub_strategy="end", # proven in v3
89
+ report_to="none",
90
+ optim="paged_adamw_8bit",
91
+ dataloader_num_workers=0, # avoid data loader deadlock in container
92
+ )
93
+
94
+ trainer = SFTTrainer(
95
+ model=model,
96
+ args=args,
97
+ train_dataset=ds,
98
+ processing_class=tokenizer,
99
+ )
100
+
101
+ # ── Train ─────────────────────────────────────────────────────────────────────
102
+ print("🔱 Training begins...", flush=True)
103
+ trainer.train()
104
+
105
+ # ── Final push ────────────────────────────────────────────────────────────────
106
+ trainer.push_to_hub(commit_message="DOJO 741Hz v4 — valformat gate binding, 3 epochs supplement")
107
+ print(f"✅ DOJO v4 pushed to {OUTPUT}", flush=True)
108
+ print("→ NEXT: uv run python ◼︎DOJO/training/validate_spoke.py DOJO", flush=True)
109
+
110
+
111
+ if __name__ == "__main__":
112
+ main()