{"input":"i need this to sound more warm and clear","target":"I need this to sound warmer and clearer."}
{"input":"rough text here","target":"Polished text here."}
"""Training loop, evaluation, checkpointing, generation, CLI commands, and UI. This module holds the behavioural core of EDEN. The cleanly separable pieces (configuration, model, data, runtime helpers) live in sibling modules and are imported here. """ from __future__ import annotations import argparse import gc import http.server import json import math import os import random import shlex import shutil import subprocess import sys import threading import time import urllib.parse from dataclasses import asdict from pathlib import Path from typing import Callable, Iterable import torch import torch.nn as nn import torch.nn.functional as F from .constants import * from .io_utils import * from .config import * from .runtime import * from .model import * from .data import * def lr_lambda_factory(total_updates: int, warmup: int, min_ratio: float): def lr_lambda(step: int) -> float: if step < warmup: return max(1e-6, step / max(1, warmup)) progress = (step - warmup) / max(1, total_updates - warmup) return max(min_ratio, 0.5 * (1.0 + math.cos(math.pi * progress))) return lr_lambda def checkpoint_payload( model: EdenTransformer, optimizer, scheduler, cfg: TrainConfig, epoch: int, step: int, best_val: float, completed_epoch: bool = False, ) -> dict: return { "model_state": {k: v.detach().cpu() for k, v in model.state_dict().items()}, "optimizer_state": optimizer.state_dict() if optimizer is not None else None, "scheduler_state": scheduler.state_dict() if scheduler is not None else None, "config": asdict(cfg), "epoch": epoch, "step": step, "best_val": best_val, "completed_epoch": completed_epoch, "special_tokens": SPECIAL_TOKENS, } def save_checkpoint( path: Path, model: EdenTransformer, optimizer, scheduler, cfg: TrainConfig, epoch: int, step: int, best_val: float, completed_epoch: bool = False, ) -> None: path.parent.mkdir(parents=True, exist_ok=True) tmp = path.with_suffix(".tmp") torch.save( checkpoint_payload(model, optimizer, scheduler, cfg, epoch, step, best_val, completed_epoch), tmp, ) tmp.replace(path) def load_checkpoint(path: Path, map_location: str | torch.device = "cpu") -> dict: return torch.load(path, map_location=map_location, weights_only=False) def latest_checkpoint() -> Path: candidates = all_checkpoint_files() if not candidates: raise FileNotFoundError("No checkpoint found. Train first with: python3 main.py train") best_candidates = [p for p in candidates if p.name == "best.pt"] if best_candidates: return best_candidates[0] return candidates[0] @torch.no_grad() def evaluate( model: EdenTransformer, rows: list[tuple[str, str]], tok, cfg: TrainConfig, device: torch.device, max_batches: int = 100, ) -> tuple[float, float]: model.eval() total_loss = 0.0 total_tokens = 0 correct = 0 batches = make_batches(rows, cfg.batch_size, shuffle_batches=False)[:max_batches] for batch in batches: batch_rows = [rows[i] for i in batch] src, tin, tout = collate_pairs(batch_rows, tok, cfg) src = src.to(device) tin = tin.to(device) tout = tout.to(device) logits = model(src, tin) loss = F.cross_entropy( logits.float().reshape(-1, logits.size(-1)), tout.reshape(-1), ignore_index=-100, reduction="sum", ) mask = tout.ne(-100) preds = logits.argmax(-1) correct += (preds[mask] == tout[mask]).sum().item() seen = mask.sum().item() total_tokens += seen total_loss += loss.item() del src, tin, tout, logits, loss, preds, mask model.train() return total_loss / max(1, total_tokens), correct / max(1, total_tokens) def build_model_from_cfg(cfg: TrainConfig, device: torch.device) -> EdenTransformer: model = EdenTransformer(cfg) return model.to(device) def maybe_prepare_data(args, cfg: TrainConfig) -> None: custom_data_requested = bool(getattr(args, "data", None)) if ( PAIRS_PATH.exists() and TOKENIZER_PATH.exists() and not getattr(args, "rebuild_data", False) and not custom_data_requested ): return prepare_args = argparse.Namespace( recipe=getattr(args, "recipe", "m5-smart"), max_pairs=getattr(args, "max_pairs", cfg.max_pairs), vocab_size=cfg.vocab_size, include_c4=getattr(args, "include_c4", False), data=getattr(args, "data", None), force=True, ) command_prepare(prepare_args) def train_loop( cfg: TrainConfig, rows: list[tuple[str, str]], tok, device: torch.device, resume_path: Path | None = None, finetune: bool = False, checkpoint_dir: Path | None = None, ) -> Path: checkpoint_dir = checkpoint_dir or CHECKPOINT_DIR checkpoint_dir.mkdir(parents=True, exist_ok=True) session_dir = checkpoint_dir.parent if checkpoint_dir.name == "checkpoints" else checkpoint_dir session_name = session_dir.name if session_dir != CHECKPOINT_DIR else "legacy" set_seed(cfg.seed) train_rows, val_rows = split_train_val(list(rows), cfg.val_split, cfg.seed) batches_per_epoch = math.ceil(len(train_rows) / cfg.batch_size) updates_per_epoch = math.ceil(batches_per_epoch / cfg.grad_accum) total_updates = max(1, updates_per_epoch * cfg.epochs) model = build_model_from_cfg(cfg, device) optimizer = torch.optim.AdamW( model.parameters(), lr=cfg.lr, betas=(0.9, 0.98), eps=1e-9, weight_decay=cfg.weight_decay, ) scheduler = torch.optim.lr_scheduler.LambdaLR( optimizer, lr_lambda_factory(total_updates, cfg.warmup_steps, cfg.min_lr_ratio), ) start_epoch = 1 global_step = 0 best_val = float("inf") if resume_path: ckpt = load_checkpoint(resume_path, map_location="cpu") model.load_state_dict(ckpt["model_state"]) model.to(device) if not finetune: if ckpt.get("optimizer_state"): optimizer.load_state_dict(ckpt["optimizer_state"]) if ckpt.get("scheduler_state"): scheduler.load_state_dict(ckpt["scheduler_state"]) saved_epoch = max(1, int(ckpt.get("epoch", 1))) start_epoch = saved_epoch + 1 if ckpt.get("completed_epoch") else saved_epoch global_step = int(ckpt.get("step", 0)) best_val = float(ckpt.get("best_val", best_val)) log(f"Loaded checkpoint: {resume_path}") exact_params = model.parameter_count() total_steps = max(1, batches_per_epoch * cfg.epochs) log("") log("EDEN training") log(f" device: {device}") log(f" model: {exact_params / 1e6:.1f}M parameters") log(f" data: {len(train_rows):,} train / {len(val_rows):,} validation pairs") log(f" context: {cfg.max_len} tokens") log(f" batch: {cfg.batch_size} x accum {cfg.grad_accum} = effective {cfg.batch_size * cfg.grad_accum}") log(f" session: {session_name}") log(f" checkpoints: {checkpoint_dir}") log("") write_run_state( status="running", mode="finetune" if finetune else "train", device=str(device), params=exact_params, train_pairs=len(train_rows), val_pairs=len(val_rows), epoch=start_epoch, epochs=cfg.epochs, completed_epochs=max(0, start_epoch - 1), epoch_progress=0.0, epoch_steps_done=0, epoch_total_steps=batches_per_epoch, step=global_step, total_steps=total_steps, best_val=None if best_val == float("inf") else best_val, checkpoint=str(checkpoint_dir / "latest.pt"), session=session_name, session_dir=str(session_dir), config=asdict(cfg), ) write_metric( "start", epoch=start_epoch, epochs=cfg.epochs, step=global_step, total_steps=total_steps, params=exact_params, train_pairs=len(train_rows), val_pairs=len(val_rows), device=str(device), ) optimizer.zero_grad(set_to_none=True) running_loss = 0.0 running_count = 0 last_log = time.time() for epoch in range(start_epoch, cfg.epochs + 1): model.train() batches = make_batches(train_rows, cfg.batch_size, shuffle_batches=True) for batch_i, batch in enumerate(batches, start=1): if PAUSE_REQUEST_PATH.exists(): pause_path = checkpoint_dir / "pause.pt" latest_path = checkpoint_dir / "latest.pt" save_checkpoint(pause_path, model, optimizer, scheduler, cfg, epoch, global_step, best_val) save_checkpoint(latest_path, model, optimizer, scheduler, cfg, epoch, global_step, best_val) try: PAUSE_REQUEST_PATH.unlink() except FileNotFoundError: pass write_run_state( status="paused", epoch=epoch, epochs=cfg.epochs, completed_epochs=max(0, epoch - 1), epoch_progress=batch_i / max(1, len(batches)), epoch_steps_done=batch_i, epoch_total_steps=len(batches), step=global_step, total_steps=total_steps, progress=min(1.0, global_step / total_steps), checkpoint=str(pause_path), best_val=None if best_val == float("inf") else best_val, ) write_metric( "pause", epoch=epoch, epochs=cfg.epochs, step=global_step, total_steps=total_steps, checkpoint=str(pause_path), ) cleanup_device(device) log(f"Training paused. Checkpoint saved: {pause_path}") return pause_path rss, total_ram, frac = memory_fraction() if frac >= cfg.memory_stop_fraction: path = checkpoint_dir / "watchdog.pt" save_checkpoint(path, model, optimizer, scheduler, cfg, epoch, global_step, best_val) write_run_state( status="stopped", reason="memory_watchdog", epoch=epoch, step=global_step, completed_epochs=max(0, epoch - 1), epoch_progress=batch_i / max(1, len(batches)), epoch_steps_done=batch_i, epoch_total_steps=len(batches), checkpoint=str(path), ram_gb=rss, ram_total_gb=total_ram, ram_fraction=frac, ) write_metric( "stop", reason="memory_watchdog", epoch=epoch, step=global_step, ram_gb=rss, ram_total_gb=total_ram, ram_fraction=frac, ) cleanup_device(device) raise SystemExit( f"Memory watchdog stopped safely at {rss:.1f}/{total_ram:.0f} GB " f"({frac * 100:.0f}%). Saved resumable checkpoint: {path}" ) batch_rows = [train_rows[i] for i in batch] src, tin, tout = collate_pairs(batch_rows, tok, cfg) src = src.to(device) tin = tin.to(device) tout = tout.to(device) logits = model(src, tin) loss = F.cross_entropy( logits.float().reshape(-1, logits.size(-1)), tout.reshape(-1), ignore_index=-100, label_smoothing=cfg.label_smoothing, ) if not torch.isfinite(loss): optimizer.zero_grad(set_to_none=True) cleanup_device(device) raise RuntimeError("Loss became NaN/Inf. Try lowering lr or using the survivor recipe.") (loss / cfg.grad_accum).backward() loss_value = float(loss.item()) running_loss += loss_value running_count += 1 global_step += 1 if global_step % cfg.grad_accum == 0 or batch_i == len(batches): grad_norm = nn.utils.clip_grad_norm_(model.parameters(), cfg.grad_clip) if torch.isfinite(grad_norm): optimizer.step() scheduler.step() optimizer.zero_grad(set_to_none=True) del src, tin, tout, logits, loss if global_step % cfg.empty_cache_every == 0: cleanup_device(device) if global_step % cfg.log_every_steps == 0: elapsed = time.time() - last_log avg_loss = running_loss / max(1, running_count) lr = scheduler.get_last_lr()[0] rss, total_ram, frac = memory_fraction() log( f"epoch {epoch}/{cfg.epochs} step {global_step:,} " f"loss {avg_loss:.4f} lr {lr:.2e} " f"ram {rss:.1f}/{total_ram:.0f}GB ({frac * 100:.0f}%) " f"{elapsed:.1f}s" ) progress = min(1.0, global_step / total_steps) write_run_state( status="running", epoch=epoch, epochs=cfg.epochs, completed_epochs=max(0, epoch - 1), epoch_progress=batch_i / max(1, len(batches)), epoch_steps_done=batch_i, epoch_total_steps=len(batches), step=global_step, total_steps=total_steps, train_loss=avg_loss, lr=lr, progress=progress, ram_gb=rss, ram_total_gb=total_ram, ram_fraction=frac, ) write_metric( "step", epoch=epoch, epochs=cfg.epochs, step=global_step, total_steps=total_steps, loss=avg_loss, lr=lr, progress=progress, ram_gb=rss, ram_total_gb=total_ram, ram_fraction=frac, ) running_loss = 0.0 running_count = 0 last_log = time.time() if global_step % cfg.eval_every_steps == 0: val_loss, token_acc = evaluate(model, val_rows, tok, cfg, device) log(f"validation step {global_step:,}: loss {val_loss:.4f}, token_acc {token_acc * 100:.1f}%") save_checkpoint(checkpoint_dir / "latest.pt", model, optimizer, scheduler, cfg, epoch, global_step, best_val) if val_loss < best_val: best_val = val_loss save_checkpoint(checkpoint_dir / "best.pt", model, optimizer, scheduler, cfg, epoch, global_step, best_val) log(f"new best checkpoint: {checkpoint_dir / 'best.pt'}") write_run_state( status="running", epoch=epoch, step=global_step, completed_epochs=max(0, epoch - 1), epoch_progress=batch_i / max(1, len(batches)), epoch_steps_done=batch_i, epoch_total_steps=len(batches), val_loss=val_loss, token_acc=token_acc, quality_percent=token_acc * 100.0, best_val=best_val, ) write_metric( "val", epoch=epoch, epochs=cfg.epochs, step=global_step, total_steps=total_steps, val_loss=val_loss, token_acc=token_acc, quality_percent=token_acc * 100.0, ) cleanup_device(device) if global_step % cfg.save_every_steps == 0: save_checkpoint(checkpoint_dir / "latest.pt", model, optimizer, scheduler, cfg, epoch, global_step, best_val) val_loss, token_acc = evaluate(model, val_rows, tok, cfg, device) log(f"end epoch {epoch}: val_loss {val_loss:.4f}, token_acc {token_acc * 100:.1f}%") save_checkpoint( checkpoint_dir / "latest.pt", model, optimizer, scheduler, cfg, epoch, global_step, best_val, completed_epoch=True, ) if val_loss < best_val: best_val = val_loss save_checkpoint( checkpoint_dir / "best.pt", model, optimizer, scheduler, cfg, epoch, global_step, best_val, completed_epoch=True, ) write_run_state( status="running", epoch=epoch, epochs=cfg.epochs, completed_epochs=epoch, epoch_progress=1.0, epoch_steps_done=len(batches), epoch_total_steps=len(batches), step=global_step, total_steps=total_steps, val_loss=val_loss, token_acc=token_acc, quality_percent=token_acc * 100.0, best_val=best_val, ) write_metric( "epoch", epoch=epoch, epochs=cfg.epochs, step=global_step, total_steps=total_steps, val_loss=val_loss, token_acc=token_acc, quality_percent=token_acc * 100.0, ) cleanup_device(device) final_path = checkpoint_dir / "final.pt" save_checkpoint( final_path, model, optimizer, scheduler, cfg, cfg.epochs, global_step, best_val, completed_epoch=True, ) if not (checkpoint_dir / "best.pt").exists(): shutil.copy2(final_path, checkpoint_dir / "best.pt") log(f"Training complete. Final checkpoint: {final_path}") write_run_state( status="done", epoch=cfg.epochs, epochs=cfg.epochs, completed_epochs=cfg.epochs, epoch_progress=1.0, epoch_steps_done=batches_per_epoch, epoch_total_steps=batches_per_epoch, step=global_step, total_steps=total_steps, progress=1.0, checkpoint=str(final_path), best_val=best_val, ) write_metric("done", epoch=cfg.epochs, step=global_step, total_steps=total_steps, best_val=best_val) return final_path @torch.no_grad() def beam_generate( model: EdenTransformer, src: torch.Tensor, cfg: TrainConfig, beam_size: int, max_new_tokens: int, length_penalty: float, repetition_penalty: float, ) -> list[int]: model.eval() device = src.device memory, src_padding = model.encode(src) beams: list[tuple[list[int], float, bool]] = [([BOS_ID], 0.0, False)] for _ in range(max_new_tokens): candidates: list[tuple[list[int], float, bool]] = [] active = [b for b in beams if not b[2]] if not active: break for tokens, score, done in beams: if done: candidates.append((tokens, score, True)) continue tgt = torch.tensor([tokens[-cfg.max_len:]], dtype=torch.long, device=device) hidden = model.decode(tgt, memory, src_padding) logits = model.lm_head(hidden[:, -1, :]).float().squeeze(0) if repetition_penalty != 1.0: for token_id in set(tokens): if 0 <= token_id < logits.numel(): logits[token_id] /= repetition_penalty logits[UNK_ID] = -float("inf") logits[PAD_ID] = -float("inf") logits[BOS_ID] = -float("inf") log_probs = F.log_softmax(logits, dim=-1) values, indices = torch.topk(log_probs, k=min(beam_size, log_probs.numel())) for value, index in zip(values.tolist(), indices.tolist()): new_tokens = tokens + [int(index)] candidates.append((new_tokens, score + float(value), int(index) == EOS_ID)) def rank(item: tuple[list[int], float, bool]) -> float: toks, score, _ = item length = max(1, len(toks) - 1) return score / (length ** length_penalty) candidates.sort(key=rank, reverse=True) beams = candidates[:beam_size] if all(done for _, _, done in beams): break best = max(beams, key=lambda item: item[1] / (max(1, len(item[0]) - 1) ** length_penalty)) out = best[0][1:] if EOS_ID in out: out = out[: out.index(EOS_ID)] return [t for t in out if t not in (PAD_ID, BOS_ID, EOS_ID, UNK_ID)] def clamp_float(value, low: float, high: float, default: float) -> float: try: number = float(value) except (TypeError, ValueError): number = default return max(low, min(high, number)) def clamp_int(value, low: int, high: int, default: int) -> int: try: number = int(value) except (TypeError, ValueError): number = default return max(low, min(high, number)) def filter_top_k_top_p(logits: torch.Tensor, top_k: int, top_p: float) -> torch.Tensor: filtered = logits.clone() if top_k > 0 and top_k < filtered.numel(): threshold = torch.topk(filtered, top_k).values[-1] filtered[filtered < threshold] = -float("inf") if 0.0 < top_p < 1.0: sorted_logits, sorted_indices = torch.sort(filtered, descending=True) probs = F.softmax(sorted_logits, dim=-1) cumulative = torch.cumsum(probs, dim=-1) remove = cumulative > top_p remove[1:] = remove[:-1].clone() remove[0] = False filtered[sorted_indices[remove]] = -float("inf") return filtered @torch.no_grad() def token_generate( model: EdenTransformer, src: torch.Tensor, cfg: TrainConfig, strategy: str, max_new_tokens: int, temperature: float, top_k: int, top_p: float, repetition_penalty: float, ) -> list[int]: model.eval() device = src.device memory, src_padding = model.encode(src) tokens = [BOS_ID] for _ in range(max_new_tokens): tgt = torch.tensor([tokens[-cfg.max_len:]], dtype=torch.long, device=device) hidden = model.decode(tgt, memory, src_padding) logits = model.lm_head(hidden[:, -1, :]).float().squeeze(0) if repetition_penalty != 1.0: for token_id in set(tokens): if 0 <= token_id < logits.numel(): logits[token_id] /= repetition_penalty logits[UNK_ID] = -float("inf") logits[PAD_ID] = -float("inf") logits[BOS_ID] = -float("inf") if strategy == "sample": logits = logits / max(0.05, temperature) logits = filter_top_k_top_p(logits, top_k, top_p) probs = F.softmax(logits, dim=-1) if not torch.isfinite(probs).all() or float(probs.sum().item()) <= 0: next_id = int(torch.argmax(logits).item()) else: next_id = int(torch.multinomial(probs.detach().cpu(), 1).item()) else: next_id = int(torch.argmax(logits).item()) if next_id == EOS_ID: break if next_id not in (PAD_ID, BOS_ID, EOS_ID, UNK_ID): tokens.append(next_id) return tokens[1:] def decode_token_piece(tok, token_id: int) -> str: text = tok.decode([token_id]).replace("\u0120", " ").replace("\u010a", "\n") return text if text else f"[{token_id}]" def chunk_text_for_model(text: str, tok, cfg: TrainConfig) -> list[str]: text = normalise_text(text) ids = tok.encode(text).ids max_src = cfg.max_len - 2 if len(ids) <= max_src: return [text] chunks = [] current = [] current_ids = [] for sent in sentence_split(text) or [text]: sent_ids = tok.encode(sent).ids if current and len(current_ids) + len(sent_ids) > max_src: chunks.append(" ".join(current)) current = [] current_ids = [] if len(sent_ids) > max_src: for i in range(0, len(sent_ids), max_src): chunks.append(tok.decode(sent_ids[i : i + max_src])) else: current.append(sent) current_ids.extend(sent_ids) if current: chunks.append(" ".join(current)) return chunks def enhance_text( text: str, model: EdenTransformer, tok, cfg: TrainConfig, device: torch.device, beam_size: int | None = None, strategy: str = "beam", max_new_tokens: int | None = None, temperature: float = 0.7, top_k: int = 40, top_p: float = 0.9, length_penalty: float | None = None, repetition_penalty: float | None = None, return_details: bool = False, ): strategy = strategy if strategy in {"beam", "greedy", "sample"} else "beam" beam = max(1, int(beam_size or cfg.beam_size)) max_tokens = clamp_int(max_new_tokens, 8, max(8, cfg.max_len - 1), min(256, cfg.max_len - 1)) temp = clamp_float(temperature, 0.05, 2.0, 0.7) top_k_value = clamp_int(top_k, 0, 200, 40) top_p_value = clamp_float(top_p, 0.05, 1.0, 0.9) len_penalty = clamp_float(length_penalty, 0.05, 2.0, cfg.length_penalty) rep_penalty = clamp_float(repetition_penalty, 1.0, 2.0, cfg.repetition_penalty) outputs = [] trace = [] for chunk in chunk_text_for_model(text, tok, cfg): src_tokens = tok.encode(chunk).ids[: cfg.max_len - 2] src = torch.tensor([[BOS_ID] + src_tokens + [EOS_ID]], dtype=torch.long, device=device) if strategy == "beam": out_ids = beam_generate( model, src, cfg, beam_size=max(1, beam), max_new_tokens=max_tokens, length_penalty=len_penalty, repetition_penalty=rep_penalty, ) else: out_ids = token_generate( model, src, cfg, strategy=strategy, max_new_tokens=max_tokens, temperature=temp, top_k=top_k_value, top_p=top_p_value, repetition_penalty=rep_penalty, ) trace.extend(decode_token_piece(tok, token_id) for token_id in out_ids[:400]) decoded = tok.decode(out_ids).replace("\u0120", " ").replace("\u010a", "\n") decoded = normalise_text(decoded) outputs.append(decoded or chunk) result = normalise_text(" ".join(outputs)) if return_details: return { "output": result, "tokens": trace, "settings": { "strategy": strategy, "beam_size": beam, "max_new_tokens": max_tokens, "temperature": temp, "top_k": top_k_value, "top_p": top_p_value, "length_penalty": len_penalty, "repetition_penalty": rep_penalty, }, } return result def load_model_for_inference(checkpoint: Path | None, force_cpu: bool = False): ckpt_path = checkpoint or latest_checkpoint() ckpt = load_checkpoint(ckpt_path, map_location="cpu") cfg = TrainConfig(**ckpt["config"]) tok = load_tokenizer() device = device_for_training(force_cpu) model = build_model_from_cfg(cfg, device) model.load_state_dict(ckpt["model_state"]) model.to(device) model.eval() log(f"Loaded {ckpt_path.name} on {device} ({model.parameter_count() / 1e6:.1f}M params)") return model, tok, cfg, device def command_prepare(args) -> None: ensure_dirs() cfg = apply_recipe(args.recipe) if args.max_pairs: cfg.max_pairs = int(args.max_pairs) if args.vocab_size: cfg.vocab_size = int(args.vocab_size) if PAIRS_PATH.exists() and TOKENIZER_PATH.exists() and not args.force: log(f"Prepared data already exists in {DATA_DIR}") log("Use --force to rebuild it.") return set_seed(cfg.seed) rows: list[tuple[str, str]] = [] if args.data: custom_path = Path(args.data) log(f"Loading custom pairs from {custom_path}...") rows.extend(read_pairs_file(custom_path)) builtin_limit = max(1000, cfg.max_pairs - len(rows)) rows.extend(load_builtin_pairs(builtin_limit, args.include_c4, log)) rows = dedupe_pairs(rows, cfg.max_pairs) random.shuffle(rows) save_pairs(rows, PAIRS_PATH) log(f"Saved {len(rows):,} training pairs to {PAIRS_PATH}") log(f"Training tokenizer with vocab_size={cfg.vocab_size}...") tok = train_tokenizer(rows, cfg.vocab_size, TOKENIZER_PATH) cfg.vocab_size = tok.get_vocab_size() CONFIG_PATH.write_text(json.dumps(asdict(cfg), indent=2), encoding="utf-8") log(f"Tokenizer saved to {TOKENIZER_PATH} (actual vocab={cfg.vocab_size})") log("Prepare complete.") def command_train(args) -> None: ensure_dirs() resume = Path(args.resume) if args.resume else None if resume: ckpt = load_checkpoint(resume, map_location="cpu") cfg = TrainConfig(**ckpt["config"]) else: cfg = apply_recipe(args.recipe) if resume: log("Resume mode: using the checkpoint's saved training settings.") if args.epochs and not resume: cfg.epochs = int(args.epochs) if args.max_pairs and not resume: cfg.max_pairs = int(args.max_pairs) if args.lr and not resume: cfg.lr = float(args.lr) if getattr(args, "max_len", None) and not resume: cfg.max_len = int(args.max_len) if getattr(args, "batch_size", None) and not resume: cfg.batch_size = int(args.batch_size) if getattr(args, "grad_accum", None) and not resume: cfg.grad_accum = int(args.grad_accum) if getattr(args, "memory_stop_fraction", None) and not resume: cfg.memory_stop_fraction = float(args.memory_stop_fraction) maybe_prepare_data(args, cfg) rows = load_prepared_pairs(PAIRS_PATH) if not rows: raise SystemExit("No training pairs found. Run: python3 main.py prepare") rows = rows[: cfg.max_pairs] tok = load_tokenizer(TOKENIZER_PATH) tokenizer_vocab = tok.get_vocab_size() if resume and tokenizer_vocab != cfg.vocab_size: raise SystemExit( "The tokenizer does not match this checkpoint. Resume needs the same " "eden_system/data/tokenizer.json that was used when the checkpoint was created." ) cfg.vocab_size = tokenizer_vocab CONFIG_PATH.write_text(json.dumps(asdict(cfg), indent=2), encoding="utf-8") device = device_for_training(args.force_cpu) if resume: resume_session = session_dir_from_checkpoint(resume) session_dir = resume_session or next_training_session_dir() else: session_dir = next_training_session_dir() train_loop( cfg, rows, tok, device, resume_path=resume, finetune=False, checkpoint_dir=session_dir / "checkpoints", ) def command_finetune(args) -> None: ensure_dirs() base = Path(args.checkpoint) if args.checkpoint else latest_checkpoint() ckpt = load_checkpoint(base, map_location="cpu") cfg = TrainConfig(**ckpt["config"]) cfg.epochs = int(args.epochs) cfg.lr = float(args.lr) cfg.max_pairs = int(args.max_pairs) if args.max_pairs else cfg.max_pairs cfg.warmup_steps = min(cfg.warmup_steps, 200) custom = read_pairs_file(Path(args.data)) if args.mix_base and PAIRS_PATH.exists(): base_rows = load_prepared_pairs(PAIRS_PATH) random.shuffle(base_rows) keep = min(len(base_rows), max(len(custom), 2000)) rows = dedupe_pairs(custom + base_rows[:keep], cfg.max_pairs) else: rows = dedupe_pairs(custom, cfg.max_pairs) if len(rows) < 10: raise SystemExit("Fine-tuning needs at least 10 valid input/target pairs.") tok = load_tokenizer(TOKENIZER_PATH) tokenizer_vocab = tok.get_vocab_size() if tokenizer_vocab != cfg.vocab_size: raise SystemExit( "The tokenizer does not match this checkpoint. Fine-tuning needs the same " "eden_system/data/tokenizer.json that was used when the checkpoint was created." ) cfg.vocab_size = tokenizer_vocab device = device_for_training(args.force_cpu) log(f"Fine-tuning from {base} on {len(rows):,} pairs...") session_dir = next_training_session_dir() train_loop(cfg, rows, tok, device, resume_path=base, finetune=True, checkpoint_dir=session_dir / "checkpoints") def command_enhance(args) -> None: model, tok, cfg, device = load_model_for_inference( Path(args.checkpoint) if args.checkpoint else None, force_cpu=args.force_cpu, ) text = " ".join(args.text).strip() if not text: text = sys.stdin.read().strip() if not text: raise SystemExit("Provide text as an argument or via stdin.") result = enhance_text(text, model, tok, cfg, device, beam_size=args.beam) print(result) def command_interactive(args) -> None: model, tok, cfg, device = load_model_for_inference( Path(args.checkpoint) if args.checkpoint else None, force_cpu=args.force_cpu, ) log("Interactive EDEN. Type text and press Enter. Type /quit to stop.") while True: try: text = input("\nrough> ").strip() except (EOFError, KeyboardInterrupt): print() break if text.lower() in {"/q", "/quit", "quit", "exit"}: break if not text: continue print("clean> " + enhance_text(text, model, tok, cfg, device, beam_size=args.beam)) def command_eval(args) -> None: model, tok, cfg, device = load_model_for_inference( Path(args.checkpoint) if args.checkpoint else None, force_cpu=args.force_cpu, ) rows = load_prepared_pairs(PAIRS_PATH) if args.data: rows = read_pairs_file(Path(args.data)) if args.samples: rows = rows[: int(args.samples)] if not rows: raise SystemExit("No eval pairs found.") loss, acc = evaluate(model, rows, tok, cfg, device, max_batches=args.max_batches) log(f"eval loss {loss:.4f}, token_acc {acc * 100:.1f}% on {len(rows):,} pairs") def command_info(_args) -> None: ensure_dirs() log(f"Workspace: {ROOT}") log(f"System dir: {SYSTEM_DIR}") log(f"PyTorch: {torch.__version__}") log(f"MPS built: {torch.backends.mps.is_built()}") log(f"MPS available: {torch.backends.mps.is_available()}") for name in RECIPES: cfg = apply_recipe(name) log( f"Recipe {name}: ~{model_param_count(cfg) / 1e6:.1f}M params, " f"ctx {cfg.max_len}, batch {cfg.batch_size} x accum {cfg.grad_accum}, " f"pairs {cfg.max_pairs:,}" ) if PAIRS_PATH.exists(): log(f"Prepared pairs: {sum(1 for _ in PAIRS_PATH.open('r', encoding='utf-8')):,}") if TOKENIZER_PATH.exists(): tok = load_tokenizer(TOKENIZER_PATH) log(f"Tokenizer vocab: {tok.get_vocab_size():,}") checkpoints = all_checkpoint_files() if checkpoints: log("Checkpoints:") for path in checkpoints[:8]: try: label = str(path.relative_to(SYSTEM_DIR)) except ValueError: label = str(path) log(f" {label} ({path.stat().st_size / 1024 ** 2:.1f} MB)") def command_smoke(args) -> None: ensure_dirs() smoke_dir = SYSTEM_DIR / "smoke" smoke_dir.mkdir(parents=True, exist_ok=True) rows = [] for clean in SEED_CLEAN_SENTENCES: add_pair(rows, corrupt_sentence(clean, 0.5), clean) add_pair(rows, clean.lower(), clean) rows = dedupe_pairs(rows, 32) tok_path = smoke_dir / "tokenizer.json" tok = train_tokenizer(rows, 512, tok_path) cfg = TrainConfig( vocab_size=tok.get_vocab_size(), d_model=64, n_heads=4, n_layers=1, dim_feedforward=128, max_len=64, batch_size=2, grad_accum=1, epochs=1, eval_every_steps=9999, save_every_steps=9999, log_every_steps=1, memory_stop_fraction=0.95, ) device = device_for_training(args.force_cpu) model = build_model_from_cfg(cfg, device) opt = torch.optim.AdamW(model.parameters(), lr=1e-3) for step in range(3): batch = rows[step * 2 : step * 2 + 2] src, tin, tout = collate_pairs(batch, tok, cfg) src, tin, tout = src.to(device), tin.to(device), tout.to(device) logits = model(src, tin) loss = F.cross_entropy(logits.reshape(-1, logits.size(-1)), tout.reshape(-1), ignore_index=-100) loss.backward() opt.step() opt.zero_grad(set_to_none=True) log(f"smoke step {step + 1}: loss {float(loss.item()):.4f}") cleanup_device(device) log("Smoke test passed.") def checkpoint_options() -> list[dict]: ensure_dirs() paths = all_checkpoint_files() out = [] for path in paths: try: stat = path.stat() except OSError: continue session_dir = session_dir_from_checkpoint(path) session = session_dir.name if session_dir else "legacy" try: label = str(path.relative_to(SYSTEM_DIR)) except ValueError: label = str(path) out.append({ "name": path.name, "path": str(path), "label": label, "session": session, "size_mb": stat.st_size / 1024 ** 2, "mtime": stat.st_mtime, }) return out def resolve_checkpoint_path(value: str | None) -> Path: if not value: return latest_checkpoint() path = Path(value).expanduser() if not path.is_absolute(): path = ROOT / path path = path.resolve() if not path.exists(): raise FileNotFoundError(f"Checkpoint not found: {path}") if path.suffix != ".pt": raise ValueError("Checkpoint must be a .pt file.") return path def resolve_finetune_data_path(value: str | None) -> Path: if not value: raise ValueError("Choose a fine-tune data file first.") path = Path(value).expanduser() if not path.is_absolute(): path = ROOT / path path = path.resolve() if not path.exists(): raise FileNotFoundError(f"Fine-tune data file not found: {path}") if path.suffix.lower() not in {".jsonl", ".ndjson", ".json", ".csv", ".tsv"}: raise ValueError("Fine-tune data must be JSONL, JSON, CSV, or TSV.") return path UI_HTML = r"""
{"input":"i need this to sound more warm and clear","target":"I need this to sound warmer and clearer."}
{"input":"rough text here","target":"Polished text here."}