"""#17 design-soul — the DEEPEST activation: mechanistic steering of the residual stream (ActAdd / Representation Engineering, Zou et al. 2023; Contrastive Activation Addition, Rimsky et al. 2024). The art/design heritage (Rams, Swiss, Albers, Bauhaus) is encoded in the weights but LATENT. The activation spectrum, shallow → deep: 0. PROMPT naming ("Swiss style") — weak token-level activation (the CANON does this) 1. DENSE semantic priming (names+era+type) — floods the context toward the art subspace 2. FEW-SHOT exemplars (the gold seeds) — in-context pattern-match, far stronger than description 3. PERSONA ("you are Müller-Brockmann") — role-conditions the whole distribution 4. ACTIVATION STEERING ← THIS FILE — inject the "elite" DIRECTION into the residual stream during the forward pass, so even a GENERIC prompt yields elite output. Bypasses prompting entirely. 5. HEAL (weights) — SFT on elite-bespoke bakes the same direction permanently. The steering vector and the heal are two views of ONE thing: move the model onto the elite-art manifold — transiently (4, a vector added at inference) or permanently (5, folded into the weights by the flywheel). For a MoE there are TWO injection sites (see `ExpertSteer` note): the residual stream AND the router. GPU-free: the vector math + injection logic is unit-tested below with mock activations. Capturing real activations + generating steered needs the model (GPU) — runs after miniF2F. """ import mlx.core as mx import mlx.nn as nn def steering_vector(elite_acts: mx.array, generic_acts: mx.array) -> mx.array: """The 'elite design' DIRECTION in activation space = mean(elite residual) − mean(generic residual), unit-normalized. elite_acts/generic_acts are [N, d] mean-pooled residual-stream activations at one layer, captured by running the model on elite-bespoke vs cookie-cutter designs (the same contrast the gate enforces).""" v = elite_acts.mean(axis=0) - generic_acts.mean(axis=0) return v / (mx.linalg.norm(v) + 1e-8) def inject(h: mx.array, v: mx.array, alpha: float = 6.0) -> mx.array: """Add the steering direction to the residual stream: h ← h + α·‖h‖·v. Scaling by the per-token activation norm ‖h‖ keeps the nudge proportional across layers/positions (stable). Hook this after a mid/late decoder layer in glm_moe_dsa's forward. α≈4–10 typical; too high → degenerate, too low → no effect (sweep it).""" scale = mx.linalg.norm(h, axis=-1, keepdims=True) return h + alpha * scale * v # ── MoE-specific DEEPEST activation (note for the GPU pass) ────────────────────────────────────────────── # Our model is MoE (77 pruned experts). Beyond the residual stream, the ROUTER is a second steering site: if a # subset of experts encode the aesthetic/design knowledge, add a bias to their router logits (like the Lean # tactic logit-bias, but on expert selection) → activate the "design experts" directly. Identify them by which # experts fire most on the elite seeds vs generic (gather router probs over both sets; the high-elite/low-generic # experts are the aesthetic ones). This is novel + MoE-native; residual steering (above) is the model-agnostic core. class _SteeredLayer(nn.Module): """Wraps a decoder layer; injects the steering direction into its residual-stream output — the rung-4 hook.""" def __init__(self, inner, v, alpha): super().__init__() self.inner = inner self.steer = v self.alpha = alpha def __call__(self, *a, **k): out = self.inner(*a, **k) if isinstance(out, tuple): return (inject(out[0], self.steer, self.alpha), *out[1:]) # glm_moe_dsa layers return (h, shared_topk) return inject(out, self.steer, self.alpha) def apply_steering(model, layer_idx: int, v: mx.array, alpha: float = 6.0): """WIRE the steering hook into the loaded model's forward at `layer_idx` — opt-in at runtime, does NOT modify the published glm_moe_dsa loader. After mlx_lm.load: `remove = apply_steering(model, 40, v)`; remove() undoes it. Best at a mid/late layer (~⅔ depth) where the residual carries the semantic direction. α≈4–10 (sweep it).""" layers = model.model.layers orig = layers[layer_idx] layers[layer_idx] = _SteeredLayer(orig, v, alpha) return lambda: layers.__setitem__(layer_idx, orig) def _selftest(): mx.random.seed(0) d = 64 elite_dir = mx.random.normal((d,)) elite_dir = elite_dir / mx.linalg.norm(elite_dir) # the (hidden) true "elite" direction elite = elite_dir + 0.15 * mx.random.normal((24, d)) # elite designs cluster near it generic = -elite_dir + 0.15 * mx.random.normal((24, d)) # cookie-cutter cluster opposite v = steering_vector(elite, generic) align = float(v @ elite_dir) print(f" recovered steering vector ↔ true elite direction: cos = {align:.3f}") assert align > 0.9, align h = -elite_dir * 2.0 # a 'generic' activation to steer before = float(h @ elite_dir) after = float(inject(h[None], v, alpha=0.4)[0] @ elite_dir) print(f" generic activation projection onto elite: {before:+.2f} → {after:+.2f} (steered toward elite ✓)") assert after > before # rung-4 HOOK: wire a mock layer, verify the forward output is steered + removal is clean holder = type("H", (), {})(); holder.model = type("H", (), {})() holder.model.layers = [lambda x: -elite_dir * mx.ones((1, 1, d))] # a 'generic' layer output remove = apply_steering(holder, 0, v, alpha=0.4) hooked = holder.model.layers[0](mx.zeros((1, 1, d))) proj = float(hooked[0, 0] @ elite_dir) print(f" HOOK wired into layer → output steered toward elite: proj = {proj:+.2f}") assert proj > -1.0 remove() assert not isinstance(holder.model.layers[0], _SteeredLayer) # cleanly unwired print(" design_steering selftest PASS — extracts the elite-art DIRECTION + WIRES a hook that injects it") print(" into the residual stream (ActAdd/RepE). Generic prompt → elite, mechanistically. GPU captures real acts.") if __name__ == "__main__": _selftest()