"use client"; import { motion } from "framer-motion"; import { COLORS, VizFrame } from "./common"; /** * Image -> grid of patches -> sequence of tokens. Bridges the "what is a token * for an image?" question that arises when introducing attention on vision. */ export function PatchTokens({ width = 920, height = 380, }: { width?: number; height?: number; }) { const N = 4; const imgSize = 200; const patchPx = imgSize / N; const padY = 40; const imgX = 30; const tokensY = padY + imgSize + 60; const tokenW = (width - 60 - 30) / (N * N); return ( {/* Left label */} image (H × W × C) {/* Image with grid */} {Array.from({ length: N * N }, (_, k) => { const r = Math.floor(k / N); const c = k % N; return ( ); })} {/* Arrow */} flatten {/* Right column: patch embeddings */} patch embeddings (T × d) {Array.from({ length: N * N }, (_, k) => ( ))} {/* Sequence below */} token sequence fed to attention — each token attends to every other {Array.from({ length: N * N }, (_, k) => ( t{k + 1} ))} ); }