Spaces:

abdelstark
/

latent-inspector-showcase

Running

App Files Files Community

latent-inspector-showcase / slides.html

abdelstark

Initial publish: landing + slides + sample compare report + PCA assets

acb2bb6 verified about 2 months ago

raw

history blame contribute delete

42.7 kB

	<!DOCTYPE html>
	<html lang="en">
	<head>
	<meta charset="utf-8">
	<meta name="viewport" content="width=device-width, initial-scale=1.0">
	<title>How AI Models See the World</title>
	<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/reveal.js@5.1.0/dist/reveal.css">
	<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/reveal.js@5.1.0/dist/theme/black.css" id="theme">
	<link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/reveal.js@5.1.0/plugin/highlight/monokai.css">
	<style>
	@import url('https://fonts.googleapis.com/css2?family=Inter:wght@300;400;500;600;700;800&family=JetBrains+Mono:wght@400;500;600&display=swap');

	:root {
	--bg: #08080c;
	--bg-card: #101018;
	--bg-card-hover: #14141e;
	--border: #1e1e2c;
	--border-accent: #2a2a3c;
	--text: #d8d8e4;
	--text-secondary: #8888a0;
	--text-dim: #555568;
	--text-muted: #3a3a4c;
	--white: #f0f0f8;
	--orange: #e8773a;
	--orange-soft: #c46830;
	--blue: #5b9cf5;
	--green: #4ac0a0;
	--red: #e05555;
	--purple: #9580e0;
	--yellow: #d4b040;
	--r-background-color: var(--bg);
	--r-main-font: 'Inter', system-ui, sans-serif;
	--r-heading-font: 'Inter', system-ui, sans-serif;
	--r-main-color: var(--text);
	--r-heading-color: var(--white);
	--r-heading-text-transform: none;
	}

	.reveal { font-weight: 300; font-size: 28px; line-height: 1.5; }
	.reveal h1 { font-weight: 800; font-size: 2.6em; letter-spacing: -0.03em; line-height: 1.1; }
	.reveal h2 { font-weight: 700; font-size: 1.8em; letter-spacing: -0.02em; color: var(--white); line-height: 1.2; }
	.reveal h3 { font-weight: 600; font-size: 1.1em; color: var(--text-secondary); letter-spacing: -0.01em; text-transform: uppercase; letter-spacing: 0.05em; font-size: 0.85em; }

	/* ── Accent colors ── */
	.c-orange { color: var(--orange); }
	.c-blue { color: var(--blue); }
	.c-green { color: var(--green); }
	.c-red { color: var(--red); }
	.c-purple { color: var(--purple); }
	.c-yellow { color: var(--yellow); }
	.c-dim { color: var(--text-dim); }
	.c-sec { color: var(--text-secondary); }

	/* ── Large metric display ── */
	.metric {
	font-family: 'JetBrains Mono', monospace;
	font-weight: 600;
	line-height: 1;
	}
	.metric-xl { font-size: 4.5em; }
	.metric-lg { font-size: 3em; }
	.metric-md { font-size: 2em; }
	.metric-label { font-size: 0.55em; font-weight: 400; color: var(--text-secondary); margin-top: 0.3em; font-family: 'Inter', sans-serif; }

	/* ── Cards ── */
	.card {
	background: var(--bg-card);
	border: 1px solid var(--border);
	border-radius: 12px;
	padding: 1.2em 1.5em;
	text-align: left;
	}
	.card-glow-orange { border-color: rgba(232,119,58,0.25); box-shadow: 0 0 40px rgba(232,119,58,0.05); }
	.card-glow-red { border-color: rgba(224,85,85,0.25); box-shadow: 0 0 40px rgba(224,85,85,0.05); }
	.card-glow-green { border-color: rgba(74,192,160,0.2); }
	.card-glow-blue { border-color: rgba(91,156,245,0.2); }
	.card-glow-purple { border-color: rgba(149,128,224,0.2); }

	.card p { margin: 0.3em 0; }
	.card strong { color: var(--orange); font-weight: 500; }

	/* ── Tables ── */
	.reveal table { margin: 0.6em auto; font-size: 0.68em; border-collapse: separate; border-spacing: 0; }
	.reveal table th {
	background: var(--bg-card);
	color: var(--text-secondary);
	font-weight: 600;
	padding: 0.6em 1em;
	border-bottom: 1px solid var(--border-accent);
	font-size: 0.85em;
	text-transform: uppercase;
	letter-spacing: 0.04em;
	}
	.reveal table td {
	padding: 0.55em 1em;
	border-bottom: 1px solid var(--border);
	}
	.reveal table tr:last-child td { border-bottom: none; }
	.cell-hi-red { background: rgba(224,85,85,0.1) !important; color: var(--red) !important; font-weight: 600; }
	.cell-hi-green { background: rgba(74,192,160,0.1) !important; color: var(--green) !important; font-weight: 600; }

	/* ── CKA matrix ── */
	.matrix { font-family: 'JetBrains Mono', monospace; font-size: 0.62em; }
	.matrix td, .matrix th { text-align: center; padding: 0.6em 0.8em; }
	.matrix .diag { color: var(--text-muted); }
	.matrix .zero { color: var(--red); font-weight: 700; font-size: 1.15em; }
	.matrix .high { color: var(--green); font-weight: 500; }

	/* ── Code ── */
	.reveal pre { box-shadow: none; font-size: 0.62em; margin: 0.8em 0; }
	.reveal code { font-family: 'JetBrains Mono', monospace; }
	.reveal pre code {
	padding: 1.2em 1.5em;
	border-radius: 10px;
	background: var(--bg-card);
	border: 1px solid var(--border);
	line-height: 1.6;
	}

	/* ── Layout ── */
	.flex { display: flex; gap: 1.5em; align-items: flex-start; }
	.flex > div { flex: 1; }
	.flex-center { display: flex; gap: 2em; align-items: center; justify-content: center; }
	.flex-baseline { display: flex; gap: 2em; align-items: flex-end; justify-content: center; }

	/* ── Image grid ── */
	.pca-grid {
	display: grid;
	grid-template-columns: 1fr 1fr;
	gap: 12px;
	max-width: 680px;
	margin: 0 auto;
	}
	.pca-grid figure { margin: 0; text-align: center; }
	.pca-grid img {
	width: 100%;
	border-radius: 8px;
	border: 1px solid var(--border);
	}
	.pca-grid figcaption {
	font-size: 0.5em;
	color: var(--text-secondary);
	margin-top: 0.4em;
	}

	/* ── Single image showcase ── */
	.showcase-img {
	max-height: 420px;
	border-radius: 10px;
	border: 1px solid var(--border);
	box-shadow: 0 8px 40px rgba(0,0,0,0.4);
	}

	/* ── Divider ── */
	.rule { width: 48px; height: 2px; background: var(--border-accent); margin: 1.2em auto; }
	.rule-orange { background: var(--orange); opacity: 0.4; }

	/* ── Utility ── */
	.small { font-size: 0.75em; }
	.smaller { font-size: 0.6em; color: var(--text-dim); }
	.tiny { font-size: 0.48em; color: var(--text-muted); }
	.mono { font-family: 'JetBrains Mono', monospace; }
	.fw-400 { font-weight: 400; }
	.fw-500 { font-weight: 500; }
	.mt-1 { margin-top: 0.5em; }
	.mt-2 { margin-top: 1em; }
	.mt-3 { margin-top: 1.5em; }
	.mb-0 { margin-bottom: 0; }
	.tl { text-align: left; }

	/* ── Progress & slide number ── */
	.reveal .progress { color: var(--orange); height: 2px; }
	.reveal .slide-number { font-family: 'JetBrains Mono', monospace; font-size: 0.45em; color: var(--text-muted); }

	/* ── Section divider slides ── */
	.section-title { display: flex; flex-direction: column; justify-content: center; min-height: 100%; }
	.section-title h2 { font-size: 2.4em; margin-bottom: 0.1em; }
	.section-title .subtitle { color: var(--text-secondary); font-size: 0.8em; font-weight: 300; }

	/* ── Model badge ── */
	.badge {
	display: inline-block;
	padding: 0.15em 0.6em;
	border-radius: 6px;
	font-size: 0.65em;
	font-weight: 500;
	font-family: 'JetBrains Mono', monospace;
	}
	.badge-green { background: rgba(74,192,160,0.12); color: var(--green); border: 1px solid rgba(74,192,160,0.2); }
	.badge-blue { background: rgba(91,156,245,0.12); color: var(--blue); border: 1px solid rgba(91,156,245,0.2); }
	.badge-purple { background: rgba(149,128,224,0.12); color: var(--purple); border: 1px solid rgba(149,128,224,0.2); }
	.badge-red { background: rgba(224,85,85,0.12); color: var(--red); border: 1px solid rgba(224,85,85,0.2); }

	/* ── Arrow connector ── */
	.arrow { color: var(--text-muted); font-size: 1.4em; line-height: 1; }

	/* ── Keyline quote ── */
	.keyline {
	border-left: 3px solid var(--orange);
	padding: 0.6em 1.2em;
	background: rgba(232,119,58,0.04);
	border-radius: 0 8px 8px 0;
	margin: 0.8em 0;
	font-size: 0.88em;
	text-align: left;
	}

	/* ── Terminal switch indicator ── */
	.terminal-cue {
	display: inline-block;
	background: var(--bg-card);
	border: 1px solid var(--border);
	border-radius: 6px;
	padding: 0.3em 0.8em;
	font-family: 'JetBrains Mono', monospace;
	font-size: 0.55em;
	color: var(--text-dim);
	margin-top: 1.5em;
	}
	.terminal-cue::before { content: "▸ "; color: var(--green); }
	</style>
	</head>
	<body>
	<div class="reveal">
	<div class="slides">

	<section data-background-color="#08080c">
	<div class="card card-glow-red" style="max-width:1100px; margin:1.5em auto 0; text-align:left;">
	<h2 style="margin-top:0;">EUPE Notice</h2>
	<p>The earlier EUPE ONNX export used in this deck was broken and produced misleading geometry claims.</p>
	<p>The corrected export and refreshed compare artifacts now live in <span class="mono">demo/reports/eupe-vs-ssl-reference.html</span> and <span class="mono">demo/reports/eupe-compare.json</span>.</p>
	<p><a href="reports/20260408-123006/report.html">Open sample compare report</a> · <a href="reports/">Browse reports</a></p>
	<p>The deck below has been rewritten around the corrected benchmark and revised interpretation.</p>
	</div>
	</section>

	<!-- ════════════════════════════════════════════ -->
	<!-- TITLE -->
	<!-- ════════════════════════════════════════════ -->
	<section data-background-color="#08080c">
	<div style="margin-top:1em;">
	<h1 style="margin-bottom:0.1em;">How AI Models<br><span class="c-orange">See the World</span></h1>
	<div class="rule rule-orange" style="margin:0.8em auto;"></div>
	<p class="c-sec fw-400" style="font-size:0.75em;">A deep dive into self-supervised vision model representations</p>
	<p class="mt-3" style="font-size:0.55em;"><span class="mono c-dim">latent-inspector</span> <span class="c-dim">\|</span> <span class="c-dim">Rust + ONNX Runtime</span></p>
	<p class="tiny mt-2">github.com/AbdelStark/latent-inspector</p>
	</div>
	</section>

	<!-- ════════════════════════════════════════════ -->
	<!-- ACT 1 — THE HOOK -->
	<!-- ════════════════════════════════════════════ -->
	<section>
	<section>
	<div class="section-title">
	<p class="c-sec" style="font-size:0.7em; letter-spacing:0.1em; text-transform:uppercase;">Act I</p>
	<h2>Same Image.<br>Four Models.<br><span class="c-orange">Four Realities.</span></h2>
	</div>
	</section>
	<section>
	<h3>The Input</h3>
	<img src="elephant_sample_image.jpg" alt="African elephant on savanna" class="showcase-img">
	<p class="smaller mt-1">One photograph. 224 x 224 pixels. Three color channels. Every model sees the same pixels.</p>
	</section>
	<section>
	<h3>What each model sees</h3>
	<p class="small c-sec mb-0">Top 3 PCA components mapped to <span class="c-red">R</span><span class="c-green">G</span><span class="c-blue">B</span></p>
	<div class="pca-grid mt-1">
	<figure>
	<img src="dinov2-vit-l14_pca.png" alt="DINOv2 PCA">
	<figcaption><span class="c-green">DINOv2</span> — clean object segmentation</figcaption>
	</figure>
	<figure>
	<img src="ijepa-vit-h14_pca.png" alt="I-JEPA PCA">
	<figcaption><span class="c-blue">I-JEPA</span> — fine-grained spatial detail</figcaption>
	</figure>
	<figure>
	<img src="vjepa2-vitl-img16-256_pca.png" alt="V-JEPA 2 PCA">
	<figcaption><span class="c-purple">V-JEPA 2</span> — strong spatial coherence from the corrected 16-frame image path</figcaption>
	</figure>
	<figure>
	<img src="eupe-vit-b16_pca.png" alt="EUPE PCA">
	<figcaption><span class="c-red">EUPE</span> — compact, sharper grouping</figcaption>
	</figure>
	</div>
	</section>
	</section>

	<!-- ════════════════════════════════════════════ -->
	<!-- ACT 2 — SELF-SUPERVISED LEARNING -->
	<!-- ════════════════════════════════════════════ -->
	<section>
	<section>
	<div class="section-title">
	<p class="c-sec" style="font-size:0.7em; letter-spacing:0.1em; text-transform:uppercase;">Act II</p>
	<h2>Self-Supervised Learning</h2>
	<p class="subtitle">Learning to see without being told what to look at</p>
	</div>
	</section>
	<section>
	<h3>The bottleneck</h3>
	<div class="flex">
	<div class="card">
	<p class="c-red fw-500">Supervised</p>
	<p class="small c-sec">Human labels for every image</p>
	<p class="small c-sec">ImageNet: 14M images, years of work</p>
	<p class="small c-sec">Millions of dollars</p>
	</div>
	<div class="card card-glow-green">
	<p class="c-green fw-500">Self-Supervised</p>
	<p class="small c-sec">No labels needed</p>
	<p class="small c-sec">The internet: billions of images</p>
	<p class="small c-sec">Zero annotation cost</p>
	</div>
	</div>
	<p class="small mt-2">The trick: invent a <em>task</em> that requires no labels<br>but forces the model to understand the image's structure.</p>
	</section>
	<section>
	<h3>Different questions, different understanding</h3>
	<div class="card card-glow-orange tl">
	<p><strong>Self-distillation:</strong> <span class="c-sec">"Two views of the same image. Produce the same representation for both."</span></p>
	<p class="mt-1"><strong>Latent prediction:</strong> <span class="c-sec">"I masked part of the image. Predict the <em>representation</em> of the hidden part."</span></p>
	<p class="mt-1"><strong>Video prediction:</strong> <span class="c-sec">"Predict the representation of the next frame."</span></p>
	<p class="mt-1"><strong>Proxy distillation:</strong> <span class="c-sec">"First build one large proxy teacher from multiple experts, then compress that proxy into a small generalist student."</span></p>
	</div>
	<div class="keyline mt-2">
	Each question creates a different learning pressure.<br>
	That pressure <strong class="c-orange">sculpts the geometry</strong> of the representation.
	</div>
	</section>
	</section>

	<!-- ════════════════════════════════════════════ -->
	<!-- THE FOUR MODELS -->
	<!-- ════════════════════════════════════════════ -->
	<section>
	<section>
	<div class="section-title">
	<p class="c-sec" style="font-size:0.7em; letter-spacing:0.1em; text-transform:uppercase;">The Cast</p>
	<h2>Four Models</h2>
	</div>
	</section>
	<section>
	<div class="flex" style="align-items:stretch;">
	<div class="card card-glow-green" style="flex:1;">
	<p><span class="badge badge-green">DINOv2</span> <span class="c-dim small">ViT-L/14 · 304M · 1024-dim</span></p>
	<p class="small c-sec mt-1">Self-distillation. Student matches a slowly-evolving teacher across augmented views.</p>
	<p class="small mt-1">The model learns that <span class="c-green fw-500">objects are the things that stay stable</span> when everything else changes.</p>
	<p class="tiny mt-1">Meta FAIR · Oquab et al., 2023</p>
	</div>
	<div class="card card-glow-blue" style="flex:1;">
	<p><span class="badge badge-blue">I-JEPA</span> <span class="c-dim small">ViT-H/14 · 632M · 1280-dim</span></p>
	<p class="small c-sec mt-1">Masks large image regions. Predicts the <em>representation</em> of missing patches, not pixels.</p>
	<p class="small mt-1"><span class="c-blue fw-500">Predicts meaning, not appearance.</span> Every patch must encode unique spatial context.</p>
	<p class="tiny mt-1">Meta FAIR · Assran et al., 2023 · Yann LeCun's JEPA</p>
	</div>
	</div>
	<div class="flex mt-1" style="align-items:stretch;">
	<div class="card card-glow-purple" style="flex:1;">
	<p><span class="badge badge-purple">V-JEPA 2</span> <span class="c-dim small">ViT-L/16 · 304M · 1024-dim</span></p>
	<p class="small c-sec mt-1">Video prediction in latent space. Predicts future frames, not pixels.</p>
	<p class="small mt-1">Even on a static photo, carries <span class="c-purple fw-500">an implicit prior about motion and time</span>.</p>
	<p class="tiny mt-1">Meta FAIR · Bardes et al., 2025</p>
	</div>
	<div class="card card-glow-red" style="flex:1;">
	<p><span class="badge badge-red">EUPE</span> <span class="c-dim small">ViT-B/16 · 86M · 768-dim</span></p>
	<p class="small c-sec mt-1">Proxy-distilled from a 1.9B universal teacher that aggregates multiple specialist teachers.</p>
	<p class="small mt-1"><span class="c-red fw-500">Compact generalist.</span> Lower-rank, more top-heavy, and more locally coherent than the SSL-only models.</p>
	<p class="tiny mt-1">Meta FAIR · Zhu et al., 2026</p>
	</div>
	</div>
	</section>
	<section>
	<h2>Same image. Different training pressures.</h2>
	<div class="rule rule-orange"></div>
	<p class="fw-400">The model families differ too, but the biggest point still holds: <strong class="c-orange">training pressure reshapes representation geometry</strong>.</p>
	<p class="small c-sec mt-2">Let's measure how that single choice<br>reshapes the entire geometry of the representation.</p>
	</section>
	</section>

	<!-- ════════════════════════════════════════════ -->
	<!-- ACT 3 — THE TOOL -->
	<!-- ════════════════════════════════════════════ -->
	<section>
	<section>
	<div class="section-title">
	<p class="c-sec" style="font-size:0.7em; letter-spacing:0.1em; text-transform:uppercase;">Act III</p>
	<h2>The Instrument</h2>
	</div>
	</section>
	<section>
	<pre><code class="language-bash">cargo install latent-inspector</code></pre>
	<div class="flex mt-2">
	<div class="card tl">
	<p class="small"><span class="c-orange fw-500">Rust</span> <span class="c-dim">single binary, no Python env</span></p>
	<p class="small mt-1"><span class="c-blue fw-500">ONNX Runtime</span> <span class="c-dim">real inference, verified models</span></p>
	<p class="small mt-1"><span class="c-green fw-500">Validated</span> <span class="c-dim">SHA-256 checksums, golden references</span></p>
	</div>
	<div class="card tl">
	<p class="small"><span class="mono c-sec">compare</span> <span class="c-dim">cross-model metrics + matrices</span></p>
	<p class="small mt-1"><span class="mono c-sec">inspect</span> <span class="c-dim">single-model deep diagnostics</span></p>
	<p class="small mt-1"><span class="mono c-sec">tui</span> <span class="c-dim">interactive terminal dashboard</span></p>
	<p class="small mt-1"><span class="mono c-sec">validate</span> <span class="c-dim">model integrity verification</span></p>
	</div>
	</div>
	<div class="terminal-cue">switch to terminal: latent-inspector models --verbose</div>
	</section>
	</section>

	<!-- ════════════════════════════════════════════ -->
	<!-- ACT 4 — WHAT IS A REPRESENTATION -->
	<!-- ════════════════════════════════════════════ -->
	<section>
	<section>
	<div class="section-title">
	<p class="c-sec" style="font-size:0.7em; letter-spacing:0.1em; text-transform:uppercase;">Act IV</p>
	<h2>What is a <span class="c-blue">Representation</span>?</h2>
	</div>
	</section>
	<section>
	<h3>From pixels to vectors</h3>
	<div class="card tl" style="max-width:700px; margin:0 auto;">
	<p class="mono small"><span class="c-dim">Image</span> <span class="c-sec">224 x 224 px, 3 channels</span></p>
	<p class="mono small c-dim mt-1">  ↓  split into 14 x 14 pixel patches</p>
	<p class="mono small mt-1"><span class="c-sec">256 patches</span></p>
	<p class="mono small c-dim mt-1">  ↓  linear projection to 1024 dimensions</p>
	<p class="mono small mt-1"><span class="c-orange">256 vectors x 1024 numbers</span></p>
	<p class="mono small c-dim mt-1">  ↓  24 Transformer layers (self-attention + FFN)</p>
	<p class="mono small mt-1"><span class="c-green">256 refined vectors = the representation</span></p>
	</div>
	<p class="small c-sec mt-2">262,144 floating-point numbers. That's what we analyze.</p>
	</section>
	<section>
	<h3>PCA — Principal Component Analysis</h3>
	<div class="card card-glow-orange tl">
	<p>1024 dimensions is too many to visualize. PCA finds the <strong>directions of maximum variation</strong>.</p>
	<p class="mt-1">Map the top 3 directions to <span class="c-red fw-500">Red</span>, <span class="c-green fw-500">Green</span>, <span class="c-blue fw-500">Blue</span> channels.</p>
	<p class="mt-1">Same-colored regions = the model considers those patches <strong>similar</strong>.</p>
	</div>
	<p class="smaller mt-2">Colors are relative to each model. You cannot compare "red" across models.</p>
	</section>
	</section>

	<!-- ════════════════════════════════════════════ -->
	<!-- ACT 5 — PCA DEEP ANALYSIS -->
	<!-- ════════════════════════════════════════════ -->
	<section>
	<section>
	<div class="section-title">
	<p class="c-sec" style="font-size:0.7em; letter-spacing:0.1em; text-transform:uppercase;">Act V</p>
	<h2>How Each Model <span class="c-orange">Sees</span></h2>
	</div>
	</section>
	<section>
	<div class="flex-center">
	<div>
	<img src="dinov2-vit-l14_pca.png" alt="DINOv2 PCA" style="height:340px; border-radius:10px; border:1px solid var(--border);">
	</div>
	<div class="tl" style="max-width:480px;">
	<p><span class="badge badge-green">DINOv2</span></p>
	<h2 style="font-size:1.3em; margin:0.3em 0;">Emergent Segmentation</h2>
	<p class="small c-sec">Sharp boundaries. The elephant body clusters in one color. Background in another.</p>
	<p class="small c-sec mt-1">Self-distillation forces consistency across augmented views. The most consistent thing across crops, rotations, and color shifts is the <strong class="c-green">object itself</strong>.</p>
	</div>
	</div>
	</section>
	<section>
	<div class="flex-center">
	<div>
	<img src="ijepa-vit-h14_pca.png" alt="I-JEPA PCA" style="height:340px; border-radius:10px; border:1px solid var(--border);">
	</div>
	<div class="tl" style="max-width:480px;">
	<p><span class="badge badge-blue">I-JEPA</span></p>
	<h2 style="font-size:1.3em; margin:0.3em 0;">Fine-Grained Detail</h2>
	<p class="small c-sec">More colors. Trunk, legs, ears each distinct. Background has spatial structure.</p>
	<p class="small c-sec mt-1">The prediction objective forces each patch to be unique. If adjacent patches had identical representations, the model <strong class="c-blue">couldn't predict which one is missing</strong>.</p>
	</div>
	</div>
	</section>
	<section>
	<div class="flex-center">
	<div>
	<img src="vjepa2-vitl-img16-256_pca.png" alt="V-JEPA 2 PCA" style="height:340px; border-radius:10px; border:1px solid var(--border);">
	</div>
	<div class="tl" style="max-width:480px;">
	<p><span class="badge badge-purple">V-JEPA 2</span></p>
	<h2 style="font-size:1.3em; margin:0.3em 0;">Spatiotemporal Coherence</h2>
	<p class="small c-sec">Much cleaner local continuity once the still image is adapted through the 16-frame evaluation path.</p>
	<p class="small c-sec mt-1">The corrected image wrapper keeps the video prior, but it now looks like a coherent image representation rather than a distorted 2-frame surrogate.</p>
	</div>
	</div>
	</section>
	<section>
	<div class="flex-center">
	<div>
	<img src="eupe-vit-b16_pca.png" alt="EUPE PCA" style="height:340px; border-radius:10px; border:1px solid var(--border);">
	</div>
	<div class="tl" style="max-width:480px;">
	<p><span class="badge badge-red">EUPE</span></p>
	<h2 style="font-size:1.3em; margin:0.3em 0;">Compact Compression</h2>
	<p class="small c-sec">Sharper grouping and much stronger local agreement than the other three models.</p>
	<p class="small c-sec mt-1">The corrected export still shows a compressed representation, but it remains clearly image-dependent and structurally meaningful.</p>
	</div>
	</div>
	<p class="small c-dim mt-2">Let's quantify exactly how different.</p>
	</section>
	</section>

	<!-- ════════════════════════════════════════════ -->
	<!-- ACT 6 — METRICS -->
	<!-- ════════════════════════════════════════════ -->
	<section>
	<section>
	<div class="section-title">
	<p class="c-sec" style="font-size:0.7em; letter-spacing:0.1em; text-transform:uppercase;">Act VI</p>
	<h2>Measuring Representations</h2>
	</div>
	<div class="terminal-cue">switch to terminal: latent-inspector compare</div>
	</section>
	<section>
	<h3>Effective Rank</h3>
	<p class="small c-sec">How many dimensions the model <em>actually uses</em>. Like a 1024-channel mixing board — how many channels carry signal?</p>
	<table>
	<tr><th>Model</th><th>Effective Rank</th><th>Of</th></tr>
	<tr><td><span class="c-green">DINOv2</span></td><td class="cell-hi-green">60</td><td class="c-dim">1024</td></tr>
	<tr><td><span class="c-blue">I-JEPA</span></td><td>44</td><td class="c-dim">1280</td></tr>
	<tr><td><span class="c-purple">V-JEPA 2</span></td><td>51</td><td class="c-dim">1024</td></tr>
	<tr><td><span class="c-red">EUPE</span></td><td class="cell-hi-red">22</td><td class="c-dim">768</td></tr>
	</table>
	<p class="smaller mt-1">DINOv2 stays the broadest spread. EUPE remains the most concentrated model in this four-model set.</p>
	</section>
	<section>
	<h3>Patch Entropy</h3>
	<p class="small c-sec">How differentiated are the patches? High = every patch says something unique.</p>
	<table>
	<tr><th>Model</th><th>Entropy</th><th></th></tr>
	<tr><td><span class="c-green">DINOv2</span></td><td>2.52</td><td></td></tr>
	<tr><td><span class="c-blue">I-JEPA</span></td><td class="cell-hi-green">2.89</td><td class="small c-sec">every patch is unique</td></tr>
	<tr><td><span class="c-purple">V-JEPA 2</span></td><td>2.89</td><td class="small c-sec">high variation once the image path is corrected</td></tr>
	<tr><td><span class="c-red">EUPE</span></td><td>2.83</td><td class="small c-sec">compact, still differentiated</td></tr>
	</table>
	<p class="smaller mt-1">I-JEPA <em>must</em> differentiate to predict. EUPE stays fairly expressive on this metric even while compressing variance much more aggressively elsewhere.</p>
	</section>
	</section>

	<!-- ════════════════════════════════════════════ -->
	<!-- ACT 7 — ISOTROPY REVEAL -->
	<!-- ════════════════════════════════════════════ -->
	<section>
	<section>
	<div class="section-title">
	<p class="c-sec" style="font-size:0.7em; letter-spacing:0.1em; text-transform:uppercase;">Act VII</p>
	<h2><span class="c-orange">Isotropy</span></h2>
	<p class="subtitle">Do patches point in diverse directions — or all the same way?</p>
	</div>
	</section>
	<section>
	<p class="small c-sec">Picture a room of 256 compasses.<br>1.0 = every compass points a different direction. 0.0 = they all point north.</p>
	<div class="rule"></div>
	<div class="flex-baseline mt-2">
	<div style="text-align:center;">
	<div class="metric metric-lg c-green">0.796</div>
	<div class="metric-label">DINOv2</div>
	</div>
	<div style="text-align:center;">
	<div class="metric metric-lg c-blue">0.788</div>
	<div class="metric-label">I-JEPA</div>
	</div>
	<div style="text-align:center;">
	<div class="metric metric-lg c-purple">0.417</div>
	<div class="metric-label">V-JEPA 2</div>
	</div>
	<div style="text-align:center;" class="fragment" data-fragment-index="1">
	<div class="metric metric-xl c-red">0.375</div>
	<div class="metric-label">EUPE</div>
	</div>
	</div>
	</section>
	<section>
	<div class="metric metric-xl c-red" style="margin-bottom:0.3em;">0.375</div>
	<p class="c-sec">Low, but <strong class="c-red">not near zero</strong>.</p>
	<div class="rule"></div>
	<div class="card card-glow-red tl mt-2" style="max-width:700px; margin-left:auto; margin-right:auto;">
	<p>EUPE still uses fewer directions than the SSL-only models, but the corrected export shows a <strong>compressed representation</strong>, not a degenerate one.</p>
	<p class="mt-1 c-sec">That matches the surviving qualitative story: sharper, more top-heavy features and much stronger local agreement.</p>
	</div>
	<div class="flex-center mt-2">
	<div class="card tl" style="flex:0 1 auto;">
	<p class="small"><span class="c-sec">Top-10 variance:</span> <span class="c-red fw-500">87.0%</span></p>
	<p class="smaller">still by far the most top-heavy model</p>
	</div>
	<div class="card tl" style="flex:0 1 auto;">
	<p class="small"><span class="c-sec">Components @ 90%:</span> <span class="c-red fw-500">13</span></p>
	<p class="smaller">vs 31, 22, and 29 for the others</p>
	</div>
	</div>
	</section>
	</section>

	<!-- ════════════════════════════════════════════ -->
	<!-- ACT 8 — CKA -->
	<!-- ════════════════════════════════════════════ -->
	<section>
	<section>
	<div class="section-title">
	<p class="c-sec" style="font-size:0.7em; letter-spacing:0.1em; text-transform:uppercase;">Act VIII</p>
	<h2>Cross-Model Similarity</h2>
	<p class="subtitle">CKA — Centered Kernel Alignment</p>
	</div>
	</section>
	<section>
	<h3>What CKA measures</h3>
	<div class="card card-glow-orange tl" style="max-width:700px; margin:0 auto;">
	<p>Do two models organize their representations in <strong>similar geometric structures</strong>?</p>
	<p class="c-sec mt-1">Like two restaurant critics: do they agree on <em>which restaurants are similar to each other</em>?</p>
	<p class="mt-1"><span class="mono c-green">1.000</span> = identical geometry     <span class="mono c-red">0.000</span> = completely unrelated</p>
	</div>
	</section>
	<section>
	<h3>The CKA Matrix</h3>
	<table class="matrix">
	<tr><th></th><th><span class="c-green">DINOv2</span></th><th><span class="c-blue">I-JEPA</span></th><th><span class="c-purple">V-JEPA 2</span></th><th><span class="c-red">EUPE</span></th></tr>
	<tr><td class="c-green fw-500">DINOv2</td><td class="diag">1.000</td><td>0.329</td><td class="high">0.495</td><td>0.150</td></tr>
	<tr><td class="c-blue fw-500">I-JEPA</td><td>0.329</td><td class="diag">1.000</td><td>0.381</td><td>0.115</td></tr>
	<tr><td class="c-purple fw-500">V-JEPA 2</td><td class="high">0.495</td><td>0.381</td><td class="diag">1.000</td><td>0.103</td></tr>
	<tr><td class="c-red fw-500">EUPE</td><td>0.150</td><td>0.115</td><td>0.103</td><td class="diag">1.000</td></tr>
	</table>
	<p class="small mt-2">The corrected image path pulls V-JEPA 2 much closer to DINOv2 and I-JEPA. EUPE is still the weakest match to the others, but now clearly as a coherent compressed outlier rather than an export artifact.</p>
	</section>
	<section>
	<h3>Three findings</h3>
	<div class="tl" style="max-width:750px; margin:0 auto;">
	<div class="card mt-1 fragment">
	<p><span class="c-green fw-500">1.</span> DINOv2 ↔ V-JEPA 2 = <span class="mono c-green">0.495</span> <span class="c-dim">— highest pair</span></p>
	<p class="small c-sec">The corrected 16-frame image path reveals that V-JEPA 2 is much closer to DINOv2 on still images than the retired 2-frame surrogate implied.</p>
	</div>
	<div class="card mt-1 fragment">
	<p><span class="c-blue fw-500">2.</span> I-JEPA ↔ V-JEPA 2 = <span class="mono c-blue">0.381</span></p>
	<p class="small c-sec">V-JEPA 2 still keeps a video-shaped bias, but on images it now sits much closer to the two SSL image encoders than to the old surrogate geometry.</p>
	</div>
	<div class="card card-glow-red mt-1 fragment">
	<p><span class="c-red fw-500">3.</span> EUPE stays weakest against everyone: <span class="mono c-red">0.150 / 0.115 / 0.103</span></p>
	<p class="small c-sec">The stronger surviving EUPE signal is compression, not total disagreement. The gap is real; the earlier near-zero magnitude was artifact-driven.</p>
	</div>
	</div>
	</section>
	<section>
	<h3>The Actual Distillation Story</h3>
	<div class="card card-glow-red tl" style="max-width:650px; margin:1em auto;">
	<p class="c-sec">The 86M student does <strong class="c-red">not</strong> directly distill from multiple teachers at once.</p>
	<p class="mt-1">It distills from a merged 1.9B proxy teacher, and the paper explicitly compares against the direct multi-teacher baseline.</p>
	<p class="mt-1 c-sec">That makes the corrected CKA numbers easier to read: EUPE reorganizes the geometry substantially, but it remains coherent.</p>
	</div>
	</section>
	</section>

	<!-- ════════════════════════════════════════════ -->
	<!-- k-NN OVERLAP -->
	<!-- ════════════════════════════════════════════ -->
	<section>
	<h3>k-NN Overlap — Local Neighborhood Agreement</h3>
	<p class="small c-sec">For each patch, find its 10 nearest neighbors in each model. What fraction do they share?</p>
	<table class="matrix">
	<tr><th></th><th><span class="c-green">DINOv2</span></th><th><span class="c-blue">I-JEPA</span></th><th><span class="c-purple">V-JEPA 2</span></th><th><span class="c-red">EUPE</span></th></tr>
	<tr><td class="c-green fw-500">DINOv2</td><td class="diag">1.000</td><td>0.278</td><td class="high">0.366</td><td>0.168</td></tr>
	<tr><td class="c-blue fw-500">I-JEPA</td><td>0.278</td><td class="diag">1.000</td><td>0.311</td><td class="cell-hi-red">0.122</td></tr>
	<tr><td class="c-purple fw-500">V-JEPA 2</td><td class="high">0.366</td><td>0.311</td><td class="diag">1.000</td><td>0.226</td></tr>
	<tr><td class="c-red fw-500">EUPE</td><td>0.168</td><td class="cell-hi-red">0.122</td><td>0.226</td><td class="diag">1.000</td></tr>
	</table>
	<div class="flex-center mt-2">
	<div class="card tl" style="flex:0 1 auto;"><p class="small"><span class="c-green fw-500">36.6%</span> <span class="c-dim">DINOv2 ↔ V-JEPA 2 — highest</span></p></div>
	<div class="card tl" style="flex:0 1 auto;"><p class="small"><span class="c-red fw-500">12.2%</span> <span class="c-dim">I-JEPA ↔ EUPE — lowest</span></p></div>
	</div>
	<p class="small c-sec mt-2">The corrected adapter changes the local story as well: V-JEPA 2 now shares many more neighborhoods with DINOv2 and I-JEPA than the 2-frame surrogate suggested. This is patch-neighborhood overlap on one image, not the paper's ImageNet k-NN classification metric.</p>
	</section>

	<!-- ════════════════════════════════════════════ -->
	<!-- ACT 9 — TOOLKIT -->
	<!-- ════════════════════════════════════════════ -->
	<section>
	<section>
	<div class="section-title">
	<p class="c-sec" style="font-size:0.7em; letter-spacing:0.1em; text-transform:uppercase;">Act IX</p>
	<h2>The Toolkit</h2>
	</div>
	<div class="terminal-cue">switch to terminal for live demos</div>
	</section>
	<section>
	<h3>Single Model Deep-Dive</h3>
	<pre><code class="language-bash">latent-inspector inspect elephant.jpg --model dinov2-vit-l14</code></pre>
	<p class="small c-sec mt-1">Full diagnostics: PCA variance spectrum, patch norm distributions,<br>CLS token analysis, all metrics in one view.</p>
	</section>
	<section>
	<h3>Interactive Terminal UI</h3>
	<pre><code class="language-bash">latent-inspector tui elephant.jpg \
	-m dinov2-vit-l14,ijepa-vit-h14,vjepa2-vitl-img16-256,eupe-vit-b16</code></pre>
	<p class="small c-sec mt-1">Dashboard · Inspector · Compare · Spectrum<br>All interactive. All in the terminal.</p>
	</section>
	<section>
	<h3>Validation Pipeline</h3>
	<pre><code class="language-bash">latent-inspector validate --model dinov2-vit-l14 --model ijepa-vit-h14 \
	--model vjepa2-vitl-img16-256 --model eupe-vit-b16</code></pre>
	<div class="flex-center mt-2">
	<span class="badge badge-green">DINOv2 · 73 signals</span>
	<span class="badge badge-blue">I-JEPA · 45 signals</span>
	<span class="badge badge-purple">V-JEPA 2 · 45 signals</span>
	<span class="badge badge-red">EUPE · 73 signals</span>
	</div>
	<p class="small c-sec mt-2">Preprocessing contracts. Golden references. Zero drift.<br>Not vibes — <strong>verifiable measurements</strong>.</p>
	</section>
	</section>

	<!-- ════════════════════════════════════════════ -->
	<!-- ACT 10 — INSIGHT -->
	<!-- ════════════════════════════════════════════ -->
	<section>
	<section>
	<div class="section-title">
	<p class="c-sec" style="font-size:0.7em; letter-spacing:0.1em; text-transform:uppercase;">Act X</p>
	<h2>What Shapes <span class="c-orange">Perception</span>?</h2>
	</div>
	</section>
	<section>
	<h3>The hierarchy of forces</h3>
	<div class="tl" style="max-width:700px; margin:0 auto;">
	<p class="fragment"><span class="c-orange fw-600" style="font-size:1.3em;">1</span>  <span class="fw-500">Training objective</span> <span class="c-dim">— the dominant force</span></p>
	<p class="fragment mt-1"><span class="c-blue fw-600" style="font-size:1.3em;">2</span>  <span class="fw-500">Architecture</span> <span class="c-dim">— the container that constrains geometry</span></p>
	<p class="fragment mt-1"><span class="c-purple fw-600" style="font-size:1.3em;">3</span>  <span class="fw-500">Modality</span> <span class="c-dim">— image vs video matters more than paradigm</span></p>
	<p class="fragment mt-1"><span class="c-red fw-600" style="font-size:1.3em;">4</span>  <span class="fw-500">Model size</span> <span class="c-dim">— the compression budget</span></p>
	</div>
	</section>
	<section>
	<h2 style="line-height:1.3;">Different <span class="c-orange">World Models</span>.<br>Different Ways of Seeing.</h2>
	<div class="rule rule-orange"></div>
	<p class="c-sec" style="max-width:700px; margin:0 auto;">If you're building a system that needs to understand the physical world — a robot, an autonomous vehicle, a world model — the question isn't just <em>"which model gets the best accuracy."</em></p>
	<p class="mt-2 fw-500" style="font-size:1.1em;">What kind of <span class="c-orange">perception</span> does this model have?</p>
	</section>
	</section>

	<!-- ════════════════════════════════════════════ -->
	<!-- CLOSING -->
	<!-- ════════════════════════════════════════════ -->
	<section data-background-color="#08080c">
	<div style="margin-top:1.5em;">
	<p class="mono c-dim" style="font-size:0.65em; letter-spacing:0.05em;">OPEN SOURCE · RUST · RUNS IN SECONDS</p>
	<h1 style="margin:0.3em 0;">latent-inspector</h1>
	<pre style="max-width:500px; margin:0.5em auto;"><code class="language-bash">cargo install latent-inspector</code></pre>
	<div class="rule rule-orange mt-2"></div>
	<div class="flex-center mt-2" style="gap:0.8em;">
	<span class="badge badge-green">DINOv2</span>
	<span class="badge badge-blue">I-JEPA</span>
	<span class="badge badge-purple">V-JEPA 2</span>
	<span class="badge badge-red">EUPE</span>
	<span class="c-dim small" style="margin-left:0.5em;">MAE · CLIP · SigLIP · DINOv3 coming</span>
	</div>
	<p class="mt-2" style="font-size:0.7em;"><a href="reports/20260408-123006/report.html">Sample compare report</a> · <a href="reports/">Reports index</a></p>
	<p class="mt-3" style="font-size:0.75em;"><strong>github.com/AbdelStark/latent-inspector</strong></p>
	</div>
	</section>

	</div>
	</div>

	<script src="https://cdn.jsdelivr.net/npm/reveal.js@5.1.0/dist/reveal.js"></script>
	<script src="https://cdn.jsdelivr.net/npm/reveal.js@5.1.0/plugin/highlight/highlight.js"></script>
	<script>
	Reveal.initialize({
	hash: true,
	slideNumber: 'c/t',
	showSlideNumber: 'all',
	transition: 'none',
	transitionSpeed: 'fast',
	backgroundTransition: 'none',
	width: 1920,
	height: 1080,
	margin: 0.06,
	center: true,
	plugins: [ RevealHighlight ]
	});
	</script>
	</body>
	</html>