Spaces:

Paulescu
/

lfm2.5-audio-web-gpu-demo

Running

App Files Files Community

lfm2.5-audio-web-gpu-demo / audio-model.js

ykhrustalev's picture

Fix TTS bug - Error: input 'depth_slices_in' is missing in 'feeds'. (#3)

c72d46d 3 months ago

history blame contribute delete

80.9 kB

	/**
	* LFM2-Audio Model Runner for ONNX Runtime Web
	*
	* Runs audio model inference using ONNX models:
	* 1. decoder.onnx - LFM2 backbone (shared with text)
	* 2. audio_encoder.onnx - Conformer encoder for ASR (mel → embeddings)
	* 3. audio_embedding.onnx - Audio code embeddings for TTS
	* 4. audio_detokenizer.onnx - Audio codes → STFT features
	* 5. vocoder_depthformer.onnx - Autoregressive codebook prediction
	*
	* Supports ASR mode for the webapp (transcription).
	*/

	import * as ort from 'onnxruntime-web';
	import { AutoTokenizer, env } from '@huggingface/transformers';
	import { loadMelConfig, computeMelSpectrogram, loadAudioFile } from './audio-processor.js';

	// Cache configuration
	const CACHE_NAME = 'onnx-models-v1';
	const IDB_NAME = 'onnx-model-cache';
	const IDB_STORE = 'models';

	// IndexedDB helpers for fallback caching
	let idbPromise = null;

	function openIDB() {
	if (idbPromise) return idbPromise;

	idbPromise = new Promise((resolve, reject) => {
	const request = indexedDB.open(IDB_NAME, 1);
	request.onerror = () => reject(request.error);
	request.onsuccess = () => resolve(request.result);
	request.onupgradeneeded = (event) => {
	const db = event.target.result;
	if (!db.objectStoreNames.contains(IDB_STORE)) {
	db.createObjectStore(IDB_STORE);
	}
	};
	});

	return idbPromise;
	}

	async function idbGet(key) {
	try {
	const db = await openIDB();
	return new Promise((resolve, reject) => {
	const tx = db.transaction(IDB_STORE, 'readonly');
	const store = tx.objectStore(IDB_STORE);
	const request = store.get(key);
	request.onerror = () => reject(request.error);
	request.onsuccess = () => resolve(request.result);
	});
	} catch (e) {
	return null;
	}
	}

	async function idbSet(key, value) {
	try {
	const db = await openIDB();
	return new Promise((resolve, reject) => {
	const tx = db.transaction(IDB_STORE, 'readwrite');
	const store = tx.objectStore(IDB_STORE);
	const request = store.put(value, key);
	request.onerror = () => reject(request.error);
	request.onsuccess = () => resolve();
	});
	} catch (e) {
	// Ignore cache write failures
	}
	}

	// Special tokens for audio model
	const SPECIAL_TOKENS = {
	AUDIO_START: 128, // <\|audio_start\|>
	TEXT_START: 129, // <\|text_start\|>
	TEXT_END: 130, // <\|text_end\|>
	MIXED_START: 131, // <\|mixed_start\|>
	MIXED_END: 132, // <\|mixed_end\|>
	IM_END: 7, // <\|im_end\|>
	};

	// Audio codebook constants
	const NUM_CODEBOOKS = 8;
	const CODEBOOK_VOCAB = 2049;
	const END_OF_AUDIO_TOKEN = 2048;

	// Default system prompts (matching Python lfm2-audio-infer)
	const DEFAULT_SYSTEM_PROMPT_ASR = 'Perform ASR.';
	const DEFAULT_SYSTEM_PROMPT_TTS = 'Perform TTS. Use the UK female voice.';
	const DEFAULT_SYSTEM_PROMPT_INTERLEAVED = 'Respond with interleaved text and audio.';

	// Max tokens defaults (matching liquid-audio)
	// Each audio frame = 80ms (6x upsampling in detokenizer, 320 hop, 24kHz)
	// 1024 frames ≈ 82 seconds of audio
	const DEFAULT_MAX_TOKENS_AUDIO = 1024; // TTS and interleaved modes
	const DEFAULT_MAX_TOKENS_TEXT = 100; // ASR mode

	// Timestamped logging helper
	let _logStartTime = null;
	function log(...args) {
	if (_logStartTime === null) {
	_logStartTime = performance.now();
	}
	const elapsed = ((performance.now() - _logStartTime) / 1000).toFixed(2);
	console.log(`[${elapsed}s]`, ...args);
	}
	function logReset() {
	_logStartTime = performance.now();
	}

	/**
	* Fetch with caching support using Cache API or IndexedDB fallback
	*/
	async function fetchWithCache(url, options = {}) {
	if (!url.startsWith('http://') && !url.startsWith('https://')) {
	return fetch(url, options);
	}

	const fileName = url.split('/').pop();

	// Try Cache API first
	if (typeof caches !== 'undefined') {
	try {
	const cache = await caches.open(CACHE_NAME);
	const cached = await cache.match(url);
	if (cached) {
	console.log(`[Cache HIT] ${fileName}`);
	return cached;
	}

	console.log(`[Cache MISS] Fetching ${fileName}...`);
	const response = await fetch(url, options);

	if (response.ok) {
	cache.put(url, response.clone());
	}

	return response;
	} catch (e) {
	// Fall through to IndexedDB
	}
	}

	// Try IndexedDB fallback
	if (typeof indexedDB !== 'undefined') {
	try {
	const cached = await idbGet(url);
	if (cached) {
	console.log(`[IDB Cache HIT] ${fileName}`);
	return new Response(cached.data, {
	status: 200,
	headers: { 'Content-Type': cached.contentType \|\| 'application/octet-stream' },
	});
	}

	console.log(`[IDB Cache MISS] Fetching ${fileName}...`);
	const response = await fetch(url, options);

	if (response.ok) {
	const clone = response.clone();
	const data = await clone.arrayBuffer();
	const contentType = response.headers.get('Content-Type') \|\| 'application/octet-stream';
	await idbSet(url, { data, contentType });
	}

	return response;
	} catch (e) {
	console.warn('IndexedDB cache failed:', e);
	}
	}

	// Direct fetch as last resort
	console.log(`[No Cache] Fetching ${fileName}...`);
	return fetch(url, options);
	}

	/**
	* Clear the model cache (both Cache API and IndexedDB)
	*/
	export async function clearModelCache() {
	let deleted = false;

	// Clear Cache API
	if (typeof caches !== 'undefined') {
	try {
	deleted = await caches.delete(CACHE_NAME);
	} catch (e) {
	// Ignore
	}
	}

	// Clear IndexedDB
	if (typeof indexedDB !== 'undefined') {
	try {
	await new Promise((resolve, reject) => {
	const request = indexedDB.deleteDatabase(IDB_NAME);
	request.onerror = () => reject(request.error);
	request.onsuccess = () => resolve();
	});
	idbPromise = null; // Reset the cached promise
	deleted = true;
	} catch (e) {
	// Ignore
	}
	}

	console.log(deleted ? 'Model cache cleared' : 'No cache to clear');
	return deleted;
	}

	/**
	* Get cache storage usage info
	*/
	export async function getCacheInfo() {
	if ('storage' in navigator && 'estimate' in navigator.storage) {
	const estimate = await navigator.storage.estimate();
	return {
	used: estimate.usage \|\| 0,
	available: estimate.quota \|\| 0,
	};
	}
	return null;
	}

	/**
	* Load tokenizer from model path
	*/
	async function loadTokenizerFromPath(modelPath) {
	const isRemote = modelPath.startsWith('http://') \|\| modelPath.startsWith('https://');
	console.log(`Loading tokenizer from ${isRemote ? 'remote' : 'local'}: ${modelPath}`);

	const fetchOptions = isRemote ? { mode: 'cors', credentials: 'omit' } : {};

	const [tokenizerResponse, configResponse] = await Promise.all([
	fetchWithCache(`${modelPath}/tokenizer.json`, fetchOptions),
	fetchWithCache(`${modelPath}/tokenizer_config.json`, fetchOptions),
	]);

	if (!tokenizerResponse.ok) {
	throw new Error(`Failed to fetch tokenizer.json: ${tokenizerResponse.status}`);
	}
	if (!configResponse.ok) {
	throw new Error(`Failed to fetch tokenizer_config.json: ${configResponse.status}`);
	}

	const tokenizerJSON = await tokenizerResponse.text();
	const configJSON = await configResponse.text();

	// Parse tokenizer.json to extract special token IDs
	const tokenizerData = JSON.parse(tokenizerJSON);
	const specialTokens = {};

	if (tokenizerData.added_tokens) {
	for (const token of tokenizerData.added_tokens) {
	specialTokens[token.content] = token.id;
	}
	console.log('Found special tokens:', Object.keys(specialTokens).length);
	}

	// Create tokenizer using transformers.js
	const fakeModelId = `tokenizer-${Date.now()}`;

	const fileCache = {
	'tokenizer.json': tokenizerJSON,
	'tokenizer_config.json': configJSON,
	};

	const originalFetch = globalThis.fetch;
	globalThis.fetch = async (input, init) => {
	const url = typeof input === 'string' ? input : input.url;

	if (url.includes(fakeModelId)) {
	for (const [filename, content] of Object.entries(fileCache)) {
	if (url.includes(filename)) {
	return new Response(content, {
	status: 200,
	headers: { 'Content-Type': 'application/json' },
	});
	}
	}
	return new Response('Not found', { status: 404 });
	}

	return originalFetch(input, init);
	};

	const originalAllowLocal = env.allowLocalModels;
	env.allowLocalModels = false;

	try {
	const tokenizer = await AutoTokenizer.from_pretrained(fakeModelId);
	console.log('Tokenizer created successfully');
	return { tokenizer, specialTokens };
	} finally {
	globalThis.fetch = originalFetch;
	env.allowLocalModels = originalAllowLocal;
	}
	}

	export class AudioModel {
	constructor() {
	this.tokenizer = null;
	this.decoderSession = null;
	this.audioEncoderSession = null;
	this.audioEmbeddingSession = null;
	this.audioEmbeddingWeight = null; // Direct lookup (faster than ONNX)
	this.audioDetokenizerSession = null;
	this.vocoderSession = null;
	this.config = null;
	this.embedTokensWeight = null;

	// Model config
	this.hiddenSize = 2048;
	this.numLayers = 16;
	this.numKVHeads = 8;
	this.headDim = 64;
	this.convL = 3;
	this.layerTypes = [];
	this.vocabSize = 65536;

	// === Stateful cache for multi-turn conversation ===
	this.cache = null;
	this.cacheSeqLen = 0;
	}

	/**
	* Reset conversation state (KV cache).
	* Call this to start a new conversation.
	*/
	reset() {
	this.cache = null;
	this.cacheSeqLen = 0;
	log('Conversation state reset');
	}

	/**
	* Load the audio model from a directory
	* @param {string} modelPath - Path to model directory
	* @param {object} options - Loading options
	*/
	async load(modelPath, options = {}) {
	const { progressCallback, device = 'webgpu', quantization = null } = options;

	const report = (status, progress = 0, file = '') => {
	if (progressCallback) {
	progressCallback({ status, progress, file });
	}
	};

	const executionProviders = device === 'webgpu'
	? ['webgpu', 'wasm']
	: ['wasm'];

	try {
	// Load mel config for audio processing
	await loadMelConfig(modelPath);

	// Load tokenizer
	report('loading', 0, 'tokenizer');
	const { tokenizer } = await loadTokenizerFromPath(modelPath);
	this.tokenizer = tokenizer;

	// Load config
	report('loading', 5, 'config');
	const configResponse = await fetch(`${modelPath}/config.json`, {
	mode: 'cors',
	credentials: 'omit',
	});
	this.config = await configResponse.json();

	// Extract model dimensions from config
	const lfmConfig = this.config.lfm \|\| {};
	this.hiddenSize = lfmConfig.hidden_size \|\| 2048;
	this.numLayers = lfmConfig.num_hidden_layers \|\| 16;
	this.numKVHeads = lfmConfig.num_key_value_heads \|\| 8;
	this.headDim = Math.floor(this.hiddenSize / (lfmConfig.num_attention_heads \|\| 32));
	this.convL = lfmConfig.conv_L_cache \|\| 3;
	this.layerTypes = lfmConfig.layer_types \|\| [];
	this.vocabSize = lfmConfig.vocab_size \|\| 65536;

	console.log('Model config:', {
	hiddenSize: this.hiddenSize,
	numLayers: this.numLayers,
	numKVHeads: this.numKVHeads,
	headDim: this.headDim,
	});

	// Parse quantization config
	const quantConfig = typeof quantization === 'object' ? quantization : {
	decoder: quantization,
	audioEncoder: quantization,
	audioEmbedding: quantization,
	audioDetokenizer: quantization,
	vocoder: quantization,
	};

	// Helper to load ONNX model with external data
	const loadOnnxWithExternalData = async (name, progress, quantSuffix = null, extraOptions = {}) => {
	const suffix = quantSuffix ? `_${quantSuffix}` : '';
	const fileName = `${name}${suffix}`;
	report('loading', progress, `${fileName}.onnx`);

	const onnxPath = `${modelPath}/onnx/${fileName}.onnx`;
	const fetchOptions = { mode: 'cors', credentials: 'omit' };

	console.log(`Loading ${fileName}...`);

	const sessionOptions = { executionProviders, ...extraOptions };

	const onnxResponse = await fetchWithCache(onnxPath, fetchOptions);
	if (!onnxResponse.ok) {
	throw new Error(`Failed to fetch ${fileName}.onnx: ${onnxResponse.status}`);
	}
	const onnxBuffer = await onnxResponse.arrayBuffer();
	console.log(`Loaded ${fileName}.onnx: ${(onnxBuffer.byteLength / 1024 / 1024).toFixed(1)} MB`);

	// Load external data files
	sessionOptions.externalData = [];

	// Try single .onnx_data file
	const singleDataPath = `${modelPath}/onnx/${fileName}.onnx_data`;
	try {
	const dataResponse = await fetchWithCache(singleDataPath, fetchOptions);
	const contentType = dataResponse.headers.get('content-type') \|\| '';
	if (dataResponse.ok && !contentType.includes('text/html')) {
	const dataBuffer = await dataResponse.arrayBuffer();
	if (dataBuffer.byteLength > 1000) { // Sanity check
	console.log(`Loaded ${fileName}.onnx_data: ${(dataBuffer.byteLength / 1024 / 1024).toFixed(1)} MB`);
	sessionOptions.externalData.push({
	path: `${fileName}.onnx_data`,
	data: new Uint8Array(dataBuffer),
	});
	}
	}
	} catch (e) {
	// File doesn't exist
	}

	// Try numbered files - stop on first 404
	for (let i = 1; ; i++) {
	const numberedDataPath = `${modelPath}/onnx/${fileName}.onnx_data_${i}`;
	const dataResponse = await fetch(numberedDataPath, fetchOptions);
	if (dataResponse.status === 404 \|\| !dataResponse.ok) break;
	const contentType = dataResponse.headers.get('content-type') \|\| '';
	if (contentType.includes('text/html')) break;
	const dataBuffer = await dataResponse.arrayBuffer();
	if (dataBuffer.byteLength < 1000) break;
	console.log(`Loaded ${fileName}.onnx_data_${i}: ${(dataBuffer.byteLength / 1024 / 1024).toFixed(1)} MB`);
	sessionOptions.externalData.push({
	path: `${fileName}.onnx_data_${i}`,
	data: new Uint8Array(dataBuffer),
	});
	}

	if (sessionOptions.externalData.length === 0) {
	delete sessionOptions.externalData;
	}

	const session = await ort.InferenceSession.create(new Uint8Array(onnxBuffer), sessionOptions);
	console.log(`Session created for ${fileName}`);
	return session;
	};

	// Load decoder
	// On WebGPU: keep KV cache outputs on GPU to avoid GPU→CPU→GPU roundtrips between steps
	const decoderOpts = device === 'webgpu' ? (() => {
	const loc = {};
	for (let i = 0; i < this.layerTypes.length; i++) {
	if (this.layerTypes[i] === 'conv') {
	loc[`present_conv.${i}`] = 'gpu-buffer';
	} else {
	loc[`present.${i}.key`] = 'gpu-buffer';
	loc[`present.${i}.value`] = 'gpu-buffer';
	}
	}
	return { preferredOutputLocation: loc };
	})() : {};
	this.decoderSession = await loadOnnxWithExternalData('decoder', 10, quantConfig.decoder, decoderOpts);

	// Load embed_tokens weight for text embedding lookup
	report('loading', 30, 'embed_tokens');
	this.embedTokensWeight = await this.loadEmbedTokensWeight(modelPath);

	// Load audio encoder (for ASR)
	this.audioEncoderSession = await loadOnnxWithExternalData('audio_encoder', 50, quantConfig.audioEncoder);

	// Load audio embedding (for TTS) - try binary first, fallback to ONNX
	report('loading', 65, 'audio_embedding');
	this.audioEmbeddingWeight = await this.loadAudioEmbeddingWeight(modelPath);
	if (!this.audioEmbeddingWeight) {
	// Fallback to ONNX model
	this.audioEmbeddingSession = await loadOnnxWithExternalData('audio_embedding', 70, quantConfig.audioEmbedding);
	} else {
	console.log('Using direct audio embedding lookup (binary)');
	}

	// Load audio detokenizer (for TTS output)
	try {
	this.audioDetokenizerSession = await loadOnnxWithExternalData('audio_detokenizer', 85, quantConfig.audioDetokenizer);
	} catch (e) {
	console.warn('Audio detokenizer not available:', e);
	}

	// Load vocoder (for TTS)
	// On WebGPU: keep KV cache on GPU to avoid GPU→CPU→GPU roundtrips between steps
	try {
	const vocoderOpts = device === 'webgpu'
	? { preferredOutputLocation: { new_keys: 'gpu-buffer', new_values: 'gpu-buffer', depth_slices: 'gpu-buffer' } }
	: {};
	this.vocoderSession = await loadOnnxWithExternalData('vocoder_depthformer', 95, quantConfig.vocoder, vocoderOpts);
	} catch (e) {
	console.warn('Vocoder not available:', e);
	}

	report('done', 100, '');
	return true;

	} catch (error) {
	let errorMessage = error;
	if (typeof error === 'number') {
	errorMessage = `ONNX Runtime error code: ${error}`;
	} else if (error instanceof Error) {
	errorMessage = error.message;
	}
	console.error('Failed to load audio model:', errorMessage);
	throw new Error(errorMessage);
	}
	}

	/**
	* Get audio embeddings for given token indices and sum across codebooks
	*
	* Uses direct array indexing if binary weight available (fast),
	* falls back to ONNX session otherwise.
	*
	* @param {number[]} audioTokens - Array of 8 token indices (one per codebook)
	* @returns {Float32Array} Summed embedding [hiddenSize]
	*/
	async getAudioEmbedding(audioTokens) {
	const NUM_CODEBOOKS = 8;
	const hiddenSize = this.hiddenSize;
	const summedEmbeds = new Float32Array(hiddenSize);

	if (this.audioEmbeddingWeight) {
	// Direct lookup (much faster - no ONNX call)
	const weight = this.audioEmbeddingWeight.weight;
	for (let cb = 0; cb < NUM_CODEBOOKS; cb++) {
	const tokenIdx = audioTokens[cb];
	const offset = tokenIdx * hiddenSize;
	for (let h = 0; h < hiddenSize; h++) {
	summedEmbeds[h] += weight[offset + h];
	}
	}
	} else {
	// Fallback to ONNX session
	const audioTokensTensor = new ort.Tensor('int64', new BigInt64Array(audioTokens.map(BigInt)), [1, NUM_CODEBOOKS]);
	const result = await this.audioEmbeddingSession.run({ audio_codes: audioTokensTensor });
	const embeddings = result.audio_embeds.data;

	for (let cb = 0; cb < NUM_CODEBOOKS; cb++) {
	for (let h = 0; h < hiddenSize; h++) {
	summedEmbeds[h] += embeddings[cb * hiddenSize + h];
	}
	}
	}

	return summedEmbeds;
	}

	/**
	* Load audio_embedding.weight from raw binary file for direct lookup
	*
	* This eliminates ONNX model calls (352 per generation → 0).
	* Falls back to ONNX session if binary not available.
	*/
	async loadAudioEmbeddingWeight(modelPath) {
	const fetchOptions = { mode: 'cors', credentials: 'omit' };

	try {
	// Load metadata
	const metaResponse = await fetchWithCache(`${modelPath}/onnx/audio_embedding.json`, fetchOptions);
	if (!metaResponse.ok) {
	console.log('audio_embedding.json not found, will use ONNX model');
	return null;
	}
	const meta = await metaResponse.json();
	console.log('audio_embedding metadata:', meta);

	// Load binary weight
	const binResponse = await fetchWithCache(`${modelPath}/onnx/audio_embedding.bin`, fetchOptions);
	if (!binResponse.ok) {
	console.log('audio_embedding.bin not found, will use ONNX model');
	return null;
	}
	const buffer = await binResponse.arrayBuffer();
	const weight = new Float32Array(buffer);

	if (weight.length !== meta.vocab_size * meta.hidden_size) {
	console.error('audio_embedding size mismatch:', weight.length, 'expected:', meta.vocab_size * meta.hidden_size);
	return null;
	}

	console.log(`Loaded audio_embedding: [${meta.vocab_size}, ${meta.hidden_size}] (${(buffer.byteLength / 1e6).toFixed(1)} MB)`);
	return { weight, vocabSize: meta.vocab_size, hiddenSize: meta.hidden_size };
	} catch (e) {
	console.log('Failed to load audio_embedding.bin:', e);
	return null;
	}
	}

	/**
	* Load embed_tokens.weight from raw binary file for text embedding lookup
	*
	* The Python export saves embed_tokens.weight as:
	* - embed_tokens.bin: raw float32 binary (vocab_size * hidden_size * 4 bytes)
	* - embed_tokens.json: metadata (vocab_size, hidden_size)
	*/
	async loadEmbedTokensWeight(modelPath) {
	const fetchOptions = { mode: 'cors', credentials: 'omit' };

	// Load metadata
	const metaResponse = await fetchWithCache(`${modelPath}/onnx/embed_tokens.json`, fetchOptions);
	if (!metaResponse.ok) {
	console.warn('embed_tokens.json not found, TTS will be unavailable');
	return null;
	}
	const meta = await metaResponse.json();
	console.log('embed_tokens metadata:', meta);

	// Load binary weight
	const binResponse = await fetchWithCache(`${modelPath}/onnx/embed_tokens.bin`, fetchOptions);
	if (!binResponse.ok) {
	console.warn('embed_tokens.bin not found, TTS will be unavailable');
	return null;
	}
	const buffer = await binResponse.arrayBuffer();
	const weight = new Float32Array(buffer);

	if (weight.length !== meta.vocab_size * meta.hidden_size) {
	console.error('embed_tokens size mismatch:', weight.length, 'expected:', meta.vocab_size * meta.hidden_size);
	return null;
	}

	console.log(`Loaded embed_tokens: [${meta.vocab_size}, ${meta.hidden_size}] (${(buffer.byteLength / 1e6).toFixed(1)} MB)`);
	return { weight, vocabSize: meta.vocab_size, hiddenSize: meta.hidden_size };
	}

	/**
	* Get text embeddings for token IDs using pre-loaded weight
	* @param {number[]} tokenIds - Array of token IDs
	* @returns {ort.Tensor} - Embeddings tensor [1, seq_len, hidden_size]
	*/
	getTextEmbeddings(tokenIds) {
	if (!this.embedTokensWeight) {
	throw new Error('embed_tokens weight not loaded');
	}

	const { weight, hiddenSize } = this.embedTokensWeight;
	const seqLen = tokenIds.length;
	const embeddings = new Float32Array(seqLen * hiddenSize);

	for (let i = 0; i < seqLen; i++) {
	const tokenId = tokenIds[i];
	const srcOffset = tokenId * hiddenSize;
	const dstOffset = i * hiddenSize;
	embeddings.set(weight.subarray(srcOffset, srcOffset + hiddenSize), dstOffset);
	}

	return new ort.Tensor('float32', embeddings, [1, seqLen, hiddenSize]);
	}

	/**
	* Initialize KV cache for generation
	*/
	initializeCache() {
	const cache = {};

	for (let idx = 0; idx < this.layerTypes.length; idx++) {
	const layerType = this.layerTypes[idx];
	if (layerType === 'conv') {
	cache[`past_conv.${idx}`] = new ort.Tensor(
	'float32',
	new Float32Array(1 * this.hiddenSize * this.convL),
	[1, this.hiddenSize, this.convL]
	);
	} else {
	cache[`past_key_values.${idx}.key`] = new ort.Tensor(
	'float32',
	new Float32Array(0),
	[1, this.numKVHeads, 0, this.headDim]
	);
	cache[`past_key_values.${idx}.value`] = new ort.Tensor(
	'float32',
	new Float32Array(0),
	[1, this.numKVHeads, 0, this.headDim]
	);
	}
	}

	return cache;
	}

	/**
	* Update cache from decoder outputs
	*/
	updateCache(cache, outputs) {
	for (const name of Object.keys(outputs)) {
	if (name.startsWith('present_conv.')) {
	const cacheName = name.replace('present_conv', 'past_conv');
	if (cacheName in cache) {
	cache[cacheName] = outputs[name];
	}
	} else if (name.startsWith('present.')) {
	const cacheName = name.replace('present.', 'past_key_values.');
	if (cacheName in cache) {
	cache[cacheName] = outputs[name];
	}
	}
	}
	}

	/**
	* Run decoder with embeddings
	*/
	async runDecoder(embeds, attentionMask, cache) {
	const feeds = {
	inputs_embeds: embeds,
	attention_mask: attentionMask,
	...cache,
	};

	const outputs = await this.decoderSession.run(feeds);

	return {
	logits: outputs.logits,
	hiddenStates: outputs.hidden_states,
	outputs,
	};
	}

	/**
	* Sample next token
	*/
	sampleToken(logits, temperature = 0.7) {
	if (temperature === 0) {
	let maxIdx = 0;
	let maxVal = logits[0];
	for (let i = 1; i < logits.length; i++) {
	if (logits[i] > maxVal) {
	maxVal = logits[i];
	maxIdx = i;
	}
	}
	return maxIdx;
	}

	// Temperature sampling
	const scaledLogits = new Float32Array(logits.length);
	let maxLogit = -Infinity;
	for (let i = 0; i < logits.length; i++) {
	scaledLogits[i] = logits[i] / temperature;
	maxLogit = Math.max(maxLogit, scaledLogits[i]);
	}

	let sumExp = 0;
	for (let i = 0; i < scaledLogits.length; i++) {
	scaledLogits[i] = Math.exp(scaledLogits[i] - maxLogit);
	sumExp += scaledLogits[i];
	}

	const probs = new Float32Array(scaledLogits.length);
	for (let i = 0; i < probs.length; i++) {
	probs[i] = scaledLogits[i] / sumExp;
	}

	// Sample from distribution
	const r = Math.random();
	let cumsum = 0;
	for (let i = 0; i < probs.length; i++) {
	cumsum += probs[i];
	if (r < cumsum) return i;
	}
	return probs.length - 1;
	}

	/**
	* Transcribe audio to text (ASR mode)
	* @param {Float32Array} audioData - Audio samples in [-1, 1]
	* @param {number} sampleRate - Audio sample rate
	* @param {object} options - Generation options
	*/
	async transcribe(audioData, sampleRate, options = {}) {
	const {
	maxNewTokens = DEFAULT_MAX_TOKENS_TEXT,
	temperature = 0,
	systemPrompt = DEFAULT_SYSTEM_PROMPT_ASR,
	onToken,
	} = options;

	if (!this.audioEncoderSession) {
	throw new Error('Audio encoder not loaded');
	}

	if (!this.embedTokensWeight) {
	throw new Error('embed_tokens not loaded - required for ASR');
	}

	logReset();
	log('=== ASR Transcription ===');
	log('Audio samples:', audioData.length, 'Sample rate:', sampleRate);

	// 1. Compute mel spectrogram
	const { melFeatures, numFrames } = computeMelSpectrogram(audioData, sampleRate);
	log('Mel spectrogram frames:', numFrames);

	// 2. Run audio encoder
	const melTensor = new ort.Tensor(
	'float32',
	melFeatures,
	[1, numFrames, 128] // [batch, time, n_mels]
	);

	const melLengths = new ort.Tensor(
	'int64',
	new BigInt64Array([BigInt(numFrames)]),
	[1]
	);

	const encoderOutputs = await this.audioEncoderSession.run({
	mel_spectrogram: melTensor,
	mel_lengths: melLengths,
	});

	const audioEmbeds = encoderOutputs.audio_embeddings;
	log('Audio embeddings shape:', audioEmbeds.dims);

	// 3. Build prompt: prefix + audio + suffix
	const prefixText = `<\|startoftext\|><\|im_start\|>system\n${systemPrompt}<\|im_end\|>\n<\|im_start\|>user\n`;
	const suffixText = '<\|im_end\|>\n<\|im_start\|>assistant\n';

	// Use add_special_tokens: false to match Python behavior (prompt already has special tokens)
	const prefixIds = Array.from(this.tokenizer.encode(prefixText, { add_special_tokens: false }));
	const suffixIds = Array.from(this.tokenizer.encode(suffixText, { add_special_tokens: false }));

	log('Prefix tokens:', prefixIds.length, 'Suffix tokens:', suffixIds.length);

	// Get text embeddings
	const prefixEmbeds = this.getTextEmbeddings(prefixIds);
	const suffixEmbeds = this.getTextEmbeddings(suffixIds);

	// 4. Concatenate embeddings: prefix + audio + suffix
	const prefixLen = prefixIds.length;
	const audioLen = audioEmbeds.dims[1];
	const suffixLen = suffixIds.length;
	const totalLen = prefixLen + audioLen + suffixLen;

	const { hiddenSize } = this.embedTokensWeight;
	const allEmbeds = new Float32Array(totalLen * hiddenSize);

	// Copy prefix embeddings
	allEmbeds.set(prefixEmbeds.data, 0);
	// Copy audio embeddings
	allEmbeds.set(new Float32Array(audioEmbeds.data.buffer, audioEmbeds.data.byteOffset, audioLen * hiddenSize), prefixLen * hiddenSize);
	// Copy suffix embeddings
	allEmbeds.set(suffixEmbeds.data, (prefixLen + audioLen) * hiddenSize);

	const inputEmbeds = new ort.Tensor('float32', allEmbeds, [1, totalLen, hiddenSize]);
	const attentionMask = new ort.Tensor('int64', new BigInt64Array(totalLen).fill(1n), [1, totalLen]);

	// 5. Initialize cache and run prefill
	const cache = this.initializeCache();
	let { logits, hiddenStates, outputs } = await this.runDecoder(inputEmbeds, attentionMask, cache);
	this.updateCache(cache, outputs);

	// 6. Generate tokens
	const generatedTokens = [];
	let currentLen = totalLen;

	for (let i = 0; i < maxNewTokens; i++) {
	// Get logits for last position - shape is [1, seq_len, vocab_size]
	const logitsData = logits.data;
	const seqLen = logits.dims[1];
	const lastLogits = new Float32Array(this.vocabSize);
	const offset = (seqLen - 1) * this.vocabSize;
	for (let j = 0; j < this.vocabSize; j++) {
	lastLogits[j] = logitsData[offset + j];
	}
	const nextToken = this.sampleToken(lastLogits, temperature);

	// Check for stop tokens
	if (nextToken === this.tokenizer.eos_token_id \|\| nextToken === SPECIAL_TOKENS.IM_END) {
	log('Stop token reached');
	break;
	}

	generatedTokens.push(nextToken);
	if (onToken) {
	const text = this.tokenizer.decode(generatedTokens);
	onToken(text, nextToken);
	}

	// Get embedding for next token
	const nextEmbeds = this.getTextEmbeddings([nextToken]);
	currentLen++;
	const nextMask = new ort.Tensor('int64', new BigInt64Array(currentLen).fill(1n), [1, currentLen]);

	// Run decoder with single token
	({ logits, hiddenStates, outputs } = await this.runDecoder(nextEmbeds, nextMask, cache));
	this.updateCache(cache, outputs);
	}

	const result = this.tokenizer.decode(generatedTokens);
	log(`Generated ${generatedTokens.length} tokens: "${result}"`);
	return result;
	}

	/**
	* Generate response from messages
	* @param {Array} messages - Chat messages
	* @param {object} options - Generation options
	*/
	async generate(messages, options = {}) {
	const { maxNewTokens = 256, onToken, audioData = null, sampleRate = null } = options;

	// If audio data provided, do ASR
	if (audioData && sampleRate) {
	return this.transcribe(audioData, sampleRate, {
	maxNewTokens,
	onToken,
	});
	}

	// Text-only generation (simplified)
	const prompt = this.tokenizer.apply_chat_template(messages, {
	add_generation_prompt: true,
	tokenize: false,
	});

	const inputIds = this.tokenizer.encode(prompt);
	console.log('Input tokens:', inputIds.length);

	// Initialize cache
	const cache = this.initializeCache();
	const generatedTokens = [];

	// Note: Full implementation needs proper text embedding support
	// This is a placeholder that shows the model is loaded

	return '[Text generation requires full embedding support - model loaded successfully]';
	}

	/**
	* Initialize reusable vocoder tensors to reduce allocation overhead
	*/
	_initVocoderCache() {
	if (this._vocoderCache) return;

	const numLayers = 6;
	const numKvHeads = 8;
	const headDim = 32;

	// Pre-allocate data arrays
	const stepIdxData = new BigInt64Array(1);
	const prevTokenData = new BigInt64Array(1);
	const seqlensKData = new Int32Array(1);
	const totalSeqLenData = new Int32Array(1);

	// Pre-allocate tensors that can be reused
	this._vocoderCache = {
	hiddenTensor: null, // Created per-call since hiddenState changes
	stepIdxData,
	prevTokenData,
	seqlensKData,
	totalSeqLenData,
	// Pre-create reusable tensors (ONNX Runtime reads from the data array)
	stepIdxTensor: new ort.Tensor('int64', stepIdxData, []),
	prevTokenTensor: new ort.Tensor('int64', prevTokenData, [1]),
	seqlensKTensor: new ort.Tensor('int32', seqlensKData, [1]),
	totalSeqLenTensor: new ort.Tensor('int32', totalSeqLenData, []),
	emptyKeysData: new Float32Array(0),
	emptyValuesData: new Float32Array(0),
	emptyDepthSlicesData: new Float32Array(8 * 1024), // zeros for step 0
	// Reusable sampling arrays
	scaledLogits: new Float32Array(2049), // codebook vocab size
	indices: new Uint16Array(2049), // Use typed array for faster reset
	probs: new Float32Array(64), // top-k size
	};

	// Initialize indices
	for (let i = 0; i < 2049; i++) {
	this._vocoderCache.indices[i] = i;
	}
	}

	/**
	* Sample audio codes using vocoder depthformer
	* Optimized to reduce tensor creation overhead
	* @param {Float32Array} hiddenState - [hidden_size] hidden state
	* @param {number} temperature - Sampling temperature
	* @param {number} topK - Top-k sampling
	* @returns {number[]} - 8 codebook values
	*/
	async sampleAudioCodes(hiddenState, temperature = 0.8, topK = 64) {
	if (!this.vocoderSession) {
	throw new Error('Vocoder not loaded');
	}

	// Initialize cache on first call
	this._initVocoderCache();
	const cache = this._vocoderCache;

	const numCodebooks = 8;
	const numLayers = 6;
	const numKvHeads = 8;
	const headDim = 32;

	const codes = [];
	let prevToken = 0;

	// Create hidden state tensor (must be new since data changes)
	const hiddenTensor = new ort.Tensor('float32', hiddenState, [1, this.hiddenSize]);

	// Initialize empty KV cache
	let pastKeys = new ort.Tensor(
	'float32',
	cache.emptyKeysData,
	[numLayers, 1, numKvHeads, 0, headDim]
	);
	let pastValues = new ort.Tensor(
	'float32',
	cache.emptyValuesData,
	[numLayers, 1, numKvHeads, 0, headDim]
	);

	// Reuse step_idx and prev_token tensors by updating their data
	cache.stepIdxData[0] = 0n;
	cache.prevTokenData[0] = 0n;

	// depth_slices_in: zeros at step 0 (model ignores it), then fed back from output
	let depthSlicesIn = new ort.Tensor('float32', cache.emptyDepthSlicesData, [1, 8, 1024]);

	for (let i = 0; i < numCodebooks; i++) {
	// Update mutable tensor data (tensor objects reuse the underlying data arrays)
	cache.stepIdxData[0] = BigInt(i);
	cache.prevTokenData[0] = BigInt(prevToken);
	cache.seqlensKData[0] = i;
	cache.totalSeqLenData[0] = i + 1;

	const feeds = {
	hidden_states: hiddenTensor,
	depth_slices_in: depthSlicesIn,
	step_idx: cache.stepIdxTensor,
	prev_token: cache.prevTokenTensor,
	past_keys: pastKeys,
	past_values: pastValues,
	seqlens_k: cache.seqlensKTensor,
	total_seq_len: cache.totalSeqLenTensor,
	};

	const outputs = await this.vocoderSession.run(feeds);
	depthSlicesIn = outputs.depth_slices;
	const logits = outputs.logits.data;
	const vocabSize = logits.length;

	// Sample with temperature and top-k (reusing cached arrays)
	let token;
	if (temperature <= 0) {
	// Greedy
	token = 0;
	let maxVal = logits[0];
	for (let j = 1; j < vocabSize; j++) {
	if (logits[j] > maxVal) {
	maxVal = logits[j];
	token = j;
	}
	}
	} else {
	// Top-k sampling with reused arrays
	const scaledLogits = cache.scaledLogits;
	const indices = cache.indices;
	const probs = cache.probs;

	// Scale logits by temperature and find top-k in single pass
	// Use partial selection sort (O(k*n) which is fast for small k)
	for (let j = 0; j < vocabSize; j++) {
	scaledLogits[j] = logits[j] / temperature;
	indices[j] = j;
	}

	// Partial sort to get top-k
	for (let j = 0; j < topK; j++) {
	let maxIdx = j;
	for (let k = j + 1; k < vocabSize; k++) {
	if (scaledLogits[indices[k]] > scaledLogits[indices[maxIdx]]) {
	maxIdx = k;
	}
	}
	// Swap
	const tmp = indices[j];
	indices[j] = indices[maxIdx];
	indices[maxIdx] = tmp;
	}

	// Softmax over top-k
	const maxLogit = scaledLogits[indices[0]];
	let sumExp = 0;
	for (let j = 0; j < topK; j++) {
	probs[j] = Math.exp(scaledLogits[indices[j]] - maxLogit);
	sumExp += probs[j];
	}
	for (let j = 0; j < topK; j++) {
	probs[j] /= sumExp;
	}

	// Sample
	const r = Math.random();
	let cumsum = 0;
	token = indices[topK - 1]; // Default to last
	for (let j = 0; j < topK; j++) {
	cumsum += probs[j];
	if (r < cumsum) {
	token = indices[j];
	break;
	}
	}
	}

	codes.push(token);
	prevToken = token;

	// Update KV cache
	pastKeys = outputs.new_keys;
	pastValues = outputs.new_values;
	}

	return codes;
	}

	/**
	* Generate speech from text (TTS mode)
	* @param {string} text - Text to convert to speech
	* @param {object} options - Generation options
	* @returns {object} - { audioCodes: number[][], textTokens: number[] }
	*/
	async generateSpeech(text, options = {}) {
	const {
	maxNewTokens = DEFAULT_MAX_TOKENS_AUDIO,
	textTemperature = 0.7,
	audioTemperature = 0.8,
	audioTopK = 64,
	systemPrompt = DEFAULT_SYSTEM_PROMPT_TTS,
	onToken,
	onAudioFrame,
	} = options;

	logReset();
	log('=== TTS Generation ===');
	log('Text:', text);

	if (!this.embedTokensWeight) {
	throw new Error('embed_tokens not loaded - required for TTS');
	}

	if (!this.vocoderSession) {
	throw new Error('Vocoder not loaded - required for TTS');
	}

	// Build TTS prompt
	const prompt = `<\|startoftext\|><\|im_start\|>system\n${systemPrompt}<\|im_end\|>\n<\|im_start\|>user\n${text}<\|im_end\|>\n<\|im_start\|>assistant\n`;
	// Use add_special_tokens: false to match Python behavior (prompt already has special tokens)
	const inputIds = Array.from(this.tokenizer.encode(prompt, { add_special_tokens: false }));
	log('TTS prompt tokens:', inputIds.length);

	// Get embeddings and run prefill
	const inputEmbeds = this.getTextEmbeddings(inputIds);
	const cache = this.initializeCache();
	const attentionMask = new ort.Tensor('int64', new BigInt64Array(inputIds.length).fill(1n), [1, inputIds.length]);

	let { logits, hiddenStates, outputs } = await this.runDecoder(inputEmbeds, attentionMask, cache);
	this.updateCache(cache, outputs);

	// Phase 1: Generate text until <\|audio_start\|> token
	const textTokens = [];
	let currentLen = inputIds.length;
	let inAudioMode = false;
	let tokensGenerated = 0;

	while (tokensGenerated < maxNewTokens && !inAudioMode) {
	const logitsData = logits.data;
	const seqLen = logits.dims[1];
	// Get logits for last position - shape is [1, seq_len, vocab_size]
	const lastLogits = new Float32Array(this.vocabSize);
	const offset = (seqLen - 1) * this.vocabSize;
	for (let i = 0; i < this.vocabSize; i++) {
	lastLogits[i] = logitsData[offset + i];
	}
	const nextToken = this.sampleToken(lastLogits, textTemperature);

	tokensGenerated++;

	if (nextToken === this.tokenizer.eos_token_id) {
	console.warn('Model produced EOS before audio, TTS may not work');
	break;
	}

	if (nextToken === SPECIAL_TOKENS.AUDIO_START) {
	log('Model entered audio mode');
	inAudioMode = true;
	// Feed audio_start token to get hidden states for first audio frame
	const nextEmbeds = this.getTextEmbeddings([SPECIAL_TOKENS.AUDIO_START]);
	currentLen++;
	const nextMask = new ort.Tensor('int64', new BigInt64Array(currentLen).fill(1n), [1, currentLen]);
	({ logits, hiddenStates, outputs } = await this.runDecoder(nextEmbeds, nextMask, cache));
	this.updateCache(cache, outputs);
	break;
	}

	textTokens.push(nextToken);

	// Continue text generation
	const nextEmbeds = this.getTextEmbeddings([nextToken]);
	currentLen++;
	const nextMask = new ort.Tensor('int64', new BigInt64Array(currentLen).fill(1n), [1, currentLen]);
	({ logits, hiddenStates, outputs } = await this.runDecoder(nextEmbeds, nextMask, cache));
	this.updateCache(cache, outputs);
	}

	if (!inAudioMode) {
	console.warn('Model did not enter audio mode, forcing audio generation');
	// Force audio start token
	const nextEmbeds = this.getTextEmbeddings([SPECIAL_TOKENS.AUDIO_START]);
	currentLen++;
	const nextMask = new ort.Tensor('int64', new BigInt64Array(currentLen).fill(1n), [1, currentLen]);
	({ logits, hiddenStates, outputs } = await this.runDecoder(nextEmbeds, nextMask, cache));
	this.updateCache(cache, outputs);
	tokensGenerated++;
	}

	// Phase 2: Generate audio frames using depthformer
	const audioCodes = [];
	const startTime = performance.now();

	while (tokensGenerated < maxNewTokens) {
	// Get hidden state for last position
	const hiddenData = hiddenStates.data;
	const seqLen = hiddenStates.dims[1];
	const lastHidden = hiddenData.slice((seqLen - 1) * this.hiddenSize, seqLen * this.hiddenSize);

	// Sample audio codes
	const frameCodes = await this.sampleAudioCodes(lastHidden, audioTemperature, audioTopK);

	// Check for end-of-audio
	// Only check first codebook (matching liquid-audio TTS behavior)
	if (frameCodes[0] >= END_OF_AUDIO_TOKEN) {
	log(`End of audio at frame ${audioCodes.length}`);
	break;
	}

	// Log progress periodically
	if (audioCodes.length % 50 === 0) {
	log(`Generated ${audioCodes.length} audio frames`);
	}

	audioCodes.push(frameCodes);
	tokensGenerated++;

	if (onAudioFrame) {
	onAudioFrame(frameCodes, audioCodes.length);
	}

	// Feed back audio codes to continue generation
	// Audio embedding expects tokens in range [0, 16392) where:
	// token = codebook_idx * 2049 + code_value
	const clampedCodes = frameCodes.map(c => Math.min(c, 2047));
	const audioTokens = clampedCodes.map((code, idx) => idx * CODEBOOK_VOCAB + code);

	// Get summed embeddings for all 8 codebooks
	const summedEmbeds = await this.getAudioEmbedding(audioTokens);
	const nextEmbeds = new ort.Tensor('float32', summedEmbeds, [1, 1, this.hiddenSize]);
	currentLen++;
	const nextMask = new ort.Tensor('int64', new BigInt64Array(currentLen).fill(1n), [1, currentLen]);
	({ logits, hiddenStates, outputs } = await this.runDecoder(nextEmbeds, nextMask, cache));
	this.updateCache(cache, outputs);
	}

	const elapsed = (performance.now() - startTime) / 1000;
	const framesPerSec = audioCodes.length / elapsed;
	log(`Generated ${audioCodes.length} audio frames in ${elapsed.toFixed(2)}s (${framesPerSec.toFixed(1)} frames/s)`);

	const textOutput = textTokens.length > 0 ? this.tokenizer.decode(textTokens) : '';
	return { audioCodes, textTokens, textOutput };
	}

	/**
	* Generate interleaved response (mixed text/audio) with stateful KV cache.
	*
	* The cache is preserved between calls for multi-turn conversation.
	* Call reset() to start a new conversation.
	*
	* @param {Float32Array} audioData - Input audio samples
	* @param {number} sampleRate - Audio sample rate
	* @param {string} textPrompt - Optional text prompt (unused, for API compatibility)
	* @param {object} options - Generation options
	* @returns {object} - { text: string, audioCodes: number[][] }
	*/
	async generateInterleaved(audioData, sampleRate, textPrompt = '', options = {}) {
	const {
	maxNewTokens = DEFAULT_MAX_TOKENS_AUDIO,
	textTemperature = 1.0,
	audioTemperature = 1.0,
	audioTopK = 4,
	systemPrompt = DEFAULT_SYSTEM_PROMPT_INTERLEAVED,
	onToken,
	onAudioFrame,
	} = options;

	// Counter-based mode switching (matching liquid-audio)
	const INTERLEAVED_N_TEXT = 6;
	const INTERLEAVED_N_AUDIO = 12;

	logReset();
	log('=== Interleaved Generation ===');
	log('Cache state:', this.cache ? `exists (seq_len=${this.cacheSeqLen})` : 'null (new conversation)');
	log('Audio samples:', audioData.length, 'Sample rate:', sampleRate);

	if (!this.audioEncoderSession) {
	throw new Error('Audio encoder not loaded - required for interleaved mode');
	}

	if (!this.embedTokensWeight) {
	throw new Error('embed_tokens not loaded - required for interleaved mode');
	}

	if (!this.vocoderSession) {
	throw new Error('Vocoder not loaded - required for interleaved mode');
	}

	// Timing accumulators
	let timeAudioEncode = 0;
	let timePrefill = 0;
	let timeTextDecode = 0;
	let timeAudioDecode = 0;
	let timeVocoder = 0;
	let timeAudioEmbed = 0;

	// 1. Compute mel spectrogram and encode audio
	let tStep = performance.now();
	const { melFeatures, numFrames } = computeMelSpectrogram(audioData, sampleRate);
	const timeMel = performance.now() - tStep;

	const melTensor = new ort.Tensor('float32', melFeatures, [1, numFrames, 128]);
	const melLengths = new ort.Tensor('int64', new BigInt64Array([BigInt(numFrames)]), [1]);

	tStep = performance.now();
	const encoderOutputs = await this.audioEncoderSession.run({
	mel_spectrogram: melTensor,
	mel_lengths: melLengths,
	});
	timeAudioEncode = performance.now() - tStep;

	const audioEmbeds = encoderOutputs.audio_embeddings;
	log(`Mel: ${timeMel.toFixed(0)}ms, AudioEnc: ${timeAudioEncode.toFixed(0)}ms, frames: ${numFrames}`);

	const { hiddenSize } = this.embedTokensWeight;

	// 2. Build prompt based on whether this is first turn or continuation
	let inputEmbeds;
	let newSeqLen;

	if (this.cache === null) {
	// === First turn: full prompt with system message ===
	log('First turn - initializing conversation');
	this.cache = this.initializeCache();
	this.cacheSeqLen = 0;

	const prefixText = `<\|startoftext\|><\|im_start\|>system\n${systemPrompt}<\|im_end\|>\n<\|im_start\|>user\n`;
	const suffixText = '<\|im_end\|>\n<\|im_start\|>assistant\n';

	const prefixIds = Array.from(this.tokenizer.encode(prefixText, { add_special_tokens: false }));
	const suffixIds = Array.from(this.tokenizer.encode(suffixText, { add_special_tokens: false }));

	const prefixEmbeds = this.getTextEmbeddings(prefixIds);
	const suffixEmbeds = this.getTextEmbeddings(suffixIds);

	const prefixLen = prefixIds.length;
	const audioLen = audioEmbeds.dims[1];
	const suffixLen = suffixIds.length;
	newSeqLen = prefixLen + audioLen + suffixLen;

	const allEmbeds = new Float32Array(newSeqLen * hiddenSize);
	allEmbeds.set(prefixEmbeds.data, 0);
	allEmbeds.set(
	new Float32Array(audioEmbeds.data.buffer, audioEmbeds.data.byteOffset, audioLen * hiddenSize),
	prefixLen * hiddenSize
	);
	allEmbeds.set(suffixEmbeds.data, (prefixLen + audioLen) * hiddenSize);

	inputEmbeds = new ort.Tensor('float32', allEmbeds, [1, newSeqLen, hiddenSize]);
	} else {
	// === Continuation: user turn only ===
	log(`Continuing conversation (cache seq_len=${this.cacheSeqLen})`);

	const userPrefixText = '<\|im_start\|>user\n';
	const suffixText = '<\|im_end\|>\n<\|im_start\|>assistant\n';

	const userPrefixIds = Array.from(this.tokenizer.encode(userPrefixText, { add_special_tokens: false }));
	const suffixIds = Array.from(this.tokenizer.encode(suffixText, { add_special_tokens: false }));

	const userPrefixEmbeds = this.getTextEmbeddings(userPrefixIds);
	const suffixEmbeds = this.getTextEmbeddings(suffixIds);

	const userPrefixLen = userPrefixIds.length;
	const audioLen = audioEmbeds.dims[1];
	const suffixLen = suffixIds.length;
	newSeqLen = userPrefixLen + audioLen + suffixLen;

	const allEmbeds = new Float32Array(newSeqLen * hiddenSize);
	allEmbeds.set(userPrefixEmbeds.data, 0);
	allEmbeds.set(
	new Float32Array(audioEmbeds.data.buffer, audioEmbeds.data.byteOffset, audioLen * hiddenSize),
	userPrefixLen * hiddenSize
	);
	allEmbeds.set(suffixEmbeds.data, (userPrefixLen + audioLen) * hiddenSize);

	inputEmbeds = new ort.Tensor('float32', allEmbeds, [1, newSeqLen, hiddenSize]);
	}

	// 3. Run prefill with attention mask covering full sequence
	const totalLen = this.cacheSeqLen + newSeqLen;
	const attentionMask = new ort.Tensor('int64', new BigInt64Array(totalLen).fill(1n), [1, totalLen]);

	tStep = performance.now();
	let { logits, hiddenStates, outputs } = await this.runDecoder(inputEmbeds, attentionMask, this.cache);
	timePrefill = performance.now() - tStep;
	this.updateCache(this.cache, outputs);
	this.cacheSeqLen = totalLen;
	log(`Prefill: ${timePrefill.toFixed(0)}ms, new tokens: ${newSeqLen}, total: ${totalLen}`);

	// 4. Generate with counter-based mode switching
	const textTokens = [];
	const audioCodes = [];
	let currentLen = totalLen;
	let inAudioMode = false;
	let modalityLeft = INTERLEAVED_N_TEXT;
	let textDone = false;

	const startTime = performance.now();

	for (let step = 0; step < maxNewTokens; step++) {
	modalityLeft--;

	if (inAudioMode) {
	// Generate audio frame using depthformer
	const hiddenData = hiddenStates.data;
	const seqLen = hiddenStates.dims[1];
	const lastHidden = hiddenData.slice((seqLen - 1) * hiddenSize, seqLen * hiddenSize);

	tStep = performance.now();
	const frameCodes = await this.sampleAudioCodes(lastHidden, audioTemperature, audioTopK);
	timeVocoder += performance.now() - tStep;

	// Switch back to text after N audio frames (if text not done)
	if (modalityLeft <= 0 && !textDone) {
	inAudioMode = false;
	modalityLeft = INTERLEAVED_N_TEXT;
	}

	// Check for end of audio - first codebook == 2048 (matching liquid-audio)
	if (frameCodes[0] === END_OF_AUDIO_TOKEN) {
	log(`End of audio at step ${step}`);
	// Set all codes to 2048 (matching liquid-audio)
	for (let i = 0; i < NUM_CODEBOOKS; i++) {
	frameCodes[i] = END_OF_AUDIO_TOKEN;
	}
	inAudioMode = false;
	// Don't save this frame, but still feed it back
	} else {
	// Save valid frame (clamped to 0-2047)
	const clampedFrame = frameCodes.map(c => Math.min(c, 2047));
	audioCodes.push(clampedFrame);

	if (onAudioFrame) {
	onAudioFrame(clampedFrame, audioCodes.length);
	}

	if (audioCodes.length % 50 === 0) {
	log(`Generated ${audioCodes.length} audio frames`);
	}
	}

	// Get embeddings for next step (always feed back, even for 2048 frames)
	tStep = performance.now();
	const feedCodes = frameCodes.map(c => c === END_OF_AUDIO_TOKEN ? END_OF_AUDIO_TOKEN : Math.min(c, 2047));
	const audioTokens = feedCodes.map((code, idx) => idx * CODEBOOK_VOCAB + code);

	// Get summed embeddings for all 8 codebooks
	const summedEmbeds = await this.getAudioEmbedding(audioTokens);
	timeAudioEmbed += performance.now() - tStep;

	const nextEmbeds = new ort.Tensor('float32', summedEmbeds, [1, 1, hiddenSize]);
	currentLen++;
	const nextMask = new ort.Tensor('int64', new BigInt64Array(currentLen).fill(1n), [1, currentLen]);
	tStep = performance.now();
	({ logits, hiddenStates, outputs } = await this.runDecoder(nextEmbeds, nextMask, this.cache));
	timeAudioDecode += performance.now() - tStep;
	this.updateCache(this.cache, outputs);

	} else {
	// Generate text token
	const logitsData = logits.data;
	const seqLen = logits.dims[1];
	// Get logits for last position - shape is [1, seq_len, vocab_size]
	const lastLogits = new Float32Array(this.vocabSize);
	const offset = (seqLen - 1) * this.vocabSize;
	for (let i = 0; i < this.vocabSize; i++) {
	lastLogits[i] = logitsData[offset + i];
	}
	const token = this.sampleToken(lastLogits, textTemperature);

	// Check for end of turn
	if (token === this.tokenizer.eos_token_id \|\| token === SPECIAL_TOKENS.IM_END) {
	log(`End of turn at step ${step}`);
	break;
	}

	// Check for <\|text_end\|> token (130)
	if (token === SPECIAL_TOKENS.TEXT_END) {
	log(`Text end at step ${step}`);
	textDone = true;
	}

	// Switch to audio after N text tokens OR text_end
	if (modalityLeft <= 0 \|\| textDone) {
	inAudioMode = true;
	modalityLeft = INTERLEAVED_N_AUDIO;
	}

	textTokens.push(token);

	if (onToken) {
	const decodedText = this.tokenizer.decode(textTokens, { skip_special_tokens: true });
	onToken(decodedText, token);
	}

	// Get embedding for next step
	const nextEmbeds = this.getTextEmbeddings([token]);
	currentLen++;
	const nextMask = new ort.Tensor('int64', new BigInt64Array(currentLen).fill(1n), [1, currentLen]);
	tStep = performance.now();
	({ logits, hiddenStates, outputs } = await this.runDecoder(nextEmbeds, nextMask, this.cache));
	timeTextDecode += performance.now() - tStep;
	this.updateCache(this.cache, outputs);
	}
	}

	// 5. Feed <\|im_end\|> token to close assistant turn in cache
	const imEndEmbeds = this.getTextEmbeddings([SPECIAL_TOKENS.IM_END]);
	currentLen++;
	const finalMask = new ort.Tensor('int64', new BigInt64Array(currentLen).fill(1n), [1, currentLen]);
	({ outputs } = await this.runDecoder(imEndEmbeds, finalMask, this.cache));
	this.updateCache(this.cache, outputs);
	this.cacheSeqLen = currentLen;

	// Decode with skip_special_tokens to clean up special tokens like <\|text_end\|>
	const text = this.tokenizer.decode(textTokens, { skip_special_tokens: true });

	// Print timing summary
	log(`=== Summary ===`);
	log(` Mel: ${timeMel.toFixed(0)}ms, AudioEnc: ${timeAudioEncode.toFixed(0)}ms, Prefill: ${timePrefill.toFixed(0)}ms`);
	log(` TextDec: ${timeTextDecode.toFixed(0)}ms (${textTokens.length} tok), AudioDec: ${timeAudioDecode.toFixed(0)}ms`);
	log(` Vocoder: ${timeVocoder.toFixed(0)}ms, AudioEmbed: ${timeAudioEmbed.toFixed(0)}ms`);
	log(`Output: ${textTokens.length} text tokens, ${audioCodes.length} audio frames`);
	log(`Text: "${text}"`);
	log(`Cache seq_len: ${this.cacheSeqLen}`);

	return { text, audioCodes };
	}

	/**
	* Generate interleaved response from text-only input (continuation turn).
	* Uses the stateful KV cache from previous turns. Produces both text AND audio.
	*
	* @param {string} userText - User's text message
	* @param {object} options - Generation options
	* @returns {object} - { text: string, audioCodes: number[][] }
	*/
	async generateInterleavedFromText(userText, options = {}) {
	const {
	maxNewTokens = DEFAULT_MAX_TOKENS_AUDIO,
	textTemperature = 1.0,
	audioTemperature = 1.0,
	audioTopK = 4,
	systemPrompt = DEFAULT_SYSTEM_PROMPT_INTERLEAVED,
	onToken,
	onAudioFrame,
	} = options;

	// Counter-based mode switching (matching liquid-audio)
	const INTERLEAVED_N_TEXT = 6;
	const INTERLEAVED_N_AUDIO = 12;

	logReset();
	log('=== Text-Only Interleaved Generation ===');
	log(`Cache state: ${this.cache ? `exists (seq_len=${this.cacheSeqLen})` : 'null (new conversation)'}`);
	log(`User text: ${userText}`);

	if (!this.embedTokensWeight) {
	throw new Error('embed_tokens not loaded - required for text generation');
	}

	if (!this.vocoderSession) {
	throw new Error('Vocoder not loaded - required for interleaved mode');
	}

	// Timing accumulators
	let timePrefill = 0;
	let timeTextDecode = 0;
	let timeAudioDecode = 0;
	let timeVocoder = 0;
	let timeAudioEmbed = 0;
	let tStep;

	const { hiddenSize } = this.embedTokensWeight;

	// Build prompt based on whether this is first turn or continuation
	let inputEmbeds;
	let newSeqLen;

	if (this.cache === null) {
	// === First turn: full prompt with system message ===
	log('First turn - initializing conversation');
	this.cache = this.initializeCache();
	this.cacheSeqLen = 0;

	const prefixText = `<\|startoftext\|><\|im_start\|>system\n${systemPrompt}<\|im_end\|>\n<\|im_start\|>user\n${userText}<\|im_end\|>\n<\|im_start\|>assistant\n`;

	const prefixIds = Array.from(this.tokenizer.encode(prefixText, { add_special_tokens: false }));
	const prefixEmbeds = this.getTextEmbeddings(prefixIds);

	newSeqLen = prefixIds.length;
	inputEmbeds = new ort.Tensor('float32', prefixEmbeds.data, [1, newSeqLen, hiddenSize]);
	} else {
	// === Continuation: user turn only ===
	log(`Continuing conversation (cache seq_len=${this.cacheSeqLen})`);

	const userTurnText = `<\|im_start\|>user\n${userText}<\|im_end\|>\n<\|im_start\|>assistant\n`;
	const userTurnIds = Array.from(this.tokenizer.encode(userTurnText, { add_special_tokens: false }));
	const userTurnEmbeds = this.getTextEmbeddings(userTurnIds);

	newSeqLen = userTurnIds.length;
	inputEmbeds = new ort.Tensor('float32', userTurnEmbeds.data, [1, newSeqLen, hiddenSize]);
	}

	// Run prefill with attention mask covering full sequence
	const totalLen = this.cacheSeqLen + newSeqLen;
	const attentionMask = new ort.Tensor('int64', new BigInt64Array(totalLen).fill(1n), [1, totalLen]);

	tStep = performance.now();
	let { logits, hiddenStates, outputs } = await this.runDecoder(inputEmbeds, attentionMask, this.cache);
	timePrefill = performance.now() - tStep;
	this.updateCache(this.cache, outputs);
	this.cacheSeqLen = totalLen;
	log(`Prefill: ${timePrefill.toFixed(0)}ms, new tokens: ${newSeqLen}, total: ${totalLen}`);

	// Generate with counter-based mode switching
	const textTokens = [];
	const audioCodes = [];
	let currentLen = totalLen;
	let inAudioMode = false;
	let modalityLeft = INTERLEAVED_N_TEXT;
	let textDone = false;

	for (let step = 0; step < maxNewTokens; step++) {
	modalityLeft--;

	if (inAudioMode) {
	// Generate audio frame using depthformer
	const hiddenData = hiddenStates.data;
	const seqLen = hiddenStates.dims[1];
	const lastHidden = hiddenData.slice((seqLen - 1) * hiddenSize, seqLen * hiddenSize);

	tStep = performance.now();
	const frameCodes = await this.sampleAudioCodes(lastHidden, audioTemperature, audioTopK);
	timeVocoder += performance.now() - tStep;

	// Switch back to text after N audio frames (if text not done)
	if (modalityLeft <= 0 && !textDone) {
	inAudioMode = false;
	modalityLeft = INTERLEAVED_N_TEXT;
	}

	// Check for end of audio
	if (frameCodes[0] === END_OF_AUDIO_TOKEN) {
	log(`End of audio at step ${step}`);
	for (let i = 0; i < NUM_CODEBOOKS; i++) {
	frameCodes[i] = END_OF_AUDIO_TOKEN;
	}
	inAudioMode = false;
	} else {
	const clampedFrame = frameCodes.map(c => Math.min(c, 2047));
	audioCodes.push(clampedFrame);

	if (onAudioFrame) {
	onAudioFrame(clampedFrame, audioCodes.length);
	}

	if (audioCodes.length % 50 === 0) {
	log(`Generated ${audioCodes.length} audio frames`);
	}
	}

	// Get embeddings for next step
	tStep = performance.now();
	const feedCodes = frameCodes.map(c => c === END_OF_AUDIO_TOKEN ? END_OF_AUDIO_TOKEN : Math.min(c, 2047));
	const audioTokens = feedCodes.map((code, idx) => idx * CODEBOOK_VOCAB + code);
	const summedEmbeds = await this.getAudioEmbedding(audioTokens);
	timeAudioEmbed += performance.now() - tStep;

	const nextEmbeds = new ort.Tensor('float32', summedEmbeds, [1, 1, hiddenSize]);
	currentLen++;
	const nextMask = new ort.Tensor('int64', new BigInt64Array(currentLen).fill(1n), [1, currentLen]);
	tStep = performance.now();
	({ logits, hiddenStates, outputs } = await this.runDecoder(nextEmbeds, nextMask, this.cache));
	timeAudioDecode += performance.now() - tStep;
	this.updateCache(this.cache, outputs);

	} else {
	// Generate text token
	const logitsData = logits.data;
	const seqLen = logits.dims[1];
	const lastLogits = new Float32Array(this.vocabSize);
	const offset = (seqLen - 1) * this.vocabSize;
	for (let i = 0; i < this.vocabSize; i++) {
	lastLogits[i] = logitsData[offset + i];
	}
	const token = this.sampleToken(lastLogits, textTemperature);

	// Check for end of turn
	if (token === this.tokenizer.eos_token_id \|\| token === SPECIAL_TOKENS.IM_END) {
	log(`End of turn at step ${step}`);
	break;
	}

	// Check for <\|text_end\|> token
	if (token === SPECIAL_TOKENS.TEXT_END) {
	log(`Text end at step ${step}`);
	textDone = true;
	}

	// Switch to audio after N text tokens OR text_end
	if (modalityLeft <= 0 \|\| textDone) {
	inAudioMode = true;
	modalityLeft = INTERLEAVED_N_AUDIO;
	}

	textTokens.push(token);

	if (onToken) {
	const decodedText = this.tokenizer.decode(textTokens, { skip_special_tokens: true });
	onToken(decodedText, token);
	}

	// Get embedding for next step
	const nextEmbeds = this.getTextEmbeddings([token]);
	currentLen++;
	const nextMask = new ort.Tensor('int64', new BigInt64Array(currentLen).fill(1n), [1, currentLen]);
	tStep = performance.now();
	({ logits, hiddenStates, outputs } = await this.runDecoder(nextEmbeds, nextMask, this.cache));
	timeTextDecode += performance.now() - tStep;
	this.updateCache(this.cache, outputs);
	}
	}

	// Feed <\|im_end\|> token to close assistant turn in cache
	const imEndEmbeds = this.getTextEmbeddings([SPECIAL_TOKENS.IM_END]);
	currentLen++;
	const finalMask = new ort.Tensor('int64', new BigInt64Array(currentLen).fill(1n), [1, currentLen]);
	({ outputs } = await this.runDecoder(imEndEmbeds, finalMask, this.cache));
	this.updateCache(this.cache, outputs);
	this.cacheSeqLen = currentLen;

	const text = this.tokenizer.decode(textTokens, { skip_special_tokens: true });

	log(`=== Summary ===`);
	log(` Prefill: ${timePrefill.toFixed(0)}ms`);
	log(` TextDec: ${timeTextDecode.toFixed(0)}ms (${textTokens.length} tok), AudioDec: ${timeAudioDecode.toFixed(0)}ms`);
	log(` Vocoder: ${timeVocoder.toFixed(0)}ms, AudioEmbed: ${timeAudioEmbed.toFixed(0)}ms`);
	log(`Output: ${textTokens.length} text tokens, ${audioCodes.length} audio frames`);
	log(`Text: "${text}"`);
	log(`Cache seq_len: ${this.cacheSeqLen}`);

	return { text, audioCodes };
	}

	/**
	* Generate text-only response (for follow-up turns without audio).
	* Uses the stateful KV cache from previous interleaved turns.
	*
	* @param {string} userText - User's text input
	* @param {object} options - Generation options
	* @returns {object} - { text: string }
	*/
	async generateTextOnly(userText, options = {}) {
	const {
	maxNewTokens = 256,
	temperature = 0.7,
	systemPrompt = 'You are a helpful assistant.',
	onToken,
	} = options;

	logReset();
	log('=== Text-Only Generation ===');
	log('Cache state:', this.cache ? `exists (seq_len=${this.cacheSeqLen})` : 'null (new conversation)');
	log('User text:', userText);

	if (!this.embedTokensWeight) {
	throw new Error('embed_tokens not loaded');
	}

	const { hiddenSize } = this.embedTokensWeight;

	// Build prompt based on whether we have existing cache
	let inputEmbeds;
	let newSeqLen;

	if (this.cache === null) {
	// First turn: include system message
	log('First turn - initializing conversation');
	this.cache = this.initializeCache();
	this.cacheSeqLen = 0;

	const promptText = `<\|startoftext\|><\|im_start\|>system\n${systemPrompt}<\|im_end\|>\n<\|im_start\|>user\n${userText}<\|im_end\|>\n<\|im_start\|>assistant\n`;
	const promptIds = Array.from(this.tokenizer.encode(promptText, { add_special_tokens: false }));
	inputEmbeds = this.getTextEmbeddings(promptIds);
	newSeqLen = promptIds.length;
	} else {
	// Continuation: just user turn
	log(`Continuing conversation (cache seq_len=${this.cacheSeqLen})`);

	const turnText = `<\|im_start\|>user\n${userText}<\|im_end\|>\n<\|im_start\|>assistant\n`;
	const turnIds = Array.from(this.tokenizer.encode(turnText, { add_special_tokens: false }));
	inputEmbeds = this.getTextEmbeddings(turnIds);
	newSeqLen = turnIds.length;
	}

	// Run prefill
	const totalLen = this.cacheSeqLen + newSeqLen;
	const attentionMask = new ort.Tensor('int64', new BigInt64Array(totalLen).fill(1n), [1, totalLen]);

	let { logits, outputs } = await this.runDecoder(inputEmbeds, attentionMask, this.cache);
	this.updateCache(this.cache, outputs);
	this.cacheSeqLen = totalLen;

	// Generate tokens
	const textTokens = [];
	let currentLen = totalLen;

	for (let i = 0; i < maxNewTokens; i++) {
	const logitsData = logits.data;
	const seqLen = logits.dims[1];
	const lastLogits = new Float32Array(this.vocabSize);
	const offset = (seqLen - 1) * this.vocabSize;
	for (let j = 0; j < this.vocabSize; j++) {
	lastLogits[j] = logitsData[offset + j];
	}
	const nextToken = this.sampleToken(lastLogits, temperature);

	// Check for stop tokens
	if (nextToken === this.tokenizer.eos_token_id \|\| nextToken === SPECIAL_TOKENS.IM_END) {
	log('Stop token reached');
	break;
	}

	textTokens.push(nextToken);

	if (onToken) {
	const text = this.tokenizer.decode(textTokens, { skip_special_tokens: true });
	onToken(text, nextToken);
	}

	// Get embedding for next token
	const nextEmbeds = this.getTextEmbeddings([nextToken]);
	currentLen++;
	const nextMask = new ort.Tensor('int64', new BigInt64Array(currentLen).fill(1n), [1, currentLen]);
	({ logits, outputs } = await this.runDecoder(nextEmbeds, nextMask, this.cache));
	this.updateCache(this.cache, outputs);
	}

	// Feed <\|im_end\|> to close turn
	const imEndEmbeds = this.getTextEmbeddings([SPECIAL_TOKENS.IM_END]);
	currentLen++;
	const finalMask = new ort.Tensor('int64', new BigInt64Array(currentLen).fill(1n), [1, currentLen]);
	({ outputs } = await this.runDecoder(imEndEmbeds, finalMask, this.cache));
	this.updateCache(this.cache, outputs);
	this.cacheSeqLen = currentLen;

	const text = this.tokenizer.decode(textTokens, { skip_special_tokens: true });
	log(`Generated ${textTokens.length} tokens: "${text}"`);
	log(`Cache seq_len: ${this.cacheSeqLen}`);

	return { text };
	}

	/**
	* Decode audio codes to waveform using audio detokenizer + ISTFT
	* @param {number[][]} audioCodes - Array of [8] codebook values per frame
	* @returns {Float32Array} - Audio waveform samples in [-1, 1]
	*/
	async decodeAudioCodes(audioCodes) {
	if (!this.audioDetokenizerSession) {
	throw new Error('Audio detokenizer not loaded');
	}

	if (audioCodes.length < 2) {
	console.warn('Not enough audio codes to decode');
	return new Float32Array(0);
	}

	const decodeStart = performance.now();
	log(`Decoding ${audioCodes.length} audio frames...`);

	// ISTFT parameters (fixed for this model)
	const nFft = 1280;
	const hopLength = 320;
	const winLength = 1280;
	const nFftBins = nFft / 2 + 1;

	// Stack codes: [T, 8] -> [8, T] and add batch -> [1, 8, T]
	const T = audioCodes.length;
	const codesTransposed = new BigInt64Array(8 * T);
	for (let t = 0; t < T; t++) {
	for (let cb = 0; cb < 8; cb++) {
	codesTransposed[cb * T + t] = BigInt(Math.min(audioCodes[t][cb], 2047));
	}
	}

	// Run detokenizer: [1, 8, T] -> [1, T, 1282]
	const codesTensor = new ort.Tensor('int64', codesTransposed, [1, 8, T]);
	const detokStart = performance.now();
	const detokOutputs = await this.audioDetokenizerSession.run({ audio_codes: codesTensor });
	const stftFeatures = detokOutputs.stft_features;
	log(`Detokenizer: ${(performance.now() - detokStart).toFixed(0)}ms, STFT frames: ${stftFeatures.dims[1]}`);

	// Get raw data - shape is [1, T, 1282], we need to skip batch dimension
	const stftData = stftFeatures.data;
	const actualT = stftFeatures.dims[1];

	// Convert to complex STFT: [log_magnitude \| angle] -> complex
	const complexStft = new Array(nFftBins);
	for (let f = 0; f < nFftBins; f++) {
	complexStft[f] = new Array(actualT);
	for (let t = 0; t < actualT; t++) {
	const logMag = stftData[t * 1282 + f];
	const angle = stftData[t * 1282 + nFftBins + f];
	const mag = Math.exp(logMag);
	// Store as [real, imag]
	complexStft[f][t] = [mag * Math.cos(angle), mag * Math.sin(angle)];
	}
	}

	// ISTFT with 'same' padding
	const istftStart = performance.now();
	const waveform = this.istftSamePadding(complexStft, nFft, hopLength, winLength, actualT);
	log(`ISTFT: ${(performance.now() - istftStart).toFixed(0)}ms`);

	// Find max/min without spread operator (avoid stack overflow on large arrays)
	let waveMax = -Infinity, waveMin = Infinity;
	for (let i = 0; i < waveform.length; i++) {
	if (waveform[i] > waveMax) waveMax = waveform[i];
	if (waveform[i] < waveMin) waveMin = waveform[i];
	}
	log('ISTFT output - length:', waveform.length, 'max:', waveMax.toFixed(4), 'min:', waveMin.toFixed(4));

	// Check for invalid values
	if (isNaN(waveMax) \|\| isNaN(waveMin) \|\| !isFinite(waveMax) \|\| !isFinite(waveMin)) {
	console.error('ISTFT produced invalid values (NaN/Inf)');
	return new Float32Array(0);
	}

	// Normalize to [-1, 1]
	let maxVal = Math.max(Math.abs(waveMax), Math.abs(waveMin));
	if (maxVal > 0) {
	for (let i = 0; i < waveform.length; i++) {
	waveform[i] = (waveform[i] / maxVal) * 0.9;
	}
	} else {
	console.warn('ISTFT produced all-zero waveform');
	}

	log(`Decoded audio: ${waveform.length} samples (${(waveform.length / 24000).toFixed(2)}s)`);
	return waveform;
	}

	/**
	* ISTFT with 'same' padding matching liquid_audio.
	* Uses Bluestein FFT for O(N log N) IRFFT on any size.
	*
	* Matches Python: np.fft.irfft(spec, n_fft, axis=0, norm="backward")
	*/
	istftSamePadding(complexStft, nFft, hopLength, winLength, T) {
	const N = complexStft.length; // nFftBins = nFft/2 + 1 = 641
	const pad = Math.floor((winLength - hopLength) / 2);

	// Generate Hann window
	const window = new Float32Array(winLength);
	for (let i = 0; i < winLength; i++) {
	window[i] = 0.5 * (1 - Math.cos(2 * Math.PI * i / (winLength - 1)));
	}

	// Initialize Bluestein FFT for size nFft (cached for reuse)
	if (!this._bluesteinCache \|\| this._bluesteinCache.n !== nFft) {
	this._bluesteinCache = this._initBluestein(nFft);
	}
	const bluestein = this._bluesteinCache;

	// Pre-allocate buffers for IFFT
	const fullRe = new Float32Array(nFft);
	const fullIm = new Float32Array(nFft);

	// Process all frames
	const ifftFrames = new Array(T);

	for (let t = 0; t < T; t++) {
	// Build full spectrum from one-sided (conjugate symmetry)
	fullRe.fill(0);
	fullIm.fill(0);

	// Copy positive frequencies
	for (let k = 0; k < N; k++) {
	fullRe[k] = complexStft[k][t][0];
	fullIm[k] = complexStft[k][t][1];
	}

	// Mirror negative frequencies (conjugate symmetry for real signal)
	for (let k = 1; k < N - 1; k++) {
	fullRe[nFft - k] = fullRe[k];
	fullIm[nFft - k] = -fullIm[k];
	}

	// IFFT using Bluestein: IFFT(X) = conj(FFT(conj(X))) / N
	// Conjugate input
	for (let i = 0; i < nFft; i++) fullIm[i] = -fullIm[i];
	// FFT
	this._bluesteinFFT(fullRe, fullIm, bluestein);
	// Conjugate and scale
	for (let i = 0; i < nFft; i++) {
	fullRe[i] /= nFft;
	fullIm[i] = -fullIm[i] / nFft;
	}

	// Apply window (take first winLength samples)
	const windowedFrame = new Float32Array(winLength);
	for (let n = 0; n < winLength; n++) {
	windowedFrame[n] = fullRe[n] * window[n];
	}
	ifftFrames[t] = windowedFrame;

	// Debug first frame
	if (t === 0) {
	let maxVal = 0;
	let hasNaN = false;
	for (let n = 0; n < winLength; n++) {
	if (isNaN(windowedFrame[n]) \|\| !isFinite(windowedFrame[n])) {
	hasNaN = true;
	break;
	}
	const absVal = Math.abs(windowedFrame[n] / (window[n] + 1e-10));
	if (absVal > maxVal) maxVal = absVal;
	}
	if (hasNaN) {
	console.error('IRFFT frame 0 contains NaN/Inf values!');
	}
	}
	}

	// Overlap-add
	const outputSize = (T - 1) * hopLength + winLength;
	const audio = new Float32Array(outputSize);
	const windowEnvelope = new Float32Array(outputSize);
	const windowSq = new Float32Array(winLength);
	for (let i = 0; i < winLength; i++) {
	windowSq[i] = window[i] * window[i];
	}

	for (let t = 0; t < T; t++) {
	const start = t * hopLength;
	for (let n = 0; n < winLength; n++) {
	audio[start + n] += ifftFrames[t][n];
	windowEnvelope[start + n] += windowSq[n];
	}
	}

	// Normalize and trim padding
	const trimmedLength = outputSize - 2 * pad;
	const trimmed = new Float32Array(trimmedLength);
	for (let i = 0; i < trimmedLength; i++) {
	const srcIdx = i + pad;
	if (windowEnvelope[srcIdx] > 1e-8) {
	trimmed[i] = audio[srcIdx] / windowEnvelope[srcIdx];
	} else {
	trimmed[i] = audio[srcIdx];
	}
	}

	return trimmed;
	}

	/**
	* Initialize Bluestein FFT for size n (any size, not just power of 2)
	*/
	_initBluestein(n) {
	// Bluestein's algorithm: converts any-size FFT to power-of-2 FFT via convolution
	// FFT size for convolution: next power of 2 >= 2n - 1
	let m = 1;
	while (m < 2 * n - 1) m <<= 1;

	// Chirp sequence: W_n^(k^2/2) = exp(-πi * k² / n)
	const chirpRe = new Float32Array(n);
	const chirpIm = new Float32Array(n);
	for (let k = 0; k < n; k++) {
	const angle = Math.PI * k * k / n;
	chirpRe[k] = Math.cos(angle);
	chirpIm[k] = -Math.sin(angle); // exp(-i*angle)
	}

	// Precompute FFT of chirp filter (b sequence)
	// b[k] = conj(chirp[k]) for k in [0, n-1]
	// b[m-k] = conj(chirp[k]) for k in [1, n-1]
	// conj(chirp[k]) = chirpRe[k] - i*chirpIm[k]
	const bRe = new Float32Array(m);
	const bIm = new Float32Array(m);
	bRe[0] = chirpRe[0];
	bIm[0] = -chirpIm[0]; // conjugate
	for (let k = 1; k < n; k++) {
	bRe[k] = chirpRe[k];
	bIm[k] = -chirpIm[k]; // conjugate
	bRe[m - k] = chirpRe[k];
	bIm[m - k] = -chirpIm[k]; // conjugate
	}

	// FFT of b (in-place)
	this._fftRadix2InPlace(bRe, bIm, m, false);

	// Precompute twiddle factors for radix-2 FFT of size m
	const twiddleRe = new Float32Array(m / 2);
	const twiddleIm = new Float32Array(m / 2);
	for (let i = 0; i < m / 2; i++) {
	const angle = -2 * Math.PI * i / m;
	twiddleRe[i] = Math.cos(angle);
	twiddleIm[i] = Math.sin(angle);
	}

	return { n, m, chirpRe, chirpIm, bRe, bIm, twiddleRe, twiddleIm };
	}

	/**
	* Bluestein FFT for any size
	*/
	_bluesteinFFT(re, im, cache) {
	const { n, m, chirpRe, chirpIm, bRe, bIm, twiddleRe, twiddleIm } = cache;

	// a[k] = x[k] * chirp[k] for k in [0, n-1], zero-padded to m
	// chirp[k] = chirpRe[k] + i*chirpIm[k]
	// (re + iim) (chirpRe + ichirpIm) = (rechirpRe - imchirpIm) + i(imchirpRe + rechirpIm)
	const aRe = new Float32Array(m);
	const aIm = new Float32Array(m);
	for (let k = 0; k < n; k++) {
	aRe[k] = re[k] * chirpRe[k] - im[k] * chirpIm[k];
	aIm[k] = im[k] * chirpRe[k] + re[k] * chirpIm[k];
	}

	// FFT of a
	this._fftRadix2(aRe, aIm, twiddleRe, twiddleIm);

	// Pointwise multiply: a = a * b
	for (let k = 0; k < m; k++) {
	const tmpRe = aRe[k] * bRe[k] - aIm[k] * bIm[k];
	const tmpIm = aRe[k] * bIm[k] + aIm[k] * bRe[k];
	aRe[k] = tmpRe;
	aIm[k] = tmpIm;
	}

	// IFFT of a (using FFT with conjugate trick)
	for (let k = 0; k < m; k++) aIm[k] = -aIm[k];
	this._fftRadix2(aRe, aIm, twiddleRe, twiddleIm);
	for (let k = 0; k < m; k++) {
	aRe[k] /= m;
	aIm[k] = -aIm[k] / m;
	}

	// X[k] = chirp[k] * y[k]
	// Same multiplication as for a: (aRe + iaIm) (chirpRe + i*chirpIm)
	for (let k = 0; k < n; k++) {
	re[k] = aRe[k] * chirpRe[k] - aIm[k] * chirpIm[k];
	im[k] = aIm[k] * chirpRe[k] + aRe[k] * chirpIm[k];
	}
	}

	/**
	* In-place radix-2 FFT (Cooley-Tukey) with precomputed twiddles
	*/
	_fftRadix2(re, im, twiddleRe, twiddleIm) {
	const n = re.length;

	// Bit-reversal permutation
	for (let i = 0, j = 0; i < n; i++) {
	if (i < j) {
	let tmp = re[i]; re[i] = re[j]; re[j] = tmp;
	tmp = im[i]; im[i] = im[j]; im[j] = tmp;
	}
	let k = n >> 1;
	while (k > 0 && k <= j) { j -= k; k >>= 1; }
	j += k;
	}

	// Cooley-Tukey butterflies
	for (let len = 2; len <= n; len <<= 1) {
	const halfLen = len >> 1;
	const step = n / len;
	for (let i = 0; i < n; i += len) {
	for (let j = 0; j < halfLen; j++) {
	const twIdx = j * step;
	const wRe = twiddleRe[twIdx];
	const wIm = twiddleIm[twIdx];
	const u = i + j;
	const v = u + halfLen;
	const tRe = wRe * re[v] - wIm * im[v];
	const tIm = wRe * im[v] + wIm * re[v];
	re[v] = re[u] - tRe;
	im[v] = im[u] - tIm;
	re[u] += tRe;
	im[u] += tIm;
	}
	}
	}
	}

	/**
	* In-place radix-2 FFT without precomputed twiddles (for initialization)
	*/
	_fftRadix2InPlace(re, im, n, inverse = false) {
	// Bit-reversal
	for (let i = 0, j = 0; i < n; i++) {
	if (i < j) {
	let tmp = re[i]; re[i] = re[j]; re[j] = tmp;
	tmp = im[i]; im[i] = im[j]; im[j] = tmp;
	}
	let k = n >> 1;
	while (k > 0 && k <= j) { j -= k; k >>= 1; }
	j += k;
	}

	// Butterflies
	const sign = inverse ? 1 : -1;
	for (let len = 2; len <= n; len <<= 1) {
	const halfLen = len >> 1;
	const angle = sign * 2 * Math.PI / len;
	const wRe = Math.cos(angle);
	const wIm = Math.sin(angle);
	for (let i = 0; i < n; i += len) {
	let curRe = 1, curIm = 0;
	for (let j = 0; j < halfLen; j++) {
	const u = i + j;
	const v = u + halfLen;
	const tRe = curRe * re[v] - curIm * im[v];
	const tIm = curRe * im[v] + curIm * re[v];
	re[v] = re[u] - tRe;
	im[v] = im[u] - tIm;
	re[u] += tRe;
	im[u] += tIm;
	const newRe = curRe * wRe - curIm * wIm;
	curIm = curRe * wIm + curIm * wRe;
	curRe = newRe;
	}
	}
	}

	if (inverse) {
	for (let i = 0; i < n; i++) {
	re[i] /= n;
	im[i] /= n;
	}
	}
	}

	/**
	* Free resources
	*/
	dispose() {
	this.tokenizer = null;
	this.decoderSession = null;
	this.audioEncoderSession = null;
	this.audioEmbeddingSession = null;
	this.audioEmbeddingWeight = null;
	this.audioDetokenizerSession = null;
	this.vocoderSession = null;
	this.embedTokensWeight = null;
	}
	}

	// Re-export audio utilities
	export { loadAudioFile };

	export default AudioModel;