function freeConfig(config, Module) { if ('buffer' in config) { Module._free(config.buffer); } if ('config' in config) { freeConfig(config.config, Module) } if ('matcha' in config) { freeConfig(config.matcha, Module) } if ('kokoro' in config) { freeConfig(config.kokoro, Module) } if ('kitten' in config) { freeConfig(config.kitten, Module) } if ('zipvoice' in config) { freeConfig(config.zipvoice, Module) } if ('pocket' in config) { freeConfig(config.pocket, Module) } if ('supertonic' in config) { freeConfig(config.supertonic, Module) } if (config.ptr) { Module._free(config.ptr); } } // The user should free the returned pointers function initSherpaOnnxOfflineTtsVitsModelConfig(config, Module) { const modelLen = Module.lengthBytesUTF8(config.model || '') + 1; const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1; const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1; const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1; const dictDir = '' const dictDirLen = Module.lengthBytesUTF8(dictDir) + 1; const n = modelLen + lexiconLen + tokensLen + dataDirLen + dictDirLen; const buffer = Module._malloc(n); const len = 8 * 4; const ptr = Module._malloc(len); let offset = 0; Module.stringToUTF8(config.model || '', buffer + offset, modelLen); offset += modelLen; Module.stringToUTF8(config.lexicon || '', buffer + offset, lexiconLen); offset += lexiconLen; Module.stringToUTF8(config.tokens || '', buffer + offset, tokensLen); offset += tokensLen; Module.stringToUTF8(config.dataDir || '', buffer + offset, dataDirLen); offset += dataDirLen; Module.stringToUTF8(dictDir, buffer + offset, dictDirLen); offset += dictDirLen; offset = 0; Module.setValue(ptr, buffer + offset, 'i8*'); offset += modelLen; Module.setValue(ptr + 4, buffer + offset, 'i8*'); offset += lexiconLen; Module.setValue(ptr + 8, buffer + offset, 'i8*'); offset += tokensLen; Module.setValue(ptr + 12, buffer + offset, 'i8*'); offset += dataDirLen; Module.setValue(ptr + 16, config.noiseScale || 0.667, 'float'); Module.setValue(ptr + 20, config.noiseScaleW || 0.8, 'float'); Module.setValue(ptr + 24, config.lengthScale || 1.0, 'float'); Module.setValue(ptr + 28, buffer + offset, 'i8*'); offset += dictDirLen; return { buffer: buffer, ptr: ptr, len: len, }; } function initSherpaOnnxOfflineTtsMatchaModelConfig(config, Module) { const acousticModelLen = Module.lengthBytesUTF8(config.acousticModel) + 1; const vocoderLen = Module.lengthBytesUTF8(config.vocoder) + 1; const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1; const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1; const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1; const dictDir = ''; const dictDirLen = Module.lengthBytesUTF8(dictDir) + 1; const n = acousticModelLen + vocoderLen + lexiconLen + tokensLen + dataDirLen + dictDirLen; const buffer = Module._malloc(n); const len = 8 * 4; const ptr = Module._malloc(len); let offset = 0; Module.stringToUTF8( config.acousticModel || '', buffer + offset, acousticModelLen); offset += acousticModelLen; Module.stringToUTF8(config.vocoder || '', buffer + offset, vocoderLen); offset += vocoderLen; Module.stringToUTF8(config.lexicon || '', buffer + offset, lexiconLen); offset += lexiconLen; Module.stringToUTF8(config.tokens || '', buffer + offset, tokensLen); offset += tokensLen; Module.stringToUTF8(config.dataDir || '', buffer + offset, dataDirLen); offset += dataDirLen; Module.stringToUTF8(dictDir, buffer + offset, dictDirLen); offset += dictDirLen; offset = 0; Module.setValue(ptr, buffer + offset, 'i8*'); offset += acousticModelLen; Module.setValue(ptr + 4, buffer + offset, 'i8*'); offset += vocoderLen; Module.setValue(ptr + 8, buffer + offset, 'i8*'); offset += lexiconLen; Module.setValue(ptr + 12, buffer + offset, 'i8*'); offset += tokensLen; Module.setValue(ptr + 16, buffer + offset, 'i8*'); offset += dataDirLen; Module.setValue(ptr + 20, config.noiseScale || 0.667, 'float'); Module.setValue(ptr + 24, config.lengthScale || 1.0, 'float'); Module.setValue(ptr + 28, buffer + offset, 'i8*'); offset += dictDirLen; return { buffer: buffer, ptr: ptr, len: len, }; } function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) { const modelLen = Module.lengthBytesUTF8(config.model) + 1; const voicesLen = Module.lengthBytesUTF8(config.voices) + 1; const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1; const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1; const dictDir = ''; const dictDirLen = Module.lengthBytesUTF8(dictDir) + 1; const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1; const langLen = Module.lengthBytesUTF8(config.lang || '') + 1; const n = modelLen + voicesLen + tokensLen + dataDirLen + dictDirLen + lexiconLen + langLen; const buffer = Module._malloc(n); const len = 8 * 4; const ptr = Module._malloc(len); let offset = 0; Module.stringToUTF8(config.model || '', buffer + offset, modelLen); offset += modelLen; Module.stringToUTF8(config.voices || '', buffer + offset, voicesLen); offset += voicesLen; Module.stringToUTF8(config.tokens || '', buffer + offset, tokensLen); offset += tokensLen; Module.stringToUTF8(config.dataDir || '', buffer + offset, dataDirLen); offset += dataDirLen; Module.stringToUTF8(dictDir, buffer + offset, dictDirLen); offset += dictDirLen; Module.stringToUTF8(config.lexicon || '', buffer + offset, lexiconLen); offset += lexiconLen; Module.stringToUTF8(config.lang || '', buffer + offset, langLen); offset += langLen; offset = 0; Module.setValue(ptr, buffer + offset, 'i8*'); offset += modelLen; Module.setValue(ptr + 4, buffer + offset, 'i8*'); offset += voicesLen; Module.setValue(ptr + 8, buffer + offset, 'i8*'); offset += tokensLen; Module.setValue(ptr + 12, buffer + offset, 'i8*'); offset += dataDirLen; Module.setValue(ptr + 16, config.lengthScale || 1.0, 'float'); Module.setValue(ptr + 20, buffer + offset, 'i8*'); offset += dictDirLen; Module.setValue(ptr + 24, buffer + offset, 'i8*'); offset += lexiconLen; Module.setValue(ptr + 28, buffer + offset, 'i8*'); offset += langLen; return { buffer: buffer, ptr: ptr, len: len, }; } function initSherpaOnnxOfflineTtsKittenModelConfig(config, Module) { const modelLen = Module.lengthBytesUTF8(config.model) + 1; const voicesLen = Module.lengthBytesUTF8(config.voices) + 1; const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1; const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1; const n = modelLen + voicesLen + tokensLen + dataDirLen; const buffer = Module._malloc(n); const len = 5 * 4; const ptr = Module._malloc(len); let offset = 0; Module.stringToUTF8(config.model || '', buffer + offset, modelLen); offset += modelLen; Module.stringToUTF8(config.voices || '', buffer + offset, voicesLen); offset += voicesLen; Module.stringToUTF8(config.tokens || '', buffer + offset, tokensLen); offset += tokensLen; Module.stringToUTF8(config.dataDir || '', buffer + offset, dataDirLen); offset += dataDirLen; offset = 0; Module.setValue(ptr, buffer + offset, 'i8*'); offset += modelLen; Module.setValue(ptr + 4, buffer + offset, 'i8*'); offset += voicesLen; Module.setValue(ptr + 8, buffer + offset, 'i8*'); offset += tokensLen; Module.setValue(ptr + 12, buffer + offset, 'i8*'); offset += dataDirLen; Module.setValue(ptr + 16, config.lengthScale || 1.0, 'float'); return { buffer: buffer, ptr: ptr, len: len, }; } function initSherpaOnnxOfflineTtsZipVoiceModelConfig(config, Module) { const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1; const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1; const decoderLen = Module.lengthBytesUTF8(config.decoder || '') + 1; const vocoderLen = Module.lengthBytesUTF8(config.vocoder || '') + 1; const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1; const lexiconLen = Module.lengthBytesUTF8(config.lexicon || '') + 1; const n = tokensLen + encoderLen + decoderLen + vocoderLen + dataDirLen + lexiconLen; const buffer = Module._malloc(n); const len = 10 * 4; const ptr = Module._malloc(len); let offset = 0; Module.stringToUTF8(config.tokens || '', buffer + offset, tokensLen); offset += tokensLen; Module.stringToUTF8(config.encoder || '', buffer + offset, encoderLen); offset += encoderLen; Module.stringToUTF8(config.decoder || '', buffer + offset, decoderLen); offset += decoderLen; Module.stringToUTF8(config.vocoder || '', buffer + offset, vocoderLen); offset += vocoderLen; Module.stringToUTF8(config.dataDir || '', buffer + offset, dataDirLen); offset += dataDirLen; Module.stringToUTF8(config.lexicon || '', buffer + offset, lexiconLen); offset += lexiconLen; offset = 0; Module.setValue(ptr, buffer + offset, 'i8*'); offset += tokensLen; Module.setValue(ptr + 4, buffer + offset, 'i8*'); offset += encoderLen; Module.setValue(ptr + 8, buffer + offset, 'i8*'); offset += decoderLen; Module.setValue(ptr + 12, buffer + offset, 'i8*'); offset += vocoderLen; Module.setValue(ptr + 16, buffer + offset, 'i8*'); offset += dataDirLen; Module.setValue(ptr + 20, buffer + offset, 'i8*'); offset += lexiconLen; Module.setValue(ptr + 24, config.featScale || 0.1, 'float'); Module.setValue(ptr + 28, config.tShift || 0.5, 'float'); Module.setValue(ptr + 32, config.targetRMS || 0.1, 'float'); Module.setValue(ptr + 36, config.guidanceScale || 1.0, 'float'); return { buffer: buffer, ptr: ptr, len: len, }; } function initSherpaOnnxOfflineTtsPocketModelConfig(config, Module) { const lmFlowLen = Module.lengthBytesUTF8(config.lmFlow || '') + 1; const lmMainLen = Module.lengthBytesUTF8(config.lmMain || '') + 1; const encoderLen = Module.lengthBytesUTF8(config.encoder || '') + 1; const decoderLen = Module.lengthBytesUTF8(config.decoder || '') + 1; const textConditionerLen = Module.lengthBytesUTF8(config.textConditioner || '') + 1; const vocabJsonLen = Module.lengthBytesUTF8(config.vocabJson || '') + 1; const tokenScoresJsonLen = Module.lengthBytesUTF8(config.tokenScoresJson || '') + 1; const n = lmFlowLen + lmMainLen + encoderLen + decoderLen + textConditionerLen + vocabJsonLen + tokenScoresJsonLen; const buffer = Module._malloc(n); const len = 8 * 4; const ptr = Module._malloc(len); let offset = 0; Module.stringToUTF8(config.lmFlow || '', buffer + offset, lmFlowLen); offset += lmFlowLen; Module.stringToUTF8(config.lmMain || '', buffer + offset, lmMainLen); offset += lmMainLen; Module.stringToUTF8(config.encoder || '', buffer + offset, encoderLen); offset += encoderLen; Module.stringToUTF8(config.decoder || '', buffer + offset, decoderLen); offset += decoderLen; Module.stringToUTF8( config.textConditioner || '', buffer + offset, textConditionerLen); offset += textConditionerLen; Module.stringToUTF8(config.vocabJson || '', buffer + offset, vocabJsonLen); offset += vocabJsonLen; Module.stringToUTF8( config.tokenScoresJson || '', buffer + offset, tokenScoresJsonLen); offset += tokenScoresJsonLen; offset = 0; Module.setValue(ptr + 0 * 4, buffer + offset, 'i8*'); offset += lmFlowLen; Module.setValue(ptr + 1 * 4, buffer + offset, 'i8*'); offset += lmMainLen; Module.setValue(ptr + 2 * 4, buffer + offset, 'i8*'); offset += encoderLen; Module.setValue(ptr + 3 * 4, buffer + offset, 'i8*'); offset += decoderLen; Module.setValue(ptr + 4 * 4, buffer + offset, 'i8*'); offset += textConditionerLen; Module.setValue(ptr + 5 * 4, buffer + offset, 'i8*'); offset += vocabJsonLen; Module.setValue(ptr + 6 * 4, buffer + offset, 'i8*'); offset += tokenScoresJsonLen; Module.setValue( ptr + 7 * 4, config.voiceEmbeddingCacheCapacity !== undefined ? config.voiceEmbeddingCacheCapacity : 50, 'i32'); return { buffer: buffer, ptr: ptr, len: len, }; } function initSherpaOnnxOfflineTtsSupertonicModelConfig(config, Module) { const durationPredictorLen = Module.lengthBytesUTF8(config.durationPredictor || '') + 1; const textEncoderLen = Module.lengthBytesUTF8(config.textEncoder || '') + 1; const vectorEstimatorLen = Module.lengthBytesUTF8(config.vectorEstimator || '') + 1; const vocoderLen = Module.lengthBytesUTF8(config.vocoder || '') + 1; const ttsJsonLen = Module.lengthBytesUTF8(config.ttsJson || '') + 1; const unicodeIndexerLen = Module.lengthBytesUTF8(config.unicodeIndexer || '') + 1; const voiceStyleLen = Module.lengthBytesUTF8(config.voiceStyle || '') + 1; const n = durationPredictorLen + textEncoderLen + vectorEstimatorLen + vocoderLen + ttsJsonLen + unicodeIndexerLen + voiceStyleLen; const buffer = Module._malloc(n); const len = 7 * 4; const ptr = Module._malloc(len); let offset = 0; Module.stringToUTF8( config.durationPredictor || '', buffer + offset, durationPredictorLen); offset += durationPredictorLen; Module.stringToUTF8( config.textEncoder || '', buffer + offset, textEncoderLen); offset += textEncoderLen; Module.stringToUTF8( config.vectorEstimator || '', buffer + offset, vectorEstimatorLen); offset += vectorEstimatorLen; Module.stringToUTF8(config.vocoder || '', buffer + offset, vocoderLen); offset += vocoderLen; Module.stringToUTF8(config.ttsJson || '', buffer + offset, ttsJsonLen); offset += ttsJsonLen; Module.stringToUTF8( config.unicodeIndexer || '', buffer + offset, unicodeIndexerLen); offset += unicodeIndexerLen; Module.stringToUTF8(config.voiceStyle || '', buffer + offset, voiceStyleLen); offset += voiceStyleLen; offset = 0; Module.setValue(ptr + 0 * 4, buffer + offset, 'i8*'); offset += durationPredictorLen; Module.setValue(ptr + 1 * 4, buffer + offset, 'i8*'); offset += textEncoderLen; Module.setValue(ptr + 2 * 4, buffer + offset, 'i8*'); offset += vectorEstimatorLen; Module.setValue(ptr + 3 * 4, buffer + offset, 'i8*'); offset += vocoderLen; Module.setValue(ptr + 4 * 4, buffer + offset, 'i8*'); offset += ttsJsonLen; Module.setValue(ptr + 5 * 4, buffer + offset, 'i8*'); offset += unicodeIndexerLen; Module.setValue(ptr + 6 * 4, buffer + offset, 'i8*'); offset += voiceStyleLen; return { buffer: buffer, ptr: ptr, len: len, }; } function initSherpaOnnxOfflineTtsModelConfig(config, Module) { if (!('offlineTtsVitsModelConfig' in config)) { config.offlineTtsVitsModelConfig = { model: '', lexicon: '', tokens: '', noiseScale: 0.667, noiseScaleW: 0.8, lengthScale: 1.0, dataDir: '', }; } if (!('offlineTtsMatchaModelConfig' in config)) { config.offlineTtsMatchaModelConfig = { acousticModel: '', vocoder: '', lexicon: '', tokens: '', noiseScale: 0.667, lengthScale: 1.0, dataDir: '', }; } if (!('offlineTtsKokoroModelConfig' in config)) { config.offlineTtsKokoroModelConfig = { model: '', voices: '', tokens: '', lengthScale: 1.0, dataDir: '', lexicon: '', lang: '', }; } if (!('offlineTtsKittenModelConfig' in config)) { config.offlineTtsKittenModelConfig = { model: '', voices: '', tokens: '', lengthScale: 1.0, }; } if (!('offlineTtsZipVoiceModelConfig' in config)) { config.offlineTtsZipVoiceModelConfig = { tokens: '', encoder: '', decoder: '', vocoder: '', dataDir: '', lexicon: '', featScale: 0.1, tShift: 0.5, targetRMS: 0.1, guidanceScale: 1.0, }; } if (!('offlineTtsPocketModelConfig' in config)) { config.offlineTtsPocketModelConfig = { lmFlow: '', lmMain: '', encoder: '', decoder: '', textConditioner: '', vocabJson: '', tokenScoresJson: '', voiceEmbeddingCacheCapacity: 50, }; } if (!('offlineTtsSupertonicModelConfig' in config)) { config.offlineTtsSupertonicModelConfig = { durationPredictor: '', textEncoder: '', vectorEstimator: '', vocoder: '', ttsJson: '', unicodeIndexer: '', voiceStyle: '', }; } const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig( config.offlineTtsVitsModelConfig, Module); const matchaModelConfig = initSherpaOnnxOfflineTtsMatchaModelConfig( config.offlineTtsMatchaModelConfig, Module); const kokoroModelConfig = initSherpaOnnxOfflineTtsKokoroModelConfig( config.offlineTtsKokoroModelConfig, Module); const kittenModelConfig = initSherpaOnnxOfflineTtsKittenModelConfig( config.offlineTtsKittenModelConfig, Module); const zipVoiceModelConfig = initSherpaOnnxOfflineTtsZipVoiceModelConfig( config.offlineTtsZipVoiceModelConfig, Module); const pocketModelConfig = initSherpaOnnxOfflineTtsPocketModelConfig( config.offlineTtsPocketModelConfig, Module); const supertonicModelConfig = initSherpaOnnxOfflineTtsSupertonicModelConfig( config.offlineTtsSupertonicModelConfig, Module); const len = vitsModelConfig.len + matchaModelConfig.len + kokoroModelConfig.len + kittenModelConfig.len + zipVoiceModelConfig.len + pocketModelConfig.len + supertonicModelConfig.len + 3 * 4; const ptr = Module._malloc(len); let offset = 0; Module._CopyHeap(vitsModelConfig.ptr, vitsModelConfig.len, ptr + offset); offset += vitsModelConfig.len; Module.setValue(ptr + offset, config.numThreads || 1, 'i32'); offset += 4; Module.setValue(ptr + offset, config.debug || 0, 'i32'); offset += 4; const providerLen = Module.lengthBytesUTF8(config.provider || 'cpu') + 1; const buffer = Module._malloc(providerLen); Module.stringToUTF8(config.provider || 'cpu', buffer, providerLen); Module.setValue(ptr + offset, buffer, 'i8*'); offset += 4; Module._CopyHeap(matchaModelConfig.ptr, matchaModelConfig.len, ptr + offset); offset += matchaModelConfig.len; Module._CopyHeap(kokoroModelConfig.ptr, kokoroModelConfig.len, ptr + offset); offset += kokoroModelConfig.len; Module._CopyHeap(kittenModelConfig.ptr, kittenModelConfig.len, ptr + offset); offset += kittenModelConfig.len; Module._CopyHeap( zipVoiceModelConfig.ptr, zipVoiceModelConfig.len, ptr + offset); offset += zipVoiceModelConfig.len; Module._CopyHeap(pocketModelConfig.ptr, pocketModelConfig.len, ptr + offset); offset += pocketModelConfig.len; Module._CopyHeap( supertonicModelConfig.ptr, supertonicModelConfig.len, ptr + offset); offset += supertonicModelConfig.len; return { buffer: buffer, ptr: ptr, len: len, config: vitsModelConfig, matcha: matchaModelConfig, kokoro: kokoroModelConfig, kitten: kittenModelConfig, zipvoice: zipVoiceModelConfig, pocket: pocketModelConfig, supertonic: supertonicModelConfig, }; } function initSherpaOnnxOfflineTtsConfig(config, Module) { const modelConfig = initSherpaOnnxOfflineTtsModelConfig(config.offlineTtsModelConfig, Module); const len = modelConfig.len + 4 * 4; const ptr = Module._malloc(len); let offset = 0; Module._CopyHeap(modelConfig.ptr, modelConfig.len, ptr + offset); offset += modelConfig.len; const ruleFstsLen = Module.lengthBytesUTF8(config.ruleFsts || '') + 1; const ruleFarsLen = Module.lengthBytesUTF8(config.ruleFars || '') + 1; const buffer = Module._malloc(ruleFstsLen + ruleFarsLen); Module.stringToUTF8(config.ruleFsts || '', buffer, ruleFstsLen); Module.stringToUTF8(config.ruleFars || '', buffer + ruleFstsLen, ruleFarsLen); Module.setValue(ptr + offset, buffer, 'i8*'); offset += 4; Module.setValue(ptr + offset, config.maxNumSentences || 1, 'i32'); offset += 4; Module.setValue(ptr + offset, buffer + ruleFstsLen, 'i8*'); offset += 4; Module.setValue(ptr + offset, config.silenceScale || 0.2, 'float'); offset += 4; return { buffer: buffer, ptr: ptr, len: len, config: modelConfig, }; } /* const genConfig = { silenceScale: 0.2, speed: 1.0, sid: 1, referenceAudio: myFloat32Array, // optional referenceSampleRate: 16000, // used if referenceAudio is required referenceText: "Hello world", // optional numSteps: 5, // optional extra: { bar: "ok", foo: 0.8, foobar: 10} }; */ // Allocate a SherpaOnnxGenerationConfig in WASM function initSherpaOnnxGenerationConfig(config, Module) { const len = 9 * 4; const ptr = Module._malloc(len); // float silence_scale Module.setValue(ptr + 0 * 4, config.silenceScale || 0.2, 'float'); // float speed Module.setValue(ptr + 1 * 4, config.speed || 1.0, 'float'); // int32_t sid Module.setValue(ptr + 2 * 4, config.sid || 0, 'i32'); // const float* reference_audio let referenceAudioPtr = 0; if (config.referenceAudio && config.referenceAudio.length > 0) { referenceAudioPtr = Module._malloc(config.referenceAudio.length * 4); Module.HEAPF32.set(config.referenceAudio, referenceAudioPtr / 4); } Module.setValue(ptr + 3 * 4, referenceAudioPtr, 'i8*'); // int32_t reference_audio_len Module.setValue( ptr + 4 * 4, config.referenceAudio ? config.referenceAudio.length : 0, 'i32'); // int32_t reference_sample_rate Module.setValue(ptr + 5 * 4, config.referenceSampleRate || 0, 'i32'); // const char* reference_text let referenceTextPtr = 0; if (config.referenceText) { const textLen = Module.lengthBytesUTF8(config.referenceText) + 1; referenceTextPtr = Module._malloc(textLen); Module.stringToUTF8(config.referenceText, referenceTextPtr, textLen); } Module.setValue(ptr + 6 * 4, referenceTextPtr, 'i8*'); // int32_t num_steps Module.setValue(ptr + 7 * 4, config.numSteps || 5, 'i32'); // const char* extra (JSON string) let extraPtr = 0; let extraStr = null; if (config.extra) { if (typeof config.extra === 'object') { extraStr = JSON.stringify(config.extra); } else if (typeof config.extra === 'string') { extraStr = config.extra; } } if (extraStr !== null) { const extraLen = Module.lengthBytesUTF8(extraStr) + 1; extraPtr = Module._malloc(extraLen); Module.stringToUTF8(extraStr, extraPtr, extraLen); } Module.setValue(ptr + 8 * 4, extraPtr, 'i8*'); return { ptr, referenceAudioPtr, referenceTextPtr, extraPtr, }; } // Free the memory allocated for a SherpaOnnxGenerationConfig function freeSherpaOnnxGenerationConfig(cfg, Module) { if (!cfg) return; if (cfg.referenceAudioPtr) Module._free(cfg.referenceAudioPtr); if (cfg.referenceTextPtr) Module._free(cfg.referenceTextPtr); if (cfg.extraPtr) Module._free(cfg.extraPtr); if (cfg.ptr) Module._free(cfg.ptr); } class OfflineTts { constructor(configObj, Module) { const config = initSherpaOnnxOfflineTtsConfig(configObj, Module) const handle = Module._SherpaOnnxCreateOfflineTts(config.ptr); freeConfig(config, Module); this.handle = handle; this.sampleRate = Module._SherpaOnnxOfflineTtsSampleRate(this.handle); this.numSpeakers = Module._SherpaOnnxOfflineTtsNumSpeakers(this.handle); this.Module = Module } free() { if (!this.handle) return; this.Module._SherpaOnnxDestroyOfflineTts(this.handle); this.handle = 0 } // { // text: "hello", // sid: 1, // speed: 1.0 // } generate(config) { if (!this.handle) { throw new Error('OfflineTts has been freed'); } if (!config || !config.text) { throw new Error('config.text is required'); } const textLen = this.Module.lengthBytesUTF8(config.text) + 1; const textPtr = this.Module._malloc(textLen); this.Module.stringToUTF8(config.text, textPtr, textLen); const genConfig = { sid: config.sid ?? 0, speed: config.speed ?? 1.0, }; const cfgWasm = initSherpaOnnxGenerationConfig(genConfig, this.Module); const h = this.Module._SherpaOnnxOfflineTtsGenerateWithConfig( this.handle, textPtr, cfgWasm.ptr, 0, 0); freeSherpaOnnxGenerationConfig(cfgWasm, this.Module); this.Module._free(textPtr); if (!h) { throw new Error('TTS generation failed'); } const base = h / 4; const samplesPtr = this.Module.HEAPU32[base]; const numSamples = this.Module.HEAP32[base + 1]; const sampleRate = this.Module.HEAP32[base + 2]; const heapSamples = this.Module.HEAPF32.subarray( samplesPtr / 4, samplesPtr / 4 + numSamples); const samples = new Float32Array(heapSamples); this.Module._SherpaOnnxDestroyOfflineTtsGeneratedAudio(h); return {samples: samples, sampleRate: sampleRate}; } generateWithConfig(text, genConfig) { if (!this.handle) { throw new Error('OfflineTts has been freed'); } const cfgWasm = initSherpaOnnxGenerationConfig(genConfig, this.Module); const textLen = this.Module.lengthBytesUTF8(text) + 1; const textPtr = this.Module._malloc(textLen); this.Module.stringToUTF8(text, textPtr, textLen); let callbackPtr = 0; if (genConfig.callback) { callbackPtr = this.Module.addFunction((samplesPtr, n, progress, arg) => { const heapSamples = this.Module.HEAPF32.subarray(samplesPtr / 4, samplesPtr / 4 + n); const samples = new Float32Array(heapSamples); return genConfig.callback(samples, n, progress, arg); }, 'iiifi'); } let audioPtr = 0; try { audioPtr = this.Module._SherpaOnnxOfflineTtsGenerateWithConfig( this.handle, textPtr, cfgWasm.ptr, callbackPtr, 0); } finally { this.Module._free(textPtr); freeSherpaOnnxGenerationConfig(cfgWasm, this.Module); if (callbackPtr) { this.Module.removeFunction(callbackPtr); } } if (!audioPtr) { throw new Error('Failed to generate audio'); } const base = audioPtr / 4; const samplesPtr = this.Module.HEAPU32[base]; // float* samples const numSamples = this.Module.HEAP32[base + 1]; // int32 num_samples const sampleRate = this.Module.HEAP32[base + 2]; // int32 sample_rate const heapSamples = this.Module.HEAPF32.subarray( samplesPtr / 4, samplesPtr / 4 + numSamples); const samples = new Float32Array(heapSamples); this.Module._SherpaOnnxDestroyOfflineTtsGeneratedAudio(audioPtr); return {samples, sampleRate}; } save(filename, audio) { const samples = audio.samples; const sampleRate = audio.sampleRate; const ptr = this.Module._malloc(samples.length * 4); this.Module.HEAPF32.set(samples, ptr / 4); const filenameLen = this.Module.lengthBytesUTF8(filename) + 1; const buffer = this.Module._malloc(filenameLen); this.Module.stringToUTF8(filename, buffer, filenameLen); this.Module._SherpaOnnxWriteWave(ptr, samples.length, sampleRate, buffer); this.Module._free(buffer); this.Module._free(ptr); } } let modelType = 0; function getDefaultOfflineTtsModelType() { return modelType; } function createOfflineTts(Module, myConfig) { const vits = { model: '', lexicon: '', tokens: '', dataDir: '', noiseScale: 0.667, noiseScaleW: 0.8, lengthScale: 1.0, }; const matcha = { acousticModel: '', vocoder: '', lexicon: '', tokens: '', dataDir: '', noiseScale: 0.667, lengthScale: 1.0, }; const offlineTtsKokoroModelConfig = { model: '', voices: '', tokens: '', dataDir: '', lengthScale: 1.0, lexicon: '', lang: '', }; const offlineTtsKittenModelConfig = { model: '', voices: '', tokens: '', dataDir: '', lengthScale: 1.0, }; const offlineTtsZipVoiceModelConfig = { tokens: '', encoder: '', decoder: '', vocoder: '', dataDir: '', lexicon: '', featScale: 0.1, tShift: 0.5, targetRMS: 0.1, guidanceScale: 1.0, }; const offlineTtsPocketModelConfig = { lmFlow: '', lmMain: '', encoder: '', decoder: '', textConditioner: '', vocabJson: '', tokenScoresJson: '', voiceEmbeddingCacheCapacity: 50, }; let ruleFsts = ''; switch (modelType) { case 0: // vits vits.model = './model.onnx'; vits.tokens = './tokens.txt'; vits.dataDir = './espeak-ng-data'; break; case 1: // matcha zh-en // https://k2-fsa.github.io/sherpa/onnx/tts/all/Chinese-English/matcha-icefall-zh-en.html matcha.acousticModel = './model-steps-3.onnx'; matcha.vocoder = './vocos-16khz-univ.onnx'; matcha.lexicon = './lexicon.txt'; matcha.tokens = './tokens.txt'; matcha.dataDir = './espeak-ng-data'; ruleFsts = './phone-zh.fst,./date-zh.fst,./number-zh.fst'; break; case 2: // matcha zh // https://k2-fsa.github.io/sherpa/onnx/tts/all/Chinese/matcha-icefall-zh-baker.html matcha.acousticModel = './model-steps-3.onnx'; matcha.vocoder = './vocos-22khz-univ.onnx'; matcha.lexicon = './lexicon.txt'; matcha.tokens = './tokens.txt'; ruleFsts = './phone.fst,./date.fst,./number.fst'; break; case 3: // matcha en // https://k2-fsa.github.io/sherpa/onnx/tts/all/English/matcha-icefall-en_US-ljspeech.html matcha.acousticModel = './model-steps-3.onnx'; matcha.vocoder = './vocos-22khz-univ.onnx'; matcha.tokens = './tokens.txt'; matcha.dataDir = './espeak-ng-data'; break; case 4: // zipvoice zh-en // https://k2-fsa.github.io/sherpa/onnx/tts/zipvoice.html offlineTtsZipVoiceModelConfig.tokens = './tokens.txt'; offlineTtsZipVoiceModelConfig.encoder = './encoder.int8.onnx'; offlineTtsZipVoiceModelConfig.decoder = './decoder.int8.onnx'; offlineTtsZipVoiceModelConfig.vocoder = './vocos_24khz.onnx'; offlineTtsZipVoiceModelConfig.dataDir = './espeak-ng-data'; offlineTtsZipVoiceModelConfig.lexicon = './lexicon.txt'; break; case 5: // pocket tts // https://k2-fsa.github.io/sherpa/onnx/tts/pocket.html offlineTtsPocketModelConfig.lmFlow = './lm_flow.int8.onnx'; offlineTtsPocketModelConfig.lmMain = './lm_main.int8.onnx'; offlineTtsPocketModelConfig.encoder = './encoder.onnx'; offlineTtsPocketModelConfig.decoder = './decoder.int8.onnx'; offlineTtsPocketModelConfig.textConditioner = './text_conditioner.onnx'; offlineTtsPocketModelConfig.vocabJson = './vocab.json'; offlineTtsPocketModelConfig.tokenScoresJson = './token_scores.json'; break; } const offlineTtsModelConfig = { offlineTtsVitsModelConfig: vits, offlineTtsMatchaModelConfig: matcha, offlineTtsKokoroModelConfig: offlineTtsKokoroModelConfig, offlineTtsKittenModelConfig: offlineTtsKittenModelConfig, offlineTtsZipVoiceModelConfig: offlineTtsZipVoiceModelConfig, offlineTtsPocketModelConfig: offlineTtsPocketModelConfig, numThreads: 1, debug: 1, provider: 'cpu', }; let offlineTtsConfig = { offlineTtsModelConfig: offlineTtsModelConfig, ruleFsts: ruleFsts, ruleFars: '', maxNumSentences: 1, } if (myConfig) { offlineTtsConfig = myConfig; } return new OfflineTts(offlineTtsConfig, Module); } if (typeof process == 'object' && typeof process.versions == 'object' && typeof process.versions.node == 'string') { module.exports = { createOfflineTts, getDefaultOfflineTtsModelType, }; }