Spaces:

D3vShoaib
/

web-assembly-tts-sherpa-onnx-en

Running

App Files Files Community

csukuangfj commited on Mar 18

Commit

1d8ae66

1 Parent(s): e1364a2

update model

Browse files

Files changed (6) hide show

app-tts.js +54 -5
index.html +3 -2
sherpa-onnx-tts.js +24 -13
sherpa-onnx-tts.worker.js +24 -3
sherpa-onnx-wasm-main-tts.js +0 -0
sherpa-onnx-wasm-main-tts.wasm +2 -2

app-tts.js CHANGED Viewed

@@ -11,6 +11,7 @@ const speedValue = document.getElementById('speedValue');
 const textArea = document.getElementById('text');
 const soundClips = document.getElementById('sound-clips');
 const statusElement = document.getElementById('status');
 speedValue.innerHTML = speedInput.value;
@@ -19,7 +20,7 @@ let index = 0;
 let audioCtx = null;
 const worker = new Worker("sherpa-onnx-tts.worker.js");
 let ttsInstanceInfo = {
-  modelType: 0,
   numSpeakers: 0,
   isReady: false,
 };
@@ -28,8 +29,13 @@ worker.onmessage = (e) => {
     Module.setStatus(e.data.status);
     return;
   }
   if (e.data.type === "sherpa-onnx-tts-ready") {
-    ttsInstanceInfo.modelType = e.data.modelType ?? 0;
     ttsInstanceInfo.numSpeakers = e.data.numSpeakers;
     ttsInstanceInfo.isReady = true;
     generateBtn.disabled = false;
@@ -39,11 +45,18 @@ worker.onmessage = (e) => {
     return;
   }
   if (e.data.type === "error") {
-    Module.setStatus(e.data.message);
     return;
   }
   if (e.data.type === "sherpa-onnx-tts-result") {
     let audio = e.data;
     console.log(audio.samples.length, audio.sampleRate);
@@ -113,6 +126,15 @@ function updateUiForModelType() {
   referenceTextSection.classList.toggle('hidden', !isZipVoice);
 }
 function getMonoSamples(audioBuffer) {
   if (audioBuffer.numberOfChannels === 1) {
     return new Float32Array(audioBuffer.getChannelData(0));
@@ -152,6 +174,21 @@ function isWaveFile(file) {
   return name.toLowerCase().endsWith('.wav');
 }
 generateBtn.onclick = async function() {
   const isZipVoice = ttsInstanceInfo.modelType === 4;
@@ -199,7 +236,7 @@ generateBtn.onclick = async function() {
     const referenceText = referenceTextInput.value.trim();
     if (referenceText.length === 0) {
-      alert('Please input the reference text');
       return;
     }
@@ -211,10 +248,13 @@ generateBtn.onclick = async function() {
       referenceText: referenceText,
       numSteps: 4,
       extra: {
-        min_char_in_sentence: 30,
       },
     };
     worker.postMessage({
       text,
       genConfig,
@@ -236,14 +276,18 @@ function createAudioTag(generateAudio) {
   const text = textArea.value.trim().substring(0, 100);
   const clipName = `${index} ${text} ...`;
   index += 1;
   const clipContainer = document.createElement('article');
   const clipLabel = document.createElement('p');
   const audio = document.createElement('audio');
   const deleteButton = document.createElement('button');
   clipContainer.classList.add('clip');
   audio.setAttribute('controls', '');
   deleteButton.textContent = 'Delete';
   deleteButton.className = 'delete';
@@ -252,6 +296,7 @@ function createAudioTag(generateAudio) {
   clipContainer.appendChild(audio);
   clipContainer.appendChild(clipLabel);
   clipContainer.appendChild(deleteButton);
   soundClips.appendChild(clipContainer);
@@ -260,6 +305,10 @@ function createAudioTag(generateAudio) {
   const audioURL = window.URL.createObjectURL(blob);
   audio.src = audioURL;
   deleteButton.onclick = function(e) {
     let evtTgt = e.target;
     evtTgt.parentNode.parentNode.removeChild(evtTgt.parentNode);

 const textArea = document.getElementById('text');
 const soundClips = document.getElementById('sound-clips');
 const statusElement = document.getElementById('status');
+const generationStatusElement = document.getElementById('generationStatus');
 speedValue.innerHTML = speedInput.value;
 let audioCtx = null;
 const worker = new Worker("sherpa-onnx-tts.worker.js");
 let ttsInstanceInfo = {
+  modelType: null,
   numSpeakers: 0,
   isReady: false,
 };
     Module.setStatus(e.data.status);
     return;
   }
+  if (e.data.type === "sherpa-onnx-tts-generation-progress") {
+    const percent = Math.max(0, Math.min(100, (e.data.progress || 0) * 100));
+    setGenerationStatus(`Generating audio... ${percent.toFixed(2)}%`);
+    return;
+  }
   if (e.data.type === "sherpa-onnx-tts-ready") {
+    ttsInstanceInfo.modelType = e.data.modelType;
     ttsInstanceInfo.numSpeakers = e.data.numSpeakers;
     ttsInstanceInfo.isReady = true;
     generateBtn.disabled = false;
     return;
   }
   if (e.data.type === "error") {
+    generateBtn.disabled = false;
+    if (ttsInstanceInfo.isReady) {
+      setGenerationStatus(e.data.message);
+    } else {
+      Module.setStatus(e.data.message);
+    }
     return;
   }
   if (e.data.type === "sherpa-onnx-tts-result") {
     let audio = e.data;
+    generateBtn.disabled = false;
+    setGenerationStatus('');
     console.log(audio.samples.length, audio.sampleRate);
   referenceTextSection.classList.toggle('hidden', !isZipVoice);
 }
+function setGenerationStatus(status) {
+  if (!generationStatusElement) {
+    return;
+  }
+  generationStatusElement.textContent = status;
+  generationStatusElement.style.display = status ? 'block' : 'none';
+}
 function getMonoSamples(audioBuffer) {
   if (audioBuffer.numberOfChannels === 1) {
     return new Float32Array(audioBuffer.getChannelData(0));
   return name.toLowerCase().endsWith('.wav');
 }
+function sanitizeFilename(name) {
+  return name.replace(/[^a-zA-Z0-9._-]+/g, '-');
+}
+function downloadBlob(blob, filename) {
+  const url = window.URL.createObjectURL(blob);
+  const link = document.createElement('a');
+  link.href = url;
+  link.download = filename;
+  document.body.appendChild(link);
+  link.click();
+  document.body.removeChild(link);
+  window.URL.revokeObjectURL(url);
+}
 generateBtn.onclick = async function() {
   const isZipVoice = ttsInstanceInfo.modelType === 4;
     const referenceText = referenceTextInput.value.trim();
     if (referenceText.length === 0) {
+      alert('Please input the transcript of the reference audio');
       return;
     }
       referenceText: referenceText,
       numSteps: 4,
       extra: {
+        min_char_in_sentence: 10,
       },
     };
+    generateBtn.disabled = true;
+    setGenerationStatus('Generating audio...');
     worker.postMessage({
       text,
       genConfig,
   const text = textArea.value.trim().substring(0, 100);
   const clipName = `${index} ${text} ...`;
+  const filename = `${sanitizeFilename(clipName)}.wav`;
   index += 1;
   const clipContainer = document.createElement('article');
   const clipLabel = document.createElement('p');
   const audio = document.createElement('audio');
+  const saveButton = document.createElement('button');
   const deleteButton = document.createElement('button');
   clipContainer.classList.add('clip');
   audio.setAttribute('controls', '');
+  saveButton.textContent = 'Save';
+  saveButton.className = 'save';
   deleteButton.textContent = 'Delete';
   deleteButton.className = 'delete';
   clipContainer.appendChild(audio);
   clipContainer.appendChild(clipLabel);
+  clipContainer.appendChild(saveButton);
   clipContainer.appendChild(deleteButton);
   soundClips.appendChild(clipContainer);
   const audioURL = window.URL.createObjectURL(blob);
   audio.src = audioURL;
+  saveButton.onclick = function() {
+    downloadBlob(blob, filename);
+  };
   deleteButton.onclick = function(e) {
     let evtTgt = e.target;
     evtTgt.parentNode.parentNode.removeChild(evtTgt.parentNode);

index.html CHANGED Viewed

@@ -44,9 +44,9 @@
         <br/>
       </div>
       <div id="referenceTextSection" class="hidden">
-        <label for="referenceText">Reference text: </label>
         <br/>
-        <textarea id="referenceText" rows="3" placeholder="Please enter the transcription of the reference audio"></textarea>
         <br/>
         <br/>
       </div>
@@ -59,6 +59,7 @@
       <br/>
       <br/>
       <button id="generateBtn" disabled>Generate</button>
     </div>
     <section flex="1" overflow="auto" id="sound-clips">

         <br/>
       </div>
       <div id="referenceTextSection" class="hidden">
+        <label for="referenceText">Reference transcript (must match the reference audio): </label>
         <br/>
+        <textarea id="referenceText" rows="3" placeholder="Please enter the transcript of the reference audio exactly"></textarea>
         <br/>
         <br/>
       </div>
       <br/>
       <br/>
       <button id="generateBtn" disabled>Generate</button>
+      <div id="generationStatus" style="display: none; margin-top: 0.75rem; font-size: 0.95rem; color: #6c757d;"></div>
     </div>
     <section flex="1" overflow="auto" id="sound-clips">

sherpa-onnx-tts.js CHANGED Viewed

@@ -429,16 +429,14 @@ function initSherpaOnnxOfflineTtsPocketModelConfig(config, Module) {
 function initSherpaOnnxOfflineTtsSupertonicModelConfig(config, Module) {
   const durationPredictorLen =
       Module.lengthBytesUTF8(config.durationPredictor || '') + 1;
-  const textEncoderLen =
-      Module.lengthBytesUTF8(config.textEncoder || '') + 1;
   const vectorEstimatorLen =
       Module.lengthBytesUTF8(config.vectorEstimator || '') + 1;
   const vocoderLen = Module.lengthBytesUTF8(config.vocoder || '') + 1;
   const ttsJsonLen = Module.lengthBytesUTF8(config.ttsJson || '') + 1;
   const unicodeIndexerLen =
       Module.lengthBytesUTF8(config.unicodeIndexer || '') + 1;
-  const voiceStyleLen =
-      Module.lengthBytesUTF8(config.voiceStyle || '') + 1;
   const n = durationPredictorLen + textEncoderLen + vectorEstimatorLen +
       vocoderLen + ttsJsonLen + unicodeIndexerLen + voiceStyleLen;
@@ -471,8 +469,7 @@ function initSherpaOnnxOfflineTtsSupertonicModelConfig(config, Module) {
       config.unicodeIndexer || '', buffer + offset, unicodeIndexerLen);
   offset += unicodeIndexerLen;
-  Module.stringToUTF8(
-      config.voiceStyle || '', buffer + offset, voiceStyleLen);
   offset += voiceStyleLen;
   offset = 0;
@@ -873,13 +870,27 @@ class OfflineTts {
     const textPtr = this.Module._malloc(textLen);
     this.Module.stringToUTF8(text, textPtr, textLen);
-    const audioPtr = this.Module._SherpaOnnxOfflineTtsGenerateWithConfig(
-        this.handle, textPtr, cfgWasm.ptr,
-        0,  // callback
-        0   // callback arg
-    );
-    this.Module._free(textPtr);
-    freeSherpaOnnxGenerationConfig(cfgWasm, this.Module);
     if (!audioPtr) {
       throw new Error('Failed to generate audio');

 function initSherpaOnnxOfflineTtsSupertonicModelConfig(config, Module) {
   const durationPredictorLen =
       Module.lengthBytesUTF8(config.durationPredictor || '') + 1;
+  const textEncoderLen = Module.lengthBytesUTF8(config.textEncoder || '') + 1;
   const vectorEstimatorLen =
       Module.lengthBytesUTF8(config.vectorEstimator || '') + 1;
   const vocoderLen = Module.lengthBytesUTF8(config.vocoder || '') + 1;
   const ttsJsonLen = Module.lengthBytesUTF8(config.ttsJson || '') + 1;
   const unicodeIndexerLen =
       Module.lengthBytesUTF8(config.unicodeIndexer || '') + 1;
+  const voiceStyleLen = Module.lengthBytesUTF8(config.voiceStyle || '') + 1;
   const n = durationPredictorLen + textEncoderLen + vectorEstimatorLen +
       vocoderLen + ttsJsonLen + unicodeIndexerLen + voiceStyleLen;
       config.unicodeIndexer || '', buffer + offset, unicodeIndexerLen);
   offset += unicodeIndexerLen;
+  Module.stringToUTF8(config.voiceStyle || '', buffer + offset, voiceStyleLen);
   offset += voiceStyleLen;
   offset = 0;
     const textPtr = this.Module._malloc(textLen);
     this.Module.stringToUTF8(text, textPtr, textLen);
+    let callbackPtr = 0;
+    if (genConfig.callback) {
+      callbackPtr = this.Module.addFunction((samplesPtr, n, progress, arg) => {
+        const heapSamples =
+            this.Module.HEAPF32.subarray(samplesPtr / 4, samplesPtr / 4 + n);
+        const samples = new Float32Array(heapSamples);
+        return genConfig.callback(samples, n, progress, arg);
+      }, 'iiifi');
+    }
+    let audioPtr = 0;
+    try {
+      audioPtr = this.Module._SherpaOnnxOfflineTtsGenerateWithConfig(
+          this.handle, textPtr, cfgWasm.ptr, callbackPtr, 0);
+    } finally {
+      this.Module._free(textPtr);
+      freeSherpaOnnxGenerationConfig(cfgWasm, this.Module);
+      if (callbackPtr) {
+        this.Module.removeFunction(callbackPtr);
+      }
+    }
     if (!audioPtr) {
       throw new Error('Failed to generate audio');

sherpa-onnx-tts.worker.js CHANGED Viewed

@@ -28,6 +28,18 @@ self.Module = {
 };
 importScripts("sherpa-onnx-wasm-main-tts.js");
 importScripts("sherpa-onnx-tts.js");
 self.onmessage = async (e) => {
   const { type, text, sid, speed, genConfig } = e.data;
   if (type === "generate") {
@@ -53,7 +65,7 @@ self.onmessage = async (e) => {
     } catch (err) {
       self.postMessage({
         type: "error",
-        message: "Generation failed: " + err.message,
       });
     }
   } else if (type === "generateWithConfig") {
@@ -61,7 +73,16 @@ self.onmessage = async (e) => {
       return;
     }
     try {
-      const audio = tts.generateWithConfig(text, genConfig || {});
       const samples = audio.samples;
       const sampleRate = audio.sampleRate;
       self.postMessage(
@@ -75,7 +96,7 @@ self.onmessage = async (e) => {
     } catch (err) {
       self.postMessage({
         type: "error",
-        message: "Generation failed: " + err.message,
       });
     }
   }

 };
 importScripts("sherpa-onnx-wasm-main-tts.js");
 importScripts("sherpa-onnx-tts.js");
+function getErrorMessage(err) {
+  if (err instanceof Error) {
+    if (err.stack) {
+      return `${err.message}\n${err.stack}`;
+    }
+    return err.message;
+  }
+  return `${err}`;
+}
 self.onmessage = async (e) => {
   const { type, text, sid, speed, genConfig } = e.data;
   if (type === "generate") {
     } catch (err) {
       self.postMessage({
         type: "error",
+        message: "Generation failed: " + getErrorMessage(err),
       });
     }
   } else if (type === "generateWithConfig") {
       return;
     }
     try {
+      const config = Object.assign({}, genConfig || {});
+      config.callback = (samples, n, progress) => {
+        self.postMessage({
+          type: "sherpa-onnx-tts-generation-progress",
+          progress: progress,
+        });
+        return 1;
+      };
+      const audio = tts.generateWithConfig(text, config);
       const samples = audio.samples;
       const sampleRate = audio.sampleRate;
       self.postMessage(
     } catch (err) {
       self.postMessage({
         type: "error",
+        message: "Generation failed: " + getErrorMessage(err),
       });
     }
   }

sherpa-onnx-wasm-main-tts.js CHANGED Viewed

The diff for this file is too large to render. See raw diff

sherpa-onnx-wasm-main-tts.wasm CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:6edcd44a15b7c385405a142b473d9f83eff0fb5c1ca683e2a729addedb0bd21b
-size 11967286

 version https://git-lfs.github.com/spec/v1
+oid sha256:fafbc0801ce90199530579097ddb6db2e7fffefc2f6ca237f571db22956f15bc
+size 11967283