csukuangfj commited on
Commit
1d8ae66
·
1 Parent(s): e1364a2

update model

Browse files
app-tts.js CHANGED
@@ -11,6 +11,7 @@ const speedValue = document.getElementById('speedValue');
11
  const textArea = document.getElementById('text');
12
  const soundClips = document.getElementById('sound-clips');
13
  const statusElement = document.getElementById('status');
 
14
 
15
  speedValue.innerHTML = speedInput.value;
16
 
@@ -19,7 +20,7 @@ let index = 0;
19
  let audioCtx = null;
20
  const worker = new Worker("sherpa-onnx-tts.worker.js");
21
  let ttsInstanceInfo = {
22
- modelType: 0,
23
  numSpeakers: 0,
24
  isReady: false,
25
  };
@@ -28,8 +29,13 @@ worker.onmessage = (e) => {
28
  Module.setStatus(e.data.status);
29
  return;
30
  }
 
 
 
 
 
31
  if (e.data.type === "sherpa-onnx-tts-ready") {
32
- ttsInstanceInfo.modelType = e.data.modelType ?? 0;
33
  ttsInstanceInfo.numSpeakers = e.data.numSpeakers;
34
  ttsInstanceInfo.isReady = true;
35
  generateBtn.disabled = false;
@@ -39,11 +45,18 @@ worker.onmessage = (e) => {
39
  return;
40
  }
41
  if (e.data.type === "error") {
42
- Module.setStatus(e.data.message);
 
 
 
 
 
43
  return;
44
  }
45
  if (e.data.type === "sherpa-onnx-tts-result") {
46
  let audio = e.data;
 
 
47
 
48
  console.log(audio.samples.length, audio.sampleRate);
49
 
@@ -113,6 +126,15 @@ function updateUiForModelType() {
113
  referenceTextSection.classList.toggle('hidden', !isZipVoice);
114
  }
115
 
 
 
 
 
 
 
 
 
 
116
  function getMonoSamples(audioBuffer) {
117
  if (audioBuffer.numberOfChannels === 1) {
118
  return new Float32Array(audioBuffer.getChannelData(0));
@@ -152,6 +174,21 @@ function isWaveFile(file) {
152
  return name.toLowerCase().endsWith('.wav');
153
  }
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  generateBtn.onclick = async function() {
156
  const isZipVoice = ttsInstanceInfo.modelType === 4;
157
 
@@ -199,7 +236,7 @@ generateBtn.onclick = async function() {
199
 
200
  const referenceText = referenceTextInput.value.trim();
201
  if (referenceText.length === 0) {
202
- alert('Please input the reference text');
203
  return;
204
  }
205
 
@@ -211,10 +248,13 @@ generateBtn.onclick = async function() {
211
  referenceText: referenceText,
212
  numSteps: 4,
213
  extra: {
214
- min_char_in_sentence: 30,
215
  },
216
  };
217
 
 
 
 
218
  worker.postMessage({
219
  text,
220
  genConfig,
@@ -236,14 +276,18 @@ function createAudioTag(generateAudio) {
236
 
237
  const text = textArea.value.trim().substring(0, 100);
238
  const clipName = `${index} ${text} ...`;
 
239
  index += 1;
240
 
241
  const clipContainer = document.createElement('article');
242
  const clipLabel = document.createElement('p');
243
  const audio = document.createElement('audio');
 
244
  const deleteButton = document.createElement('button');
245
  clipContainer.classList.add('clip');
246
  audio.setAttribute('controls', '');
 
 
247
  deleteButton.textContent = 'Delete';
248
  deleteButton.className = 'delete';
249
 
@@ -252,6 +296,7 @@ function createAudioTag(generateAudio) {
252
  clipContainer.appendChild(audio);
253
 
254
  clipContainer.appendChild(clipLabel);
 
255
  clipContainer.appendChild(deleteButton);
256
  soundClips.appendChild(clipContainer);
257
 
@@ -260,6 +305,10 @@ function createAudioTag(generateAudio) {
260
  const audioURL = window.URL.createObjectURL(blob);
261
  audio.src = audioURL;
262
 
 
 
 
 
263
  deleteButton.onclick = function(e) {
264
  let evtTgt = e.target;
265
  evtTgt.parentNode.parentNode.removeChild(evtTgt.parentNode);
 
11
  const textArea = document.getElementById('text');
12
  const soundClips = document.getElementById('sound-clips');
13
  const statusElement = document.getElementById('status');
14
+ const generationStatusElement = document.getElementById('generationStatus');
15
 
16
  speedValue.innerHTML = speedInput.value;
17
 
 
20
  let audioCtx = null;
21
  const worker = new Worker("sherpa-onnx-tts.worker.js");
22
  let ttsInstanceInfo = {
23
+ modelType: null,
24
  numSpeakers: 0,
25
  isReady: false,
26
  };
 
29
  Module.setStatus(e.data.status);
30
  return;
31
  }
32
+ if (e.data.type === "sherpa-onnx-tts-generation-progress") {
33
+ const percent = Math.max(0, Math.min(100, (e.data.progress || 0) * 100));
34
+ setGenerationStatus(`Generating audio... ${percent.toFixed(2)}%`);
35
+ return;
36
+ }
37
  if (e.data.type === "sherpa-onnx-tts-ready") {
38
+ ttsInstanceInfo.modelType = e.data.modelType;
39
  ttsInstanceInfo.numSpeakers = e.data.numSpeakers;
40
  ttsInstanceInfo.isReady = true;
41
  generateBtn.disabled = false;
 
45
  return;
46
  }
47
  if (e.data.type === "error") {
48
+ generateBtn.disabled = false;
49
+ if (ttsInstanceInfo.isReady) {
50
+ setGenerationStatus(e.data.message);
51
+ } else {
52
+ Module.setStatus(e.data.message);
53
+ }
54
  return;
55
  }
56
  if (e.data.type === "sherpa-onnx-tts-result") {
57
  let audio = e.data;
58
+ generateBtn.disabled = false;
59
+ setGenerationStatus('');
60
 
61
  console.log(audio.samples.length, audio.sampleRate);
62
 
 
126
  referenceTextSection.classList.toggle('hidden', !isZipVoice);
127
  }
128
 
129
+ function setGenerationStatus(status) {
130
+ if (!generationStatusElement) {
131
+ return;
132
+ }
133
+
134
+ generationStatusElement.textContent = status;
135
+ generationStatusElement.style.display = status ? 'block' : 'none';
136
+ }
137
+
138
  function getMonoSamples(audioBuffer) {
139
  if (audioBuffer.numberOfChannels === 1) {
140
  return new Float32Array(audioBuffer.getChannelData(0));
 
174
  return name.toLowerCase().endsWith('.wav');
175
  }
176
 
177
+ function sanitizeFilename(name) {
178
+ return name.replace(/[^a-zA-Z0-9._-]+/g, '-');
179
+ }
180
+
181
+ function downloadBlob(blob, filename) {
182
+ const url = window.URL.createObjectURL(blob);
183
+ const link = document.createElement('a');
184
+ link.href = url;
185
+ link.download = filename;
186
+ document.body.appendChild(link);
187
+ link.click();
188
+ document.body.removeChild(link);
189
+ window.URL.revokeObjectURL(url);
190
+ }
191
+
192
  generateBtn.onclick = async function() {
193
  const isZipVoice = ttsInstanceInfo.modelType === 4;
194
 
 
236
 
237
  const referenceText = referenceTextInput.value.trim();
238
  if (referenceText.length === 0) {
239
+ alert('Please input the transcript of the reference audio');
240
  return;
241
  }
242
 
 
248
  referenceText: referenceText,
249
  numSteps: 4,
250
  extra: {
251
+ min_char_in_sentence: 10,
252
  },
253
  };
254
 
255
+ generateBtn.disabled = true;
256
+ setGenerationStatus('Generating audio...');
257
+
258
  worker.postMessage({
259
  text,
260
  genConfig,
 
276
 
277
  const text = textArea.value.trim().substring(0, 100);
278
  const clipName = `${index} ${text} ...`;
279
+ const filename = `${sanitizeFilename(clipName)}.wav`;
280
  index += 1;
281
 
282
  const clipContainer = document.createElement('article');
283
  const clipLabel = document.createElement('p');
284
  const audio = document.createElement('audio');
285
+ const saveButton = document.createElement('button');
286
  const deleteButton = document.createElement('button');
287
  clipContainer.classList.add('clip');
288
  audio.setAttribute('controls', '');
289
+ saveButton.textContent = 'Save';
290
+ saveButton.className = 'save';
291
  deleteButton.textContent = 'Delete';
292
  deleteButton.className = 'delete';
293
 
 
296
  clipContainer.appendChild(audio);
297
 
298
  clipContainer.appendChild(clipLabel);
299
+ clipContainer.appendChild(saveButton);
300
  clipContainer.appendChild(deleteButton);
301
  soundClips.appendChild(clipContainer);
302
 
 
305
  const audioURL = window.URL.createObjectURL(blob);
306
  audio.src = audioURL;
307
 
308
+ saveButton.onclick = function() {
309
+ downloadBlob(blob, filename);
310
+ };
311
+
312
  deleteButton.onclick = function(e) {
313
  let evtTgt = e.target;
314
  evtTgt.parentNode.parentNode.removeChild(evtTgt.parentNode);
index.html CHANGED
@@ -44,9 +44,9 @@
44
  <br/>
45
  </div>
46
  <div id="referenceTextSection" class="hidden">
47
- <label for="referenceText">Reference text: </label>
48
  <br/>
49
- <textarea id="referenceText" rows="3" placeholder="Please enter the transcription of the reference audio"></textarea>
50
  <br/>
51
  <br/>
52
  </div>
@@ -59,6 +59,7 @@
59
  <br/>
60
  <br/>
61
  <button id="generateBtn" disabled>Generate</button>
 
62
  </div>
63
 
64
  <section flex="1" overflow="auto" id="sound-clips">
 
44
  <br/>
45
  </div>
46
  <div id="referenceTextSection" class="hidden">
47
+ <label for="referenceText">Reference transcript (must match the reference audio): </label>
48
  <br/>
49
+ <textarea id="referenceText" rows="3" placeholder="Please enter the transcript of the reference audio exactly"></textarea>
50
  <br/>
51
  <br/>
52
  </div>
 
59
  <br/>
60
  <br/>
61
  <button id="generateBtn" disabled>Generate</button>
62
+ <div id="generationStatus" style="display: none; margin-top: 0.75rem; font-size: 0.95rem; color: #6c757d;"></div>
63
  </div>
64
 
65
  <section flex="1" overflow="auto" id="sound-clips">
sherpa-onnx-tts.js CHANGED
@@ -429,16 +429,14 @@ function initSherpaOnnxOfflineTtsPocketModelConfig(config, Module) {
429
  function initSherpaOnnxOfflineTtsSupertonicModelConfig(config, Module) {
430
  const durationPredictorLen =
431
  Module.lengthBytesUTF8(config.durationPredictor || '') + 1;
432
- const textEncoderLen =
433
- Module.lengthBytesUTF8(config.textEncoder || '') + 1;
434
  const vectorEstimatorLen =
435
  Module.lengthBytesUTF8(config.vectorEstimator || '') + 1;
436
  const vocoderLen = Module.lengthBytesUTF8(config.vocoder || '') + 1;
437
  const ttsJsonLen = Module.lengthBytesUTF8(config.ttsJson || '') + 1;
438
  const unicodeIndexerLen =
439
  Module.lengthBytesUTF8(config.unicodeIndexer || '') + 1;
440
- const voiceStyleLen =
441
- Module.lengthBytesUTF8(config.voiceStyle || '') + 1;
442
 
443
  const n = durationPredictorLen + textEncoderLen + vectorEstimatorLen +
444
  vocoderLen + ttsJsonLen + unicodeIndexerLen + voiceStyleLen;
@@ -471,8 +469,7 @@ function initSherpaOnnxOfflineTtsSupertonicModelConfig(config, Module) {
471
  config.unicodeIndexer || '', buffer + offset, unicodeIndexerLen);
472
  offset += unicodeIndexerLen;
473
 
474
- Module.stringToUTF8(
475
- config.voiceStyle || '', buffer + offset, voiceStyleLen);
476
  offset += voiceStyleLen;
477
 
478
  offset = 0;
@@ -873,13 +870,27 @@ class OfflineTts {
873
  const textPtr = this.Module._malloc(textLen);
874
  this.Module.stringToUTF8(text, textPtr, textLen);
875
 
876
- const audioPtr = this.Module._SherpaOnnxOfflineTtsGenerateWithConfig(
877
- this.handle, textPtr, cfgWasm.ptr,
878
- 0, // callback
879
- 0 // callback arg
880
- );
881
- this.Module._free(textPtr);
882
- freeSherpaOnnxGenerationConfig(cfgWasm, this.Module);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
883
 
884
  if (!audioPtr) {
885
  throw new Error('Failed to generate audio');
 
429
  function initSherpaOnnxOfflineTtsSupertonicModelConfig(config, Module) {
430
  const durationPredictorLen =
431
  Module.lengthBytesUTF8(config.durationPredictor || '') + 1;
432
+ const textEncoderLen = Module.lengthBytesUTF8(config.textEncoder || '') + 1;
 
433
  const vectorEstimatorLen =
434
  Module.lengthBytesUTF8(config.vectorEstimator || '') + 1;
435
  const vocoderLen = Module.lengthBytesUTF8(config.vocoder || '') + 1;
436
  const ttsJsonLen = Module.lengthBytesUTF8(config.ttsJson || '') + 1;
437
  const unicodeIndexerLen =
438
  Module.lengthBytesUTF8(config.unicodeIndexer || '') + 1;
439
+ const voiceStyleLen = Module.lengthBytesUTF8(config.voiceStyle || '') + 1;
 
440
 
441
  const n = durationPredictorLen + textEncoderLen + vectorEstimatorLen +
442
  vocoderLen + ttsJsonLen + unicodeIndexerLen + voiceStyleLen;
 
469
  config.unicodeIndexer || '', buffer + offset, unicodeIndexerLen);
470
  offset += unicodeIndexerLen;
471
 
472
+ Module.stringToUTF8(config.voiceStyle || '', buffer + offset, voiceStyleLen);
 
473
  offset += voiceStyleLen;
474
 
475
  offset = 0;
 
870
  const textPtr = this.Module._malloc(textLen);
871
  this.Module.stringToUTF8(text, textPtr, textLen);
872
 
873
+ let callbackPtr = 0;
874
+ if (genConfig.callback) {
875
+ callbackPtr = this.Module.addFunction((samplesPtr, n, progress, arg) => {
876
+ const heapSamples =
877
+ this.Module.HEAPF32.subarray(samplesPtr / 4, samplesPtr / 4 + n);
878
+ const samples = new Float32Array(heapSamples);
879
+ return genConfig.callback(samples, n, progress, arg);
880
+ }, 'iiifi');
881
+ }
882
+
883
+ let audioPtr = 0;
884
+ try {
885
+ audioPtr = this.Module._SherpaOnnxOfflineTtsGenerateWithConfig(
886
+ this.handle, textPtr, cfgWasm.ptr, callbackPtr, 0);
887
+ } finally {
888
+ this.Module._free(textPtr);
889
+ freeSherpaOnnxGenerationConfig(cfgWasm, this.Module);
890
+ if (callbackPtr) {
891
+ this.Module.removeFunction(callbackPtr);
892
+ }
893
+ }
894
 
895
  if (!audioPtr) {
896
  throw new Error('Failed to generate audio');
sherpa-onnx-tts.worker.js CHANGED
@@ -28,6 +28,18 @@ self.Module = {
28
  };
29
  importScripts("sherpa-onnx-wasm-main-tts.js");
30
  importScripts("sherpa-onnx-tts.js");
 
 
 
 
 
 
 
 
 
 
 
 
31
  self.onmessage = async (e) => {
32
  const { type, text, sid, speed, genConfig } = e.data;
33
  if (type === "generate") {
@@ -53,7 +65,7 @@ self.onmessage = async (e) => {
53
  } catch (err) {
54
  self.postMessage({
55
  type: "error",
56
- message: "Generation failed: " + err.message,
57
  });
58
  }
59
  } else if (type === "generateWithConfig") {
@@ -61,7 +73,16 @@ self.onmessage = async (e) => {
61
  return;
62
  }
63
  try {
64
- const audio = tts.generateWithConfig(text, genConfig || {});
 
 
 
 
 
 
 
 
 
65
  const samples = audio.samples;
66
  const sampleRate = audio.sampleRate;
67
  self.postMessage(
@@ -75,7 +96,7 @@ self.onmessage = async (e) => {
75
  } catch (err) {
76
  self.postMessage({
77
  type: "error",
78
- message: "Generation failed: " + err.message,
79
  });
80
  }
81
  }
 
28
  };
29
  importScripts("sherpa-onnx-wasm-main-tts.js");
30
  importScripts("sherpa-onnx-tts.js");
31
+
32
+ function getErrorMessage(err) {
33
+ if (err instanceof Error) {
34
+ if (err.stack) {
35
+ return `${err.message}\n${err.stack}`;
36
+ }
37
+ return err.message;
38
+ }
39
+
40
+ return `${err}`;
41
+ }
42
+
43
  self.onmessage = async (e) => {
44
  const { type, text, sid, speed, genConfig } = e.data;
45
  if (type === "generate") {
 
65
  } catch (err) {
66
  self.postMessage({
67
  type: "error",
68
+ message: "Generation failed: " + getErrorMessage(err),
69
  });
70
  }
71
  } else if (type === "generateWithConfig") {
 
73
  return;
74
  }
75
  try {
76
+ const config = Object.assign({}, genConfig || {});
77
+ config.callback = (samples, n, progress) => {
78
+ self.postMessage({
79
+ type: "sherpa-onnx-tts-generation-progress",
80
+ progress: progress,
81
+ });
82
+ return 1;
83
+ };
84
+
85
+ const audio = tts.generateWithConfig(text, config);
86
  const samples = audio.samples;
87
  const sampleRate = audio.sampleRate;
88
  self.postMessage(
 
96
  } catch (err) {
97
  self.postMessage({
98
  type: "error",
99
+ message: "Generation failed: " + getErrorMessage(err),
100
  });
101
  }
102
  }
sherpa-onnx-wasm-main-tts.js CHANGED
The diff for this file is too large to render. See raw diff
 
sherpa-onnx-wasm-main-tts.wasm CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6edcd44a15b7c385405a142b473d9f83eff0fb5c1ca683e2a729addedb0bd21b
3
- size 11967286
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fafbc0801ce90199530579097ddb6db2e7fffefc2f6ca237f571db22956f15bc
3
+ size 11967283