csukuangfj commited on
Commit
d0d0f2d
·
1 Parent(s): 133b768

update model

Browse files
app-tts.js CHANGED
@@ -1,6 +1,11 @@
1
  const generateBtn = document.getElementById('generateBtn');
2
  const speakerIdLabel = document.getElementById('speakerIdLabel');
3
  const speakerIdInput = document.getElementById('speakerId');
 
 
 
 
 
4
  const speedInput = document.getElementById('speed');
5
  const speedValue = document.getElementById('speedValue');
6
  const textArea = document.getElementById('text');
@@ -13,6 +18,7 @@ let index = 0;
13
  let audioCtx = null;
14
  const worker = new Worker("/sherpa-onnx-tts.worker.js");
15
  let ttsInstanceInfo = {
 
16
  numSpeakers: 0,
17
  isReady: false,
18
  };
@@ -21,10 +27,12 @@ worker.onmessage = (e) => {
21
  Module.setStatus(e.data.status);
22
  }
23
  if (e.data.type === "sherpa-onnx-tts-ready") {
 
24
  ttsInstanceInfo.numSpeakers = e.data.numSpeakers;
25
  ttsInstanceInfo.isReady = true;
26
  generateBtn.disabled = false;
27
  speakerIdLabel.innerHTML = `Speaker ID (0 - ${e.data.numSpeakers - 1}):`;
 
28
  return;
29
  }
30
  if (e.data.type === "sherpa-onnx-tts-result") {
@@ -92,23 +100,68 @@ speedInput.oninput = function() {
92
  speedValue.innerHTML = this.value;
93
  };
94
 
95
- generateBtn.onclick = function() {
96
- let speakerId = speakerIdInput.value;
97
- if (speakerId.trim().length == 0) {
98
- alert('Please input a speakerId');
99
- return;
 
 
 
 
 
100
  }
101
 
102
- if (!speakerId.match(/^\d+$/)) {
103
- alert(`Input speakerID ${
104
- speakerId} is not a number.\nPlease enter a number between 0 and ${
105
- ttsInstanceInfo.numSpeakers - 1}`);
106
- return;
 
107
  }
108
- speakerId = parseInt(speakerId, 10);
109
- if (speakerId > ttsInstanceInfo.numSpeakers - 1) {
110
- alert(`Pleaser enter a number between 0 and ${ttsInstanceInfo.numSpeakers - 1}`);
111
- return;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
112
  }
113
 
114
  let text = textArea.value.trim();
@@ -120,10 +173,43 @@ generateBtn.onclick = function() {
120
  console.log('speakerId', speakerId);
121
  console.log('speed', speedInput.value);
122
  console.log('text', text);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
123
  worker.postMessage({
124
  text,
125
  sid: speakerId,
126
- speed: speedInput.value,
127
  type: "generate",
128
  });
129
  };
 
1
  const generateBtn = document.getElementById('generateBtn');
2
  const speakerIdLabel = document.getElementById('speakerIdLabel');
3
  const speakerIdInput = document.getElementById('speakerId');
4
+ const speakerIdSection = document.getElementById('speakerIdSection');
5
+ const referenceAudioSection = document.getElementById('referenceAudioSection');
6
+ const referenceTextSection = document.getElementById('referenceTextSection');
7
+ const referenceAudioInput = document.getElementById('referenceAudio');
8
+ const referenceTextInput = document.getElementById('referenceText');
9
  const speedInput = document.getElementById('speed');
10
  const speedValue = document.getElementById('speedValue');
11
  const textArea = document.getElementById('text');
 
18
  let audioCtx = null;
19
  const worker = new Worker("/sherpa-onnx-tts.worker.js");
20
  let ttsInstanceInfo = {
21
+ modelType: 0,
22
  numSpeakers: 0,
23
  isReady: false,
24
  };
 
27
  Module.setStatus(e.data.status);
28
  }
29
  if (e.data.type === "sherpa-onnx-tts-ready") {
30
+ ttsInstanceInfo.modelType = e.data.modelType ?? 0;
31
  ttsInstanceInfo.numSpeakers = e.data.numSpeakers;
32
  ttsInstanceInfo.isReady = true;
33
  generateBtn.disabled = false;
34
  speakerIdLabel.innerHTML = `Speaker ID (0 - ${e.data.numSpeakers - 1}):`;
35
+ updateUiForModelType();
36
  return;
37
  }
38
  if (e.data.type === "sherpa-onnx-tts-result") {
 
100
  speedValue.innerHTML = this.value;
101
  };
102
 
103
+ function updateUiForModelType() {
104
+ const isZipVoice = ttsInstanceInfo.modelType === 4;
105
+ speakerIdSection.classList.toggle('hidden', isZipVoice);
106
+ referenceAudioSection.classList.toggle('hidden', !isZipVoice);
107
+ referenceTextSection.classList.toggle('hidden', !isZipVoice);
108
+ }
109
+
110
+ function getMonoSamples(audioBuffer) {
111
+ if (audioBuffer.numberOfChannels === 1) {
112
+ return new Float32Array(audioBuffer.getChannelData(0));
113
  }
114
 
115
+ const samples = new Float32Array(audioBuffer.length);
116
+ for (let c = 0; c < audioBuffer.numberOfChannels; ++c) {
117
+ const channel = audioBuffer.getChannelData(c);
118
+ for (let i = 0; i < channel.length; ++i) {
119
+ samples[i] += channel[i];
120
+ }
121
  }
122
+
123
+ for (let i = 0; i < samples.length; ++i) {
124
+ samples[i] /= audioBuffer.numberOfChannels;
125
+ }
126
+
127
+ return samples;
128
+ }
129
+
130
+ async function readReferenceAudio(file) {
131
+ const arrayBuffer = await file.arrayBuffer();
132
+ const ctx = new AudioContext();
133
+ try {
134
+ const audioBuffer = await ctx.decodeAudioData(arrayBuffer.slice(0));
135
+ return {
136
+ samples: getMonoSamples(audioBuffer),
137
+ sampleRate: audioBuffer.sampleRate,
138
+ };
139
+ } finally {
140
+ await ctx.close();
141
+ }
142
+ }
143
+
144
+ generateBtn.onclick = async function() {
145
+ const isZipVoice = ttsInstanceInfo.modelType === 4;
146
+
147
+ let speakerId = speakerIdInput.value;
148
+ if (!isZipVoice) {
149
+ if (speakerId.trim().length == 0) {
150
+ alert('Please input a speakerId');
151
+ return;
152
+ }
153
+
154
+ if (!speakerId.match(/^\d+$/)) {
155
+ alert(`Input speakerID ${
156
+ speakerId} is not a number.\nPlease enter a number between 0 and ${
157
+ ttsInstanceInfo.numSpeakers - 1}`);
158
+ return;
159
+ }
160
+ speakerId = parseInt(speakerId, 10);
161
+ if (speakerId > ttsInstanceInfo.numSpeakers - 1) {
162
+ alert(`Pleaser enter a number between 0 and ${ttsInstanceInfo.numSpeakers - 1}`);
163
+ return;
164
+ }
165
  }
166
 
167
  let text = textArea.value.trim();
 
173
  console.log('speakerId', speakerId);
174
  console.log('speed', speedInput.value);
175
  console.log('text', text);
176
+
177
+ if (isZipVoice) {
178
+ if (!referenceAudioInput.files || referenceAudioInput.files.length === 0) {
179
+ alert('Please select a reference audio file');
180
+ return;
181
+ }
182
+
183
+ const referenceText = referenceTextInput.value.trim();
184
+ if (referenceText.length === 0) {
185
+ alert('Please input the reference text');
186
+ return;
187
+ }
188
+
189
+ const referenceAudio = await readReferenceAudio(referenceAudioInput.files[0]);
190
+ const genConfig = {
191
+ speed: parseFloat(speedInput.value),
192
+ referenceAudio: referenceAudio.samples,
193
+ referenceSampleRate: referenceAudio.sampleRate,
194
+ referenceText: referenceText,
195
+ numSteps: 4,
196
+ extra: {
197
+ min_char_in_sentence: 30,
198
+ },
199
+ };
200
+
201
+ worker.postMessage({
202
+ text,
203
+ genConfig,
204
+ type: "generateWithConfig",
205
+ }, [genConfig.referenceAudio.buffer]);
206
+ return;
207
+ }
208
+
209
  worker.postMessage({
210
  text,
211
  sid: speakerId,
212
+ speed: parseFloat(speedInput.value),
213
  type: "generate",
214
  });
215
  };
index.html CHANGED
@@ -14,6 +14,9 @@
14
  .loading {
15
  display: none !important;
16
  }
 
 
 
17
  </style>
18
  </head>
19
 
@@ -27,10 +30,25 @@
27
  <div id="status">Loading...</div>
28
 
29
  <div id="singleAudioContent" class="tab-content loading">
30
- <label for="speakerId" id="speakerIdLabel">Speaker ID: </label>
31
- <input type="text" id="speakerId" name="speakerId" value="0" />
32
- <br/>
33
- <br/>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  <label for="speed" id="speedLabel">Speed: </label>
35
  <input type="range" id="speed" name="speed" min="0.4" max="3.5" step="0.1" value="1.0" />
36
  <span id="speedValue"></span>
 
14
  .loading {
15
  display: none !important;
16
  }
17
+ .hidden {
18
+ display: none !important;
19
+ }
20
  </style>
21
  </head>
22
 
 
30
  <div id="status">Loading...</div>
31
 
32
  <div id="singleAudioContent" class="tab-content loading">
33
+ <div id="speakerIdSection">
34
+ <label for="speakerId" id="speakerIdLabel">Speaker ID: </label>
35
+ <input type="text" id="speakerId" name="speakerId" value="0" />
36
+ <br/>
37
+ <br/>
38
+ </div>
39
+ <div id="referenceAudioSection" class="hidden">
40
+ <label for="referenceAudio">Reference audio: </label>
41
+ <input type="file" id="referenceAudio" name="referenceAudio" accept="audio/*" />
42
+ <br/>
43
+ <br/>
44
+ </div>
45
+ <div id="referenceTextSection" class="hidden">
46
+ <label for="referenceText">Reference text: </label>
47
+ <br/>
48
+ <textarea id="referenceText" rows="3" placeholder="Please enter the transcription of the reference audio"></textarea>
49
+ <br/>
50
+ <br/>
51
+ </div>
52
  <label for="speed" id="speedLabel">Speed: </label>
53
  <input type="range" id="speed" name="speed" min="0.4" max="3.5" step="0.1" value="1.0" />
54
  <span id="speedValue"></span>
sherpa-onnx-tts.js CHANGED
@@ -916,6 +916,12 @@ class OfflineTts {
916
  }
917
  }
918
 
 
 
 
 
 
 
919
  function createOfflineTts(Module, myConfig) {
920
  const vits = {
921
  model: '',
@@ -955,10 +961,22 @@ function createOfflineTts(Module, myConfig) {
955
  lengthScale: 1.0,
956
  };
957
 
 
 
 
 
 
 
 
 
 
 
 
 
 
958
  let ruleFsts = '';
959
 
960
- let type = 0;
961
- switch (type) {
962
  case 0:
963
  // vits
964
  vits.model = './model.onnx';
@@ -992,6 +1010,16 @@ function createOfflineTts(Module, myConfig) {
992
  matcha.tokens = './tokens.txt';
993
  matcha.dataDir = './espeak-ng-data';
994
  break;
 
 
 
 
 
 
 
 
 
 
995
  }
996
 
997
  const offlineTtsModelConfig = {
@@ -999,6 +1027,7 @@ function createOfflineTts(Module, myConfig) {
999
  offlineTtsMatchaModelConfig: matcha,
1000
  offlineTtsKokoroModelConfig: offlineTtsKokoroModelConfig,
1001
  offlineTtsKittenModelConfig: offlineTtsKittenModelConfig,
 
1002
  numThreads: 1,
1003
  debug: 1,
1004
  provider: 'cpu',
@@ -1022,5 +1051,6 @@ if (typeof process == 'object' && typeof process.versions == 'object' &&
1022
  typeof process.versions.node == 'string') {
1023
  module.exports = {
1024
  createOfflineTts,
 
1025
  };
1026
  }
 
916
  }
917
  }
918
 
919
+ let modelType = 0;
920
+
921
+ function getDefaultOfflineTtsModelType() {
922
+ return modelType;
923
+ }
924
+
925
  function createOfflineTts(Module, myConfig) {
926
  const vits = {
927
  model: '',
 
961
  lengthScale: 1.0,
962
  };
963
 
964
+ const offlineTtsZipVoiceModelConfig = {
965
+ tokens: '',
966
+ encoder: '',
967
+ decoder: '',
968
+ vocoder: '',
969
+ dataDir: '',
970
+ lexicon: '',
971
+ featScale: 0.1,
972
+ tShift: 0.5,
973
+ targetRMS: 0.1,
974
+ guidanceScale: 1.0,
975
+ };
976
+
977
  let ruleFsts = '';
978
 
979
+ switch (modelType) {
 
980
  case 0:
981
  // vits
982
  vits.model = './model.onnx';
 
1010
  matcha.tokens = './tokens.txt';
1011
  matcha.dataDir = './espeak-ng-data';
1012
  break;
1013
+ case 4:
1014
+ // zipvoice zh-en
1015
+ // https://k2-fsa.github.io/sherpa/onnx/tts/zipvoice.html
1016
+ offlineTtsZipVoiceModelConfig.tokens = './tokens.txt';
1017
+ offlineTtsZipVoiceModelConfig.encoder = './encoder.int8.onnx';
1018
+ offlineTtsZipVoiceModelConfig.decoder = './decoder.int8.onnx';
1019
+ offlineTtsZipVoiceModelConfig.vocoder = './vocos_24khz.onnx';
1020
+ offlineTtsZipVoiceModelConfig.dataDir = './espeak-ng-data';
1021
+ offlineTtsZipVoiceModelConfig.lexicon = './lexicon.txt';
1022
+ break;
1023
  }
1024
 
1025
  const offlineTtsModelConfig = {
 
1027
  offlineTtsMatchaModelConfig: matcha,
1028
  offlineTtsKokoroModelConfig: offlineTtsKokoroModelConfig,
1029
  offlineTtsKittenModelConfig: offlineTtsKittenModelConfig,
1030
+ offlineTtsZipVoiceModelConfig: offlineTtsZipVoiceModelConfig,
1031
  numThreads: 1,
1032
  debug: 1,
1033
  provider: 'cpu',
 
1051
  typeof process.versions.node == 'string') {
1052
  module.exports = {
1053
  createOfflineTts,
1054
+ getDefaultOfflineTtsModelType,
1055
  };
1056
  }
sherpa-onnx-wasm-main-tts.data CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f7ce2d43070f87274774b8426e5ca7440a4fed2b1ecd8fe6dccb792f42a2016e
3
- size 96523617
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:619888141185f0c6dea5926ab9bb8a525383d1546b96f92d7346323297a73899
3
+ size 96524422
sherpa-onnx-wasm-main-tts.js CHANGED
The diff for this file is too large to render. See raw diff
 
sherpa-onnx-wasm-main-tts.wasm CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:20ec17318118c73835b33cf44ed73e34e368c612104e6a584bbd3ea565eb0750
3
- size 11964249
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6edcd44a15b7c385405a142b473d9f83eff0fb5c1ca683e2a729addedb0bd21b
3
+ size 11967286