Commit ·
5c1c58f
1
Parent(s): ec2c459
update model
Browse files- app-tts.js +22 -15
- sherpa-onnx-tts.js +23 -0
- sherpa-onnx-wasm-main-tts.data +2 -2
- sherpa-onnx-wasm-main-tts.js +0 -0
app-tts.js
CHANGED
|
@@ -123,8 +123,10 @@ speedInput.oninput = function() {
|
|
| 123 |
|
| 124 |
function updateUiForModelType() {
|
| 125 |
const isZipVoice = ttsInstanceInfo.modelType === 4;
|
| 126 |
-
|
| 127 |
-
|
|
|
|
|
|
|
| 128 |
referenceTextSection.classList.toggle('hidden', !isZipVoice);
|
| 129 |
}
|
| 130 |
|
|
@@ -193,9 +195,11 @@ function downloadBlob(blob, filename) {
|
|
| 193 |
|
| 194 |
generateBtn.onclick = async function() {
|
| 195 |
const isZipVoice = ttsInstanceInfo.modelType === 4;
|
|
|
|
|
|
|
| 196 |
|
| 197 |
let speakerId = speakerIdInput.value;
|
| 198 |
-
if (!
|
| 199 |
if (speakerId.trim().length == 0) {
|
| 200 |
alert('Please input a speakerId');
|
| 201 |
return;
|
|
@@ -224,7 +228,7 @@ generateBtn.onclick = async function() {
|
|
| 224 |
console.log('speed', speedInput.value);
|
| 225 |
console.log('text', text);
|
| 226 |
|
| 227 |
-
if (
|
| 228 |
if (!referenceAudioInput.files || referenceAudioInput.files.length === 0) {
|
| 229 |
alert('Please select a reference audio file');
|
| 230 |
return;
|
|
@@ -236,24 +240,27 @@ generateBtn.onclick = async function() {
|
|
| 236 |
return;
|
| 237 |
}
|
| 238 |
|
| 239 |
-
const referenceText = referenceTextInput.value.trim();
|
| 240 |
-
if (referenceText.length === 0) {
|
| 241 |
-
alert('Please input the transcript of the reference audio');
|
| 242 |
-
return;
|
| 243 |
-
}
|
| 244 |
-
|
| 245 |
const referenceAudio = await readReferenceAudio(referenceFile);
|
| 246 |
const genConfig = {
|
| 247 |
speed: parseFloat(speedInput.value),
|
| 248 |
referenceAudio: referenceAudio.samples,
|
| 249 |
referenceSampleRate: referenceAudio.sampleRate,
|
| 250 |
-
|
| 251 |
-
numSteps: 4,
|
| 252 |
-
extra: {
|
| 253 |
-
min_char_in_sentence: 10,
|
| 254 |
-
},
|
| 255 |
};
|
| 256 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 257 |
generateBtn.disabled = true;
|
| 258 |
setGenerationStatus('Generating audio...');
|
| 259 |
|
|
|
|
| 123 |
|
| 124 |
function updateUiForModelType() {
|
| 125 |
const isZipVoice = ttsInstanceInfo.modelType === 4;
|
| 126 |
+
const isPocketTts = ttsInstanceInfo.modelType === 5;
|
| 127 |
+
const useGenerationConfig = isZipVoice || isPocketTts;
|
| 128 |
+
speakerIdSection.classList.toggle('hidden', useGenerationConfig);
|
| 129 |
+
referenceAudioSection.classList.toggle('hidden', !useGenerationConfig);
|
| 130 |
referenceTextSection.classList.toggle('hidden', !isZipVoice);
|
| 131 |
}
|
| 132 |
|
|
|
|
| 195 |
|
| 196 |
generateBtn.onclick = async function() {
|
| 197 |
const isZipVoice = ttsInstanceInfo.modelType === 4;
|
| 198 |
+
const isPocketTts = ttsInstanceInfo.modelType === 5;
|
| 199 |
+
const useGenerationConfig = isZipVoice || isPocketTts;
|
| 200 |
|
| 201 |
let speakerId = speakerIdInput.value;
|
| 202 |
+
if (!useGenerationConfig) {
|
| 203 |
if (speakerId.trim().length == 0) {
|
| 204 |
alert('Please input a speakerId');
|
| 205 |
return;
|
|
|
|
| 228 |
console.log('speed', speedInput.value);
|
| 229 |
console.log('text', text);
|
| 230 |
|
| 231 |
+
if (useGenerationConfig) {
|
| 232 |
if (!referenceAudioInput.files || referenceAudioInput.files.length === 0) {
|
| 233 |
alert('Please select a reference audio file');
|
| 234 |
return;
|
|
|
|
| 240 |
return;
|
| 241 |
}
|
| 242 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 243 |
const referenceAudio = await readReferenceAudio(referenceFile);
|
| 244 |
const genConfig = {
|
| 245 |
speed: parseFloat(speedInput.value),
|
| 246 |
referenceAudio: referenceAudio.samples,
|
| 247 |
referenceSampleRate: referenceAudio.sampleRate,
|
| 248 |
+
numSteps: isPocketTts ? 5 : 4,
|
|
|
|
|
|
|
|
|
|
|
|
|
| 249 |
};
|
| 250 |
|
| 251 |
+
if (isZipVoice) {
|
| 252 |
+
const referenceText = referenceTextInput.value.trim();
|
| 253 |
+
if (referenceText.length === 0) {
|
| 254 |
+
alert('Please input the transcript of the reference audio');
|
| 255 |
+
return;
|
| 256 |
+
}
|
| 257 |
+
|
| 258 |
+
genConfig.referenceText = referenceText;
|
| 259 |
+
genConfig.extra = {
|
| 260 |
+
min_char_in_sentence: 10,
|
| 261 |
+
};
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
generateBtn.disabled = true;
|
| 265 |
setGenerationStatus('Generating audio...');
|
| 266 |
|
sherpa-onnx-tts.js
CHANGED
|
@@ -985,6 +985,17 @@ function createOfflineTts(Module, myConfig) {
|
|
| 985 |
guidanceScale: 1.0,
|
| 986 |
};
|
| 987 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 988 |
let ruleFsts = '';
|
| 989 |
|
| 990 |
switch (modelType) {
|
|
@@ -1031,6 +1042,17 @@ function createOfflineTts(Module, myConfig) {
|
|
| 1031 |
offlineTtsZipVoiceModelConfig.dataDir = './espeak-ng-data';
|
| 1032 |
offlineTtsZipVoiceModelConfig.lexicon = './lexicon.txt';
|
| 1033 |
break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1034 |
}
|
| 1035 |
|
| 1036 |
const offlineTtsModelConfig = {
|
|
@@ -1039,6 +1061,7 @@ function createOfflineTts(Module, myConfig) {
|
|
| 1039 |
offlineTtsKokoroModelConfig: offlineTtsKokoroModelConfig,
|
| 1040 |
offlineTtsKittenModelConfig: offlineTtsKittenModelConfig,
|
| 1041 |
offlineTtsZipVoiceModelConfig: offlineTtsZipVoiceModelConfig,
|
|
|
|
| 1042 |
numThreads: 1,
|
| 1043 |
debug: 1,
|
| 1044 |
provider: 'cpu',
|
|
|
|
| 985 |
guidanceScale: 1.0,
|
| 986 |
};
|
| 987 |
|
| 988 |
+
const offlineTtsPocketModelConfig = {
|
| 989 |
+
lmFlow: '',
|
| 990 |
+
lmMain: '',
|
| 991 |
+
encoder: '',
|
| 992 |
+
decoder: '',
|
| 993 |
+
textConditioner: '',
|
| 994 |
+
vocabJson: '',
|
| 995 |
+
tokenScoresJson: '',
|
| 996 |
+
voiceEmbeddingCacheCapacity: 50,
|
| 997 |
+
};
|
| 998 |
+
|
| 999 |
let ruleFsts = '';
|
| 1000 |
|
| 1001 |
switch (modelType) {
|
|
|
|
| 1042 |
offlineTtsZipVoiceModelConfig.dataDir = './espeak-ng-data';
|
| 1043 |
offlineTtsZipVoiceModelConfig.lexicon = './lexicon.txt';
|
| 1044 |
break;
|
| 1045 |
+
case 5:
|
| 1046 |
+
// pocket tts
|
| 1047 |
+
// https://k2-fsa.github.io/sherpa/onnx/tts/pocket.html
|
| 1048 |
+
offlineTtsPocketModelConfig.lmFlow = './lm_flow.int8.onnx';
|
| 1049 |
+
offlineTtsPocketModelConfig.lmMain = './lm_main.int8.onnx';
|
| 1050 |
+
offlineTtsPocketModelConfig.encoder = './encoder.onnx';
|
| 1051 |
+
offlineTtsPocketModelConfig.decoder = './decoder.int8.onnx';
|
| 1052 |
+
offlineTtsPocketModelConfig.textConditioner = './text_conditioner.onnx';
|
| 1053 |
+
offlineTtsPocketModelConfig.vocabJson = './vocab.json';
|
| 1054 |
+
offlineTtsPocketModelConfig.tokenScoresJson = './token_scores.json';
|
| 1055 |
+
break;
|
| 1056 |
}
|
| 1057 |
|
| 1058 |
const offlineTtsModelConfig = {
|
|
|
|
| 1061 |
offlineTtsKokoroModelConfig: offlineTtsKokoroModelConfig,
|
| 1062 |
offlineTtsKittenModelConfig: offlineTtsKittenModelConfig,
|
| 1063 |
offlineTtsZipVoiceModelConfig: offlineTtsZipVoiceModelConfig,
|
| 1064 |
+
offlineTtsPocketModelConfig: offlineTtsPocketModelConfig,
|
| 1065 |
numThreads: 1,
|
| 1066 |
debug: 1,
|
| 1067 |
provider: 'cpu',
|
sherpa-onnx-wasm-main-tts.data
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:bcf45b1441eb0aa228a3c6de1ea62a25f5a691eb99fdae68f3d6dc10f5e995f7
|
| 3 |
+
size 96525193
|
sherpa-onnx-wasm-main-tts.js
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|