Commit ·
d0d0f2d
1
Parent(s): 133b768
update model
Browse files- app-tts.js +101 -15
- index.html +22 -4
- sherpa-onnx-tts.js +32 -2
- sherpa-onnx-wasm-main-tts.data +2 -2
- sherpa-onnx-wasm-main-tts.js +0 -0
- sherpa-onnx-wasm-main-tts.wasm +2 -2
app-tts.js
CHANGED
|
@@ -1,6 +1,11 @@
|
|
| 1 |
const generateBtn = document.getElementById('generateBtn');
|
| 2 |
const speakerIdLabel = document.getElementById('speakerIdLabel');
|
| 3 |
const speakerIdInput = document.getElementById('speakerId');
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
const speedInput = document.getElementById('speed');
|
| 5 |
const speedValue = document.getElementById('speedValue');
|
| 6 |
const textArea = document.getElementById('text');
|
|
@@ -13,6 +18,7 @@ let index = 0;
|
|
| 13 |
let audioCtx = null;
|
| 14 |
const worker = new Worker("/sherpa-onnx-tts.worker.js");
|
| 15 |
let ttsInstanceInfo = {
|
|
|
|
| 16 |
numSpeakers: 0,
|
| 17 |
isReady: false,
|
| 18 |
};
|
|
@@ -21,10 +27,12 @@ worker.onmessage = (e) => {
|
|
| 21 |
Module.setStatus(e.data.status);
|
| 22 |
}
|
| 23 |
if (e.data.type === "sherpa-onnx-tts-ready") {
|
|
|
|
| 24 |
ttsInstanceInfo.numSpeakers = e.data.numSpeakers;
|
| 25 |
ttsInstanceInfo.isReady = true;
|
| 26 |
generateBtn.disabled = false;
|
| 27 |
speakerIdLabel.innerHTML = `Speaker ID (0 - ${e.data.numSpeakers - 1}):`;
|
|
|
|
| 28 |
return;
|
| 29 |
}
|
| 30 |
if (e.data.type === "sherpa-onnx-tts-result") {
|
|
@@ -92,23 +100,68 @@ speedInput.oninput = function() {
|
|
| 92 |
speedValue.innerHTML = this.value;
|
| 93 |
};
|
| 94 |
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
}
|
| 101 |
|
| 102 |
-
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
|
|
|
| 107 |
}
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
}
|
| 113 |
|
| 114 |
let text = textArea.value.trim();
|
|
@@ -120,10 +173,43 @@ generateBtn.onclick = function() {
|
|
| 120 |
console.log('speakerId', speakerId);
|
| 121 |
console.log('speed', speedInput.value);
|
| 122 |
console.log('text', text);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 123 |
worker.postMessage({
|
| 124 |
text,
|
| 125 |
sid: speakerId,
|
| 126 |
-
speed: speedInput.value,
|
| 127 |
type: "generate",
|
| 128 |
});
|
| 129 |
};
|
|
|
|
| 1 |
const generateBtn = document.getElementById('generateBtn');
|
| 2 |
const speakerIdLabel = document.getElementById('speakerIdLabel');
|
| 3 |
const speakerIdInput = document.getElementById('speakerId');
|
| 4 |
+
const speakerIdSection = document.getElementById('speakerIdSection');
|
| 5 |
+
const referenceAudioSection = document.getElementById('referenceAudioSection');
|
| 6 |
+
const referenceTextSection = document.getElementById('referenceTextSection');
|
| 7 |
+
const referenceAudioInput = document.getElementById('referenceAudio');
|
| 8 |
+
const referenceTextInput = document.getElementById('referenceText');
|
| 9 |
const speedInput = document.getElementById('speed');
|
| 10 |
const speedValue = document.getElementById('speedValue');
|
| 11 |
const textArea = document.getElementById('text');
|
|
|
|
| 18 |
let audioCtx = null;
|
| 19 |
const worker = new Worker("/sherpa-onnx-tts.worker.js");
|
| 20 |
let ttsInstanceInfo = {
|
| 21 |
+
modelType: 0,
|
| 22 |
numSpeakers: 0,
|
| 23 |
isReady: false,
|
| 24 |
};
|
|
|
|
| 27 |
Module.setStatus(e.data.status);
|
| 28 |
}
|
| 29 |
if (e.data.type === "sherpa-onnx-tts-ready") {
|
| 30 |
+
ttsInstanceInfo.modelType = e.data.modelType ?? 0;
|
| 31 |
ttsInstanceInfo.numSpeakers = e.data.numSpeakers;
|
| 32 |
ttsInstanceInfo.isReady = true;
|
| 33 |
generateBtn.disabled = false;
|
| 34 |
speakerIdLabel.innerHTML = `Speaker ID (0 - ${e.data.numSpeakers - 1}):`;
|
| 35 |
+
updateUiForModelType();
|
| 36 |
return;
|
| 37 |
}
|
| 38 |
if (e.data.type === "sherpa-onnx-tts-result") {
|
|
|
|
| 100 |
speedValue.innerHTML = this.value;
|
| 101 |
};
|
| 102 |
|
| 103 |
+
function updateUiForModelType() {
|
| 104 |
+
const isZipVoice = ttsInstanceInfo.modelType === 4;
|
| 105 |
+
speakerIdSection.classList.toggle('hidden', isZipVoice);
|
| 106 |
+
referenceAudioSection.classList.toggle('hidden', !isZipVoice);
|
| 107 |
+
referenceTextSection.classList.toggle('hidden', !isZipVoice);
|
| 108 |
+
}
|
| 109 |
+
|
| 110 |
+
function getMonoSamples(audioBuffer) {
|
| 111 |
+
if (audioBuffer.numberOfChannels === 1) {
|
| 112 |
+
return new Float32Array(audioBuffer.getChannelData(0));
|
| 113 |
}
|
| 114 |
|
| 115 |
+
const samples = new Float32Array(audioBuffer.length);
|
| 116 |
+
for (let c = 0; c < audioBuffer.numberOfChannels; ++c) {
|
| 117 |
+
const channel = audioBuffer.getChannelData(c);
|
| 118 |
+
for (let i = 0; i < channel.length; ++i) {
|
| 119 |
+
samples[i] += channel[i];
|
| 120 |
+
}
|
| 121 |
}
|
| 122 |
+
|
| 123 |
+
for (let i = 0; i < samples.length; ++i) {
|
| 124 |
+
samples[i] /= audioBuffer.numberOfChannels;
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
return samples;
|
| 128 |
+
}
|
| 129 |
+
|
| 130 |
+
async function readReferenceAudio(file) {
|
| 131 |
+
const arrayBuffer = await file.arrayBuffer();
|
| 132 |
+
const ctx = new AudioContext();
|
| 133 |
+
try {
|
| 134 |
+
const audioBuffer = await ctx.decodeAudioData(arrayBuffer.slice(0));
|
| 135 |
+
return {
|
| 136 |
+
samples: getMonoSamples(audioBuffer),
|
| 137 |
+
sampleRate: audioBuffer.sampleRate,
|
| 138 |
+
};
|
| 139 |
+
} finally {
|
| 140 |
+
await ctx.close();
|
| 141 |
+
}
|
| 142 |
+
}
|
| 143 |
+
|
| 144 |
+
generateBtn.onclick = async function() {
|
| 145 |
+
const isZipVoice = ttsInstanceInfo.modelType === 4;
|
| 146 |
+
|
| 147 |
+
let speakerId = speakerIdInput.value;
|
| 148 |
+
if (!isZipVoice) {
|
| 149 |
+
if (speakerId.trim().length == 0) {
|
| 150 |
+
alert('Please input a speakerId');
|
| 151 |
+
return;
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
if (!speakerId.match(/^\d+$/)) {
|
| 155 |
+
alert(`Input speakerID ${
|
| 156 |
+
speakerId} is not a number.\nPlease enter a number between 0 and ${
|
| 157 |
+
ttsInstanceInfo.numSpeakers - 1}`);
|
| 158 |
+
return;
|
| 159 |
+
}
|
| 160 |
+
speakerId = parseInt(speakerId, 10);
|
| 161 |
+
if (speakerId > ttsInstanceInfo.numSpeakers - 1) {
|
| 162 |
+
alert(`Pleaser enter a number between 0 and ${ttsInstanceInfo.numSpeakers - 1}`);
|
| 163 |
+
return;
|
| 164 |
+
}
|
| 165 |
}
|
| 166 |
|
| 167 |
let text = textArea.value.trim();
|
|
|
|
| 173 |
console.log('speakerId', speakerId);
|
| 174 |
console.log('speed', speedInput.value);
|
| 175 |
console.log('text', text);
|
| 176 |
+
|
| 177 |
+
if (isZipVoice) {
|
| 178 |
+
if (!referenceAudioInput.files || referenceAudioInput.files.length === 0) {
|
| 179 |
+
alert('Please select a reference audio file');
|
| 180 |
+
return;
|
| 181 |
+
}
|
| 182 |
+
|
| 183 |
+
const referenceText = referenceTextInput.value.trim();
|
| 184 |
+
if (referenceText.length === 0) {
|
| 185 |
+
alert('Please input the reference text');
|
| 186 |
+
return;
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
const referenceAudio = await readReferenceAudio(referenceAudioInput.files[0]);
|
| 190 |
+
const genConfig = {
|
| 191 |
+
speed: parseFloat(speedInput.value),
|
| 192 |
+
referenceAudio: referenceAudio.samples,
|
| 193 |
+
referenceSampleRate: referenceAudio.sampleRate,
|
| 194 |
+
referenceText: referenceText,
|
| 195 |
+
numSteps: 4,
|
| 196 |
+
extra: {
|
| 197 |
+
min_char_in_sentence: 30,
|
| 198 |
+
},
|
| 199 |
+
};
|
| 200 |
+
|
| 201 |
+
worker.postMessage({
|
| 202 |
+
text,
|
| 203 |
+
genConfig,
|
| 204 |
+
type: "generateWithConfig",
|
| 205 |
+
}, [genConfig.referenceAudio.buffer]);
|
| 206 |
+
return;
|
| 207 |
+
}
|
| 208 |
+
|
| 209 |
worker.postMessage({
|
| 210 |
text,
|
| 211 |
sid: speakerId,
|
| 212 |
+
speed: parseFloat(speedInput.value),
|
| 213 |
type: "generate",
|
| 214 |
});
|
| 215 |
};
|
index.html
CHANGED
|
@@ -14,6 +14,9 @@
|
|
| 14 |
.loading {
|
| 15 |
display: none !important;
|
| 16 |
}
|
|
|
|
|
|
|
|
|
|
| 17 |
</style>
|
| 18 |
</head>
|
| 19 |
|
|
@@ -27,10 +30,25 @@
|
|
| 27 |
<div id="status">Loading...</div>
|
| 28 |
|
| 29 |
<div id="singleAudioContent" class="tab-content loading">
|
| 30 |
-
<
|
| 31 |
-
|
| 32 |
-
|
| 33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
<label for="speed" id="speedLabel">Speed: </label>
|
| 35 |
<input type="range" id="speed" name="speed" min="0.4" max="3.5" step="0.1" value="1.0" />
|
| 36 |
<span id="speedValue"></span>
|
|
|
|
| 14 |
.loading {
|
| 15 |
display: none !important;
|
| 16 |
}
|
| 17 |
+
.hidden {
|
| 18 |
+
display: none !important;
|
| 19 |
+
}
|
| 20 |
</style>
|
| 21 |
</head>
|
| 22 |
|
|
|
|
| 30 |
<div id="status">Loading...</div>
|
| 31 |
|
| 32 |
<div id="singleAudioContent" class="tab-content loading">
|
| 33 |
+
<div id="speakerIdSection">
|
| 34 |
+
<label for="speakerId" id="speakerIdLabel">Speaker ID: </label>
|
| 35 |
+
<input type="text" id="speakerId" name="speakerId" value="0" />
|
| 36 |
+
<br/>
|
| 37 |
+
<br/>
|
| 38 |
+
</div>
|
| 39 |
+
<div id="referenceAudioSection" class="hidden">
|
| 40 |
+
<label for="referenceAudio">Reference audio: </label>
|
| 41 |
+
<input type="file" id="referenceAudio" name="referenceAudio" accept="audio/*" />
|
| 42 |
+
<br/>
|
| 43 |
+
<br/>
|
| 44 |
+
</div>
|
| 45 |
+
<div id="referenceTextSection" class="hidden">
|
| 46 |
+
<label for="referenceText">Reference text: </label>
|
| 47 |
+
<br/>
|
| 48 |
+
<textarea id="referenceText" rows="3" placeholder="Please enter the transcription of the reference audio"></textarea>
|
| 49 |
+
<br/>
|
| 50 |
+
<br/>
|
| 51 |
+
</div>
|
| 52 |
<label for="speed" id="speedLabel">Speed: </label>
|
| 53 |
<input type="range" id="speed" name="speed" min="0.4" max="3.5" step="0.1" value="1.0" />
|
| 54 |
<span id="speedValue"></span>
|
sherpa-onnx-tts.js
CHANGED
|
@@ -916,6 +916,12 @@ class OfflineTts {
|
|
| 916 |
}
|
| 917 |
}
|
| 918 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 919 |
function createOfflineTts(Module, myConfig) {
|
| 920 |
const vits = {
|
| 921 |
model: '',
|
|
@@ -955,10 +961,22 @@ function createOfflineTts(Module, myConfig) {
|
|
| 955 |
lengthScale: 1.0,
|
| 956 |
};
|
| 957 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 958 |
let ruleFsts = '';
|
| 959 |
|
| 960 |
-
|
| 961 |
-
switch (type) {
|
| 962 |
case 0:
|
| 963 |
// vits
|
| 964 |
vits.model = './model.onnx';
|
|
@@ -992,6 +1010,16 @@ function createOfflineTts(Module, myConfig) {
|
|
| 992 |
matcha.tokens = './tokens.txt';
|
| 993 |
matcha.dataDir = './espeak-ng-data';
|
| 994 |
break;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 995 |
}
|
| 996 |
|
| 997 |
const offlineTtsModelConfig = {
|
|
@@ -999,6 +1027,7 @@ function createOfflineTts(Module, myConfig) {
|
|
| 999 |
offlineTtsMatchaModelConfig: matcha,
|
| 1000 |
offlineTtsKokoroModelConfig: offlineTtsKokoroModelConfig,
|
| 1001 |
offlineTtsKittenModelConfig: offlineTtsKittenModelConfig,
|
|
|
|
| 1002 |
numThreads: 1,
|
| 1003 |
debug: 1,
|
| 1004 |
provider: 'cpu',
|
|
@@ -1022,5 +1051,6 @@ if (typeof process == 'object' && typeof process.versions == 'object' &&
|
|
| 1022 |
typeof process.versions.node == 'string') {
|
| 1023 |
module.exports = {
|
| 1024 |
createOfflineTts,
|
|
|
|
| 1025 |
};
|
| 1026 |
}
|
|
|
|
| 916 |
}
|
| 917 |
}
|
| 918 |
|
| 919 |
+
let modelType = 0;
|
| 920 |
+
|
| 921 |
+
function getDefaultOfflineTtsModelType() {
|
| 922 |
+
return modelType;
|
| 923 |
+
}
|
| 924 |
+
|
| 925 |
function createOfflineTts(Module, myConfig) {
|
| 926 |
const vits = {
|
| 927 |
model: '',
|
|
|
|
| 961 |
lengthScale: 1.0,
|
| 962 |
};
|
| 963 |
|
| 964 |
+
const offlineTtsZipVoiceModelConfig = {
|
| 965 |
+
tokens: '',
|
| 966 |
+
encoder: '',
|
| 967 |
+
decoder: '',
|
| 968 |
+
vocoder: '',
|
| 969 |
+
dataDir: '',
|
| 970 |
+
lexicon: '',
|
| 971 |
+
featScale: 0.1,
|
| 972 |
+
tShift: 0.5,
|
| 973 |
+
targetRMS: 0.1,
|
| 974 |
+
guidanceScale: 1.0,
|
| 975 |
+
};
|
| 976 |
+
|
| 977 |
let ruleFsts = '';
|
| 978 |
|
| 979 |
+
switch (modelType) {
|
|
|
|
| 980 |
case 0:
|
| 981 |
// vits
|
| 982 |
vits.model = './model.onnx';
|
|
|
|
| 1010 |
matcha.tokens = './tokens.txt';
|
| 1011 |
matcha.dataDir = './espeak-ng-data';
|
| 1012 |
break;
|
| 1013 |
+
case 4:
|
| 1014 |
+
// zipvoice zh-en
|
| 1015 |
+
// https://k2-fsa.github.io/sherpa/onnx/tts/zipvoice.html
|
| 1016 |
+
offlineTtsZipVoiceModelConfig.tokens = './tokens.txt';
|
| 1017 |
+
offlineTtsZipVoiceModelConfig.encoder = './encoder.int8.onnx';
|
| 1018 |
+
offlineTtsZipVoiceModelConfig.decoder = './decoder.int8.onnx';
|
| 1019 |
+
offlineTtsZipVoiceModelConfig.vocoder = './vocos_24khz.onnx';
|
| 1020 |
+
offlineTtsZipVoiceModelConfig.dataDir = './espeak-ng-data';
|
| 1021 |
+
offlineTtsZipVoiceModelConfig.lexicon = './lexicon.txt';
|
| 1022 |
+
break;
|
| 1023 |
}
|
| 1024 |
|
| 1025 |
const offlineTtsModelConfig = {
|
|
|
|
| 1027 |
offlineTtsMatchaModelConfig: matcha,
|
| 1028 |
offlineTtsKokoroModelConfig: offlineTtsKokoroModelConfig,
|
| 1029 |
offlineTtsKittenModelConfig: offlineTtsKittenModelConfig,
|
| 1030 |
+
offlineTtsZipVoiceModelConfig: offlineTtsZipVoiceModelConfig,
|
| 1031 |
numThreads: 1,
|
| 1032 |
debug: 1,
|
| 1033 |
provider: 'cpu',
|
|
|
|
| 1051 |
typeof process.versions.node == 'string') {
|
| 1052 |
module.exports = {
|
| 1053 |
createOfflineTts,
|
| 1054 |
+
getDefaultOfflineTtsModelType,
|
| 1055 |
};
|
| 1056 |
}
|
sherpa-onnx-wasm-main-tts.data
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:619888141185f0c6dea5926ab9bb8a525383d1546b96f92d7346323297a73899
|
| 3 |
+
size 96524422
|
sherpa-onnx-wasm-main-tts.js
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sherpa-onnx-wasm-main-tts.wasm
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6edcd44a15b7c385405a142b473d9f83eff0fb5c1ca683e2a729addedb0bd21b
|
| 3 |
+
size 11967286
|