Commit ·
133b768
1
Parent(s): 22bfcad
update model
Browse files- sherpa-onnx-tts.js +153 -31
- sherpa-onnx-wasm-main-tts.js +0 -0
- sherpa-onnx-wasm-main-tts.wasm +2 -2
sherpa-onnx-tts.js
CHANGED
|
@@ -28,7 +28,13 @@ function freeConfig(config, Module) {
|
|
| 28 |
freeConfig(config.pocket, Module)
|
| 29 |
}
|
| 30 |
|
| 31 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 32 |
}
|
| 33 |
|
| 34 |
// The user should free the returned pointers
|
|
@@ -405,8 +411,91 @@ function initSherpaOnnxOfflineTtsPocketModelConfig(config, Module) {
|
|
| 405 |
|
| 406 |
Module.setValue(ptr + 6 * 4, buffer + offset, 'i8*');
|
| 407 |
offset += tokenScoresJsonLen;
|
| 408 |
-
|
| 409 |
-
Module.setValue(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 410 |
|
| 411 |
return {
|
| 412 |
buffer: buffer,
|
|
@@ -489,6 +578,17 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
|
|
| 489 |
};
|
| 490 |
}
|
| 491 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 492 |
|
| 493 |
const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig(
|
| 494 |
config.offlineTtsVitsModelConfig, Module);
|
|
@@ -508,9 +608,12 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
|
|
| 508 |
const pocketModelConfig = initSherpaOnnxOfflineTtsPocketModelConfig(
|
| 509 |
config.offlineTtsPocketModelConfig, Module);
|
| 510 |
|
|
|
|
|
|
|
|
|
|
| 511 |
const len = vitsModelConfig.len + matchaModelConfig.len +
|
| 512 |
kokoroModelConfig.len + kittenModelConfig.len + zipVoiceModelConfig.len +
|
| 513 |
-
pocketModelConfig.len + 3 * 4;
|
| 514 |
|
| 515 |
const ptr = Module._malloc(len);
|
| 516 |
|
|
@@ -546,6 +649,10 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
|
|
| 546 |
Module._CopyHeap(pocketModelConfig.ptr, pocketModelConfig.len, ptr + offset);
|
| 547 |
offset += pocketModelConfig.len;
|
| 548 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 549 |
return {
|
| 550 |
buffer: buffer,
|
| 551 |
ptr: ptr,
|
|
@@ -556,6 +663,7 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
|
|
| 556 |
kitten: kittenModelConfig,
|
| 557 |
zipvoice: zipVoiceModelConfig,
|
| 558 |
pocket: pocketModelConfig,
|
|
|
|
| 559 |
};
|
| 560 |
}
|
| 561 |
|
|
@@ -615,9 +723,6 @@ function initSherpaOnnxGenerationConfig(config, Module) {
|
|
| 615 |
const len = 9 * 4;
|
| 616 |
const ptr = Module._malloc(len);
|
| 617 |
|
| 618 |
-
// Zero-init for safety
|
| 619 |
-
Module.HEAPU8.fill(0, ptr, ptr + len);
|
| 620 |
-
|
| 621 |
// float silence_scale
|
| 622 |
Module.setValue(ptr + 0 * 4, config.silenceScale || 0.2, 'float');
|
| 623 |
|
|
@@ -709,6 +814,8 @@ class OfflineTts {
|
|
| 709 |
}
|
| 710 |
|
| 711 |
free() {
|
|
|
|
|
|
|
| 712 |
this.Module._SherpaOnnxDestroyOfflineTts(this.handle);
|
| 713 |
this.handle = 0
|
| 714 |
}
|
|
@@ -719,27 +826,47 @@ class OfflineTts {
|
|
| 719 |
// speed: 1.0
|
| 720 |
// }
|
| 721 |
generate(config) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 722 |
const textLen = this.Module.lengthBytesUTF8(config.text) + 1;
|
| 723 |
const textPtr = this.Module._malloc(textLen);
|
| 724 |
this.Module.stringToUTF8(config.text, textPtr, textLen);
|
| 725 |
|
| 726 |
const h = this.Module._SherpaOnnxOfflineTtsGenerate(
|
| 727 |
-
this.handle, textPtr, config.sid, config.speed);
|
| 728 |
|
| 729 |
-
|
| 730 |
-
const sampleRate = this.Module.HEAP32[h / 4 + 2];
|
| 731 |
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
for (let i = 0; i < numSamples; i++) {
|
| 735 |
-
samples[i] = this.Module.HEAPF32[samplesPtr + i];
|
| 736 |
}
|
| 737 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 738 |
this.Module._SherpaOnnxDestroyOfflineTtsGeneratedAudio(h);
|
| 739 |
return {samples: samples, sampleRate: sampleRate};
|
| 740 |
}
|
| 741 |
|
| 742 |
generateWithConfig(text, genConfig) {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 743 |
const cfgWasm = initSherpaOnnxGenerationConfig(genConfig, this.Module);
|
| 744 |
|
| 745 |
const textLen = this.Module.lengthBytesUTF8(text) + 1;
|
|
@@ -751,28 +878,24 @@ class OfflineTts {
|
|
| 751 |
0, // callback
|
| 752 |
0 // callback arg
|
| 753 |
);
|
|
|
|
|
|
|
| 754 |
|
| 755 |
if (!audioPtr) {
|
| 756 |
-
this.Module._free(textPtr);
|
| 757 |
-
freeSherpaOnnxGenerationConfig(cfgWasm, this.Module);
|
| 758 |
throw new Error('Failed to generate audio');
|
| 759 |
}
|
| 760 |
|
| 761 |
-
const
|
| 762 |
-
const numSamples =
|
| 763 |
-
this.Module.HEAP32[audioPtr / 4 + 1]; // int32 num_samples
|
| 764 |
-
const sampleRate =
|
| 765 |
-
this.Module.HEAP32[audioPtr / 4 + 2]; // int32 sample_rate
|
| 766 |
|
| 767 |
-
|
| 768 |
-
const
|
| 769 |
-
|
| 770 |
-
|
| 771 |
-
|
|
|
|
|
|
|
| 772 |
|
| 773 |
this.Module._SherpaOnnxDestroyOfflineTtsGeneratedAudio(audioPtr);
|
| 774 |
-
this.Module._free(textPtr);
|
| 775 |
-
freeSherpaOnnxGenerationConfig(cfgWasm, this.Module);
|
| 776 |
|
| 777 |
return {samples, sampleRate};
|
| 778 |
}
|
|
@@ -781,9 +904,8 @@ class OfflineTts {
|
|
| 781 |
const samples = audio.samples;
|
| 782 |
const sampleRate = audio.sampleRate;
|
| 783 |
const ptr = this.Module._malloc(samples.length * 4);
|
| 784 |
-
|
| 785 |
-
|
| 786 |
-
}
|
| 787 |
|
| 788 |
const filenameLen = this.Module.lengthBytesUTF8(filename) + 1;
|
| 789 |
const buffer = this.Module._malloc(filenameLen);
|
|
|
|
| 28 |
freeConfig(config.pocket, Module)
|
| 29 |
}
|
| 30 |
|
| 31 |
+
if ('supertonic' in config) {
|
| 32 |
+
freeConfig(config.supertonic, Module)
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
if (config.ptr) {
|
| 36 |
+
Module._free(config.ptr);
|
| 37 |
+
}
|
| 38 |
}
|
| 39 |
|
| 40 |
// The user should free the returned pointers
|
|
|
|
| 411 |
|
| 412 |
Module.setValue(ptr + 6 * 4, buffer + offset, 'i8*');
|
| 413 |
offset += tokenScoresJsonLen;
|
| 414 |
+
|
| 415 |
+
Module.setValue(
|
| 416 |
+
ptr + 7 * 4,
|
| 417 |
+
config.voiceEmbeddingCacheCapacity !== undefined ?
|
| 418 |
+
config.voiceEmbeddingCacheCapacity :
|
| 419 |
+
50,
|
| 420 |
+
'i32');
|
| 421 |
+
|
| 422 |
+
return {
|
| 423 |
+
buffer: buffer,
|
| 424 |
+
ptr: ptr,
|
| 425 |
+
len: len,
|
| 426 |
+
};
|
| 427 |
+
}
|
| 428 |
+
|
| 429 |
+
function initSherpaOnnxOfflineTtsSupertonicModelConfig(config, Module) {
|
| 430 |
+
const durationPredictorLen =
|
| 431 |
+
Module.lengthBytesUTF8(config.durationPredictor || '') + 1;
|
| 432 |
+
const textEncoderLen =
|
| 433 |
+
Module.lengthBytesUTF8(config.textEncoder || '') + 1;
|
| 434 |
+
const vectorEstimatorLen =
|
| 435 |
+
Module.lengthBytesUTF8(config.vectorEstimator || '') + 1;
|
| 436 |
+
const vocoderLen = Module.lengthBytesUTF8(config.vocoder || '') + 1;
|
| 437 |
+
const ttsJsonLen = Module.lengthBytesUTF8(config.ttsJson || '') + 1;
|
| 438 |
+
const unicodeIndexerLen =
|
| 439 |
+
Module.lengthBytesUTF8(config.unicodeIndexer || '') + 1;
|
| 440 |
+
const voiceStyleLen =
|
| 441 |
+
Module.lengthBytesUTF8(config.voiceStyle || '') + 1;
|
| 442 |
+
|
| 443 |
+
const n = durationPredictorLen + textEncoderLen + vectorEstimatorLen +
|
| 444 |
+
vocoderLen + ttsJsonLen + unicodeIndexerLen + voiceStyleLen;
|
| 445 |
+
|
| 446 |
+
const buffer = Module._malloc(n);
|
| 447 |
+
|
| 448 |
+
const len = 7 * 4;
|
| 449 |
+
const ptr = Module._malloc(len);
|
| 450 |
+
|
| 451 |
+
let offset = 0;
|
| 452 |
+
Module.stringToUTF8(
|
| 453 |
+
config.durationPredictor || '', buffer + offset, durationPredictorLen);
|
| 454 |
+
offset += durationPredictorLen;
|
| 455 |
+
|
| 456 |
+
Module.stringToUTF8(
|
| 457 |
+
config.textEncoder || '', buffer + offset, textEncoderLen);
|
| 458 |
+
offset += textEncoderLen;
|
| 459 |
+
|
| 460 |
+
Module.stringToUTF8(
|
| 461 |
+
config.vectorEstimator || '', buffer + offset, vectorEstimatorLen);
|
| 462 |
+
offset += vectorEstimatorLen;
|
| 463 |
+
|
| 464 |
+
Module.stringToUTF8(config.vocoder || '', buffer + offset, vocoderLen);
|
| 465 |
+
offset += vocoderLen;
|
| 466 |
+
|
| 467 |
+
Module.stringToUTF8(config.ttsJson || '', buffer + offset, ttsJsonLen);
|
| 468 |
+
offset += ttsJsonLen;
|
| 469 |
+
|
| 470 |
+
Module.stringToUTF8(
|
| 471 |
+
config.unicodeIndexer || '', buffer + offset, unicodeIndexerLen);
|
| 472 |
+
offset += unicodeIndexerLen;
|
| 473 |
+
|
| 474 |
+
Module.stringToUTF8(
|
| 475 |
+
config.voiceStyle || '', buffer + offset, voiceStyleLen);
|
| 476 |
+
offset += voiceStyleLen;
|
| 477 |
+
|
| 478 |
+
offset = 0;
|
| 479 |
+
Module.setValue(ptr + 0 * 4, buffer + offset, 'i8*');
|
| 480 |
+
offset += durationPredictorLen;
|
| 481 |
+
|
| 482 |
+
Module.setValue(ptr + 1 * 4, buffer + offset, 'i8*');
|
| 483 |
+
offset += textEncoderLen;
|
| 484 |
+
|
| 485 |
+
Module.setValue(ptr + 2 * 4, buffer + offset, 'i8*');
|
| 486 |
+
offset += vectorEstimatorLen;
|
| 487 |
+
|
| 488 |
+
Module.setValue(ptr + 3 * 4, buffer + offset, 'i8*');
|
| 489 |
+
offset += vocoderLen;
|
| 490 |
+
|
| 491 |
+
Module.setValue(ptr + 4 * 4, buffer + offset, 'i8*');
|
| 492 |
+
offset += ttsJsonLen;
|
| 493 |
+
|
| 494 |
+
Module.setValue(ptr + 5 * 4, buffer + offset, 'i8*');
|
| 495 |
+
offset += unicodeIndexerLen;
|
| 496 |
+
|
| 497 |
+
Module.setValue(ptr + 6 * 4, buffer + offset, 'i8*');
|
| 498 |
+
offset += voiceStyleLen;
|
| 499 |
|
| 500 |
return {
|
| 501 |
buffer: buffer,
|
|
|
|
| 578 |
};
|
| 579 |
}
|
| 580 |
|
| 581 |
+
if (!('offlineTtsSupertonicModelConfig' in config)) {
|
| 582 |
+
config.offlineTtsSupertonicModelConfig = {
|
| 583 |
+
durationPredictor: '',
|
| 584 |
+
textEncoder: '',
|
| 585 |
+
vectorEstimator: '',
|
| 586 |
+
vocoder: '',
|
| 587 |
+
ttsJson: '',
|
| 588 |
+
unicodeIndexer: '',
|
| 589 |
+
voiceStyle: '',
|
| 590 |
+
};
|
| 591 |
+
}
|
| 592 |
|
| 593 |
const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig(
|
| 594 |
config.offlineTtsVitsModelConfig, Module);
|
|
|
|
| 608 |
const pocketModelConfig = initSherpaOnnxOfflineTtsPocketModelConfig(
|
| 609 |
config.offlineTtsPocketModelConfig, Module);
|
| 610 |
|
| 611 |
+
const supertonicModelConfig = initSherpaOnnxOfflineTtsSupertonicModelConfig(
|
| 612 |
+
config.offlineTtsSupertonicModelConfig, Module);
|
| 613 |
+
|
| 614 |
const len = vitsModelConfig.len + matchaModelConfig.len +
|
| 615 |
kokoroModelConfig.len + kittenModelConfig.len + zipVoiceModelConfig.len +
|
| 616 |
+
pocketModelConfig.len + supertonicModelConfig.len + 3 * 4;
|
| 617 |
|
| 618 |
const ptr = Module._malloc(len);
|
| 619 |
|
|
|
|
| 649 |
Module._CopyHeap(pocketModelConfig.ptr, pocketModelConfig.len, ptr + offset);
|
| 650 |
offset += pocketModelConfig.len;
|
| 651 |
|
| 652 |
+
Module._CopyHeap(
|
| 653 |
+
supertonicModelConfig.ptr, supertonicModelConfig.len, ptr + offset);
|
| 654 |
+
offset += supertonicModelConfig.len;
|
| 655 |
+
|
| 656 |
return {
|
| 657 |
buffer: buffer,
|
| 658 |
ptr: ptr,
|
|
|
|
| 663 |
kitten: kittenModelConfig,
|
| 664 |
zipvoice: zipVoiceModelConfig,
|
| 665 |
pocket: pocketModelConfig,
|
| 666 |
+
supertonic: supertonicModelConfig,
|
| 667 |
};
|
| 668 |
}
|
| 669 |
|
|
|
|
| 723 |
const len = 9 * 4;
|
| 724 |
const ptr = Module._malloc(len);
|
| 725 |
|
|
|
|
|
|
|
|
|
|
| 726 |
// float silence_scale
|
| 727 |
Module.setValue(ptr + 0 * 4, config.silenceScale || 0.2, 'float');
|
| 728 |
|
|
|
|
| 814 |
}
|
| 815 |
|
| 816 |
free() {
|
| 817 |
+
if (!this.handle) return;
|
| 818 |
+
|
| 819 |
this.Module._SherpaOnnxDestroyOfflineTts(this.handle);
|
| 820 |
this.handle = 0
|
| 821 |
}
|
|
|
|
| 826 |
// speed: 1.0
|
| 827 |
// }
|
| 828 |
generate(config) {
|
| 829 |
+
if (!this.handle) {
|
| 830 |
+
throw new Error('OfflineTts has been freed');
|
| 831 |
+
}
|
| 832 |
+
|
| 833 |
+
if (!config || !config.text) {
|
| 834 |
+
throw new Error('config.text is required');
|
| 835 |
+
}
|
| 836 |
+
|
| 837 |
const textLen = this.Module.lengthBytesUTF8(config.text) + 1;
|
| 838 |
const textPtr = this.Module._malloc(textLen);
|
| 839 |
this.Module.stringToUTF8(config.text, textPtr, textLen);
|
| 840 |
|
| 841 |
const h = this.Module._SherpaOnnxOfflineTtsGenerate(
|
| 842 |
+
this.handle, textPtr, config.sid ?? 0, config.speed ?? 1.0);
|
| 843 |
|
| 844 |
+
this.Module._free(textPtr);
|
|
|
|
| 845 |
|
| 846 |
+
if (!h) {
|
| 847 |
+
throw new Error('TTS generation failed');
|
|
|
|
|
|
|
| 848 |
}
|
| 849 |
|
| 850 |
+
const base = h / 4;
|
| 851 |
+
|
| 852 |
+
const samplesPtr = this.Module.HEAPU32[base];
|
| 853 |
+
const numSamples = this.Module.HEAP32[base + 1];
|
| 854 |
+
const sampleRate = this.Module.HEAP32[base + 2];
|
| 855 |
+
|
| 856 |
+
const heapSamples = this.Module.HEAPF32.subarray(
|
| 857 |
+
samplesPtr / 4, samplesPtr / 4 + numSamples);
|
| 858 |
+
|
| 859 |
+
const samples = new Float32Array(heapSamples);
|
| 860 |
+
|
| 861 |
this.Module._SherpaOnnxDestroyOfflineTtsGeneratedAudio(h);
|
| 862 |
return {samples: samples, sampleRate: sampleRate};
|
| 863 |
}
|
| 864 |
|
| 865 |
generateWithConfig(text, genConfig) {
|
| 866 |
+
if (!this.handle) {
|
| 867 |
+
throw new Error('OfflineTts has been freed');
|
| 868 |
+
}
|
| 869 |
+
|
| 870 |
const cfgWasm = initSherpaOnnxGenerationConfig(genConfig, this.Module);
|
| 871 |
|
| 872 |
const textLen = this.Module.lengthBytesUTF8(text) + 1;
|
|
|
|
| 878 |
0, // callback
|
| 879 |
0 // callback arg
|
| 880 |
);
|
| 881 |
+
this.Module._free(textPtr);
|
| 882 |
+
freeSherpaOnnxGenerationConfig(cfgWasm, this.Module);
|
| 883 |
|
| 884 |
if (!audioPtr) {
|
|
|
|
|
|
|
| 885 |
throw new Error('Failed to generate audio');
|
| 886 |
}
|
| 887 |
|
| 888 |
+
const base = audioPtr / 4;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 889 |
|
| 890 |
+
const samplesPtr = this.Module.HEAPU32[base]; // float* samples
|
| 891 |
+
const numSamples = this.Module.HEAP32[base + 1]; // int32 num_samples
|
| 892 |
+
const sampleRate = this.Module.HEAP32[base + 2]; // int32 sample_rate
|
| 893 |
+
|
| 894 |
+
const heapSamples = this.Module.HEAPF32.subarray(
|
| 895 |
+
samplesPtr / 4, samplesPtr / 4 + numSamples);
|
| 896 |
+
const samples = new Float32Array(heapSamples);
|
| 897 |
|
| 898 |
this.Module._SherpaOnnxDestroyOfflineTtsGeneratedAudio(audioPtr);
|
|
|
|
|
|
|
| 899 |
|
| 900 |
return {samples, sampleRate};
|
| 901 |
}
|
|
|
|
| 904 |
const samples = audio.samples;
|
| 905 |
const sampleRate = audio.sampleRate;
|
| 906 |
const ptr = this.Module._malloc(samples.length * 4);
|
| 907 |
+
|
| 908 |
+
this.Module.HEAPF32.set(samples, ptr / 4);
|
|
|
|
| 909 |
|
| 910 |
const filenameLen = this.Module.lengthBytesUTF8(filename) + 1;
|
| 911 |
const buffer = this.Module._malloc(filenameLen);
|
sherpa-onnx-wasm-main-tts.js
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|
sherpa-onnx-wasm-main-tts.wasm
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:20ec17318118c73835b33cf44ed73e34e368c612104e6a584bbd3ea565eb0750
|
| 3 |
+
size 11964249
|