csukuangfj commited on
Commit
133b768
·
1 Parent(s): 22bfcad

update model

Browse files
sherpa-onnx-tts.js CHANGED
@@ -28,7 +28,13 @@ function freeConfig(config, Module) {
28
  freeConfig(config.pocket, Module)
29
  }
30
 
31
- Module._free(config.ptr);
 
 
 
 
 
 
32
  }
33
 
34
  // The user should free the returned pointers
@@ -405,8 +411,91 @@ function initSherpaOnnxOfflineTtsPocketModelConfig(config, Module) {
405
 
406
  Module.setValue(ptr + 6 * 4, buffer + offset, 'i8*');
407
  offset += tokenScoresJsonLen;
408
-
409
- Module.setValue(ptr + 7 * 4, config.voiceEmbeddingCacheCapacity !== undefined ? config.voiceEmbeddingCacheCapacity : 50, 'i32');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
410
 
411
  return {
412
  buffer: buffer,
@@ -489,6 +578,17 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
489
  };
490
  }
491
 
 
 
 
 
 
 
 
 
 
 
 
492
 
493
  const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig(
494
  config.offlineTtsVitsModelConfig, Module);
@@ -508,9 +608,12 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
508
  const pocketModelConfig = initSherpaOnnxOfflineTtsPocketModelConfig(
509
  config.offlineTtsPocketModelConfig, Module);
510
 
 
 
 
511
  const len = vitsModelConfig.len + matchaModelConfig.len +
512
  kokoroModelConfig.len + kittenModelConfig.len + zipVoiceModelConfig.len +
513
- pocketModelConfig.len + 3 * 4;
514
 
515
  const ptr = Module._malloc(len);
516
 
@@ -546,6 +649,10 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
546
  Module._CopyHeap(pocketModelConfig.ptr, pocketModelConfig.len, ptr + offset);
547
  offset += pocketModelConfig.len;
548
 
 
 
 
 
549
  return {
550
  buffer: buffer,
551
  ptr: ptr,
@@ -556,6 +663,7 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
556
  kitten: kittenModelConfig,
557
  zipvoice: zipVoiceModelConfig,
558
  pocket: pocketModelConfig,
 
559
  };
560
  }
561
 
@@ -615,9 +723,6 @@ function initSherpaOnnxGenerationConfig(config, Module) {
615
  const len = 9 * 4;
616
  const ptr = Module._malloc(len);
617
 
618
- // Zero-init for safety
619
- Module.HEAPU8.fill(0, ptr, ptr + len);
620
-
621
  // float silence_scale
622
  Module.setValue(ptr + 0 * 4, config.silenceScale || 0.2, 'float');
623
 
@@ -709,6 +814,8 @@ class OfflineTts {
709
  }
710
 
711
  free() {
 
 
712
  this.Module._SherpaOnnxDestroyOfflineTts(this.handle);
713
  this.handle = 0
714
  }
@@ -719,27 +826,47 @@ class OfflineTts {
719
  // speed: 1.0
720
  // }
721
  generate(config) {
 
 
 
 
 
 
 
 
722
  const textLen = this.Module.lengthBytesUTF8(config.text) + 1;
723
  const textPtr = this.Module._malloc(textLen);
724
  this.Module.stringToUTF8(config.text, textPtr, textLen);
725
 
726
  const h = this.Module._SherpaOnnxOfflineTtsGenerate(
727
- this.handle, textPtr, config.sid, config.speed);
728
 
729
- const numSamples = this.Module.HEAP32[h / 4 + 1];
730
- const sampleRate = this.Module.HEAP32[h / 4 + 2];
731
 
732
- const samplesPtr = this.Module.HEAP32[h / 4] / 4;
733
- const samples = new Float32Array(numSamples);
734
- for (let i = 0; i < numSamples; i++) {
735
- samples[i] = this.Module.HEAPF32[samplesPtr + i];
736
  }
737
 
 
 
 
 
 
 
 
 
 
 
 
738
  this.Module._SherpaOnnxDestroyOfflineTtsGeneratedAudio(h);
739
  return {samples: samples, sampleRate: sampleRate};
740
  }
741
 
742
  generateWithConfig(text, genConfig) {
 
 
 
 
743
  const cfgWasm = initSherpaOnnxGenerationConfig(genConfig, this.Module);
744
 
745
  const textLen = this.Module.lengthBytesUTF8(text) + 1;
@@ -751,28 +878,24 @@ class OfflineTts {
751
  0, // callback
752
  0 // callback arg
753
  );
 
 
754
 
755
  if (!audioPtr) {
756
- this.Module._free(textPtr);
757
- freeSherpaOnnxGenerationConfig(cfgWasm, this.Module);
758
  throw new Error('Failed to generate audio');
759
  }
760
 
761
- const samplesPtr = this.Module.HEAP32[audioPtr / 4]; // float* samples
762
- const numSamples =
763
- this.Module.HEAP32[audioPtr / 4 + 1]; // int32 num_samples
764
- const sampleRate =
765
- this.Module.HEAP32[audioPtr / 4 + 2]; // int32 sample_rate
766
 
767
- // 5️⃣ Copy samples to Float32Array
768
- const samples = new Float32Array(numSamples);
769
- for (let i = 0; i < numSamples; i++) {
770
- samples[i] = this.Module.HEAPF32[samplesPtr / 4 + i];
771
- }
 
 
772
 
773
  this.Module._SherpaOnnxDestroyOfflineTtsGeneratedAudio(audioPtr);
774
- this.Module._free(textPtr);
775
- freeSherpaOnnxGenerationConfig(cfgWasm, this.Module);
776
 
777
  return {samples, sampleRate};
778
  }
@@ -781,9 +904,8 @@ class OfflineTts {
781
  const samples = audio.samples;
782
  const sampleRate = audio.sampleRate;
783
  const ptr = this.Module._malloc(samples.length * 4);
784
- for (let i = 0; i < samples.length; i++) {
785
- this.Module.HEAPF32[ptr / 4 + i] = samples[i];
786
- }
787
 
788
  const filenameLen = this.Module.lengthBytesUTF8(filename) + 1;
789
  const buffer = this.Module._malloc(filenameLen);
 
28
  freeConfig(config.pocket, Module)
29
  }
30
 
31
+ if ('supertonic' in config) {
32
+ freeConfig(config.supertonic, Module)
33
+ }
34
+
35
+ if (config.ptr) {
36
+ Module._free(config.ptr);
37
+ }
38
  }
39
 
40
  // The user should free the returned pointers
 
411
 
412
  Module.setValue(ptr + 6 * 4, buffer + offset, 'i8*');
413
  offset += tokenScoresJsonLen;
414
+
415
+ Module.setValue(
416
+ ptr + 7 * 4,
417
+ config.voiceEmbeddingCacheCapacity !== undefined ?
418
+ config.voiceEmbeddingCacheCapacity :
419
+ 50,
420
+ 'i32');
421
+
422
+ return {
423
+ buffer: buffer,
424
+ ptr: ptr,
425
+ len: len,
426
+ };
427
+ }
428
+
429
+ function initSherpaOnnxOfflineTtsSupertonicModelConfig(config, Module) {
430
+ const durationPredictorLen =
431
+ Module.lengthBytesUTF8(config.durationPredictor || '') + 1;
432
+ const textEncoderLen =
433
+ Module.lengthBytesUTF8(config.textEncoder || '') + 1;
434
+ const vectorEstimatorLen =
435
+ Module.lengthBytesUTF8(config.vectorEstimator || '') + 1;
436
+ const vocoderLen = Module.lengthBytesUTF8(config.vocoder || '') + 1;
437
+ const ttsJsonLen = Module.lengthBytesUTF8(config.ttsJson || '') + 1;
438
+ const unicodeIndexerLen =
439
+ Module.lengthBytesUTF8(config.unicodeIndexer || '') + 1;
440
+ const voiceStyleLen =
441
+ Module.lengthBytesUTF8(config.voiceStyle || '') + 1;
442
+
443
+ const n = durationPredictorLen + textEncoderLen + vectorEstimatorLen +
444
+ vocoderLen + ttsJsonLen + unicodeIndexerLen + voiceStyleLen;
445
+
446
+ const buffer = Module._malloc(n);
447
+
448
+ const len = 7 * 4;
449
+ const ptr = Module._malloc(len);
450
+
451
+ let offset = 0;
452
+ Module.stringToUTF8(
453
+ config.durationPredictor || '', buffer + offset, durationPredictorLen);
454
+ offset += durationPredictorLen;
455
+
456
+ Module.stringToUTF8(
457
+ config.textEncoder || '', buffer + offset, textEncoderLen);
458
+ offset += textEncoderLen;
459
+
460
+ Module.stringToUTF8(
461
+ config.vectorEstimator || '', buffer + offset, vectorEstimatorLen);
462
+ offset += vectorEstimatorLen;
463
+
464
+ Module.stringToUTF8(config.vocoder || '', buffer + offset, vocoderLen);
465
+ offset += vocoderLen;
466
+
467
+ Module.stringToUTF8(config.ttsJson || '', buffer + offset, ttsJsonLen);
468
+ offset += ttsJsonLen;
469
+
470
+ Module.stringToUTF8(
471
+ config.unicodeIndexer || '', buffer + offset, unicodeIndexerLen);
472
+ offset += unicodeIndexerLen;
473
+
474
+ Module.stringToUTF8(
475
+ config.voiceStyle || '', buffer + offset, voiceStyleLen);
476
+ offset += voiceStyleLen;
477
+
478
+ offset = 0;
479
+ Module.setValue(ptr + 0 * 4, buffer + offset, 'i8*');
480
+ offset += durationPredictorLen;
481
+
482
+ Module.setValue(ptr + 1 * 4, buffer + offset, 'i8*');
483
+ offset += textEncoderLen;
484
+
485
+ Module.setValue(ptr + 2 * 4, buffer + offset, 'i8*');
486
+ offset += vectorEstimatorLen;
487
+
488
+ Module.setValue(ptr + 3 * 4, buffer + offset, 'i8*');
489
+ offset += vocoderLen;
490
+
491
+ Module.setValue(ptr + 4 * 4, buffer + offset, 'i8*');
492
+ offset += ttsJsonLen;
493
+
494
+ Module.setValue(ptr + 5 * 4, buffer + offset, 'i8*');
495
+ offset += unicodeIndexerLen;
496
+
497
+ Module.setValue(ptr + 6 * 4, buffer + offset, 'i8*');
498
+ offset += voiceStyleLen;
499
 
500
  return {
501
  buffer: buffer,
 
578
  };
579
  }
580
 
581
+ if (!('offlineTtsSupertonicModelConfig' in config)) {
582
+ config.offlineTtsSupertonicModelConfig = {
583
+ durationPredictor: '',
584
+ textEncoder: '',
585
+ vectorEstimator: '',
586
+ vocoder: '',
587
+ ttsJson: '',
588
+ unicodeIndexer: '',
589
+ voiceStyle: '',
590
+ };
591
+ }
592
 
593
  const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig(
594
  config.offlineTtsVitsModelConfig, Module);
 
608
  const pocketModelConfig = initSherpaOnnxOfflineTtsPocketModelConfig(
609
  config.offlineTtsPocketModelConfig, Module);
610
 
611
+ const supertonicModelConfig = initSherpaOnnxOfflineTtsSupertonicModelConfig(
612
+ config.offlineTtsSupertonicModelConfig, Module);
613
+
614
  const len = vitsModelConfig.len + matchaModelConfig.len +
615
  kokoroModelConfig.len + kittenModelConfig.len + zipVoiceModelConfig.len +
616
+ pocketModelConfig.len + supertonicModelConfig.len + 3 * 4;
617
 
618
  const ptr = Module._malloc(len);
619
 
 
649
  Module._CopyHeap(pocketModelConfig.ptr, pocketModelConfig.len, ptr + offset);
650
  offset += pocketModelConfig.len;
651
 
652
+ Module._CopyHeap(
653
+ supertonicModelConfig.ptr, supertonicModelConfig.len, ptr + offset);
654
+ offset += supertonicModelConfig.len;
655
+
656
  return {
657
  buffer: buffer,
658
  ptr: ptr,
 
663
  kitten: kittenModelConfig,
664
  zipvoice: zipVoiceModelConfig,
665
  pocket: pocketModelConfig,
666
+ supertonic: supertonicModelConfig,
667
  };
668
  }
669
 
 
723
  const len = 9 * 4;
724
  const ptr = Module._malloc(len);
725
 
 
 
 
726
  // float silence_scale
727
  Module.setValue(ptr + 0 * 4, config.silenceScale || 0.2, 'float');
728
 
 
814
  }
815
 
816
  free() {
817
+ if (!this.handle) return;
818
+
819
  this.Module._SherpaOnnxDestroyOfflineTts(this.handle);
820
  this.handle = 0
821
  }
 
826
  // speed: 1.0
827
  // }
828
  generate(config) {
829
+ if (!this.handle) {
830
+ throw new Error('OfflineTts has been freed');
831
+ }
832
+
833
+ if (!config || !config.text) {
834
+ throw new Error('config.text is required');
835
+ }
836
+
837
  const textLen = this.Module.lengthBytesUTF8(config.text) + 1;
838
  const textPtr = this.Module._malloc(textLen);
839
  this.Module.stringToUTF8(config.text, textPtr, textLen);
840
 
841
  const h = this.Module._SherpaOnnxOfflineTtsGenerate(
842
+ this.handle, textPtr, config.sid ?? 0, config.speed ?? 1.0);
843
 
844
+ this.Module._free(textPtr);
 
845
 
846
+ if (!h) {
847
+ throw new Error('TTS generation failed');
 
 
848
  }
849
 
850
+ const base = h / 4;
851
+
852
+ const samplesPtr = this.Module.HEAPU32[base];
853
+ const numSamples = this.Module.HEAP32[base + 1];
854
+ const sampleRate = this.Module.HEAP32[base + 2];
855
+
856
+ const heapSamples = this.Module.HEAPF32.subarray(
857
+ samplesPtr / 4, samplesPtr / 4 + numSamples);
858
+
859
+ const samples = new Float32Array(heapSamples);
860
+
861
  this.Module._SherpaOnnxDestroyOfflineTtsGeneratedAudio(h);
862
  return {samples: samples, sampleRate: sampleRate};
863
  }
864
 
865
  generateWithConfig(text, genConfig) {
866
+ if (!this.handle) {
867
+ throw new Error('OfflineTts has been freed');
868
+ }
869
+
870
  const cfgWasm = initSherpaOnnxGenerationConfig(genConfig, this.Module);
871
 
872
  const textLen = this.Module.lengthBytesUTF8(text) + 1;
 
878
  0, // callback
879
  0 // callback arg
880
  );
881
+ this.Module._free(textPtr);
882
+ freeSherpaOnnxGenerationConfig(cfgWasm, this.Module);
883
 
884
  if (!audioPtr) {
 
 
885
  throw new Error('Failed to generate audio');
886
  }
887
 
888
+ const base = audioPtr / 4;
 
 
 
 
889
 
890
+ const samplesPtr = this.Module.HEAPU32[base]; // float* samples
891
+ const numSamples = this.Module.HEAP32[base + 1]; // int32 num_samples
892
+ const sampleRate = this.Module.HEAP32[base + 2]; // int32 sample_rate
893
+
894
+ const heapSamples = this.Module.HEAPF32.subarray(
895
+ samplesPtr / 4, samplesPtr / 4 + numSamples);
896
+ const samples = new Float32Array(heapSamples);
897
 
898
  this.Module._SherpaOnnxDestroyOfflineTtsGeneratedAudio(audioPtr);
 
 
899
 
900
  return {samples, sampleRate};
901
  }
 
904
  const samples = audio.samples;
905
  const sampleRate = audio.sampleRate;
906
  const ptr = this.Module._malloc(samples.length * 4);
907
+
908
+ this.Module.HEAPF32.set(samples, ptr / 4);
 
909
 
910
  const filenameLen = this.Module.lengthBytesUTF8(filename) + 1;
911
  const buffer = this.Module._malloc(filenameLen);
sherpa-onnx-wasm-main-tts.js CHANGED
The diff for this file is too large to render. See raw diff
 
sherpa-onnx-wasm-main-tts.wasm CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f39411c53164c66e47504532f6b19ffc7ab34be9df81b64dd27ac181d9bd7f43
3
- size 11900557
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:20ec17318118c73835b33cf44ed73e34e368c612104e6a584bbd3ea565eb0750
3
+ size 11964249