Skip to content

Commit

Permalink
Add JavaScript (WebAssembly) API for Kokoro TTS models. (#1726)
Browse files Browse the repository at this point in the history
  • Loading branch information
csukuangfj authored Jan 17, 2025
1 parent e8d499d commit 3a1de0b
Show file tree
Hide file tree
Showing 5 changed files with 154 additions and 6 deletions.
10 changes: 9 additions & 1 deletion .github/scripts/test-nodejs-npm.sh
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,15 @@ ls -lh
ls -lh node_modules

# offline tts
#

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
tar xf kokoro-en-v0_19.tar.bz2
rm kokoro-en-v0_19.tar.bz2

node ./test-offline-tts-kokoro-en.js

ls -lh

curl -SL -O https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/matcha-icefall-zh-baker.tar.bz2
tar xvf matcha-icefall-zh-baker.tar.bz2
rm matcha-icefall-zh-baker.tar.bz2
Expand Down
16 changes: 16 additions & 0 deletions nodejs-examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,22 @@ node ./test-offline-speaker-diarization.js

In the following, we demonstrate how to run text-to-speech.

## ./test-offline-tts-kokoro-en.js

[./test-offline-tts-kokoro-en.js](./test-offline-tts-kokoro-en.js) shows how to use
[kokoro-en-v0_19](https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2)
for text-to-speech.

You can use the following command to run it:

```bash
wget https://github.com/k2-fsa/sherpa-onnx/releases/download/tts-models/kokoro-en-v0_19.tar.bz2
tar xf kokoro-en-v0_19.tar.bz2
rm kokoro-en-v0_19.tar.bz2

node ./test-offline-tts-kokoro-en.js
```

## ./test-offline-tts-matcha-zh.js

[./test-offline-tts-matcha-zh.js](./test-offline-tts-matcha-zh.js) shows how to use
Expand Down
37 changes: 37 additions & 0 deletions nodejs-examples/test-offline-tts-kokoro-en.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
// Copyright (c) 2025 Xiaomi Corporation (authors: Fangjun Kuang)

const sherpa_onnx = require('sherpa-onnx');

function createOfflineTts() {
let offlineTtsKokoroModelConfig = {
model: './kokoro-en-v0_19/model.onnx',
voices: './kokoro-en-v0_19/voices.bin',
tokens: './kokoro-en-v0_19/tokens.txt',
dataDir: './kokoro-en-v0_19/espeak-ng-data',
lengthScale: 1.0,
};
let offlineTtsModelConfig = {
offlineTtsKokoroModelConfig: offlineTtsKokoroModelConfig,
numThreads: 1,
debug: 1,
provider: 'cpu',
};

let offlineTtsConfig = {
offlineTtsModelConfig: offlineTtsModelConfig,
maxNumSentences: 1,
};

return sherpa_onnx.createOfflineTts(offlineTtsConfig);
}

const tts = createOfflineTts();
const speakerId = 0;
const speed = 1.0;
const text =
'Today as always, men fall into two groups: slaves and free men. Whoever does not have two-thirds of his day for himself, is a slave, whatever he may be: a statesman, a businessman, an official, or a scholar.'

const audio = tts.generate({text: text, sid: speakerId, speed: speed});
tts.save('./test-kokoro-en.wav', audio);
console.log('Saved to test-kokoro-en.wav successfully.');
tts.free();
85 changes: 81 additions & 4 deletions wasm/tts/sherpa-onnx-tts.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,12 @@ function freeConfig(config, Module) {
freeConfig(config.config, Module)
}

if ('config2' in config) {
freeConfig(config.config2, Module)
if ('matcha' in config) {
freeConfig(config.matcha, Module)
}

if ('kokoro' in config) {
freeConfig(config.kokoro, Module)
}

Module._free(config.ptr);
Expand Down Expand Up @@ -132,6 +136,52 @@ function initSherpaOnnxOfflineTtsMatchaModelConfig(config, Module) {
}
}

function initSherpaOnnxOfflineTtsKokoroModelConfig(config, Module) {
const modelLen = Module.lengthBytesUTF8(config.model) + 1;
const voicesLen = Module.lengthBytesUTF8(config.voices) + 1;
const tokensLen = Module.lengthBytesUTF8(config.tokens || '') + 1;
const dataDirLen = Module.lengthBytesUTF8(config.dataDir || '') + 1;

const n = modelLen + voicesLen + tokensLen + dataDirLen;

const buffer = Module._malloc(n);

const len = 5 * 4;
const ptr = Module._malloc(len);

let offset = 0;
Module.stringToUTF8(config.model || '', buffer + offset, modelLen);
offset += modelLen;

Module.stringToUTF8(config.voices || '', buffer + offset, voicesLen);
offset += voicesLen;

Module.stringToUTF8(config.tokens || '', buffer + offset, tokensLen);
offset += tokensLen;

Module.stringToUTF8(config.dataDir || '', buffer + offset, dataDirLen);
offset += dataDirLen;

offset = 0;
Module.setValue(ptr, buffer + offset, 'i8*');
offset += modelLen;

Module.setValue(ptr + 4, buffer + offset, 'i8*');
offset += voicesLen;

Module.setValue(ptr + 8, buffer + offset, 'i8*');
offset += tokensLen;

Module.setValue(ptr + 12, buffer + offset, 'i8*');
offset += dataDirLen;

Module.setValue(ptr + 16, config.lengthScale || 1.0, 'float');

return {
buffer: buffer, ptr: ptr, len: len,
}
}

function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
if (!('offlineTtsVitsModelConfig' in config)) {
config.offlineTtsVitsModelConfig = {
Expand Down Expand Up @@ -159,14 +209,29 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
};
}

if (!('offlineTtsKokoroModelConfig' in config)) {
config.offlineTtsKokoroModelConfig = {
model: '',
voices: '',
tokens: '',
lengthScale: 1.0,
dataDir: '',
};
}


const vitsModelConfig = initSherpaOnnxOfflineTtsVitsModelConfig(
config.offlineTtsVitsModelConfig, Module);

const matchaModelConfig = initSherpaOnnxOfflineTtsMatchaModelConfig(
config.offlineTtsMatchaModelConfig, Module);

const len = vitsModelConfig.len + matchaModelConfig.len + 3 * 4;
const kokoroModelConfig = initSherpaOnnxOfflineTtsKokoroModelConfig(
config.offlineTtsKokoroModelConfig, Module);

const len = vitsModelConfig.len + matchaModelConfig.len +
kokoroModelConfig.len + 3 * 4;

const ptr = Module._malloc(len);

let offset = 0;
Expand All @@ -188,9 +253,12 @@ function initSherpaOnnxOfflineTtsModelConfig(config, Module) {
Module._CopyHeap(matchaModelConfig.ptr, matchaModelConfig.len, ptr + offset);
offset += matchaModelConfig.len;

Module._CopyHeap(kokoroModelConfig.ptr, kokoroModelConfig.len, ptr + offset);
offset += kokoroModelConfig.len;

return {
buffer: buffer, ptr: ptr, len: len, config: vitsModelConfig,
config2: matchaModelConfig
matcha: matchaModelConfig, kokoro: kokoroModelConfig,
}
}

Expand Down Expand Up @@ -308,9 +376,18 @@ function createOfflineTts(Module, myConfig) {
lengthScale: 1.0,
};

const offlineTtsKokoroModelConfig = {
model: '',
voices: '',
tokens: '',
dataDir: '',
lengthScale: 1.0,
};

const offlineTtsModelConfig = {
offlineTtsVitsModelConfig: offlineTtsVitsModelConfig,
offlineTtsMatchaModelConfig: offlineTtsMatchaModelConfig,
offlineTtsKokoroModelConfig: offlineTtsKokoroModelConfig,
numThreads: 1,
debug: 1,
provider: 'cpu',
Expand Down
12 changes: 11 additions & 1 deletion wasm/tts/sherpa-onnx-wasm-main-tts.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,9 +15,11 @@ extern "C" {

static_assert(sizeof(SherpaOnnxOfflineTtsVitsModelConfig) == 8 * 4, "");
static_assert(sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) == 8 * 4, "");
static_assert(sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) == 5 * 4, "");
static_assert(sizeof(SherpaOnnxOfflineTtsModelConfig) ==
sizeof(SherpaOnnxOfflineTtsVitsModelConfig) +
sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) + 3 * 4,
sizeof(SherpaOnnxOfflineTtsMatchaModelConfig) +
sizeof(SherpaOnnxOfflineTtsKokoroModelConfig) + 3 * 4,
"");
static_assert(sizeof(SherpaOnnxOfflineTtsConfig) ==
sizeof(SherpaOnnxOfflineTtsModelConfig) + 3 * 4,
Expand All @@ -27,6 +29,7 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
auto tts_model_config = &tts_config->model;
auto vits_model_config = &tts_model_config->vits;
auto matcha_model_config = &tts_model_config->matcha;
auto kokoro = &tts_model_config->kokoro;
fprintf(stdout, "----------vits model config----------\n");
fprintf(stdout, "model: %s\n", vits_model_config->model);
fprintf(stdout, "lexicon: %s\n", vits_model_config->lexicon);
Expand All @@ -47,6 +50,13 @@ void MyPrint(SherpaOnnxOfflineTtsConfig *tts_config) {
fprintf(stdout, "length scale: %.3f\n", matcha_model_config->length_scale);
fprintf(stdout, "dict_dir: %s\n", matcha_model_config->dict_dir);

fprintf(stdout, "----------kokoro model config----------\n");
fprintf(stdout, "model: %s\n", kokoro->model);
fprintf(stdout, "voices: %s\n", kokoro->voices);
fprintf(stdout, "tokens: %s\n", kokoro->tokens);
fprintf(stdout, "data_dir: %s\n", kokoro->data_dir);
fprintf(stdout, "length scale: %.3f\n", kokoro->length_scale);

fprintf(stdout, "----------tts model config----------\n");
fprintf(stdout, "num threads: %d\n", tts_model_config->num_threads);
fprintf(stdout, "debug: %d\n", tts_model_config->debug);
Expand Down

0 comments on commit 3a1de0b

Please sign in to comment.