From bf3330c90667fcd4ae677138935f3074ad18860e Mon Sep 17 00:00:00 2001 From: Fangjun Kuang Date: Fri, 3 Jan 2025 17:09:29 +0800 Subject: [PATCH] Add HarmonyOS examples for MatchaTTS. (#1678) --- .../sherpa_onnx/BuildProfile.ets | 2 +- .../SherpaOnnxHar/sherpa_onnx/Index.ets | 10 +-- .../main/ets/components/NonStreamingTts.ets | 12 ++++ .../ets/workers/NonStreamingTtsWorker.ets | 53 ++++++++++++++- sherpa-onnx/c-api/c-api.cc | 2 +- sherpa-onnx/c-api/c-api.h | 2 +- sherpa-onnx/csrc/jieba-lexicon.cc | 64 +++++++++++++++++++ sherpa-onnx/csrc/jieba-lexicon.h | 6 ++ sherpa-onnx/csrc/offline-tts-matcha-impl.h | 5 +- 9 files changed, 141 insertions(+), 15 deletions(-) diff --git a/harmony-os/SherpaOnnxHar/sherpa_onnx/BuildProfile.ets b/harmony-os/SherpaOnnxHar/sherpa_onnx/BuildProfile.ets index 8eb22c9a71..4cc33f4e97 100644 --- a/harmony-os/SherpaOnnxHar/sherpa_onnx/BuildProfile.ets +++ b/harmony-os/SherpaOnnxHar/sherpa_onnx/BuildProfile.ets @@ -1,7 +1,7 @@ /** * Use these variables when you tailor your ArkTS code. They must be of the const type. */ -export const HAR_VERSION = '1.10.35'; +export const HAR_VERSION = '1.10.37'; export const BUILD_MODE_NAME = 'debug'; export const DEBUG = true; export const TARGET_NAME = 'default'; diff --git a/harmony-os/SherpaOnnxHar/sherpa_onnx/Index.ets b/harmony-os/SherpaOnnxHar/sherpa_onnx/Index.ets index 16c6279e12..56deee0ce1 100644 --- a/harmony-os/SherpaOnnxHar/sherpa_onnx/Index.ets +++ b/harmony-os/SherpaOnnxHar/sherpa_onnx/Index.ets @@ -1,11 +1,6 @@ export { listRawfileDir, readWave, readWaveFromBinary, } from "libsherpa_onnx.so"; -export { CircularBuffer, - SileroVadConfig, - SpeechSegment, - Vad, - VadConfig, -} from './src/main/ets/components/Vad'; +export { CircularBuffer, SileroVadConfig, SpeechSegment, Vad, VadConfig, } from './src/main/ets/components/Vad'; export { Samples, @@ -36,7 +31,8 @@ export { OnlineStream, OnlineRecognizer, } from './src/main/ets/components/StreamingAsr'; -export { OfflineTtsVitsModelConfig, +export { OfflineTtsMatchaModelConfig, + OfflineTtsVitsModelConfig, OfflineTtsModelConfig, OfflineTtsConfig, OfflineTts, diff --git a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/NonStreamingTts.ets b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/NonStreamingTts.ets index 556877489e..814d6e2677 100644 --- a/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/NonStreamingTts.ets +++ b/harmony-os/SherpaOnnxHar/sherpa_onnx/src/main/ets/components/NonStreamingTts.ets @@ -17,8 +17,20 @@ export class OfflineTtsVitsModelConfig { public lengthScale: number = 1.0; } +export class OfflineTtsMatchaModelConfig { + public acousticModel: string = ''; + public vocoder: string = ''; + public lexicon: string = ''; + public tokens: string = ''; + public dataDir: string = ''; + public dictDir: String = ''; + public noiseScale: number = 0.667; + public lengthScale: number = 1.0; +} + export class OfflineTtsModelConfig { public vits: OfflineTtsVitsModelConfig = new OfflineTtsVitsModelConfig(); + public matcha: OfflineTtsMatchaModelConfig = new OfflineTtsMatchaModelConfig(); public numThreads: number = 1; public debug: boolean = false; public provider: string = 'cpu'; diff --git a/harmony-os/SherpaOnnxTts/entry/src/main/ets/workers/NonStreamingTtsWorker.ets b/harmony-os/SherpaOnnxTts/entry/src/main/ets/workers/NonStreamingTtsWorker.ets index bd5c7a5b89..cf841cbe1e 100644 --- a/harmony-os/SherpaOnnxTts/entry/src/main/ets/workers/NonStreamingTtsWorker.ets +++ b/harmony-os/SherpaOnnxTts/entry/src/main/ets/workers/NonStreamingTtsWorker.ets @@ -73,7 +73,16 @@ function initTts(context: Context): OfflineTts { // for details let modelDir = ''; + + // for VITS begin let modelName = ''; + // for VITS end + + // for Matcha begin + let acousticModelName = ''; + let vocoder = ''; + // for Matcha end + let ruleFsts = ''; let ruleFars = ''; let lexicon = ''; @@ -134,15 +143,47 @@ function initTts(context: Context): OfflineTts { // dictDir = 'dict'; // ruleFsts = `date.fst,phone.fst,number.fst`; + // Example 8 + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-zh-baker-chinese-1-female-speaker + // modelDir = 'matcha-icefall-zh-baker' + // acousticModelName = 'model-steps-3.onnx' + // vocoder = 'hifigan_v2.onnx' + // lexicon = 'lexicon.txt' + // dictDir = 'dict'; + // ruleFsts = `date.fst,phone.fst,number.fst`; + + // Example 9 + // https://github.com/k2-fsa/sherpa-onnx/releases/tag/tts-models + // https://k2-fsa.github.io/sherpa/onnx/tts/pretrained_models/matcha.html#matcha-icefall-en-us-ljspeech-american-english-1-female-speaker + // modelDir = 'matcha-icefall-en_US-ljspeech' + // acousticModelName = 'model-steps-3.onnx' + // vocoder = 'hifigan_v2.onnx' + // dataDir = 'espeak-ng-data'; + // ============================================================ // Please don't change the remaining part of this function // ============================================================ - if (modelName == '') { + if (modelName == '' && acousticModelName == '' && vocoder == '') { throw new Error('You are supposed to select a model by changing the code before you run the app'); } - modelName = modelDir + '/' + modelName; + if (modelName != '' && acousticModelName != '') { + throw new Error('Please select either VITS or Matcha, not both'); + } + + if (acousticModelName != '' && vocoder == '') { + throw new Error('Please provider vocoder for matcha tts models'); + } + + if (modelName != '') { + modelName = modelDir + '/' + modelName; + } + + if (acousticModelName != '') { + acousticModelName = modelDir + '/' + acousticModelName; + } if (ruleFsts != '') { let fsts = ruleFsts.split(',') @@ -186,6 +227,14 @@ function initTts(context: Context): OfflineTts { config.model.vits.tokens = tokens; config.model.vits.dataDir = dataDir; config.model.vits.dictDir = dictDir; + + config.model.matcha.acousticModel = acousticModelName; + config.model.matcha.vocoder = vocoder; + config.model.matcha.lexicon = lexicon; + config.model.matcha.tokens = tokens; + config.model.matcha.dataDir = dataDir; + config.model.matcha.dictDir = dictDir; + config.model.numThreads = 2; config.model.debug = true; config.ruleFsts = ruleFsts; diff --git a/sherpa-onnx/c-api/c-api.cc b/sherpa-onnx/c-api/c-api.cc index 584f933217..30c87823a9 100644 --- a/sherpa-onnx/c-api/c-api.cc +++ b/sherpa-onnx/c-api/c-api.cc @@ -2098,7 +2098,7 @@ SherpaOnnxCreateSpeakerEmbeddingExtractorOHOS( } #if SHERPA_ONNX_ENABLE_TTS == 1 -SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTtsOHOS( +const SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTtsOHOS( const SherpaOnnxOfflineTtsConfig *config, NativeResourceManager *mgr) { if (!mgr) { return SherpaOnnxCreateOfflineTts(config); diff --git a/sherpa-onnx/c-api/c-api.h b/sherpa-onnx/c-api/c-api.h index 691b92a3bd..a669b50dc8 100644 --- a/sherpa-onnx/c-api/c-api.h +++ b/sherpa-onnx/c-api/c-api.h @@ -1618,7 +1618,7 @@ SherpaOnnxCreateVoiceActivityDetectorOHOS( const SherpaOnnxVadModelConfig *config, float buffer_size_in_seconds, NativeResourceManager *mgr); -SHERPA_ONNX_API SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTtsOHOS( +SHERPA_ONNX_API const SherpaOnnxOfflineTts *SherpaOnnxCreateOfflineTtsOHOS( const SherpaOnnxOfflineTtsConfig *config, NativeResourceManager *mgr); SHERPA_ONNX_API const SherpaOnnxSpeakerEmbeddingExtractor * diff --git a/sherpa-onnx/csrc/jieba-lexicon.cc b/sherpa-onnx/csrc/jieba-lexicon.cc index 57b77666b9..189520c4d5 100644 --- a/sherpa-onnx/csrc/jieba-lexicon.cc +++ b/sherpa-onnx/csrc/jieba-lexicon.cc @@ -6,12 +6,23 @@ #include #include // NOLINT +#include #include #include +#if __ANDROID_API__ >= 9 +#include "android/asset_manager.h" +#include "android/asset_manager_jni.h" +#endif + +#if __OHOS__ +#include "rawfile/raw_file_manager.h" +#endif + #include "cppjieba/Jieba.hpp" #include "sherpa-onnx/csrc/file-utils.h" #include "sherpa-onnx/csrc/macros.h" +#include "sherpa-onnx/csrc/onnx-utils.h" #include "sherpa-onnx/csrc/symbol-table.h" #include "sherpa-onnx/csrc/text-utils.h" @@ -56,6 +67,39 @@ class JiebaLexicon::Impl { } } + template + Impl(Manager *mgr, const std::string &lexicon, const std::string &tokens, + const std::string &dict_dir, bool debug) + : debug_(debug) { + std::string dict = dict_dir + "/jieba.dict.utf8"; + std::string hmm = dict_dir + "/hmm_model.utf8"; + std::string user_dict = dict_dir + "/user.dict.utf8"; + std::string idf = dict_dir + "/idf.utf8"; + std::string stop_word = dict_dir + "/stop_words.utf8"; + + AssertFileExists(dict); + AssertFileExists(hmm); + AssertFileExists(user_dict); + AssertFileExists(idf); + AssertFileExists(stop_word); + + jieba_ = + std::make_unique(dict, hmm, user_dict, idf, stop_word); + + { + auto buf = ReadFile(mgr, tokens); + std::istrstream is(buf.data(), buf.size()); + + InitTokens(is); + } + + { + auto buf = ReadFile(mgr, lexicon); + std::istrstream is(buf.data(), buf.size()); + InitLexicon(is); + } + } + std::vector ConvertTextToTokenIds(const std::string &text) const { // see // https://github.com/Plachtaa/VITS-fast-fine-tuning/blob/main/text/mandarin.py#L244 @@ -279,9 +323,29 @@ JiebaLexicon::JiebaLexicon(const std::string &lexicon, const std::string &dict_dir, bool debug) : impl_(std::make_unique(lexicon, tokens, dict_dir, debug)) {} +template +JiebaLexicon::JiebaLexicon(Manager *mgr, const std::string &lexicon, + const std::string &tokens, + const std::string &dict_dir, bool debug) + : impl_(std::make_unique(mgr, lexicon, tokens, dict_dir, debug)) {} + std::vector JiebaLexicon::ConvertTextToTokenIds( const std::string &text, const std::string & /*unused_voice = ""*/) const { return impl_->ConvertTextToTokenIds(text); } +#if __ANDROID_API__ >= 9 +template JiebaLexicon::JiebaLexicon(AAssetManager *mgr, + const std::string &lexicon, + const std::string &tokens, + const std::string &dict_dir, bool debug); +#endif + +#if __OHOS__ +template JiebaLexicon::JiebaLexicon(NativeResourceManager *mgr, + const std::string &lexicon, + const std::string &tokens, + const std::string &dict_dir, bool debug); +#endif + } // namespace sherpa_onnx diff --git a/sherpa-onnx/csrc/jieba-lexicon.h b/sherpa-onnx/csrc/jieba-lexicon.h index 9de1043576..b810a084cf 100644 --- a/sherpa-onnx/csrc/jieba-lexicon.h +++ b/sherpa-onnx/csrc/jieba-lexicon.h @@ -17,9 +17,15 @@ namespace sherpa_onnx { class JiebaLexicon : public OfflineTtsFrontend { public: ~JiebaLexicon() override; + JiebaLexicon(const std::string &lexicon, const std::string &tokens, const std::string &dict_dir, bool debug); + template + JiebaLexicon(Manager *mgr, const std::string &lexicon, + const std::string &tokens, const std::string &dict_dir, + bool debug); + std::vector ConvertTextToTokenIds( const std::string &text, const std::string &unused_voice = "") const override; diff --git a/sherpa-onnx/csrc/offline-tts-matcha-impl.h b/sherpa-onnx/csrc/offline-tts-matcha-impl.h index a4f47fadb2..7bd45fede1 100644 --- a/sherpa-onnx/csrc/offline-tts-matcha-impl.h +++ b/sherpa-onnx/csrc/offline-tts-matcha-impl.h @@ -327,13 +327,12 @@ class OfflineTtsMatchaImpl : public OfflineTtsImpl { // from assets to disk // // for jieba - // we require that you copy tokens.txt, lexicon.txt and dict - // from assets to disk + // we require that you copy dict from assets to disk const auto &meta_data = model_->GetMetaData(); if (meta_data.jieba && !meta_data.has_espeak) { frontend_ = std::make_unique( - config_.model.matcha.lexicon, config_.model.matcha.tokens, + mgr, config_.model.matcha.lexicon, config_.model.matcha.tokens, config_.model.matcha.dict_dir, config_.model.debug); } else if (meta_data.has_espeak && !meta_data.jieba) { frontend_ = std::make_unique(