forked from k2-fsa/sherpa-onnx
-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add jieba for Chinese TTS models (k2-fsa#797)
- Loading branch information
1 parent
2e0ee0e
commit 6b353bf
Showing
14 changed files
with
513 additions
and
8 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,45 @@ | ||
function(download_cppjieba) | ||
include(FetchContent) | ||
|
||
set(cppjieba_URL "https://github.com/csukuangfj/cppjieba/archive/refs/tags/sherpa-onnx-2024-04-19.tar.gz") | ||
set(cppjieba_URL2 "https://hub.nuaa.cf/csukuangfj/cppjieba/archive/refs/tags/sherpa-onnx-2024-04-19.tar.gz") | ||
set(cppjieba_HASH "SHA256=03e5264687f0efaef05487a07d49c3f4c0f743347bfbf825df4b30cc75ac5288") | ||
|
||
# If you don't have access to the Internet, | ||
# please pre-download cppjieba | ||
set(possible_file_locations | ||
$ENV{HOME}/Downloads/cppjieba-sherpa-onnx-2024-04-19.tar.gz | ||
${CMAKE_SOURCE_DIR}/cppjieba-sherpa-onnx-2024-04-19.tar.gz | ||
${CMAKE_BINARY_DIR}/cppjieba-sherpa-onnx-2024-04-19.tar.gz | ||
/tmp/cppjieba-sherpa-onnx-2024-04-19.tar.gz | ||
/star-fj/fangjun/download/github/cppjieba-sherpa-onnx-2024-04-19.tar.gz | ||
) | ||
|
||
foreach(f IN LISTS possible_file_locations) | ||
if(EXISTS ${f}) | ||
set(cppjieba_URL "${f}") | ||
file(TO_CMAKE_PATH "${cppjieba_URL}" cppjieba_URL) | ||
message(STATUS "Found local downloaded cppjieba: ${cppjieba_URL}") | ||
set(cppjieba_URL2) | ||
break() | ||
endif() | ||
endforeach() | ||
|
||
FetchContent_Declare(cppjieba | ||
URL | ||
${cppjieba_URL} | ||
${cppjieba_URL2} | ||
URL_HASH | ||
${cppjieba_HASH} | ||
) | ||
|
||
FetchContent_GetProperties(cppjieba) | ||
if(NOT cppjieba_POPULATED) | ||
message(STATUS "Downloading cppjieba ${cppjieba_URL}") | ||
FetchContent_Populate(cppjieba) | ||
endif() | ||
message(STATUS "cppjieba is downloaded to ${cppjieba_SOURCE_DIR}") | ||
add_subdirectory(${cppjieba_SOURCE_DIR} ${cppjieba_BINARY_DIR} EXCLUDE_FROM_ALL) | ||
endfunction() | ||
|
||
download_cppjieba() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,144 @@ | ||
// sherpa-onnx/csrc/cppjieba-test.cc | ||
// | ||
// Copyright (c) 2024 Xiaomi Corporation | ||
#include <iostream> | ||
#include <regex> // NOLINT | ||
#include <string> | ||
#include <vector> | ||
|
||
#include "cppjieba/Jieba.hpp" | ||
#include "gtest/gtest.h" | ||
#include "sherpa-onnx/csrc/file-utils.h" | ||
#include "sherpa-onnx/csrc/macros.h" | ||
|
||
namespace sherpa_onnx { | ||
|
||
// Please download dict files form | ||
// https://github.com/csukuangfj/cppjieba/releases/download/sherpa-onnx-2024-04-19/dict.tar.bz2 | ||
const char *const kDictPath = "./dict/jieba.dict.utf8"; | ||
const char *const kHmmPath = "./dict/hmm_model.utf8"; | ||
const char *const kUserDictPath = "./dict/user.dict.utf8"; | ||
const char *const kIdfPath = "./dict/idf.utf8"; | ||
const char *const kStopWordPath = "./dict/stop_words.utf8"; | ||
|
||
TEST(CppJieBa, Case1) { | ||
if (!FileExists(kDictPath)) { | ||
SHERPA_ONNX_LOGE("%s does not exist. Skipping test", kDictPath); | ||
return; | ||
} | ||
|
||
cppjieba::Jieba jieba(kDictPath, kHmmPath, kUserDictPath, kIdfPath, | ||
kStopWordPath); | ||
|
||
std::vector<std::string> words; | ||
std::vector<cppjieba::Word> jiebawords; | ||
|
||
std::string s = "他来到了网易杭研大厦"; | ||
std::cout << s << std::endl; | ||
std::cout << "[demo] Cut With HMM" << std::endl; | ||
jieba.Cut(s, words, true); | ||
std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl; | ||
/* | ||
他来到了网易杭研大厦 | ||
[demo] Cut With HMM | ||
他/来到/了/网易/杭研/大厦 | ||
*/ | ||
s = "小明硕士毕业于中国科学院计算所,后在日本京都大学深造"; | ||
std::cout << s << std::endl; | ||
std::cout << "[demo] CutForSearch" << std::endl; | ||
jieba.CutForSearch(s, words); | ||
std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl; | ||
/* | ||
小明硕士毕业于中国科学院计算所,后在日本京都大学深造 | ||
[demo] CutForSearch | ||
小明/硕士/毕业/于/中国/科学/学院/科学院/中国科学院/计算/计算所/,/后/在/日本/京都/大学/日本京都大学/深造 | ||
*/ | ||
std::cout << "[demo] Insert User Word" << std::endl; | ||
jieba.Cut("男默女泪", words); | ||
std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl; | ||
jieba.InsertUserWord("男默女泪"); | ||
jieba.Cut("男默女泪", words); | ||
std::cout << limonp::Join(words.begin(), words.end(), "/") << std::endl; | ||
/* | ||
[demo] Insert User Word | ||
男默/女泪 | ||
男默女泪 | ||
*/ | ||
std::cout << "[demo] CutForSearch Word With Offset" << std::endl; | ||
jieba.CutForSearch(s, jiebawords, true); | ||
std::cout << jiebawords << std::endl; | ||
/* | ||
[demo] CutForSearch Word With Offset | ||
[{"word": "小明", "offset": 0}, {"word": "硕士", "offset": 6}, {"word": "毕业", | ||
"offset": 12}, {"word": "于", "offset": 18}, {"word": "中国", "offset": 21}, | ||
{"word": "科学", "offset": 27}, {"word": "学院", "offset": 30}, {"word": | ||
"科学院", "offset": 27}, {"word": "中国科学院", "offset": 21}, {"word": "计算", | ||
"offset": 36}, {"word": "计算所", "offset": 36}, {"word": ",", "offset": 45}, | ||
{"word": "后", "offset": 48}, {"word": "在", "offset": 51}, {"word": "日本", | ||
"offset": 54}, {"word": "京都", "offset": 60}, {"word": "大学", "offset": 66}, | ||
{"word": "日本京都大学", "offset": 54}, {"word": " 深造", "offset": 72}] | ||
*/ | ||
// see more test at | ||
// https://github.com/yanyiwu/cppjieba/blob/master/test/demo.cpp | ||
} | ||
|
||
TEST(CppJieBa, Case2) { | ||
if (!FileExists(kDictPath)) { | ||
SHERPA_ONNX_LOGE("%s does not exist. Skipping test", kDictPath); | ||
return; | ||
} | ||
|
||
cppjieba::Jieba jieba(kDictPath, kHmmPath, kUserDictPath, kIdfPath, | ||
kStopWordPath); | ||
std::string s = | ||
"当夜幕降临,星光点点,伴随着微风拂面,我在静谧中感受着时光的流转,思念如" | ||
"涟漪荡漾,梦境如画卷展开,我与自然融为一体,沉静在这片宁静的美丽之中,感" | ||
"受着生命的奇迹与温柔"; | ||
std::vector<std::string> words; | ||
bool is_hmm = true; | ||
jieba.Cut(s, words, is_hmm); | ||
{ | ||
std::ostringstream os; | ||
std::string sep = ""; | ||
for (const auto &w : words) { | ||
os << sep << w; | ||
sep = "_"; | ||
} | ||
|
||
std::cout << os.str() << "\n"; | ||
} | ||
/* | ||
当_夜幕降临_,_星光点点_,_伴随_着_微风_拂面_, | ||
_我_在_静谧_中_感受_着_时光_的_流转_, | ||
_思念_如_涟漪_荡漾_,_梦境_如_画卷_展开_,_我_与_自然_融为一体_, | ||
_沉静_在_这_片_宁静_的_美丽_之中_,_感受_着_生命_的_奇迹_与_温柔 | ||
*/ | ||
s = "这里有:红的、绿的、蓝的;各种各样的颜色都有!你想要什么呢?测试."; | ||
std::regex punct_re(":|、|;"); | ||
std::string s2 = std::regex_replace(s, punct_re, ","); | ||
|
||
std::regex punct_re2("[.]"); | ||
s2 = std::regex_replace(s2, punct_re2, "。"); | ||
|
||
std::regex punct_re3("[?]"); | ||
s2 = std::regex_replace(s2, punct_re3, "?"); | ||
|
||
std::regex punct_re4("[!]"); | ||
s2 = std::regex_replace(s2, punct_re4, "!"); | ||
std::cout << s << "\n" << s2 << "\n"; | ||
|
||
words.clear(); | ||
jieba.Cut(s2, words, is_hmm); | ||
{ | ||
std::ostringstream os; | ||
std::string sep = ""; | ||
for (const auto &w : words) { | ||
os << sep << w; | ||
sep = "_"; | ||
} | ||
|
||
std::cout << os.str() << "\n"; | ||
} | ||
} | ||
|
||
} // namespace sherpa_onnx |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.