From 8958c4cbf9810342f36962b8e154851eab6fa3ff Mon Sep 17 00:00:00 2001 From: Jason Lee Date: Wed, 18 May 2022 20:22:42 +0800 Subject: [PATCH] Improve Spellcheck for only change words neer the CJK or Fullwidth punctuations. #42 --- README.md | 12 +++++------ autocorrect/benches/example.rs | 2 +- autocorrect/src/code/html.rs | 4 ++-- autocorrect/src/code/markdown.rs | 2 +- autocorrect/src/spellcheck.rs | 31 +++++++++++++++++++-------- autocorrect/tests/.autocorrectrc.test | 2 ++ 6 files changed, 34 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index 6524fd4b..6957e9f3 100644 --- a/README.md +++ b/README.md @@ -332,9 +332,9 @@ test bench_format_html ... bench: 156,654 ns/iter (+/- 4,773) test bench_format_javascript ... bench: 89,387 ns/iter (+/- 8,365) test bench_format_json ... bench: 29,356 ns/iter (+/- 718) test bench_format_json_with_2k_lines ... bench: 3,829,479 ns/iter (+/- 76,499) -test bench_spellcheck_50 ... bench: 25,350 ns/iter (+/- 287) -test bench_spellcheck_100 ... bench: 39,242 ns/iter (+/- 607) -test bench_spellcheck_400 ... bench: 151,864 ns/iter (+/- 1,640) +test bench_spellcheck_50 ... bench: 37,371 ns/iter (+/- 844) +test bench_spellcheck_100 ... bench: 57,835 ns/iter (+/- 745) +test bench_spellcheck_400 ... bench: 195,606 ns/iter (+/- 2,996) ``` | Type | Total chars | Duration | @@ -342,9 +342,9 @@ test bench_spellcheck_400 ... bench: 151,864 ns/iter (+/- 1,640) | format | 50 | 0.014 ms | | format | 100 | 0.019 ms | | format | 400 | 0.045 ms | -| spellcheck | 50 | 0.025 ms | -| spellcheck | 100 | 0.039 ms | -| spellcheck | 400 | 0.151 ms | +| spellcheck | 50 | 0.037 ms | +| spellcheck | 100 | 0.057 ms | +| spellcheck | 400 | 0.195 ms | ## TODO diff --git a/autocorrect/benches/example.rs b/autocorrect/benches/example.rs index d7fe03c2..075857da 100644 --- a/autocorrect/benches/example.rs +++ b/autocorrect/benches/example.rs @@ -48,7 +48,7 @@ fn bench_format_400(b: &mut Bencher) { fn bench_format_html(b: &mut Bencher) { let raw = r###" -bad html +bad HTML <% a = 1 %> {% hello = a %} diff --git a/autocorrect/src/code/html.rs b/autocorrect/src/code/html.rs index b6067a97..95877bb6 100644 --- a/autocorrect/src/code/html.rs +++ b/autocorrect/src/code/html.rs @@ -32,7 +32,7 @@ mod tests { #[test] fn test_format_html() { let html = r###" - bad html + bad HTML <% a = 1 %> {% hello = a %} @@ -74,7 +74,7 @@ mod tests { "###; let expected = r###" - bad html + bad HTML <% a = 1 %> {% hello = a %} diff --git a/autocorrect/src/code/markdown.rs b/autocorrect/src/code/markdown.rs index 1edf788a..2d5aaecb 100644 --- a/autocorrect/src/code/markdown.rs +++ b/autocorrect/src/code/markdown.rs @@ -20,7 +20,7 @@ mod tests { let example = r###" --- - title: IPAD 和 Ios 接入的不同点 + title: iPad 和 Ios 接入的不同点 id: h slug: /appstore/ipad_and_ios --- diff --git a/autocorrect/src/spellcheck.rs b/autocorrect/src/spellcheck.rs index 2bd68a19..7fdc2fa2 100644 --- a/autocorrect/src/spellcheck.rs +++ b/autocorrect/src/spellcheck.rs @@ -3,9 +3,13 @@ use regex::Regex; use crate::config::Config; pub(crate) fn word_regexp(word: &str) -> Regex { + let prefix = r#"([^\W]|[\p{Han}?!:,。;、]|$|^)"#; + regexp!( - r#"(?im)([\s,。、?!]|^)+({})([\s,。、?!]|$)+"#, - word.replace('-', r"\-").replace('.', r"\.") + r#"(?im){}([\s?!:,。;、]|^)+({})([\s?!:,。;、]|$)+{}"#, + prefix, + word.replace('-', r"\-").replace('.', r"\."), + prefix ) } @@ -25,7 +29,7 @@ pub fn spellcheck(text: &str) -> String { let new_word = spellcheck_dict.get(word).unwrap_or(word); out = re .replace_all(&out, |cap: ®ex::Captures| { - cap[0].replace(&cap[2], new_word) + cap[0].replace(&cap[3], new_word) }) .to_string(); } @@ -53,7 +57,7 @@ mod tests { let cases = map! [ "ios" => "iOS", - "this is ipad ios website, and the IOS download url" => "this is iPad iOS website, and the iOS download url", + "this is ipad ios website, and the IOS download url" => "this is iPad iOS website, and the iOS download URL", "Ios download" => "iOS download", "Download iOs" => "Download iOS", "hello_ios" => "hello_ios", @@ -70,21 +74,30 @@ mod tests { "开放接口 IOS!" => "开放接口 iOS!", "开放接口 IOS," => "开放接口 iOS,", "开放,ios 接口" => "开放,iOS 接口", - // r#""ios 发布新版本 ios""# => r#""iOS 发布新版本 iOS""#, - // r#"'ios 发布新版本 ios'"# => r#"'iOS 发布新版本 iOS'"#, - r#"key: "ios", value: "ipad""# => r#"key: "ios", value: "ipad""# + "打开 wifi 并找到就近的 WIFI,点击输入 wi-fi 密码" => "打开 Wi-Fi 并找到就近的 Wi-Fi,点击输入 Wi-Fi 密码" ]; assert_spellcheck_cases(cases); } #[test] - fn test_speelcheck_cases() { + fn test_spellcheck_for_special_cases() { crate::config::setup_test(); let cases = map! [ - "打开 wifi 并找到就近的 WIFI,点击输入 wi-fi 密码" => "打开 Wi-Fi 并找到就近的 Wi-Fi,点击输入 Wi-Fi 密码" + "var ios = '1.0.0'" => "var ios = '1.0.0'", + "let wifi = ios" => "let wifi = ios", + "ipad + ios" => "ipad + ios", + "html { color: #999; }" => "html { color: #999; }", + "> IOS" => "> IOS", + "ios => {}" => "ios => {}", + "if ios > 0" => "if ios > 0", + r#""IOS""# => r#""IOS""#, + r#"'IOS'"# => r#"'IOS'"#, + r#""IOS 11""# => r#""IOS 11""#, + r#"key: "ios", value: "ipad""# => r#"key: "ios", value: "ipad""# ]; + assert_spellcheck_cases(cases); } diff --git a/autocorrect/tests/.autocorrectrc.test b/autocorrect/tests/.autocorrectrc.test index 84ee5f14..47a7e16a 100644 --- a/autocorrect/tests/.autocorrectrc.test +++ b/autocorrect/tests/.autocorrectrc.test @@ -1,6 +1,8 @@ spellcheck: mode: 1 words: + - HTML + - URL - iOS - iPad - iPhone