Skip to content

Commit

Permalink
Improve Spellcheck for only change words neer the CJK or Fullwidth pu…
Browse files Browse the repository at this point in the history
…nctuations. #42
  • Loading branch information
huacnlee committed May 18, 2022
1 parent 8dde6a2 commit 8958c4c
Show file tree
Hide file tree
Showing 6 changed files with 34 additions and 19 deletions.
12 changes: 6 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -332,19 +332,19 @@ test bench_format_html ... bench: 156,654 ns/iter (+/- 4,773)
test bench_format_javascript ... bench: 89,387 ns/iter (+/- 8,365)
test bench_format_json ... bench: 29,356 ns/iter (+/- 718)
test bench_format_json_with_2k_lines ... bench: 3,829,479 ns/iter (+/- 76,499)
test bench_spellcheck_50 ... bench: 25,350 ns/iter (+/- 287)
test bench_spellcheck_100 ... bench: 39,242 ns/iter (+/- 607)
test bench_spellcheck_400 ... bench: 151,864 ns/iter (+/- 1,640)
test bench_spellcheck_50 ... bench: 37,371 ns/iter (+/- 844)
test bench_spellcheck_100 ... bench: 57,835 ns/iter (+/- 745)
test bench_spellcheck_400 ... bench: 195,606 ns/iter (+/- 2,996)
```

| Type | Total chars | Duration |
| ---------- | ----------- | -------- |
| format | 50 | 0.014 ms |
| format | 100 | 0.019 ms |
| format | 400 | 0.045 ms |
| spellcheck | 50 | 0.025 ms |
| spellcheck | 100 | 0.039 ms |
| spellcheck | 400 | 0.151 ms |
| spellcheck | 50 | 0.037 ms |
| spellcheck | 100 | 0.057 ms |
| spellcheck | 400 | 0.195 ms |

## TODO

Expand Down
2 changes: 1 addition & 1 deletion autocorrect/benches/example.rs
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ fn bench_format_400(b: &mut Bencher) {

fn bench_format_html(b: &mut Bencher) {
let raw = r###"
bad html
bad HTML
<% a = 1 %>
{% hello = a %}
<!DOCTYPE html>
Expand Down
4 changes: 2 additions & 2 deletions autocorrect/src/code/html.rs
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ mod tests {
#[test]
fn test_format_html() {
let html = r###"
bad html
bad HTML
<% a = 1 %>
{% hello = a %}
<!DOCTYPE html>
Expand Down Expand Up @@ -74,7 +74,7 @@ mod tests {
"###;

let expected = r###"
bad html
bad HTML
<% a = 1 %>
{% hello = a %}
<!DOCTYPE html>
Expand Down
2 changes: 1 addition & 1 deletion autocorrect/src/code/markdown.rs
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ mod tests {

let example = r###"
---
title: IPAD 和 Ios 接入的不同点
title: iPad 和 Ios 接入的不同点
id: h
slug: /appstore/ipad_and_ios
---
Expand Down
31 changes: 22 additions & 9 deletions autocorrect/src/spellcheck.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,13 @@ use regex::Regex;
use crate::config::Config;

pub(crate) fn word_regexp(word: &str) -> Regex {
let prefix = r#"([^\W]|[\p{Han}?!:,。;、]|$|^)"#;

regexp!(
r#"(?im)([\s,。、?!]|^)+({})([\s,。、?!]|$)+"#,
word.replace('-', r"\-").replace('.', r"\.")
r#"(?im){}([\s?!:,。;、]|^)+({})([\s?!:,。;、]|$)+{}"#,
prefix,
word.replace('-', r"\-").replace('.', r"\."),
prefix
)
}

Expand All @@ -25,7 +29,7 @@ pub fn spellcheck(text: &str) -> String {
let new_word = spellcheck_dict.get(word).unwrap_or(word);
out = re
.replace_all(&out, |cap: &regex::Captures| {
cap[0].replace(&cap[2], new_word)
cap[0].replace(&cap[3], new_word)
})
.to_string();
}
Expand Down Expand Up @@ -53,7 +57,7 @@ mod tests {

let cases = map! [
"ios" => "iOS",
"this is ipad ios website, and the IOS download url" => "this is iPad iOS website, and the iOS download url",
"this is ipad ios website, and the IOS download url" => "this is iPad iOS website, and the iOS download URL",
"Ios download" => "iOS download",
"Download iOs" => "Download iOS",
"hello_ios" => "hello_ios",
Expand All @@ -70,21 +74,30 @@ mod tests {
"开放接口 IOS!" => "开放接口 iOS!",
"开放接口 IOS," => "开放接口 iOS,",
"开放,ios 接口" => "开放,iOS 接口",
// r#""ios 发布新版本 ios""# => r#""iOS 发布新版本 iOS""#,
// r#"'ios 发布新版本 ios'"# => r#"'iOS 发布新版本 iOS'"#,
r#"key: "ios", value: "ipad""# => r#"key: "ios", value: "ipad""#
"打开 wifi 并找到就近的 WIFI,点击输入 wi-fi 密码" => "打开 Wi-Fi 并找到就近的 Wi-Fi,点击输入 Wi-Fi 密码"
];

assert_spellcheck_cases(cases);
}

#[test]
fn test_speelcheck_cases() {
fn test_spellcheck_for_special_cases() {
crate::config::setup_test();

let cases = map! [
"打开 wifi 并找到就近的 WIFI,点击输入 wi-fi 密码" => "打开 Wi-Fi 并找到就近的 Wi-Fi,点击输入 Wi-Fi 密码"
"var ios = '1.0.0'" => "var ios = '1.0.0'",
"let wifi = ios" => "let wifi = ios",
"ipad + ios" => "ipad + ios",
"html { color: #999; }" => "html { color: #999; }",
"> IOS" => "> IOS",
"ios => {}" => "ios => {}",
"if ios > 0" => "if ios > 0",
r#""IOS""# => r#""IOS""#,
r#"'IOS'"# => r#"'IOS'"#,
r#""IOS 11""# => r#""IOS 11""#,
r#"key: "ios", value: "ipad""# => r#"key: "ios", value: "ipad""#
];

assert_spellcheck_cases(cases);
}

Expand Down
2 changes: 2 additions & 0 deletions autocorrect/tests/.autocorrectrc.test
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
spellcheck:
mode: 1
words:
- HTML
- URL
- iOS
- iPad
- iPhone
Expand Down

0 comments on commit 8958c4c

Please sign in to comment.