Skip to content

Commit

Permalink
Add Aho-Corasick algorithm for spellcheck to match keywords. (#171)
Browse files Browse the repository at this point in the history
* Add Aho-Corasick algorithm for spellcheck to match keywords.

* Update spellcheck replace

* Fix test and update benchmark result
  • Loading branch information
huacnlee authored Dec 5, 2023
1 parent e332310 commit 0cb15a0
Show file tree
Hide file tree
Showing 8 changed files with 401 additions and 65 deletions.
6 changes: 3 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -419,9 +419,9 @@ format_javascript time: [108.63 µs 108.70 µs 108.79 µs]
format_json_2k time: [9.3879 ms 9.4871 ms 9.6541 ms]
format_jupyter time: [86.660 µs 86.743 µs 86.830 µs]
format_markdown time: [1.0007 ms 1.0123 ms 1.0285 ms]
spellcheck_50 time: [1.9177 µs 1.9422 µs 1.9766 µs]
spellcheck_100 time: [3.5868 µs 3.5909 µs 3.5950 µs]
spellcheck_400 time: [14.873 µs 14.974 µs 15.110 µs]
spellcheck_50 time: [1.6012 µs 1.6122 µs 1.6306 µs]
spellcheck_100 time: [3.0968 µs 3.1696 µs 3.2653 µs]
spellcheck_400 time: [10.136 µs 10.478 µs 10.898 µs]
lint_markdown time: [1.1195 ms 1.1205 ms 1.1215 ms]
lint_json time: [67.764 µs 68.341 µs 69.137 µs]
lint_html time: [280.87 µs 281.44 µs 282.20 µs]
Expand Down
11 changes: 7 additions & 4 deletions autocorrect/benches/example.rs
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,7 @@ fn bench_halfwidth_full_english_100(c: &mut Criterion) {
fn bench_spellcheck(c: &mut Criterion) {
setup();

// [1.6012 µs 1.6122 µs 1.6306 µs]
c.bench_function("spellcheck_50", |b| {
b.iter(|| {
spellcheck::format(
Expand All @@ -88,10 +89,12 @@ fn bench_spellcheck(c: &mut Criterion) {
});
});

// [3.0968 µs 3.1696 µs 3.2653 µs]
c.bench_function("spellcheck_100", |b| {
b.iter(|| spellcheck::format("探索 apple 充满创新的世界,选购各式iphone、ipad、apple watch 和 mac、娱乐产品了,iphone 13 新款 - iphone SE 新款 ,并获得相关产品的专家支持服务。"));
});

// [10.136 µs 10.478 µs 10.898 µs]
c.bench_function("spellcheck_400", |b| {
b.iter(|| spellcheck::format("探索 apple 充满创新的世界,选购各式 iphone、ipad、apple watch 和 mac、娱乐产品了,iphone 13 新款 - iphone SE 新款 ,并获得相关产品的专家支持服务。通过 apple Trade In 换购计划,你可以用符合条件的智能手机来换购新 iphone,享受折抵优惠5。这样一来,你受益,地球也受益。现可在线加入 iphone 年年焕新计划,年年用上新 iphone,享受 AppleCare+ 服务计划,还可选择分期付款*。AirTag 是能帮你轻松追踪各种物品的高手。只要给钥匙串上
挂一个,往背包里塞一个,在打开查找 app 时,除了能追踪自己的 Apple 设备之外,你还能看到钥匙和背包这些物品在哪里。只要放一个 AirTag,钱包在哪里这类问题会迎刃而解。通过查找 app 的全新“物品”标签页,都能让 AirTag 来指示物品位置。"));
Expand Down Expand Up @@ -148,9 +151,9 @@ criterion_group!(
bench_format_json_with_2k_lines,
bench_format_jupyter,
bench_markdown,
bench_spellcheck,
bench_lint,
bench_lint_output
);

criterion_main!(format_benches);
criterion_group!(spellcheck_benches, bench_spellcheck);
criterion_group!(lint_benches, bench_lint, bench_lint_output);

criterion_main!(format_benches, spellcheck_benches, lint_benches);
10 changes: 2 additions & 8 deletions autocorrect/src/config/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@ use std::{
fs,
path::Path,
sync::{Arc, RwLock, RwLockReadGuard},
vec,
};

use crate::serde_any;
Expand Down Expand Up @@ -66,12 +65,7 @@ impl Default for Config {
Config {
rules: HashMap::new(),
text_rules: HashMap::new(),
spellcheck: SpellcheckConfig {
mode: None,
words: vec![],
dict: HashMap::new(),
dict_re: HashMap::new(),
},
spellcheck: SpellcheckConfig::default(),
file_types: HashMap::new(),
}
}
Expand Down Expand Up @@ -350,7 +344,7 @@ mod tests {

assert_eq!(None, config.spellcheck.mode);
assert!(!config.spellcheck.words.is_empty());
assert!(!config.spellcheck.dict.is_empty());
assert!(!config.spellcheck.word_map.is_empty());
}

#[test]
Expand Down
33 changes: 24 additions & 9 deletions autocorrect/src/config/spellcheck.rs
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
use crate::keyword;

use super::severity::*;
use regex::Regex;
use serde::{Deserialize, Serialize};
use std::collections::HashMap;

Expand All @@ -14,14 +15,31 @@ pub struct SpellcheckConfig {
pub mode: Option<SeverityMode>,
#[serde(default)]
pub words: Vec<SpellcheckWord>,
/// key is always in lowercase
/// value is the original word
#[serde(skip)]
pub dict: HashMap<String, String>,
pub word_map: HashMap<String, String>,
/// A tree to match words
#[serde(skip)]
pub dict_re: HashMap<String, Regex>,
pub matcher: keyword::Node,
}

impl SpellcheckConfig {
pub fn prepare(&mut self) {
self.matcher = keyword::Node::new(true);
self.matcher.add_keywords(
self.words
.iter()
.map(|w| {
// get the = before, and trim
// ios
// wifi = Wi-Fi
w.split('=').next().unwrap().trim()
})
.collect::<Vec<_>>(),
);
self.matcher.build();

if !self.words.is_empty() {
let mut lines = self.words.clone();

Expand Down Expand Up @@ -57,12 +75,9 @@ impl SpellcheckConfig {
left_str = left_str.trim();
right_str = right_str.trim();

self.dict
.insert(left_str.to_string(), right_str.to_string());
self.dict_re.insert(
left_str.to_string(),
crate::rule::spellcheck::word_regexp(left_str),
);
let key = left_str.to_lowercase();

self.word_map.insert(key.clone(), right_str.to_string());
}
}
}
Expand Down
Loading

0 comments on commit 0cb15a0

Please sign in to comment.