Skip to content

Commit

Permalink
Fix min chars so it counts graphemes not bytes
Browse files Browse the repository at this point in the history
  • Loading branch information
havenwood committed Sep 9, 2024
1 parent dcb9640 commit 5544cd7
Show file tree
Hide file tree
Showing 2 changed files with 19 additions and 1 deletion.
4 changes: 3 additions & 1 deletion src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,9 @@ impl WordTally {

for line in lines.map_while(Result::ok) {
line.unicode_words()
.filter(|word| min_chars.inapplicable() || word.len() >= min_chars.0)
.filter(|word| {
min_chars.inapplicable() || word.graphemes(true).count() >= min_chars.0
})
.for_each(|word| {
*tally.entry(Self::normalize_case(word, case)).or_insert(0) += 1;
});
Expand Down
16 changes: 16 additions & 0 deletions tests/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -355,6 +355,22 @@ fn test_words_only_from() {
assert_eq!(WordsOnly::from(only.clone()), WordsOnly(Some(only)));
}

#[test]
fn test_min_count_graphemes() {
let tally = WordTally::new(
// An `"é"` is only one char.
&b"e\xCC\x81"[..],
Case::default(),
Sort::default(),
Filters {
min_chars: MinChars(2),
..Filters::default()
},
);

assert_eq!(tally.count(), 0);
}

#[cfg(feature = "serde")]
#[test]
fn test_to_json() {
Expand Down

0 comments on commit 5544cd7

Please sign in to comment.