forked from quickwit-oss/tantivy
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfacet_tokenizer.rs
128 lines (117 loc) · 3.83 KB
/
facet_tokenizer.rs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
use super::{Token, TokenStream, Tokenizer};
use crate::schema::FACET_SEP_BYTE;
/// The `FacetTokenizer` process a `Facet` binary representation
/// and emits a token for all of its parent.
///
/// For instance, `/america/north_america/canada`
/// will emit the three following tokens
/// - `/america/north_america/canada`
/// - `/america/north_america`
/// - `/america`
#[derive(Clone, Default)]
pub struct FacetTokenizer {
token: Token,
}
#[derive(Debug)]
enum State {
RootFacetNotEmitted,
UpToPosition(usize), //< we already emitted facet prefix up to &text[..cursor]
Terminated,
}
pub struct FacetTokenStream<'a> {
text: &'a str,
state: State,
token: &'a mut Token,
}
impl Tokenizer for FacetTokenizer {
type TokenStream<'a> = FacetTokenStream<'a>;
fn token_stream<'a>(&'a mut self, text: &'a str) -> FacetTokenStream<'a> {
self.token.reset();
self.token.position = 0;
FacetTokenStream {
text,
state: State::RootFacetNotEmitted, //< pos is the first char that has not been processed yet.
token: &mut self.token,
}
}
}
impl<'a> TokenStream for FacetTokenStream<'a> {
fn advance(&mut self) -> bool {
match self.state {
State::RootFacetNotEmitted => {
self.state = if self.text.is_empty() {
State::Terminated
} else {
State::UpToPosition(0)
};
true
}
State::UpToPosition(cursor) => {
let bytes: &[u8] = self.text.as_bytes();
if let Some(next_sep_pos) = bytes[cursor + 1..]
.iter()
.cloned()
.position(|b| b == FACET_SEP_BYTE)
.map(|pos| cursor + 1 + pos)
{
let facet_part = &self.text[cursor..next_sep_pos];
self.token.text.push_str(facet_part);
self.state = State::UpToPosition(next_sep_pos);
} else {
let facet_part = &self.text[cursor..];
self.token.text.push_str(facet_part);
self.state = State::Terminated;
}
true
}
State::Terminated => false,
}
}
fn token(&self) -> &Token {
self.token
}
fn token_mut(&mut self) -> &mut Token {
self.token
}
}
#[cfg(test)]
mod tests {
use super::FacetTokenizer;
use crate::schema::Facet;
use crate::tokenizer::{Token, TokenStream, Tokenizer};
#[test]
fn test_facet_tokenizer() {
let facet = Facet::from_path(vec!["top", "a", "b"]);
let mut tokens = vec![];
{
let mut add_token = |token: &Token| {
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap();
tokens.push(format!("{}", facet));
};
FacetTokenizer::default()
.token_stream(facet.encoded_str())
.process(&mut add_token);
}
assert_eq!(tokens.len(), 4);
assert_eq!(tokens[0], "/");
assert_eq!(tokens[1], "/top");
assert_eq!(tokens[2], "/top/a");
assert_eq!(tokens[3], "/top/a/b");
}
#[test]
fn test_facet_tokenizer_root_facets() {
let facet = Facet::root();
let mut tokens = vec![];
{
let mut add_token = |token: &Token| {
let facet = Facet::from_encoded(token.text.as_bytes().to_owned()).unwrap(); // ok test
tokens.push(format!("{}", facet));
};
FacetTokenizer::default()
.token_stream(facet.encoded_str()) // ok test
.process(&mut add_token);
}
assert_eq!(tokens.len(), 1);
assert_eq!(tokens[0], "/");
}
}