diff --git a/CHANGELOG.md b/CHANGELOG.md index f710901..2a5b78f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,10 @@ - Lexers can now use `pub(crate)` visibility, and other visibilities supported by Rust and the `syn` crate. Previously only `pub` was supported. +- Eliminate redundant `backtrack` calls in generated code, improving code size + and runtime performance. Runtime performance improved 13% in a benchmark. + (#69) + # 2023/09/03: 0.15.0 - Lexer type declarations can now have outer attributes other than just diff --git a/crates/lexgen/src/dfa.rs b/crates/lexgen/src/dfa.rs index 4c1fa73..6a0d974 100644 --- a/crates/lexgen/src/dfa.rs +++ b/crates/lexgen/src/dfa.rs @@ -1,3 +1,4 @@ +mod backtrack; pub mod codegen; pub mod simplify; @@ -7,6 +8,7 @@ pub mod simulate; use crate::collections::{Map, Set}; use crate::nfa::AcceptingState; use crate::range_map::{Range, RangeMap}; +pub(crate) use backtrack::update_backtracks; use std::convert::TryFrom; use std::iter::{FromIterator, IntoIterator}; @@ -44,6 +46,7 @@ pub struct State { /// Predecessors of the state, used to inline code for a state with one predecessor in the /// predecessor's code. predecessors: Set, + backtrack: bool, } impl State { @@ -56,6 +59,7 @@ impl State { end_of_input_transition: None, accepting: vec![], predecessors: Default::default(), + backtrack: false, } } @@ -93,7 +97,6 @@ impl DFA { new_state_idx } - #[cfg(test)] pub fn is_accepting_state(&self, state: StateIdx) -> bool { !self.states[state.0].accepting.is_empty() } @@ -179,6 +182,7 @@ impl DFA { end_of_input_transition, accepting, predecessors, + backtrack, } in other.states { let mut new_char_transitions: Map = Default::default(); @@ -213,6 +217,7 @@ impl DFA { end_of_input_transition: new_end_of_input_transition, accepting, predecessors, + backtrack, }); } @@ -239,6 +244,7 @@ impl Display for DFA { end_of_input_transition, accepting, predecessors: _, + backtrack, } = state; if !accepting.is_empty() { @@ -301,6 +307,14 @@ impl Display for DFA { writeln!(f, "$ -> {}", next)?; } + if *backtrack { + if !first { + write!(f, " ")?; + } + + writeln!(f, "backtrack")?; + } + if char_transitions.is_empty() && range_transitions.is_empty() && any_transition.is_none() diff --git a/crates/lexgen/src/dfa/backtrack.rs b/crates/lexgen/src/dfa/backtrack.rs new file mode 100644 index 0000000..aaa97ee --- /dev/null +++ b/crates/lexgen/src/dfa/backtrack.rs @@ -0,0 +1,64 @@ +use crate::collections::Map; +use crate::dfa::{StateIdx, DFA}; + +use std::collections::hash_map::Entry; + +pub(crate) fn update_backtracks(dfa: &mut DFA) { + // State and whether the state is an accepting state. + let mut work_list: Vec<(StateIdx, bool)> = dfa + .states + .iter() + .enumerate() + .filter_map(|(state_idx, state)| { + if state.initial { + Some((StateIdx(state_idx), false)) + } else { + None + } + }) + .collect(); + + // Set of visited nodes, with their backtrack state when visited. If a state's backtrack + // property changes, we visit it again to make its successors backtrack. + let mut visited: Map = Default::default(); + + while let Some((state, backtrack)) = work_list.pop() { + // Did we visit the state, with the right backtrack state? + match visited.entry(state) { + Entry::Occupied(mut entry) => { + if *entry.get() == backtrack { + continue; + } + entry.insert(backtrack); + } + Entry::Vacant(entry) => { + entry.insert(backtrack); + } + } + + // Whether the successor states should backtrack. + let successor_backtrack = backtrack || dfa.is_accepting_state(state); + + for (_, next) in &dfa.states[state.0].char_transitions { + work_list.push((*next, successor_backtrack)); + } + + for next_range in dfa.states[state.0].range_transitions.iter() { + work_list.push((next_range.value, successor_backtrack)); + } + + if let Some(next) = dfa.states[state.0].any_transition { + work_list.push((next, successor_backtrack)); + } + + if let Some(next) = dfa.states[state.0].end_of_input_transition { + work_list.push((next, successor_backtrack)); + } + } + + assert_eq!(visited.len(), dfa.states.len()); + + for (state, backtrack) in visited { + dfa.states[state.0].backtrack = backtrack; + } +} diff --git a/crates/lexgen/src/dfa/codegen.rs b/crates/lexgen/src/dfa/codegen.rs index f4fb919..fef25fa 100644 --- a/crates/lexgen/src/dfa/codegen.rs +++ b/crates/lexgen/src/dfa/codegen.rs @@ -393,17 +393,30 @@ fn generate_state( end_of_input_transition, accepting, predecessors: _, + backtrack, } = state; let fail = || -> TokenStream { - let action = generate_semantic_action_call("e!(semantic_action)); - quote!(match self.0.backtrack() { - Err(err) => { + if *backtrack || !accepting.is_empty() { + let action = generate_semantic_action_call("e!(semantic_action)); + quote!(match self.0.backtrack() { + Err(err) => { + self.reset_match(); + return Some(Err(err)) + } + Ok(semantic_action) => #action, + }) + } else { + quote!({ + let location = self.match_loc().0; self.reset_match(); - return Some(Err(err)) - } - Ok(semantic_action) => #action, - }) + self.0.__state = 0; + return Some(Err(::lexgen_util::LexerError { + location, + kind: ::lexgen_util::LexerErrorKind::InvalidToken, + })); + }) + } }; // When we can't take char or range transitions, take the 'any' transition if it exists, or @@ -797,6 +810,7 @@ fn generate_right_ctx_state_arm( end_of_input_transition, accepting, predecessors: _, + backtrack: _, } = state; let state_char_arms = diff --git a/crates/lexgen/src/dfa/simplify.rs b/crates/lexgen/src/dfa/simplify.rs index 0847577..c276c6d 100644 --- a/crates/lexgen/src/dfa/simplify.rs +++ b/crates/lexgen/src/dfa/simplify.rs @@ -50,6 +50,7 @@ pub fn simplify( end_of_input_transition, accepting, predecessors, + backtrack, } = state; let char_transitions = char_transitions @@ -83,6 +84,7 @@ pub fn simplify( end_of_input_transition, accepting, predecessors, + backtrack, } }) .collect(); diff --git a/crates/lexgen/src/lib.rs b/crates/lexgen/src/lib.rs index 7faa5f6..d1a1eef 100644 --- a/crates/lexgen/src/lib.rs +++ b/crates/lexgen/src/lib.rs @@ -140,11 +140,13 @@ pub fn lexer(input: TokenStream) -> TokenStream { } } - let dfa = match init_dfa { + let mut dfa = match init_dfa { Some(init_dfa) => init_dfa, None => nfa_to_dfa(&unnamed_nfa), }; + dfa::update_backtracks(&mut dfa); + let dfa = dfa::simplify::simplify(dfa, &mut dfas); dfa::codegen::generate(