Skip to content

Commit

Permalink
Fix redundant backtrack calls (#69)
Browse files Browse the repository at this point in the history
Improves [this lexgen benchmark][1] from 167 MB/s to 190 MB/s, 13%.

Fixes #68. All of the `backtrack` calls in the repro in the issue are
eliminated with this change.

[1]:
https://github.com/osa1/how-to-parse/blob/main/part1/bin/bench.rs#L76-L89
  • Loading branch information
osa1 authored Dec 3, 2024
1 parent 004e101 commit a397d00
Show file tree
Hide file tree
Showing 6 changed files with 109 additions and 9 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,10 @@
- Lexers can now use `pub(crate)` visibility, and other visibilities supported
by Rust and the `syn` crate. Previously only `pub` was supported.

- Eliminate redundant `backtrack` calls in generated code, improving code size
and runtime performance. Runtime performance improved 13% in a benchmark.
(#69)

# 2023/09/03: 0.15.0

- Lexer type declarations can now have outer attributes other than just
Expand Down
16 changes: 15 additions & 1 deletion crates/lexgen/src/dfa.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
mod backtrack;
pub mod codegen;
pub mod simplify;

Expand All @@ -7,6 +8,7 @@ pub mod simulate;
use crate::collections::{Map, Set};
use crate::nfa::AcceptingState;
use crate::range_map::{Range, RangeMap};
pub(crate) use backtrack::update_backtracks;

use std::convert::TryFrom;
use std::iter::{FromIterator, IntoIterator};
Expand Down Expand Up @@ -44,6 +46,7 @@ pub struct State<T, A> {
/// Predecessors of the state, used to inline code for a state with one predecessor in the
/// predecessor's code.
predecessors: Set<StateIdx>,
backtrack: bool,
}

impl<T, A> State<T, A> {
Expand All @@ -56,6 +59,7 @@ impl<T, A> State<T, A> {
end_of_input_transition: None,
accepting: vec![],
predecessors: Default::default(),
backtrack: false,
}
}

Expand Down Expand Up @@ -93,7 +97,6 @@ impl<A> DFA<StateIdx, A> {
new_state_idx
}

#[cfg(test)]
pub fn is_accepting_state(&self, state: StateIdx) -> bool {
!self.states[state.0].accepting.is_empty()
}
Expand Down Expand Up @@ -179,6 +182,7 @@ impl<A> DFA<StateIdx, A> {
end_of_input_transition,
accepting,
predecessors,
backtrack,
} in other.states
{
let mut new_char_transitions: Map<char, StateIdx> = Default::default();
Expand Down Expand Up @@ -213,6 +217,7 @@ impl<A> DFA<StateIdx, A> {
end_of_input_transition: new_end_of_input_transition,
accepting,
predecessors,
backtrack,
});
}

Expand All @@ -239,6 +244,7 @@ impl<A> Display for DFA<StateIdx, A> {
end_of_input_transition,
accepting,
predecessors: _,
backtrack,
} = state;

if !accepting.is_empty() {
Expand Down Expand Up @@ -301,6 +307,14 @@ impl<A> Display for DFA<StateIdx, A> {
writeln!(f, "$ -> {}", next)?;
}

if *backtrack {
if !first {
write!(f, " ")?;
}

writeln!(f, "backtrack")?;
}

if char_transitions.is_empty()
&& range_transitions.is_empty()
&& any_transition.is_none()
Expand Down
64 changes: 64 additions & 0 deletions crates/lexgen/src/dfa/backtrack.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
use crate::collections::Map;
use crate::dfa::{StateIdx, DFA};

use std::collections::hash_map::Entry;

pub(crate) fn update_backtracks<A>(dfa: &mut DFA<StateIdx, A>) {
// State and whether the state is an accepting state.
let mut work_list: Vec<(StateIdx, bool)> = dfa
.states
.iter()
.enumerate()
.filter_map(|(state_idx, state)| {
if state.initial {
Some((StateIdx(state_idx), false))
} else {
None
}
})
.collect();

// Set of visited nodes, with their backtrack state when visited. If a state's backtrack
// property changes, we visit it again to make its successors backtrack.
let mut visited: Map<StateIdx, bool> = Default::default();

while let Some((state, backtrack)) = work_list.pop() {
// Did we visit the state, with the right backtrack state?
match visited.entry(state) {
Entry::Occupied(mut entry) => {
if *entry.get() == backtrack {
continue;
}
entry.insert(backtrack);
}
Entry::Vacant(entry) => {
entry.insert(backtrack);
}
}

// Whether the successor states should backtrack.
let successor_backtrack = backtrack || dfa.is_accepting_state(state);

for (_, next) in &dfa.states[state.0].char_transitions {
work_list.push((*next, successor_backtrack));
}

for next_range in dfa.states[state.0].range_transitions.iter() {
work_list.push((next_range.value, successor_backtrack));
}

if let Some(next) = dfa.states[state.0].any_transition {
work_list.push((next, successor_backtrack));
}

if let Some(next) = dfa.states[state.0].end_of_input_transition {
work_list.push((next, successor_backtrack));
}
}

assert_eq!(visited.len(), dfa.states.len());

for (state, backtrack) in visited {
dfa.states[state.0].backtrack = backtrack;
}
}
28 changes: 21 additions & 7 deletions crates/lexgen/src/dfa/codegen.rs
Original file line number Diff line number Diff line change
Expand Up @@ -393,17 +393,30 @@ fn generate_state(
end_of_input_transition,
accepting,
predecessors: _,
backtrack,
} = state;

let fail = || -> TokenStream {
let action = generate_semantic_action_call(&quote!(semantic_action));
quote!(match self.0.backtrack() {
Err(err) => {
if *backtrack || !accepting.is_empty() {
let action = generate_semantic_action_call(&quote!(semantic_action));
quote!(match self.0.backtrack() {
Err(err) => {
self.reset_match();
return Some(Err(err))
}
Ok(semantic_action) => #action,
})
} else {
quote!({
let location = self.match_loc().0;
self.reset_match();
return Some(Err(err))
}
Ok(semantic_action) => #action,
})
self.0.__state = 0;
return Some(Err(::lexgen_util::LexerError {
location,
kind: ::lexgen_util::LexerErrorKind::InvalidToken,
}));
})
}
};

// When we can't take char or range transitions, take the 'any' transition if it exists, or
Expand Down Expand Up @@ -797,6 +810,7 @@ fn generate_right_ctx_state_arm(
end_of_input_transition,
accepting,
predecessors: _,
backtrack: _,
} = state;

let state_char_arms =
Expand Down
2 changes: 2 additions & 0 deletions crates/lexgen/src/dfa/simplify.rs
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ pub fn simplify<K, A: Clone>(
end_of_input_transition,
accepting,
predecessors,
backtrack,
} = state;

let char_transitions = char_transitions
Expand Down Expand Up @@ -83,6 +84,7 @@ pub fn simplify<K, A: Clone>(
end_of_input_transition,
accepting,
predecessors,
backtrack,
}
})
.collect();
Expand Down
4 changes: 3 additions & 1 deletion crates/lexgen/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -140,11 +140,13 @@ pub fn lexer(input: TokenStream) -> TokenStream {
}
}

let dfa = match init_dfa {
let mut dfa = match init_dfa {
Some(init_dfa) => init_dfa,
None => nfa_to_dfa(&unnamed_nfa),
};

dfa::update_backtracks(&mut dfa);

let dfa = dfa::simplify::simplify(dfa, &mut dfas);

dfa::codegen::generate(
Expand Down

0 comments on commit a397d00

Please sign in to comment.