Skip to content

Commit

Permalink
Speed-up attribute parsing by splitting tokenizing and verification
Browse files Browse the repository at this point in the history
This split allows us to use memchr to find delimiters and should also
enable to better auto-vectorize the XML character verification.
  • Loading branch information
adamreichold committed Jan 25, 2025
1 parent 1205d2e commit 3a1ab33
Showing 1 changed file with 65 additions and 2 deletions.
67 changes: 65 additions & 2 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
use core::ops::Range;
use core::str;

use memchr::memchr2;

use crate::{Error, TextPos};

type Result<T> = core::result::Result<T, Error>;
Expand Down Expand Up @@ -92,6 +94,10 @@ trait XmlByteExt {
/// Checks if byte is within the ASCII
/// [Char](https://www.w3.org/TR/xml/#NT-Char) range.
fn is_xml_name(&self) -> bool;

/// Checks if the value is within the
/// [Char](https://www.w3.org/TR/xml/#NT-Char) range.
fn is_xml_char(&self) -> bool;
}

impl XmlByteExt for u8 {
Expand All @@ -109,6 +115,41 @@ impl XmlByteExt for u8 {
fn is_xml_name(&self) -> bool {
matches!(*self, b'A'..=b'Z' | b'a'..=b'z'| b'0'..=b'9'| b':' | b'_' | b'-' | b'.')
}

#[inline]
fn is_xml_char(&self) -> bool {
*self > 0x20 || self.is_xml_space()
}
}

#[inline]
fn is_xml_str(s: &str, stream: &mut Stream<'_>) -> Result<()> {
if s.as_bytes().is_ascii() {
for (i, b) in s.as_bytes().iter().enumerate() {
if !b.is_xml_char() {
stream.rewind(s.len() - i);
return Err(Error::NonXmlChar(*b as char, stream.gen_text_pos()));
}
}

Ok(())
} else {
is_xml_str_unicode(s, stream)
}
}


#[cold]
#[inline(never)]
fn is_xml_str_unicode(s: &str, stream: &mut Stream<'_>) -> Result<()> {
for (i, ch) in s.char_indices() {
if !ch.is_xml_char() {
stream.rewind(s.len() - i);
return Err(Error::NonXmlChar(ch, stream.gen_text_pos()));
}
}

Ok(())
}

/// A string slice.
Expand Down Expand Up @@ -567,11 +608,11 @@ fn parse_element<'input>(s: &mut Stream<'input>, events: &mut impl XmlEvents<'in
s.consume_eq()?;
let eq_len = u8::try_from(s.pos() - qname_end).unwrap_or(u8::MAX);
let quote = s.consume_quote()?;
let quote_c = quote as char;
// The attribute value must not contain the < character.
let value_start = s.pos();
s.skip_chars(|_, c| c != quote_c && c != '<')?;
s.advance_until2(quote, b'<')?;
let value = s.slice_back_span(value_start);
is_xml_str(value.as_str(), s)?;
s.consume_byte(quote)?;
let end = s.pos();
events.token(Token::Attribute(start..end, qname_len, eq_len, prefix, local, value))?;
Expand Down Expand Up @@ -759,12 +800,23 @@ impl<'input> Stream<'input> {
Ok(self.span.as_str().as_bytes()[self.pos + 1])
}

#[inline]
fn as_bytes(&self) -> &[u8] {
&self.span.text.as_bytes()[self.pos..self.end]
}

#[inline]
pub fn advance(&mut self, n: usize) {
debug_assert!(self.pos + n <= self.end);
self.pos += n;
}

#[inline]
pub fn rewind(&mut self, n: usize) {
debug_assert!(self.pos >= n);
self.pos -= n;
}

#[inline]
fn starts_with(&self, text: &[u8]) -> bool {
self.span.text.as_bytes()[self.pos..self.end].starts_with(text)
Expand Down Expand Up @@ -846,6 +898,17 @@ impl<'input> Stream<'input> {
Ok(())
}

#[inline]
fn advance_until2(&mut self, needle1: u8, needle2: u8) -> Result<()> {
match memchr2(needle1, needle2, self.as_bytes()) {
Some(pos) => {
self.advance(pos);
Ok(())
}
None => Err(Error::UnexpectedEndOfStream),
}
}

#[inline]
fn chars(&self) -> str::Chars<'input> {
self.span.as_str()[self.pos..self.end].chars()
Expand Down

0 comments on commit 3a1ab33

Please sign in to comment.