Skip to content

Commit

Permalink
Block-wise attribute normalization using memchr and avoiding UTF-8 ve…
Browse files Browse the repository at this point in the history
…rification.
  • Loading branch information
adamreichold committed Jan 14, 2025
1 parent 251084e commit 1761229
Show file tree
Hide file tree
Showing 2 changed files with 62 additions and 43 deletions.
100 changes: 57 additions & 43 deletions src/parse.rs
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
use alloc::string::{String, ToString};
use alloc::{vec, vec::Vec};
use core::ops::Range;
use memchr::{memchr, memchr2, memchr_iter};
use memchr::{memchr, memchr2, memchr3, memchr_iter};

use crate::{
AttributeData, Document, ExpandedNameIndexed, NamespaceIdx, Namespaces, NodeData, NodeId,
Expand Down Expand Up @@ -1110,54 +1110,67 @@ fn normalize_attribute<'input>(
) -> Result<StringStorage<'input>> {
// We assume that `&` indicates an entity or a character reference.
// But in rare cases it can be just an another character.
if memchr2(b'&', b'\t', text.as_str().as_bytes()).is_some() || memchr2(b'\n', b'\r', text.as_str().as_bytes()).is_some() {
let mut text_buffer = TextBuffer::new();
_normalize_attribute(text, &mut text_buffer, ctx)?;
Ok(StringStorage::new_owned(text_buffer.finish()))
if memchr(b'&', text.as_str().as_bytes()).is_some() || memchr3(b'\t', b'\r', b'\n', text.as_str().as_bytes()).is_some() {
let mut buf = String::new();
_normalize_attribute(text, &mut buf, ctx)?;
Ok(StringStorage::new_owned(buf))
} else {
Ok(StringStorage::Borrowed(text.as_str()))
}
}

fn _normalize_attribute(text: StrSpan, buffer: &mut TextBuffer, ctx: &mut Context) -> Result<()> {
fn _normalize_attribute<'input>(text: StrSpan<'input>, buf: &mut String, ctx: &mut Context<'input>) -> Result<()> {
let mut stream = Stream::from_substr(ctx.doc.text, text.range());
while !stream.at_end() {
// Safe, because we already checked that the stream is not at the end.
let c = stream.curr_byte_unchecked();

if c != b'&' {
stream.advance(1);
buffer.push_from_attr(c, stream.curr_byte().ok());
continue;
while let Some(mut pos) = memchr(b'&', stream.as_str().as_bytes()) {
while let Some(pos1) = memchr3(b'\t', b'\r', b'\n', &stream.as_str().as_bytes()[..pos]) {
let (before, after) = stream.as_str().split_at(pos1);

buf.push_str(before);
buf.push(' ');

let skip = if after.starts_with("\r\n") {
2
} else {
1
};

stream.advance(pos1 + skip);
pos -= pos1 + skip;
}

buf.push_str(&stream.as_str()[..pos]);
stream.advance(pos);

// Check for character/entity references.
let start = stream.pos();
match stream.consume_reference() {
Some(Reference::Char(ch)) => {
for b in CharToBytes::new(ch) {
if ctx.loop_detector.depth > 0 {
// Escaped `<` inside an ENTITY is an error.
// Escaped `<` outside an ENTITY is ok.
if b == b'<' {
return Err(Error::InvalidAttributeValue(
stream.gen_text_pos_from(start),
));
}

buffer.push_from_attr(b, None);
if ctx.loop_detector.depth > 0 {
// Escaped `<` inside an ENTITY is an error.
// Escaped `<` outside an ENTITY is ok.
if ch == '<' {
return Err(Error::InvalidAttributeValue(
stream.gen_text_pos_from(start),
));
}

if matches!(ch, '\t' | '\r' | '\n') {
buf.push(' ');
} else {
// Characters not from entity should be added as is.
// Not sure why... At least `lxml` produces the same results.
buffer.push_raw(b);
buf.push(ch);
}
} else {
// Characters not from entity should be added as is.
// Not sure why... At least `lxml` produces the same results.
buf.push(ch);
}
}
Some(Reference::Entity(name)) => match ctx.entities.iter().find(|e| e.name == name) {
Some(entity) => {
ctx.loop_detector.inc_references(&stream)?;
ctx.loop_detector.inc_depth(&stream)?;
_normalize_attribute(entity.value, buffer, ctx)?;
_normalize_attribute(entity.value, buf, ctx)?;
ctx.loop_detector.dec_depth();
}
None => {
Expand All @@ -1172,6 +1185,22 @@ fn _normalize_attribute(text: StrSpan, buffer: &mut TextBuffer, ctx: &mut Contex
}
}

while let Some(pos) = memchr3(b'\t', b'\r', b'\n', stream.as_str().as_bytes()) {
let (before, after) = stream.as_str().split_at(pos);

buf.push_str(before);
buf.push(' ');

let skip = if after.starts_with("\r\n") {
2
} else {
1
};

stream.advance(pos + skip);
}

buf.push_str(stream.as_str());
Ok(())
}

Expand Down Expand Up @@ -1279,21 +1308,6 @@ impl TextBuffer {
self.buffer.push(c);
}

fn push_from_attr(&mut self, mut current: u8, next: Option<u8>) {
// \r in \r\n should be ignored.
if current == b'\r' && next == Some(b'\n') {
return;
}

// \n, \r and \t should be converted into spaces.
current = match current {
b'\n' | b'\r' | b'\t' => b' ',
_ => current,
};

self.buffer.push(current);
}

// Translate \r\n and any \r that is not followed by \n into a single \n character.
//
// https://www.w3.org/TR/xml/#sec-line-ends
Expand Down
5 changes: 5 additions & 0 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -736,6 +736,11 @@ impl<'input> Stream<'input> {
self.pos >= self.end
}

#[inline]
pub fn as_str(&self) -> &str {
&self.span.text[self.pos..self.end]
}

#[inline]
pub fn curr_byte(&self) -> Result<u8> {
if self.at_end() {
Expand Down

0 comments on commit 1761229

Please sign in to comment.