Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

perf: apply memory optimizations to TokenReader #27

Merged
merged 1 commit into from
Oct 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion bench/src/token_reader.zig
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,8 @@ pub fn runBench(data: []const u8) !void {
var token_reader = xml.tokenReader(data_stream.reader(), .{
.DecoderType = xml.encoding.Utf8Decoder,
});
while (try token_reader.next()) |_| {}
while (true) {
const token = try token_reader.next();
if (token == .eof) break;
}
}
4 changes: 2 additions & 2 deletions src/Scanner.zig
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
//! - Not extensively tested/fuzzed.

/// The data for the most recently returned token.
token_data: Token.Data = .{ .ok = {} },
token_data: Token.Data = undefined,
/// The current state of the scanner.
state: State = .start,
/// Data associated with the current state of the scanner.
Expand Down Expand Up @@ -97,7 +97,7 @@ pub const Range = struct {
/// content or there are other necessary intervening factors, such as CDATA
/// in the middle of normal (non-CDATA) element content.
///
/// For efficiency (avoiding copying when passing around tokens), `Token` is
/// For efficiency (avoiding copying when passing around tokens), this is
/// merely an enum specifying the token type. The actual token data is available
/// in `Token.Data`, in the scanner's `token_data` field. The `fullToken`
/// function can be used to get a `Token.Full`, which is a tagged union type and
Expand Down
80 changes: 45 additions & 35 deletions src/reader.zig
Original file line number Diff line number Diff line change
Expand Up @@ -468,50 +468,52 @@ pub fn Reader(comptime ReaderType: type, comptime options: ReaderOptions) type {
_ = self.event_arena.reset(.retain_capacity);
const event_allocator = self.event_arena.allocator();
while (true) {
const token = (try self.nextToken()) orelse return null;
switch (token) {
.xml_declaration => |xml_declaration| return .{ .xml_declaration = .{
.version = xml_declaration.version,
.encoding = xml_declaration.encoding,
.standalone = xml_declaration.standalone,
switch (try self.nextToken()) {
.eof => return null,
.xml_declaration => return .{ .xml_declaration = .{
.version = self.token_reader.token_data.xml_declaration.version,
.encoding = self.token_reader.token_data.xml_declaration.encoding,
.standalone = self.token_reader.token_data.xml_declaration.standalone,
} },
.element_start => |element_start| {
.element_start => {
if (try self.finalizePendingEvent()) |event| {
self.pending_token = token;
self.pending_token = .element_start;
return event;
}
const name = try self.allocator.dupe(u8, element_start.name);
const name = try self.allocator.dupe(u8, self.token_reader.token_data.element_start.name);
errdefer self.allocator.free(name);
try self.element_names.append(self.allocator, name);
errdefer _ = self.element_names.pop();
try self.namespace_context.startScope(self.allocator);
self.pending_event = .{ .element_start = .{ .name = name } };
},
.element_content => |element_content| {
.element_content => {
if (try self.finalizePendingEvent()) |event| {
self.pending_token = token;
self.pending_token = .element_content;
return event;
}
return .{ .element_content = .{ .content = try self.contentText(element_content.content) } };
return .{ .element_content = .{
.content = try self.contentText(self.token_reader.token_data.element_content.content),
} };
},
.element_end => |element_end| {
.element_end => {
if (try self.finalizePendingEvent()) |event| {
self.pending_token = token;
self.pending_token = .element_end;
return event;
}
const expected_name = self.element_names.pop();
defer self.allocator.free(expected_name);
if (!mem.eql(u8, expected_name, element_end.name)) {
if (!mem.eql(u8, expected_name, self.token_reader.token_data.element_end.name)) {
return error.MismatchedEndTag;
}
var qname = try self.namespace_context.parseName(element_end.name, true);
var qname = try self.namespace_context.parseName(self.token_reader.token_data.element_end.name, true);
try qname.dupNs(event_allocator);
self.namespace_context.endScope(self.allocator);
return .{ .element_end = .{ .name = qname } };
},
.element_end_empty => {
if (try self.finalizePendingEvent()) |event| {
self.pending_token = token;
self.pending_token = .element_end_empty;
return event;
}
const name = self.element_names.pop();
Expand All @@ -522,51 +524,59 @@ pub fn Reader(comptime ReaderType: type, comptime options: ReaderOptions) type {
self.namespace_context.endScope(self.allocator);
return .{ .element_end = .{ .name = qname } };
},
.attribute_start => |attribute_start| {
var attr_entry = try self.pending_event.element_start.attributes.getOrPut(event_allocator, attribute_start.name);
.attribute_start => {
var attr_entry = try self.pending_event.element_start.attributes.getOrPut(
event_allocator,
self.token_reader.token_data.attribute_start.name,
);
if (attr_entry.found_existing) {
return error.DuplicateAttribute;
}
// The attribute name will be invalidated after we get
// the next token, so we have to duplicate it here.
// This doesn't change the hash of the key, so it's
// safe to do this.
attr_entry.key_ptr.* = try event_allocator.dupe(u8, attribute_start.name);
attr_entry.key_ptr.* = try event_allocator.dupe(u8, self.token_reader.token_data.attribute_start.name);
attr_entry.value_ptr.* = .{};
},
.attribute_content => |attribute_content| {
.attribute_content => {
const attributes = self.pending_event.element_start.attributes.values();
try attributes[attributes.len - 1].appendSlice(event_allocator, try self.contentText(attribute_content.content));
try attributes[attributes.len - 1].appendSlice(event_allocator, try self.contentText(self.token_reader.token_data.attribute_content.content));
},
.comment_start => {
if (try self.finalizePendingEvent()) |event| {
self.pending_token = token;
self.pending_token = .comment_start;
return event;
}
self.pending_event = .{ .comment = .{} };
},
.comment_content => |comment_content| {
try self.pending_event.comment.content.appendSlice(event_allocator, comment_content.content);
if (comment_content.final) {
.comment_content => {
try self.pending_event.comment.content.appendSlice(event_allocator, self.token_reader.token_data.comment_content.content);
if (self.token_reader.token_data.comment_content.final) {
const event = Event{ .comment = .{ .content = self.pending_event.comment.content.items } };
self.pending_event = .none;
return event;
}
},
.pi_start => |pi_start| {
.pi_start => {
if (try self.finalizePendingEvent()) |event| {
self.pending_token = token;
self.pending_token = .pi_start;
return event;
}
if (options.namespace_aware and mem.indexOfScalar(u8, pi_start.target, ':') != null) {
if (options.namespace_aware and mem.indexOfScalar(u8, self.token_reader.token_data.pi_start.target, ':') != null) {
return error.QNameNotAllowed;
}
self.pending_event = .{ .pi = .{ .target = try event_allocator.dupe(u8, pi_start.target) } };
self.pending_event = .{ .pi = .{
.target = try event_allocator.dupe(u8, self.token_reader.token_data.pi_start.target),
} };
},
.pi_content => |pi_content| {
try self.pending_event.pi.content.appendSlice(event_allocator, pi_content.content);
if (pi_content.final) {
const event = Event{ .pi = .{ .target = self.pending_event.pi.target, .content = self.pending_event.pi.content.items } };
.pi_content => {
try self.pending_event.pi.content.appendSlice(event_allocator, self.token_reader.token_data.pi_content.content);
if (self.token_reader.token_data.pi_content.final) {
const event = Event{ .pi = .{
.target = self.pending_event.pi.target,
.content = self.pending_event.pi.content.items,
} };
self.pending_event = .none;
return event;
}
Expand All @@ -575,7 +585,7 @@ pub fn Reader(comptime ReaderType: type, comptime options: ReaderOptions) type {
}
}

fn nextToken(self: *Self) !?Token {
fn nextToken(self: *Self) !Token {
if (self.pending_token) |token| {
self.pending_token = null;
return token;
Expand Down
Loading