Skip to content

Commit

Permalink
Improve parsing performance by sharing a global parser
Browse files Browse the repository at this point in the history
  • Loading branch information
zyc9012 committed Sep 1, 2023
1 parent 18f1c26 commit ed38fb7
Show file tree
Hide file tree
Showing 2 changed files with 29 additions and 12 deletions.
20 changes: 17 additions & 3 deletions bench/bench.rb
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,34 @@

html_file = File.join(current_dir, 'coffee.html')
html = File.open(html_file, "r:UTF-8", &:read)
html_small = "<div>Hello world</div>" * 50

Benchmark.ips do |x|
x.warmup = 5
x.time = 20
x.report("Nokolexbor parse") do

x.report("Nokolexbor parse (#{html.size / 1024} KB)") do
Nokolexbor::HTML(html)
end
x.report("Nokogiri parse") do
x.report("Nokogiri parse (#{html.size / 1024} KB)") do
Nokogiri::HTML(html)
end
x.compare!
end

Benchmark.ips do |x|
x.warmup = 5
x.time = 20

x.report("Nokolexbor parse (#{html_small.size} B)") do
Nokolexbor::HTML(html_small)
end
x.report("Nokogiri parse (#{html_small.size} B)") do
Nokogiri::HTML(html_small)
end
x.compare!
end

nokolex = Nokolexbor::HTML(html)
nokogiri = Nokogiri::HTML(html)
css_selector = 'div.g div[data-ved] a[href]:not([href="#"])'
Expand Down
21 changes: 12 additions & 9 deletions ext/nokolexbor/nl_document.c
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@ extern VALUE mNokolexbor;
extern VALUE cNokolexborNode;
VALUE cNokolexborDocument;

lxb_html_parser_t *g_parser;

static void
free_nl_document(lxb_html_document_t *document)
{
Expand Down Expand Up @@ -44,18 +46,19 @@ nl_document_parse(VALUE self, VALUE rb_string_or_io)
const char *html_c = StringValuePtr(rb_html);
size_t html_len = RSTRING_LEN(rb_html);

lxb_html_document_t *document;

document = lxb_html_document_create();
if (document == NULL) {
rb_raise(rb_eRuntimeError, "Error creating document");
if (g_parser == NULL) {
g_parser = lxb_html_parser_create();
lxb_status_t status = lxb_html_parser_init(g_parser);
if (status != LXB_STATUS_OK) {
nl_raise_lexbor_error(status);
}
g_parser->tree->scripting = true;
}

lxb_dom_document_scripting_set(lxb_dom_interface_document(document), true);
lxb_html_document_t *document = lxb_html_parse(g_parser, (const lxb_char_t *)html_c, html_len);

lxb_status_t status = lxb_html_document_parse(document, (const lxb_char_t *)html_c, html_len);
if (status != LXB_STATUS_OK) {
nl_raise_lexbor_error(status);
if (document == NULL) {
rb_raise(rb_eRuntimeError, "Error parsing document");
}

return TypedData_Wrap_Struct(cNokolexborDocument, &nl_document_type, document);
Expand Down

0 comments on commit ed38fb7

Please sign in to comment.