From ed38fb7f06c7e1f7c11ac29692c03765805a8115 Mon Sep 17 00:00:00 2001 From: zyc9012 Date: Fri, 25 Aug 2023 21:32:19 +0800 Subject: [PATCH] Improve parsing performance by sharing a global parser --- bench/bench.rb | 20 +++++++++++++++++--- ext/nokolexbor/nl_document.c | 21 ++++++++++++--------- 2 files changed, 29 insertions(+), 12 deletions(-) diff --git a/bench/bench.rb b/bench/bench.rb index 1cdfe74..2dae62e 100644 --- a/bench/bench.rb +++ b/bench/bench.rb @@ -15,20 +15,34 @@ html_file = File.join(current_dir, 'coffee.html') html = File.open(html_file, "r:UTF-8", &:read) +html_small = "
Hello world
" * 50 Benchmark.ips do |x| x.warmup = 5 x.time = 20 - - x.report("Nokolexbor parse") do + + x.report("Nokolexbor parse (#{html.size / 1024} KB)") do Nokolexbor::HTML(html) end - x.report("Nokogiri parse") do + x.report("Nokogiri parse (#{html.size / 1024} KB)") do Nokogiri::HTML(html) end x.compare! end +Benchmark.ips do |x| + x.warmup = 5 + x.time = 20 + + x.report("Nokolexbor parse (#{html_small.size} B)") do + Nokolexbor::HTML(html_small) + end + x.report("Nokogiri parse (#{html_small.size} B)") do + Nokogiri::HTML(html_small) + end + x.compare! +end + nokolex = Nokolexbor::HTML(html) nokogiri = Nokogiri::HTML(html) css_selector = 'div.g div[data-ved] a[href]:not([href="#"])' diff --git a/ext/nokolexbor/nl_document.c b/ext/nokolexbor/nl_document.c index a2d4cec..6ba957e 100644 --- a/ext/nokolexbor/nl_document.c +++ b/ext/nokolexbor/nl_document.c @@ -4,6 +4,8 @@ extern VALUE mNokolexbor; extern VALUE cNokolexborNode; VALUE cNokolexborDocument; +lxb_html_parser_t *g_parser; + static void free_nl_document(lxb_html_document_t *document) { @@ -44,18 +46,19 @@ nl_document_parse(VALUE self, VALUE rb_string_or_io) const char *html_c = StringValuePtr(rb_html); size_t html_len = RSTRING_LEN(rb_html); - lxb_html_document_t *document; - - document = lxb_html_document_create(); - if (document == NULL) { - rb_raise(rb_eRuntimeError, "Error creating document"); + if (g_parser == NULL) { + g_parser = lxb_html_parser_create(); + lxb_status_t status = lxb_html_parser_init(g_parser); + if (status != LXB_STATUS_OK) { + nl_raise_lexbor_error(status); + } + g_parser->tree->scripting = true; } - lxb_dom_document_scripting_set(lxb_dom_interface_document(document), true); + lxb_html_document_t *document = lxb_html_parse(g_parser, (const lxb_char_t *)html_c, html_len); - lxb_status_t status = lxb_html_document_parse(document, (const lxb_char_t *)html_c, html_len); - if (status != LXB_STATUS_OK) { - nl_raise_lexbor_error(status); + if (document == NULL) { + rb_raise(rb_eRuntimeError, "Error parsing document"); } return TypedData_Wrap_Struct(cNokolexborDocument, &nl_document_type, document);