From 79794c4053df3a2dd3832b49343c93826edd7065 Mon Sep 17 00:00:00 2001 From: "Victor M. Alvarez" Date: Mon, 15 Jan 2024 16:51:11 +0100 Subject: [PATCH] Fix issue with regexps not matching empty strings. Some regexps like `/(abc|)/` where not matching the empty string even if they should. Matching an empty string is not very useful, as such regexp will match on every offset of every file, but this is fixed for correctness. Also fixes another issue with the `\b` metacharacter in regexps, which was exposed after fixing the main issue. --- libyara/re.c | 38 ++++++++++++++++++++++++-------------- libyara/scan.c | 4 ++-- tests/test-rules.c | 8 +++++--- 3 files changed, 31 insertions(+), 19 deletions(-) diff --git a/libyara/re.c b/libyara/re.c index e128e3065a..4edf441529 100644 --- a/libyara/re.c +++ b/libyara/re.c @@ -214,7 +214,11 @@ void yr_re_ast_destroy(RE_AST* re_ast) // Parses a regexp but don't emit its code. A further call to // yr_re_ast_emit_code is required to get the code. // -int yr_re_parse(const char* re_string, RE_AST** re_ast, RE_ERROR* error, int flags) +int yr_re_parse( + const char* re_string, + RE_AST** re_ast, + RE_ERROR* error, + int flags) { return yr_parse_re_string(re_string, re_ast, error, flags); } @@ -1725,6 +1729,9 @@ int yr_re_exec( int kill; int action; + bool prev_is_word_char = false; + bool input_is_word_char = false; + #define ACTION_NONE 0 #define ACTION_CONTINUE 1 #define ACTION_KILL 2 @@ -1940,27 +1947,30 @@ int yr_re_exec( case RE_OPCODE_WORD_BOUNDARY: case RE_OPCODE_NON_WORD_BOUNDARY: - - if (bytes_matched == 0 && input_backwards_size < character_size) + if (input - input_incr + character_size <= input_data + input_forwards_size && + input - input_incr >= input_data - input_backwards_size) { - match = true; + prev_is_word_char = _yr_re_is_word_char( + input - input_incr, character_size); } - else if (bytes_matched >= max_bytes_matched) + else { - match = true; + prev_is_word_char = false; + } + + if (input + character_size <= input_data + input_forwards_size && + input >= input_data - input_backwards_size) + { + input_is_word_char = _yr_re_is_word_char(input, character_size); } else { - assert(input < input_data + input_forwards_size); - assert(input >= input_data - input_backwards_size); - - assert(input - input_incr < input_data + input_forwards_size); - assert(input - input_incr >= input_data - input_backwards_size); - - match = _yr_re_is_word_char(input, character_size) != - _yr_re_is_word_char(input - input_incr, character_size); + input_is_word_char = false; } + match = (prev_is_word_char && !input_is_word_char) || + (!prev_is_word_char && input_is_word_char); + if (*ip == RE_OPCODE_NON_WORD_BOUNDARY) match = !match; diff --git a/libyara/scan.c b/libyara/scan.c index b1ff00fcc5..5e4c8dc308 100644 --- a/libyara/scan.c +++ b/libyara/scan.c @@ -859,7 +859,7 @@ static int _yr_scan_verify_re_match( (void*) &callback_args, NULL)); } - else if (callback_args.forward_matches > 0) + else if (callback_args.forward_matches >= 0) { FAIL_ON_ERROR( _yr_scan_match_callback(data + offset, 0, flags, &callback_args)); @@ -892,7 +892,7 @@ static int _yr_scan_verify_re_match( (void*) &callback_args, NULL)); } - else if (callback_args.forward_matches > 0) + else if (callback_args.forward_matches >= 0) { FAIL_ON_ERROR( _yr_scan_match_callback(data + offset, 0, flags, &callback_args)); diff --git a/tests/test-rules.c b/tests/test-rules.c index 5fd928fb1b..5545df330f 100644 --- a/tests/test-rules.c +++ b/tests/test-rules.c @@ -2435,15 +2435,15 @@ void test_re() "rule test { strings: $a = /abc\\b/ wide condition: $a }", TEXT_1024_BYTES "a\0b\0c\0b\t"); - assert_false_rule_blob( + assert_false_rule( "rule test { strings: $a = /\\b/ wide condition: $a }", TEXT_1024_BYTES "abc"); - assert_true_rule_blob( + assert_true_rule( "rule test { condition: \"avb\" matches /a\\vb/ }", TEXT_1024_BYTES "rule test { condition: \"avb\" matches /a\\vb/ }"); - assert_false_rule_blob( + assert_false_rule( "rule test { condition: \"ab\" matches /a\\vb/ }", TEXT_1024_BYTES "rule test { condition: \"ab\" matches /a\\vb/ }"); @@ -2638,8 +2638,10 @@ void test_re() assert_false_regexp("^(ab|cd)e", "abcde"); assert_true_regexp("(abc|)ef", "abcdef", "ef"); assert_true_regexp("(abc|)ef", "abcef", "abcef"); + assert_true_regexp("(abc|)", "foo", ""); assert_true_regexp("\\babc", "abc", "abc"); assert_true_regexp("abc\\b", "abc", "abc"); + assert_true_regexp("\\b", "abc", ""); assert_false_regexp("\\babc", "1abc"); assert_false_regexp("abc\\b", "abc1"); assert_true_regexp("abc\\s\\b", "abc x", "abc ");