From ced142d97b317f945a9fd50680a10241e79070ee Mon Sep 17 00:00:00 2001 From: Kyle Lin Date: Tue, 16 Jan 2024 11:00:05 +0800 Subject: [PATCH 1/5] Migrate preprocessor directive handling to parser unit --- src/lexer.c | 148 +++++------------------------ src/parser.c | 258 +++++++++++++++++++++++++++++++++++++++++---------- 2 files changed, 231 insertions(+), 175 deletions(-) diff --git a/src/lexer.c b/src/lexer.c index 243429c5..91bec448 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -58,10 +58,6 @@ typedef enum { T_while, T_for, T_do, - T_define, - T_undef, - T_error, - T_include, T_typedef, T_enum, T_struct, @@ -71,7 +67,16 @@ typedef enum { T_case, T_break, T_default, - T_continue + T_continue, + T_preproc_include, + T_preproc_define, + T_preproc_undef, + T_preproc_error, + T_preproc_if, + T_preproc_elif, + T_preproc_else, + T_preproc_endif, + T_preproc_ifdef } token_t; char token_str[MAX_TOKEN_LEN]; @@ -189,39 +194,6 @@ char peek_char(int offset) return SOURCE[source_idx + offset]; } -void if_elif_skip_lines() -{ - char peek_c; - int i; - - do { - skip_whitespace(); - i = 0; - do { - token_str[i++] = next_char; - } while (read_char(0) != '\n'); - token_str[i] = 0; - read_char(1); - peek_c = peek_char(1); - } while (next_char != '#' || (next_char == '#' && peek_c == 'd')); - skip_whitespace(); -} - -void ifdef_else_skip_lines() -{ - int i; - - do { - skip_whitespace(); - i = 0; - do { - token_str[i++] = next_char; - } while (read_char(0) != '\n'); - token_str[i] = 0; - } while (strcmp(token_str, "#else") && strcmp(token_str, "#endif")); - skip_whitespace(); -} - /* check alias defined or not */ void chk_def(int defined) { @@ -253,108 +225,31 @@ token_t get_next_token() skip_whitespace(); if (!strcmp(token_str, "#include")) { - do { - token_str[i++] = next_char; - } while (read_char(0) != '\n'); - skip_whitespace(); - return T_include; + return T_preproc_include; } if (!strcmp(token_str, "#define")) { - skip_whitespace(); - return T_define; + return T_preproc_define; } if (!strcmp(token_str, "#undef")) { - skip_whitespace(); - return T_undef; + return T_preproc_undef; } if (!strcmp(token_str, "#error")) { - skip_whitespace(); - return T_error; + return T_preproc_error; } if (!strcmp(token_str, "#if")) { - preproc_match = 0; - i = 0; - do { - token_str[i++] = next_char; - } while (read_char(0) != '\n'); - token_str[i] = 0; - - if (!strncmp(token_str, "defined", 7)) { - chk_def(1); - if (preproc_match) { - skip_whitespace(); - return get_next_token(); - } - - /* skip lines until #elif or #else or #endif */ - if_elif_skip_lines(); - return get_next_token(); - } + return T_preproc_if; } if (!strcmp(token_str, "#elif")) { - if (preproc_match) { - do { - skip_whitespace(); - i = 0; - do { - token_str[i++] = next_char; - } while (read_char(0) != '\n'); - token_str[i] = 0; - } while (strcmp(token_str, "#endif")); - skip_whitespace(); - return get_next_token(); - } - - i = 0; - do { - token_str[i++] = next_char; - } while (read_char(0) != '\n'); - token_str[i] = 0; - - if (!strncmp(token_str, "defined", 7)) { - chk_def(1); - if (preproc_match) { - skip_whitespace(); - return get_next_token(); - } - /* skip lines until #elif or #else or #endif */ - if_elif_skip_lines(); - return get_next_token(); - } + return T_preproc_elif; } if (!strcmp(token_str, "#ifdef")) { - preproc_match = 0; - i = 0; - do { - token_str[i++] = next_char; - } while (read_char(0) != '\n'); - token_str[i] = 0; - chk_def(0); - if (preproc_match) { - skip_whitespace(); - return get_next_token(); - } - /* skip lines until #else or #endif */ - ifdef_else_skip_lines(); - return get_next_token(); + return T_preproc_ifdef; } if (!strcmp(token_str, "#else")) { - /* reach here has 2 possible cases: - * 1. reach #ifdef preprocessor directive - * 2. conditional expression in #elif is false - */ - if (!preproc_match) { - skip_whitespace(); - return get_next_token(); - } - /* skip lines until #else or #endif */ - ifdef_else_skip_lines(); - return get_next_token(); + return T_preproc_else; } if (!strcmp(token_str, "#endif")) { - preproc_match = 0; - skip_whitespace(); - return get_next_token(); + return T_preproc_endif; } error("Unknown directive"); } @@ -709,7 +604,12 @@ void skip_macro_body() int lex_accept(token_t token) { if (next_token == token) { + /* FIXME: this is a hack, fix aggressive aliasing first */ + if (token == T_preproc_ifdef) + preproc_aliasing = 0; next_token = get_next_token(); + if (token == T_preproc_ifdef) + preproc_aliasing = 1; return 1; } return 0; diff --git a/src/parser.c b/src/parser.c index d544bd2b..afd0eab0 100644 --- a/src/parser.c +++ b/src/parser.c @@ -62,6 +62,208 @@ int get_size(var_t *var, type_t *type) return type->size; } +/* abort when invalidate is true and the line contains character other than whitespace */ +void skip_line(int invalidate) +{ + skip_whitespace(); + do { + if (invalidate && !is_whitespace(peek_char(0)) && !is_newline(peek_char(0))) { + error("Expects whitespace after preprocessor directive"); + } + } while (read_char(0) != '\n'); +} + +void if_elif_skip_lines() +{ + char peek_c; + int i; + + do { + skip_whitespace(); + i = 0; + do { + token_str[i++] = next_char; + } while (read_char(0) != '\n'); + token_str[i] = 0; + read_char(1); + peek_c = peek_char(1); + } while (next_char != '#' || (next_char == '#' && peek_c == 'd')); + skip_whitespace(); +} + +void ifdef_else_skip_lines() +{ + while (!lex_peek(T_preproc_else, NULL) && !lex_peek(T_preproc_endif, NULL)) { + next_token = get_next_token(); + } + skip_whitespace(); +} + +void check_def(char *alias) +{ + if (find_alias(alias)) + preproc_match = 1; +} + +void read_defined_macro() +{ + char lookup_alias[MAX_TOKEN_LEN]; + + preproc_aliasing = 0; /* to prevent aggressive aliasing */ + lex_expect(T_identifier); /* defined */ + lex_expect(T_open_bracket); + lex_ident(T_identifier, lookup_alias); + lex_expect(T_close_bracket); + preproc_aliasing = 1; + + check_def(lookup_alias); +} + +/* read preprocessor directive at each potential positions: + * e.g. global statement / body statement + */ +int read_preproc_directive() +{ + char token[MAX_ID_LEN]; + + if (lex_peek(T_preproc_include, token)) { + skip_line(0); /* FIXME: remove this line after syntax parsing is implemented */ + lex_expect(T_preproc_include); + /* TODO: parse include syntax here */ + return 1; + } + if (lex_accept(T_preproc_define)) { + char alias[MAX_VAR_LEN]; + char value[MAX_VAR_LEN]; + + lex_ident(T_identifier, alias); + + if (lex_peek(T_numeric, value)) { + lex_expect(T_numeric); + add_alias(alias, value); + } else if (lex_peek(T_string, value)) { + lex_expect(T_string); + add_alias(alias, value); + } else if (lex_accept(T_open_bracket)) { /* function-like macro */ + macro_t *macro = add_macro(alias); + + skip_newline = 0; + while (lex_peek(T_identifier, alias)) { + lex_expect(T_identifier); + strcpy(macro->param_defs[macro->num_param_defs++].var_name, + alias); + lex_accept(T_comma); + } + if (lex_accept(T_elipsis)) + macro->is_variadic = 1; + + macro->start_source_idx = source_idx; + skip_macro_body(); + } + + return 1; + } + if (lex_peek(T_preproc_undef, token)) { + char alias[MAX_VAR_LEN]; + + preproc_aliasing = 0; + lex_expect(T_preproc_undef); + lex_peek(T_identifier, alias); + preproc_aliasing = 1; + lex_expect(T_identifier); + + remove_alias(alias); + remove_macro(alias); + return 1; + } + if (lex_peek(T_preproc_error, NULL)) { + int i = 0; + char error_diagnostic[MAX_LINE_LEN]; + + do { + error_diagnostic[i++] = next_char; + } while (read_char(0) != '\n'); + error_diagnostic[i] = 0; + + error(error_diagnostic); + } + if (lex_accept(T_preproc_if)) { + preproc_match = 0; + + if (lex_peek(T_identifier, token) && !strcmp(token, "defined")) { + read_defined_macro(); + + if (preproc_match) { + skip_whitespace(); + return 1; + } + + if_elif_skip_lines(); + } else { + /* TODO: parse and evaluate constant expression here */ + } + return 1; + } + if (lex_accept(T_preproc_elif)) { + if (preproc_match) { + while (!lex_peek(T_preproc_endif, NULL)) { + next_token = get_next_token(); + } + return 1; + } + + if (lex_peek(T_identifier, token) && !strcmp(token, "defined")) { + read_defined_macro(); + + if (preproc_match) { + skip_whitespace(); + return 1; + } + + if_elif_skip_lines(); + } else { + /* TODO: parse and evaluate constant expression here */ + } + + return 1; + } + if (lex_accept(T_preproc_else)) { + /* reach here has 2 possible cases: + * 1. reach #ifdef preprocessor directive + * 2. conditional expression in #elif is false + */ + if (!preproc_match) { + skip_whitespace(); + return 1; + } + + /* skip lines until #else or #endif */ + ifdef_else_skip_lines(); + return 1; + } + if (lex_accept(T_preproc_endif)) { + preproc_match = 0; + skip_whitespace(); + return 1; + } + if (lex_accept(T_preproc_ifdef)) { + preproc_match = 0; + lex_ident(T_identifier, token); + check_def(token); + + if (preproc_match) { + skip_whitespace(); + return 1; + } + + /* skip lines until #else or #endif */ + ifdef_else_skip_lines(); + return 1; + } + + return 0; +} + int read_numeric_constant(char buffer[]) { int i = 0; @@ -2298,6 +2500,8 @@ basic_block_t *read_code_block(func_t *func, lex_expect(T_open_curly); while (!lex_accept(T_close_curly)) { + if (read_preproc_directive()) + continue; bb = read_body_statement(blk, bb); perform_side_effect(blk, bb); } @@ -2385,58 +2589,10 @@ void read_global_statement() char token[MAX_ID_LEN]; block_t *block = &BLOCKS[0]; /* global block */ - if (lex_peek(T_include, token)) { - lex_expect(T_include); - } else if (lex_accept(T_define)) { - char alias[MAX_VAR_LEN]; - char value[MAX_VAR_LEN]; - - lex_peek(T_identifier, alias); - lex_expect(T_identifier); - if (lex_peek(T_numeric, value)) { - lex_expect(T_numeric); - add_alias(alias, value); - } else if (lex_peek(T_string, value)) { - lex_expect(T_string); - add_alias(alias, value); - } else if (lex_accept(T_open_bracket)) { /* function-like macro */ - macro_t *macro = add_macro(alias); - - skip_newline = 0; - while (lex_peek(T_identifier, alias)) { - lex_expect(T_identifier); - strcpy(macro->param_defs[macro->num_param_defs++].var_name, - alias); - lex_accept(T_comma); - } - if (lex_accept(T_elipsis)) - macro->is_variadic = 1; - - macro->start_source_idx = source_idx; - skip_macro_body(); - } - } else if (lex_peek(T_undef, token)) { - char alias[MAX_VAR_LEN]; - - preproc_aliasing = 0; - lex_expect(T_undef); - lex_peek(T_identifier, alias); - preproc_aliasing = 1; - lex_expect(T_identifier); - - remove_alias(alias); - remove_macro(alias); - } else if (lex_peek(T_error, NULL)) { - int i = 0; - char error_diagnostic[MAX_LINE_LEN]; - - do { - error_diagnostic[i++] = next_char; - } while (read_char(0) != '\n'); - error_diagnostic[i] = 0; + if (read_preproc_directive()) + return; - error(error_diagnostic); - } else if (lex_accept(T_struct)) { + if (lex_accept(T_struct)) { int i = 0, size = 0; lex_ident(T_identifier, token); From 20d3f8d040098efe37b5be374843ee0da28558bb Mon Sep 17 00:00:00 2001 From: Kyle Lin Date: Tue, 16 Jan 2024 11:15:45 +0800 Subject: [PATCH 2/5] Fix code style --- src/parser.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/src/parser.c b/src/parser.c index afd0eab0..2141a599 100644 --- a/src/parser.c +++ b/src/parser.c @@ -62,12 +62,14 @@ int get_size(var_t *var, type_t *type) return type->size; } -/* abort when invalidate is true and the line contains character other than whitespace */ +/* abort when invalidate is true and the line contains character other than + * whitespace */ void skip_line(int invalidate) { skip_whitespace(); do { - if (invalidate && !is_whitespace(peek_char(0)) && !is_newline(peek_char(0))) { + if (invalidate && !is_whitespace(peek_char(0)) && + !is_newline(peek_char(0))) { error("Expects whitespace after preprocessor directive"); } } while (read_char(0) != '\n'); @@ -93,7 +95,8 @@ void if_elif_skip_lines() void ifdef_else_skip_lines() { - while (!lex_peek(T_preproc_else, NULL) && !lex_peek(T_preproc_endif, NULL)) { + while (!lex_peek(T_preproc_else, NULL) && + !lex_peek(T_preproc_endif, NULL)) { next_token = get_next_token(); } skip_whitespace(); @@ -109,7 +112,7 @@ void read_defined_macro() { char lookup_alias[MAX_TOKEN_LEN]; - preproc_aliasing = 0; /* to prevent aggressive aliasing */ + preproc_aliasing = 0; /* to prevent aggressive aliasing */ lex_expect(T_identifier); /* defined */ lex_expect(T_open_bracket); lex_ident(T_identifier, lookup_alias); @@ -127,7 +130,8 @@ int read_preproc_directive() char token[MAX_ID_LEN]; if (lex_peek(T_preproc_include, token)) { - skip_line(0); /* FIXME: remove this line after syntax parsing is implemented */ + skip_line(0); /* FIXME: remove this line after syntax parsing is + implemented */ lex_expect(T_preproc_include); /* TODO: parse include syntax here */ return 1; From e8f111fd3afe35eddb6de7da1549d44333a68aa5 Mon Sep 17 00:00:00 2001 From: Kyle Lin Date: Tue, 16 Jan 2024 11:24:16 +0800 Subject: [PATCH 3/5] Moves read_preproc_directive to loop This is for consistency for both body statement and global statement parsing. --- src/parser.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/parser.c b/src/parser.c index 2141a599..7cff3bf4 100644 --- a/src/parser.c +++ b/src/parser.c @@ -2593,9 +2593,6 @@ void read_global_statement() char token[MAX_ID_LEN]; block_t *block = &BLOCKS[0]; /* global block */ - if (read_preproc_directive()) - return; - if (lex_accept(T_struct)) { int i = 0, size = 0; @@ -2756,6 +2753,8 @@ void parse_internal() lex_expect(T_start); do { + if (read_preproc_directive()) + continue; read_global_statement(); } while (!lex_accept(T_eof)); } From c72583ea62df2cf724461e52eb64fe757d234241 Mon Sep 17 00:00:00 2001 From: Kyle Lin Date: Tue, 16 Jan 2024 11:50:36 +0800 Subject: [PATCH 4/5] Rename prefix `preproc` to `cppd` --- src/lexer.c | 40 ++++++++++++++++++++-------------------- src/parser.c | 28 ++++++++++++++-------------- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/src/lexer.c b/src/lexer.c index 91bec448..e26aca4d 100644 --- a/src/lexer.c +++ b/src/lexer.c @@ -68,15 +68,15 @@ typedef enum { T_break, T_default, T_continue, - T_preproc_include, - T_preproc_define, - T_preproc_undef, - T_preproc_error, - T_preproc_if, - T_preproc_elif, - T_preproc_else, - T_preproc_endif, - T_preproc_ifdef + T_cppd_include, + T_cppd_define, + T_cppd_undef, + T_cppd_error, + T_cppd_if, + T_cppd_elif, + T_cppd_else, + T_cppd_endif, + T_cppd_ifdef } token_t; char token_str[MAX_TOKEN_LEN]; @@ -225,31 +225,31 @@ token_t get_next_token() skip_whitespace(); if (!strcmp(token_str, "#include")) { - return T_preproc_include; + return T_cppd_include; } if (!strcmp(token_str, "#define")) { - return T_preproc_define; + return T_cppd_define; } if (!strcmp(token_str, "#undef")) { - return T_preproc_undef; + return T_cppd_undef; } if (!strcmp(token_str, "#error")) { - return T_preproc_error; + return T_cppd_error; } if (!strcmp(token_str, "#if")) { - return T_preproc_if; + return T_cppd_if; } if (!strcmp(token_str, "#elif")) { - return T_preproc_elif; + return T_cppd_elif; } if (!strcmp(token_str, "#ifdef")) { - return T_preproc_ifdef; + return T_cppd_ifdef; } if (!strcmp(token_str, "#else")) { - return T_preproc_else; + return T_cppd_else; } if (!strcmp(token_str, "#endif")) { - return T_preproc_endif; + return T_cppd_endif; } error("Unknown directive"); } @@ -605,10 +605,10 @@ int lex_accept(token_t token) { if (next_token == token) { /* FIXME: this is a hack, fix aggressive aliasing first */ - if (token == T_preproc_ifdef) + if (token == T_cppd_ifdef) preproc_aliasing = 0; next_token = get_next_token(); - if (token == T_preproc_ifdef) + if (token == T_cppd_ifdef) preproc_aliasing = 1; return 1; } diff --git a/src/parser.c b/src/parser.c index 7cff3bf4..768a2af4 100644 --- a/src/parser.c +++ b/src/parser.c @@ -95,8 +95,8 @@ void if_elif_skip_lines() void ifdef_else_skip_lines() { - while (!lex_peek(T_preproc_else, NULL) && - !lex_peek(T_preproc_endif, NULL)) { + while (!lex_peek(T_cppd_else, NULL) && + !lex_peek(T_cppd_endif, NULL)) { next_token = get_next_token(); } skip_whitespace(); @@ -129,14 +129,14 @@ int read_preproc_directive() { char token[MAX_ID_LEN]; - if (lex_peek(T_preproc_include, token)) { + if (lex_peek(T_cppd_include, token)) { skip_line(0); /* FIXME: remove this line after syntax parsing is implemented */ - lex_expect(T_preproc_include); + lex_expect(T_cppd_include); /* TODO: parse include syntax here */ return 1; } - if (lex_accept(T_preproc_define)) { + if (lex_accept(T_cppd_define)) { char alias[MAX_VAR_LEN]; char value[MAX_VAR_LEN]; @@ -167,11 +167,11 @@ int read_preproc_directive() return 1; } - if (lex_peek(T_preproc_undef, token)) { + if (lex_peek(T_cppd_undef, token)) { char alias[MAX_VAR_LEN]; preproc_aliasing = 0; - lex_expect(T_preproc_undef); + lex_expect(T_cppd_undef); lex_peek(T_identifier, alias); preproc_aliasing = 1; lex_expect(T_identifier); @@ -180,7 +180,7 @@ int read_preproc_directive() remove_macro(alias); return 1; } - if (lex_peek(T_preproc_error, NULL)) { + if (lex_peek(T_cppd_error, NULL)) { int i = 0; char error_diagnostic[MAX_LINE_LEN]; @@ -191,7 +191,7 @@ int read_preproc_directive() error(error_diagnostic); } - if (lex_accept(T_preproc_if)) { + if (lex_accept(T_cppd_if)) { preproc_match = 0; if (lex_peek(T_identifier, token) && !strcmp(token, "defined")) { @@ -208,9 +208,9 @@ int read_preproc_directive() } return 1; } - if (lex_accept(T_preproc_elif)) { + if (lex_accept(T_cppd_elif)) { if (preproc_match) { - while (!lex_peek(T_preproc_endif, NULL)) { + while (!lex_peek(T_cppd_endif, NULL)) { next_token = get_next_token(); } return 1; @@ -231,7 +231,7 @@ int read_preproc_directive() return 1; } - if (lex_accept(T_preproc_else)) { + if (lex_accept(T_cppd_else)) { /* reach here has 2 possible cases: * 1. reach #ifdef preprocessor directive * 2. conditional expression in #elif is false @@ -245,12 +245,12 @@ int read_preproc_directive() ifdef_else_skip_lines(); return 1; } - if (lex_accept(T_preproc_endif)) { + if (lex_accept(T_cppd_endif)) { preproc_match = 0; skip_whitespace(); return 1; } - if (lex_accept(T_preproc_ifdef)) { + if (lex_accept(T_cppd_ifdef)) { preproc_match = 0; lex_ident(T_identifier, token); check_def(token); From 3e708f5f5dbd858d88a3db9bbc01842720231d25 Mon Sep 17 00:00:00 2001 From: Kyle Lin Date: Tue, 16 Jan 2024 11:52:03 +0800 Subject: [PATCH 5/5] Fix code style --- src/parser.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/parser.c b/src/parser.c index 768a2af4..d20394f0 100644 --- a/src/parser.c +++ b/src/parser.c @@ -95,8 +95,7 @@ void if_elif_skip_lines() void ifdef_else_skip_lines() { - while (!lex_peek(T_cppd_else, NULL) && - !lex_peek(T_cppd_endif, NULL)) { + while (!lex_peek(T_cppd_else, NULL) && !lex_peek(T_cppd_endif, NULL)) { next_token = get_next_token(); } skip_whitespace();