From 4ee1e28205b61c6ec6b3a01194b5784a7c1d175d Mon Sep 17 00:00:00 2001 From: Kate F Date: Wed, 8 May 2024 14:38:44 +0100 Subject: [PATCH] First cut at introducing rx, a tool to compile a set of regular expressions. --- Makefile | 1 + src/rx/Makefile | 14 + src/rx/main.c | 1196 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 1211 insertions(+) create mode 100644 src/rx/Makefile create mode 100644 src/rx/main.c diff --git a/Makefile b/Makefile index 499239fd9..31b8c6769 100644 --- a/Makefile +++ b/Makefile @@ -101,6 +101,7 @@ SUBDIR += src/libre/print SUBDIR += src/libre SUBDIR += src/fsm SUBDIR += src/re +SUBDIR += src/rx SUBDIR += src/retest SUBDIR += src/lx/print SUBDIR += src/lx diff --git a/src/rx/Makefile b/src/rx/Makefile new file mode 100644 index 000000000..2facbfd6c --- /dev/null +++ b/src/rx/Makefile @@ -0,0 +1,14 @@ +.include "../../share/mk/top.mk" + +SRC += src/rx/main.c + +PROG += rx + +.for lib in ${LIB:Mlibfsm} ${LIB:Mlibre} +${BUILD}/bin/rx: ${BUILD}/lib/${lib:R}.a +.endfor + +.for src in ${SRC:Msrc/rx/*.c} +${BUILD}/bin/rx: ${BUILD}/${src:R}.o +.endfor + diff --git a/src/rx/main.c b/src/rx/main.c new file mode 100644 index 000000000..3b521c1b9 --- /dev/null +++ b/src/rx/main.c @@ -0,0 +1,1196 @@ +/* + * Copyright 2024 Katherine Flavel + * + * See LICENCE for the full copyright terms. + */ + +#define _POSIX_C_SOURCE 200809L + +#ifdef __linux__ +#include +#endif + +#include +#include + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +struct literal { + const void *p; + size_t n; + fsm_end_id_t id; +}; + +// TODO: no need to be global now +static size_t declined_count, general_count; +static size_t literals_none_count, literals_left_count, literals_right_count, literals_both_count; +static fsm_end_id_t *declined; +static fsm_end_id_t *general; + +// XXX: xalloc()ing these conservatively is large, i don't like it +static struct literal *literals_none; +static struct literal *literals_left; +static struct literal *literals_right; +static struct literal *literals_both; + +static void * +xalloc(size_t size) +{ + void *p; + + p = malloc(size); + if (p == NULL) { + perror("malloc"); + exit(EXIT_FAILURE); + } + + return p; +} +static char * +xstrndup(const char *s, size_t n) +{ + char *new = strndup(s, n); + if (new == NULL) { + perror("strndup"); + exit(EXIT_FAILURE); + } + + return new; +} + +static bool +permitted_chars(const char *p, size_t n, const char *accept, const char *reject) +{ + char *s; + bool r = true; + + assert(p != NULL); + assert(accept != NULL || reject != NULL); + + /* + * p,n is a slice into a \n-terminated buffer, so we can't strcspn() here. + * We could provide our own implementation (i.e. memcspn), but until it's + * actually a problem I'd prefer to allocate and \0-terminate here just + * for the sake of using strcspn(). + */ + s = xstrndup(p, n); + + if (accept != NULL) { + r &= s[strspn(s, accept)] == '\0'; + } + + if (reject != NULL) { + r &= s[strcspn(s, reject)] == '\0'; + } + + free(s); + + return r; +} + +int +nlgetc(void *opaque) +{ + const char **s = opaque; + char c; + + assert(s != NULL); + assert(*s != NULL); + + c = **s; + + /* skip the newline too */ + (*s)++; + + if (c == '\n') { + return EOF; + } + + return (unsigned char) c; +} + +static void +append_id(fsm_end_id_t *a, size_t *count, fsm_end_id_t id) +{ + assert(a != NULL); + + a[*count] = id; + (*count)++; +} + +static void +append_literal(struct literal *a, size_t *count, fsm_end_id_t id, const char *p, size_t n) +{ + assert(a != NULL); + assert(p != NULL); + + a[*count].id = id; + a[*count].p = p; + a[*count].n = n; + (*count)++; +} + +/* ^[abc..]*$ */ +// XXX: this interface doesn't allow us to have \0 in the character set +static struct fsm * +intersect_charset(const struct fsm_options *opt, bool show_stats, + const char *charset, struct fsm *b) +{ + struct fsm *a, *q; + + if (charset == NULL) { + return b; + } + + /* + * Since intersection is destructive, there's no point in making + * the charset FSM in advance and then fsm_clone()ing it. + * We may as well just make a new one each time. + */ + { + fsm_state_t state; + + // TODO: pass .statealloc explicitly, we know it's 1. the default is overkill + a = fsm_new(opt); + if (a == NULL) { + perror("fsm_new"); + exit(EXIT_FAILURE); + } + + if (!fsm_addstate(a, &state)) { + perror("fsm_addstate"); + exit(EXIT_FAILURE); + } + + for (const char *p = charset; *p != '\0'; p++) { + if (!fsm_addedge_literal(a, state, state, *p)) { + perror("fsm_addedge_literal"); + exit(EXIT_FAILURE); + } + } + + fsm_setend(a, state, true); + fsm_setstart(a, state); + } + + assert(fsm_all(a, fsm_isdfa)); + assert(fsm_all(b, fsm_isdfa)); + + size_t pre = fsm_countstates(b); + if (show_stats) { + fprintf(stderr, "pre-intersection dfa: %zu states\n", pre); + } + + /* + * Here I would call: + * + * q = fsm_walk2(a, b, FSM_WALK2_BOTH, FSM_WALK2_BOTH); + * + * This is equivalent to fsm_intersect(). fsm_intersect() asserts that + * both operands are DFA at runtime. But in this program we know this + * empirically from our callers. So we would call fsm_walk2() directly + * to avoid needlessly running the DFA predicate in DNDEBUG builds. + * + * This is intersection implemented by walking sets of states through + * both FSM simultaneously, as described by Hopcroft, Motwani and Ullman + * (2001, 2nd ed.) 4.2, Closure under Intersection. + * + * The intersection of two FSM consists of only those items which + * are present in _BOTH. + * + * Unfortunately fsm_walk2() isn't in the public API, so we don't do that. + */ + q = fsm_intersect(a, b); + if (q == NULL) { + perror("fsm_intersect"); + exit(EXIT_FAILURE); + } + + /* walking two DFA produces a DFA */ + assert(fsm_all(q, fsm_isdfa)); + + size_t post = fsm_countstates(q); + if (show_stats) { + fprintf(stderr, "post-intersection dfa: %zu states\n", post); + } + + /* intersecting with the charset FSM should never introduce new states */ + assert(post <= pre); + + return q; +} + +static void +categorize(const struct fsm_options *opt, enum re_dialect dialect, enum re_flags flags, + bool strict, bool verbose, + const char *charset, const char *reject, fsm_end_id_t id, const char *s) +{ + enum re_literal_category category; + struct re_err err; + char *lit_s; + size_t lit_n; + int r; + + /* + * -1: Error + * 1: Literal, *s may or may not be NULL (i.e. for an empty string), *n >= 0 + * and *category is set. + */ + const char *q = s; + r = re_is_literal(dialect, nlgetc, &q, opt, flags, &err, &category, + &lit_s, &lit_n); + + /* unsupported */ + if (r == -1 && err.e == RE_EUNSUPPORTED) { + if (verbose) { + fprintf(stderr, "declined (unsupported) #%u: /%.*s/\n", + id, (int) strcspn(s, "\n"), s); + } + + append_id(declined, &declined_count, id); + return; + } + + /* parse error */ +// we consider a parse error here a failure, rather than declining the pattern +// TOOD: pass cli flag to make this non-fatal, quietly decline. also for empty strings and unsatisfiable expressions + if (r == -1) { + if (!strict) { + if (verbose) { +// XXX: rephrase error. also re_perror() for this too + fprintf(stderr, "XXX: would decline (parse error) #%u: /%.*s/\n", + id, (int) strcspn(s, "\n"), s); + } + + append_id(declined, &declined_count, id); + return; + } + + const char *p; + + /* s is \n-terminated per pattern. + * We allocate here so we can \0-terminate for re_perror() */ + p = xstrndup(s, strcspn(s, "\n")); + re_perror(dialect, &err, NULL, p); + free((void *) p); + + exit(EXIT_FAILURE); + } + + /* empty string */ + if (r == 1 && lit_n == 0) { + fprintf(stdout, "re_is_literal: /%.*s/ regex matches empty string only\n", + (int) strcspn(s, "\n"), s); +// TODO: cli option to decline instead + exit(EXIT_FAILURE); + } + + /* unsatisfiable */ + if (r == 1 && category == RE_LITERAL_UNSATISFIABLE) { + fprintf(stdout, "re_is_literal: /%.*s/ regex is unsatisfiable\n", + (int) strcspn(s, "\n"), s); +// TODO: cli option to decline instead + exit(EXIT_FAILURE); + } + + /* is a literal */ + if (r == 1) { + assert(lit_n >= 0); + assert(lit_s != NULL); + + for (size_t i = 0; i < lit_n; i++) { + if (lit_s[i] == '\0') { +// TODO: cli option to decline instead + fprintf(stderr, "re_is_literal: '%.*s' literal contains \\0\n", + (int) strcspn(s, "\n"), s); + exit(EXIT_FAILURE); + } + } + + if (category == RE_LITERAL_ANCHOR_END || category == RE_LITERAL_ANCHOR_BOTH) { + /* + * This newline is appended in ast_is_literal() as an attempt to + * account for the PCRE2_DOTALL behaviour. I'm not interested in + * newlines for this program (optional or legitimate), + * so I'm just cutting it off. + */ +// TODO: maybe only if it's not in the character set? + if (lit_n > 0 && lit_s[lit_n - 1] == '\n') { + lit_n--; + } + } + + if (verbose) { + fprintf(stderr, "literal #%u '%.*s'\n", id, (int) lit_n, lit_s); + } + +// TODO: explain reject does not apply + if (!permitted_chars(lit_s, lit_n, charset, NULL)) { +// TODO: cli option to decline instead +//fprintf(stderr, "declined literal (charset) #%u: /%.*s/\n", id, (int) strcspn(s, "\n"), s); + append_id(declined, &declined_count, id); + return; + } + +//fprintf(stderr, "re_is_literal found: '%.*s'\n", (int) lit_n, lit_s); + + switch (category) { + case RE_LITERAL_UNANCHORED: + append_literal(literals_none, &literals_none_count, id, lit_s, lit_n); + break; + + case RE_LITERAL_ANCHOR_START: + append_literal(literals_left, &literals_left_count, id, lit_s, lit_n); + break; + + case RE_LITERAL_ANCHOR_END: + append_literal(literals_right, &literals_right_count, id, lit_s, lit_n); + break; + + case RE_LITERAL_ANCHOR_BOTH: + append_literal(literals_both, &literals_both_count, id, lit_s, lit_n); + break; + + default: + assert(!"unreached"); + abort(); + } + + return; + } + + /* + * Reject regexps that contain characters that may be expensive. + * + * If the regex contains a character that's not in the charset, + * we can also save some compilation time and decline it up front. + * + * This isn't perfect, for example consider hex encoded values. + * My design assumption here is that it's okay to pre-filter strings + * down to a character set. This means /./ doesn't match any character, + * it means any character within the charset. + * + * So we're relying on that here, and this is just a best-effort help. + */ + if (!permitted_chars(s, strcspn(s, "\n"), charset, reject)) { +//fprintf(stderr, "declined general (charset or reject) #%u: /%.*s/\n", id, (int) strcspn(s, "\n"), s); + append_id(declined, &declined_count, id); + return; + } + +//fprintf(stderr, "general #%u: /%.*s/\n", id, (int) strcspn(s, "\n"), s); + /* not a literal */ + append_id(general, &general_count, id); +} + +/* + * This is re_strings() but for an array of struct literal + */ +static struct fsm * +literal_strings(const struct fsm_options *opt, + const struct literal a[], size_t n, enum re_strings_flags flags) +{ + struct re_strings *g; + struct fsm *fsm; + + assert(a != NULL); + + g = re_strings_new(); + if (g == NULL) { + return NULL; + perror("re_strings_new"); + exit(EXIT_FAILURE); + } + + for (size_t i = 0; i < n; i++) { + if (!re_strings_add_raw(g, a[i].p, a[i].n, & (fsm_end_id_t) { a[i].id })) { + perror("re_strings_add_raw"); + exit(EXIT_FAILURE); + } + } + + fsm = re_strings_build(g, opt, flags); + re_strings_free(g); + +// XXX: this fsm has unreachable states boo +//fsm_print_dot(stderr, fsm); + + /* this is already a DFA, courtesy of re_strings() */ + assert(fsm_all(fsm, fsm_isdfa)); + + return fsm; +} + +static struct fsm * +build_literals_fsm(const struct fsm_options *opt, bool show_stats, + const char *charset, + struct literal *a, size_t n, enum re_strings_flags flags) +{ + struct fsm *fsm; + + fsm = literal_strings(opt, a, n, flags); +// XXX: we do produce an empty fsm for 0 literals assert(!fsm_empty(fsm)); + + for (size_t i = 0; i < n; i++) { +// TODO: when we switch to libfsm alloc hooks, check if re_is_literal() calls malloc proper or not for *lit_s + free((void *) a[i].p); + } +// TODO: also free a[j].a when we malloc for the 20k-element arrays + + fsm = intersect_charset(opt, show_stats, charset, fsm); + if (fsm == NULL) { + perror("intersect_charset"); + exit(EXIT_FAILURE); + } + + /* We don't minimise here because this fsm has multiple endids, + * and the resulting FSM would be very similar to the current DFA */ + + if (fsm_countstates(fsm) == 0) { + fprintf(stderr, "literals produced empty intersection\n"); + exit(EXIT_FAILURE); + } + +#ifndef NDEBUG +// TODO: could test to see that the fsm isn't any different. for general regexps, lots will be, because of /./ +// TODO: but literal strings should be no different. except for the anchoring +#endif + + return fsm; +} + +static struct fsm * +build_regex_fsm(const struct fsm_options *opt, bool show_stats, + const char *charset, + enum re_dialect dialect, bool strict, + const char *s, enum re_flags flags, fsm_end_id_t id) +{ + struct fsm *fsm; + struct re_err err; + + assert(s != NULL); + +// TODO: want to compile these as if anchored, then do the unanchored parts during unioning +// then we can also construct the unanchored parts from the charset + + const char *q = s; +// TODO: RE_SINGLE here? i think it shouldn't have any effect when \n isn't present in the charset + fsm = re_comp(dialect, nlgetc, &q, opt, flags, &err); + + if (strict && fsm == NULL) { + const char *p; + + /* patterns[] is \n-terminated per pattern. + * We allocate here so we can \0-terminate for re_perror() */ + p = xstrndup(s, strcspn(s, "\n")); + re_perror(dialect, &err, NULL, p); + exit(EXIT_FAILURE); + } + + if (fsm == NULL) { + switch (err.e) { + case RE_EUNSUPCAPTUR: + case RE_EUNSUPPPCRE: + case RE_EUNSUPPORTED: + return NULL; + + default: + break; + } + } + + /* + * An unsatisfiable regex compiles to an FSM that matches nothing, + * so that unioning it with other regexes will still work. + */ + if (strict && fsm_empty(fsm)) { + fprintf(stdout, "re_comp: /%.*s/ regex is unsatisfiable\n", + (int) strcspn(s, "\n"), s); + exit(EXIT_FAILURE); + } + +// TODO: note we don't minimise rather than determinise, explain why + if (!fsm_determinise(fsm)) { + perror("fsm_determinise"); + exit(EXIT_FAILURE); + } + + if (!fsm_setendid(fsm, id)) { + perror("fsm_setendid"); + exit(EXIT_FAILURE); + } + + fsm = intersect_charset(opt, show_stats, charset, fsm); + if (fsm == NULL) { + perror("intersect_charset"); + exit(EXIT_FAILURE); + } + +// XXX: unsure why this gives 0 states, even with endids +// TODO: explain we minimise again after intersect_charset because this fsm has a single endid, so it's (potentially) a worthwhile difference + +// TODO: we do minimise now after intersection, explain it's because only one end id + if (!fsm_minimise(fsm)) { + perror("fsm_minimise"); + exit(EXIT_FAILURE); + } + + return fsm; +} + +static int +endleaf_c(FILE *f, const struct fsm_end_ids *ids, + const void *endleaf_opaque) +{ + assert(endleaf_opaque == NULL); + assert(ids != NULL); + assert(endleaf_opaque == NULL); + + (void) f; + (void) endleaf_opaque; + + /* morally an assertion, but I feel better leaving this in for various user data */ + if (ids->count == 0) { + fprintf(stderr, "no IDs attached to one accepting state\n"); + exit(EXIT_FAILURE); + } + + /* exactly one end id means no ambiguious patterns */ +// TODO: want to ensure at compile time that the endids set only has one element, reject ambigious patterns +// XXX: i'm not convinced we do. we could just as well emit code that returns a list of IDs +// and have a cli option to error about ambiguities +// TODO: this isn't strict checking. this is the opposite to -u +// TODO: give examples, lx style +#if 0 + if (ids->count > 1) { +// TODO: explain this more clearly + fprintf(stderr, "ambigious patterns:"); + + for (fsm_end_id_t i = 0; i < ids->count; i++) { + fprintf(stderr, " %u", ids->ids[i]); + } + + fprintf(stderr, "\n"); + +// we consider this a compilation error for the purposes of this program + exit(EXIT_FAILURE); + } +#endif + + /* + * Here I would like to emit (static unsigned []) { 1, 2, 3 } + * but specifying a storage duration for compound literals + * is a compiler extension. + * So I'm emitting a static const variable declaration instead. + */ + + fprintf(f, "{\n"); + fprintf(f, "\t\tstatic const unsigned a[] = { "); + for (fsm_end_id_t i = 0; i < ids->count; i++) { + fprintf(f, "%u", ids->ids[i]); + if (i + 1 < ids->count) { + fprintf(f, ", "); + } + } + fprintf(f, " };\n"); + fprintf(f, "\t\t*ids = a;\n"); + fprintf(f, "\t\t*count = %u;\n", ids->count); + fprintf(f, "\t\treturn 0;\n"); + fprintf(f, "\t}"); + +// TODO: override return -1 + + return 0; +} + +void +print_fsm_c(struct fsm *fsm, const char *prefix) +{ + const struct fsm_options *old; + struct fsm_options tmp; + + if (prefix == NULL) { + prefix = "fsm"; + } + + fprintf(stdout, "/* generated */\n"); + fprintf(stdout, "\n"); + + fprintf(stdout, "#include \n"); + fprintf(stdout, "#include \n"); + fprintf(stdout, "\n"); + + fprintf(stdout, "bool\n"); + fprintf(stdout, "%s_main(const char *s, size_t n,\n", prefix); + fprintf(stdout, "\tconst unsigned **ids, size_t *count)\n"); + fprintf(stdout, "{\n"); + fprintf(stdout, "\tconst char *b = s, *e = s + n;\n"); + + old = fsm_getoptions(fsm); + assert(old != NULL); + + tmp = *old; + tmp.fragment = true, + tmp.endleaf_opaque = NULL, + tmp.endleaf = endleaf_c, + + fsm_setoptions(fsm, &tmp); + fsm_print_vmc(stdout, fsm); + fsm_setoptions(fsm, old); + + fprintf(stdout, "}\n"); +} + +static enum re_dialect +dialect_name(const char *name) +{ + size_t i; + + struct { + const char *name; + enum re_dialect dialect; + } a[] = { + { "like", RE_LIKE }, + { "literal", RE_LITERAL }, + { "glob", RE_GLOB }, + { "native", RE_NATIVE }, + { "pcre", RE_PCRE }, + { "sql", RE_SQL } + }; + + assert(name != NULL); + + for (i = 0; i < sizeof a / sizeof *a; i++) { + if (0 == strcmp(a[i].name, name)) { + return a[i].dialect; + } + } + + fprintf(stderr, "unrecognised regexp dialect \"%s\"; valid dialects are: ", name); + + for (i = 0; i < sizeof a / sizeof *a; i++) { + fprintf(stderr, "%s%s", + a[i].name, + i + 1 < sizeof a / sizeof *a ? ", " : "\n"); + } + + exit(EXIT_FAILURE); +} + +static +void usage(const char *name) +{ + + if (name == NULL) { + name = ""; + } else { + const char *p = strrchr(name, '/'); + name = p != NULL ? p + 1 : name; + } + + printf("usage: %s: [-ciQqv] [-C charset] [-r dialect] [-R reject] input-file [declined-file]\n", name); + printf(" %s -h\n", name); +} + +int +main(int argc, char *argv[]) +{ + bool quiet = false; + bool strict = false; + bool verbose = false; + bool show_stats = false; + + size_t general_limit = 0; + + const char *charset = NULL; + const char *input_file = NULL; + const char *declined_file = NULL; + + enum re_dialect dialect = RE_PCRE; +// TODO: we need to provide a way to disable RE_END_NL +// TODO: RE_SINGLE. also be careful, flags disrupt re_is_literal() + const enum re_flags flags = 0; // TODO: notably not RE_END_NL + + // TODO: explain we decline these at compile time because NFA->DFA is too costly + const char *reject = "*+{"; + + const struct fsm_options opt = { + .anonymous_states = true, + .consolidate_edges = true, + .comments = false, + .case_ranges = true, + .always_hex = false, + .group_edges = true, + .io = FSM_IO_PAIR, +// TODO: cli option to set io api + }; + +// XXX: actual character set in practice +charset = + /* RFC9110 field-value stuff... */ + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwzyx" + "0123456789" + "!#$%&'*+-.^_`|~" + + /* ...plus some extras found in practice */ + "'\"/()@,:; "; + +// TODO: -l cli option for codegen lang +// TODO: optional extra argv[1] to output declined patterns (we could iteratively munch down on the list) +// TODO: cli option for numeric limit for general_count to decline patterns +// TODO: cli option for memory limit to decline patterns (implement via allocation hooks) +// TODO: cli option for worker pool threads +// TODO: make this a whole tool, alongside re(1) and fsm(1) +// TODO: manpage +// TODO: probably handle each of fsms[] in a separate thread, use worker pool +// TODO: cli flag to set prefix + + { + const char *name = argv[0]; + int c; + + while (c = getopt(argc, argv, "h" "C:cin:r:R:Qqv"), c != -1) { + switch (c) { + case 'C': + charset = optarg; + break; + + case 'c': + /* + * https://www.rfc-editor.org/rfc/rfc9110#name-tokens + * + * tchar = "!" / "#" / "$" / "%" / "&" / "'" / "*" + * / "+" / "-" / "." / "^" / "_" / "`" / "|" / "~" + * / DIGIT / ALPHA + * ; any VCHAR, except delimiters + * + * https://www.rfc-editor.org/rfc/rfc9110#appendix-A + * field-name = token + * token = 1*tchar + * + * So a valid header field-name has at least one character. + * But we don't enforce that here. + */ +// TODO: although we could, with the charset FSM as [a-z]+ rather than [a-z]* + charset = + "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwzyx" + "0123456789" + "!#$%&'*+-.^_`|~"; + break; + + case 'h': + usage(name); + exit(EXIT_SUCCESS); + + case 'i': + strict = true; + break; + + case 'n': + general_limit = atoi(optarg); + break; + + case 'R': + reject = optarg; + break; + + case 'r': + dialect = dialect_name(optarg); + break; + + case 'Q': + /* one-off stats on resource consumption */ + show_stats = true; + break; + + case 'q': + /* "quiet" applies to the generated code only */ + quiet = true; + break; + + case 'v': + /* "verbose" means showing information about every pattern and FSM, + * this is O(n) output to the length of the input file */ + verbose = true; + break; + + case '?': + default: + usage(name); + exit(EXIT_FAILURE); + } + } + + argc -= optind; + argv += optind; + + switch (argc) { + case 2: + declined_file = argv[1]; + case 1: + input_file = argv[0]; + break; + + default: + usage(name); + exit(EXIT_FAILURE); + } + } + +// TODO: error if charset contains \n (it also can't contain \0) + + if (show_stats) { + fprintf(stderr, "charset: [%s]\n", charset); + fprintf(stderr, "reject: [%s]\n", reject); + } + + struct stat sb; + char *addr; + int fd; + + size_t patterns_count; + + /* + * Phases: + * + * 1. per byte, delimiting + * 2. per pattern, categorization + * 3. per set of literals/general regex to FSMs + * 4. union FSMs, codegen + */ + + { + fd = open(input_file, O_RDONLY); + if (fd == -1) { + perror(input_file); + exit(EXIT_FAILURE); + } + + if (fstat(fd, &sb) == -1) { + perror(input_file); + exit(EXIT_FAILURE); + } + + addr = mmap(NULL, sb.st_size, PROT_READ, MAP_PRIVATE, fd, 0); + if (addr == MAP_FAILED) { + perror("mmap"); + exit(EXIT_FAILURE); + } + + /* we assume a trailing newline per pattern in the getc callback */ + if (sb.st_size == 0 || addr[sb.st_size - 1] != '\n') { + fprintf(stderr, "%s: missing newline at end of file\n", input_file); + exit(EXIT_FAILURE); + } + + /* + * Here we iterate per byte, finding the \n delimiter for patterns. + * Henceforth we iterate per pattern id, not per byte. + */ + patterns_count = 0; + for (const char *p = addr; p - addr < sb.st_size; p++) { + if (*p == '\n') { + patterns_count++; + } + } + + if (show_stats) { + fprintf(stderr, "patterns: %zu\n", patterns_count); + } + } + + if (general_limit == 0) { + general_limit = patterns_count; + } + +// we're keying id -> spelling +// we don't use patterns[] just for debugging, we use it for passing to re_comp(). but not to re_strings(), we pass lit_s there +// note this is \n-terminated, not \0-terminated. need %.*s for printing +// TODO: explain the index here is an endid + + const char **patterns; + + patterns = xalloc(patterns_count * sizeof *patterns); + + // TODO: explain we use the worst case of patterns_count for all these, prior to categorization + declined = xalloc(patterns_count * sizeof *declined); + general = xalloc(patterns_count * sizeof *general); + +// TODO: allocate literals etc, based on worst case of pattern size +// XXX: xalloc()ing these conservatively is large, i don't like it +// literals arrays store lit_s \0-terminated strings. and have parallel arrays for respective endids per literal +// XXX: what to do about literals? we need the lit_s strings for re_strings(). and we need an array of endids for re_strings() + literals_none = xalloc(patterns_count * sizeof *literals_none); + literals_left = xalloc(patterns_count * sizeof *literals_left); + literals_right = xalloc(patterns_count * sizeof *literals_right); + literals_both = xalloc(patterns_count * sizeof *literals_both); + + /* + * On end id numbering: The UI must be one pattern per line, because + * it's the only format that doesn't introduce a syntax with escaping. + * So that means we need to track the line number associated with each + * regex. And that means the line number is neccessarily the end id. + * So patterns[] is indexed by the end id. + * + * We have four whole FSMs from re_strings(), each potentially contains + * a set of end ids. And each regex in general[i] is a single end id. + * So general[i] is the index into patterns[]. + */ + + /* + * Categorize patterns + */ + { + const char *q = addr; + + for (fsm_end_id_t id = 0; id < patterns_count; id++) { + patterns[id] = q; + if (verbose) { + fprintf(stderr, "pattern[%u]: /%.*s/\n", + id, (int) strcspn(q, "\n"), patterns[id]); + } + + categorize(&opt, dialect, flags, strict, verbose, + charset, reject, id, patterns[id]); + + /* + * On error the parser may not have consumed all the input. + * So we can't use the current value of q because it potentially + * points somewhere mid-way along the regex. + */ + q += strcspn(patterns[id], "\n") + 1; + } + } + + /* + * This is conservative allocation, the declined list may still grow yet. + * We track actual usage in fsm_count. + */ + struct fsm **fsms = xalloc((patterns_count - declined_count) * sizeof *fsms); + size_t fsm_count = 0; + + /* + * The order of processing literals and regexps here is arbitrary. + * If this were a threaded program, each set of literals and each + * individual general regex could be handled in parallel. + * + * Also, the order of FSMs in the fsms[] array is not significant, + * they could be populated in any order. They're just going to get + * unioned anyway. + */ + + /* sets of literals */ + { + const struct { + struct literal *a; + size_t n; + enum re_strings_flags flags; + } a[] = { + { literals_none, literals_none_count, 0 }, + { literals_left, literals_left_count, RE_STRINGS_ANCHOR_LEFT }, + { literals_right, literals_right_count, RE_STRINGS_ANCHOR_RIGHT }, + { literals_both, literals_both_count, RE_STRINGS_ANCHOR_LEFT | RE_STRINGS_ANCHOR_RIGHT }, + }; + + for (size_t j = 0; j < sizeof a / sizeof *a; j++) { + if (show_stats) { + fprintf(stderr, "literals[%zu].n = %zu\n", j, a[j].n); + } + + struct fsm *fsm = build_literals_fsm(&opt, show_stats, charset, + a[j].a, a[j].n, a[j].flags); + assert(fsm != NULL); + + fsms[fsm_count] = fsm; + fsm_count++; + } + } + + free(literals_none); + free(literals_left); + free(literals_right); + free(literals_both); + + /* individual regexps */ + { + for (size_t i = 0; i < general_count; i++) { + if (i > general_limit) { + append_id(declined, &declined_count, i); + continue; + } + + if (verbose) { + fprintf(stderr, "general[%zu]: #%u /%.*s/\n", + i, general[i], (int) strcspn(patterns[general[i]], "\n"), patterns[general[i]]); + } + + struct fsm *fsm = build_regex_fsm(&opt, show_stats, charset, + dialect, strict, + patterns[general[i]], flags, general[i]); + if (fsm == NULL) { + append_id(declined, &declined_count, i); + continue; + } + + fsms[fsm_count] = fsm; + fsm_count++; + } + + if (general_count > general_limit) { + general_count = general_limit; + } + } + + free(general); + + if (show_stats) { + fprintf(stderr, "declined: %zu patterns\n", + declined_count); + fprintf(stderr, "literals_none: %zu patterns, %u states\n", + literals_none_count, fsm_countstates(fsms[0])); + fprintf(stderr, "literals_left: %zu patterns, %u states\n", + literals_left_count, fsm_countstates(fsms[1])); + fprintf(stderr, "literals_right: %zu patterns, %u states\n", + literals_right_count, fsm_countstates(fsms[2])); + fprintf(stderr, "literals_both: %zu patterns, %u states\n", + literals_both_count, fsm_countstates(fsms[3])); + fprintf(stderr, "general: %zu patterns (limit %zu)\n", + general_count, general_limit); + } + + if (verbose) { + for (size_t i = 0; i < declined_count; i++) { + fprintf(stderr, "declined[%zu]: #%u /%.*s/\n", + i, declined[i], + (int) strcspn(patterns[declined[i]], "\n"), patterns[declined[i]]); + } + } + +// TODO: dump to file, handle these with pcre or such +// TODO: explain you're expected to disambiguate by pattern spelling, the endids are not relevant here + if (declined_file != NULL) { + FILE *f; + + f = fopen(declined_file, "w"); + if (f == NULL) { + perror(declined_file); + exit(EXIT_FAILURE); + } + + for (size_t i = 0; i < declined_count; i++) { + fprintf(f, "%.*s\n", (int) strcspn(patterns[declined[i]], "\n"), patterns[declined[i]]); + } + + fclose(f); + } + + free(declined); + free(patterns); + + if (-1 == munmap(addr, sb.st_size)) { + perror("munmap"); + exit(EXIT_FAILURE); + } + + if (-1 == close(fd)) { + perror("close"); + exit(EXIT_FAILURE); + } + + if (show_stats) { + fprintf(stderr, "fsm_count = %zu FSMs prior to union\n", fsm_count); + } + + if (verbose) { + for (size_t i = 0; i < fsm_count; i++) { + fprintf(stderr, "fsm[%zu] = %u states\n", i, fsm_countstates(fsms[i])); + } + } + + /* + * Union down and codegen. + */ + { + struct fsm *fsm; + + /* + * fsm_union_array introduces epsilons. + * If we construct anchored DFA and deal with ^.* ourselves here, + * we can union union without introducing epislons. + * This should reduce pressure on fsm_determinise() below. + */ + + fsm = fsm_union_array(fsm_count, fsms, NULL); + if (fsm == NULL) { + perror("fsm_union_array"); + exit(EXIT_FAILURE); + } + + free(fsms); + + if (show_stats) { + fprintf(stderr, "nfa: %u states\n", fsm_countstates(fsm)); + } + + /* + * We determinise but do not minimise the single DFA, like in lx. + * This keeps end states (and therefore end ids) unique. But unlike lx, + * end ids don't overlap between patterns (because they represent lines + * in the input file). So there isn't an advantage to minimising here. + */ + if (!fsm_determinise(fsm)) { + perror("fsm_determinise"); + exit(EXIT_FAILURE); + } + + if (show_stats) { + fprintf(stderr, "dfa: %u states\n", fsm_countstates(fsm)); + } + + if (!quiet) { + print_fsm_c(fsm, opt.prefix); + } + + fsm_free(fsm); + } + +#ifdef __linux__ + if (show_stats) { + struct rusage ru; + + if (getrusage(RUSAGE_SELF, &ru) != 0) { + perror("getrusage"); + exit(EXIT_FAILURE); + } + + fprintf(stderr, "rusage.utime: %g\n", (double) ru.ru_utime.tv_sec + (double) ru.ru_utime.tv_usec / 1000000); + fprintf(stderr, "rusage.stime: %g\n", (double) ru.ru_stime.tv_sec + (double) ru.ru_stime.tv_usec / 1000000); + fprintf(stderr, "rusage.maxrss: %zu MiB\n", (size_t) ru.ru_maxrss / 1024); + } +#endif + +// TODO: also free lit_s +} +