From e2b97a6bc049ea6f157ed18455c85859def5990d Mon Sep 17 00:00:00 2001 From: Kate F Date: Sat, 25 May 2024 11:13:44 +0100 Subject: [PATCH] First cut at introducing fsm_intersect_charset(), exposed as fsm -U. --- include/fsm/bool.h | 8 ++++ man/fsm.1/fsm.1.xml | 21 ++++++++++ src/fsm/main.c | 32 +++++++++++++- src/libfsm/Makefile | 1 + src/libfsm/charset.c | 95 ++++++++++++++++++++++++++++++++++++++++++ src/libfsm/libfsm.syms | 1 + 6 files changed, 157 insertions(+), 1 deletion(-) create mode 100644 src/libfsm/charset.c diff --git a/include/fsm/bool.h b/include/fsm/bool.h index d35d1c425..d92518297 100644 --- a/include/fsm/bool.h +++ b/include/fsm/bool.h @@ -55,6 +55,14 @@ fsm_union_array(size_t fsm_count, struct fsm * fsm_intersect(struct fsm *a, struct fsm *b); +/* + * A convenience to intersect against a character set, rather than + * a pre-existing FSM. Unlike fsm_intersect(), the FSM is required + * to be a DFA. + */ +struct fsm * +fsm_intersect_charset(struct fsm *a, size_t n, const char *charset); + /* * Subtract b from a. This is not commutative. */ diff --git a/man/fsm.1/fsm.1.xml b/man/fsm.1/fsm.1.xml index 117e8966f..6df32e186 100644 --- a/man/fsm.1/fsm.1.xml +++ b/man/fsm.1/fsm.1.xml @@ -22,6 +22,7 @@ io"> iterations"> length"> + charset"> -a"> -w"> @@ -32,6 +33,7 @@ -G &length.arg;"> -k &io.arg;"> -i &iterations.arg;"> + -U &charset.arg;"> -X"> -d"> @@ -75,6 +77,7 @@ fsm &x.opt; + &U.opt; &str.arg; @@ -83,6 +86,7 @@ fsm &p.opt; + &U.opt; &l.opt; &a.opt; @@ -103,6 +107,7 @@ &r.opt; &t.opt; + &U.opt; &i.opt; @@ -111,6 +116,7 @@ fsm &q.opt; + &U.opt; &file.arg; @@ -190,6 +196,21 @@ + + &U.opt; + + + Intersect the resulting state machine down to a + given character set. + This is done after any transformations are applied + (e.g. by ). + + The default character set is a byte. + It is not possible to specify a particular character set + that includes a literal \0. + + + &w.opt; diff --git a/src/fsm/main.c b/src/fsm/main.c index 8e668b779..4e0038bcf 100644 --- a/src/fsm/main.c +++ b/src/fsm/main.c @@ -368,6 +368,7 @@ main(int argc, char *argv[]) double elapsed; fsm_print *print; enum op op; + const char *charset; struct fsm *fsm; int xfiles; int r; @@ -395,7 +396,7 @@ main(int argc, char *argv[]) { int c; - while (c = getopt(argc, argv, "h" "aCcgwXe:k:i:" "xpq:l:dG:mrt:EW:"), c != -1) { + while (c = getopt(argc, argv, "h" "aCcgwXe:k:i:" "xpq:l:dG:mrt:EU:W:"), c != -1) { switch (c) { case 'a': opt.anonymous_states = 1; break; case 'c': opt.consolidate_edges = 1; break; @@ -421,6 +422,11 @@ main(int argc, char *argv[]) case 'r': op = op_name("reverse"); break; case 't': op = op_name(optarg); break; case 'E': op = op_name("remove_epsilons"); break; + + case 'U': + charset = optarg; + break; + case 'W': /* print = gen_words; */ /* num_words = strtoul(optarg, NULL, 10); */ @@ -428,6 +434,7 @@ main(int argc, char *argv[]) fprintf(stderr, "not yet implemented.\n"); exit(EXIT_FAILURE); break; + case 'G': generate_bounds = strtoul(optarg, NULL, 10); if (generate_bounds == 0) { @@ -548,6 +555,29 @@ main(int argc, char *argv[]) q = NULL; } + /* + * It might be more efficient to intersect the character set for each + * operand, but that gives a different result for some operations + * (complement especially). So since we'd also need to intersect the + * result here too, I'm just doing it in the one place for simplicity. + * + * Passing a NULL charset is a no-op, so the default charset is a byte. + * We can't include \0 here because optarg is a string, and I don't + * want to invent a syntax for character sets. + */ + if (charset != NULL) { + if (!fsm_determinise(q)) { + perror("fsm_determinise"); + exit(EXIT_FAILURE); + } + + q = fsm_intersect_charset(q, strlen(charset), charset); + if (q == NULL) { + perror("fsm_intersect_charset"); + exit(EXIT_FAILURE); + } + } + fsm_to_cleanup = q; if (-1 == clock_gettime(CLOCK_MONOTONIC, &post)) { diff --git a/src/libfsm/Makefile b/src/libfsm/Makefile index 9af51a5a4..005f7d4c6 100644 --- a/src/libfsm/Makefile +++ b/src/libfsm/Makefile @@ -1,6 +1,7 @@ .include "../../share/mk/top.mk" SRC += src/libfsm/capture.c +SRC += src/libfsm/charset.c SRC += src/libfsm/collate.c SRC += src/libfsm/complete.c SRC += src/libfsm/consolidate.c diff --git a/src/libfsm/charset.c b/src/libfsm/charset.c new file mode 100644 index 000000000..45f2cac20 --- /dev/null +++ b/src/libfsm/charset.c @@ -0,0 +1,95 @@ +/* + * Copyright 2024 Katherine Flavel + * + * See LICENCE for the full copyright terms. + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include "internal.h" +#include "walk2.h" + +/* ^[abc..]*$ */ +struct fsm * +fsm_intersect_charset(struct fsm *a, size_t n, const char *charset) +{ + struct fsm *b, *q; + + assert(a != NULL); + + if (charset == NULL) { + return a; + } + + /* + * Since intersection is destructive, there's no point in making + * the charset FSM in advance and then fsm_clone()ing it. + * We may as well just make a new one each time. + */ + { + fsm_state_t state; + + // TODO: pass .statealloc explicitly, we know it's 1. the default is overkill + b = fsm_new(a->opt); + if (b == NULL) { + return NULL; + } + + if (!fsm_addstate(b, &state)) { + goto error; + } + + for (size_t i = 0; i < n; i++) { + if (!fsm_addedge_literal(b, state, state, charset[i])) { + goto error; + } + } + + fsm_setend(b, state, true); + fsm_setstart(b, state); + } + +// TODO: document prerequisites + assert(fsm_all(a, fsm_isdfa)); + assert(fsm_all(b, fsm_isdfa)); + + /* + * This is equivalent to fsm_intersect(). fsm_intersect() asserts that + * both operands are DFA at runtime. But we know this empirically from + * our callers. So we call fsm_walk2() directly to avoid needlessly + * running the DFA predicate in DNDEBUG builds. + * + * This is intersection implemented by walking sets of states through + * both FSM simultaneously, as described by Hopcroft, Motwani and Ullman + * (2001, 2nd ed.) 4.2, Closure under Intersection. + * + * The intersection of two FSM consists of only those items which + * are present in _BOTH. + */ + q = fsm_walk2(a, b, FSM_WALK2_BOTH, FSM_WALK2_BOTH); + if (q == NULL) { + return NULL; + } + + fsm_free(a); + fsm_free(b); + + /* walking two DFA produces a DFA */ + assert(fsm_all(q, fsm_isdfa)); + + return q; + +error: + + fsm_free(b); + + return NULL; +} + diff --git a/src/libfsm/libfsm.syms b/src/libfsm/libfsm.syms index a2570b8c9..f905e1f04 100644 --- a/src/libfsm/libfsm.syms +++ b/src/libfsm/libfsm.syms @@ -3,6 +3,7 @@ fsm_complement fsm_union fsm_union_array fsm_intersect +fsm_intersect_charset # fsm_cost_legible