Skip to content

Commit

Permalink
First cut at introducing fsm_intersect_charset(), exposed as fsm -U.
Browse files Browse the repository at this point in the history
  • Loading branch information
katef committed May 25, 2024
1 parent 18816b1 commit 99271af
Show file tree
Hide file tree
Showing 6 changed files with 152 additions and 1 deletion.
8 changes: 8 additions & 0 deletions include/fsm/bool.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,14 @@ fsm_union_array(size_t fsm_count,
struct fsm *
fsm_intersect(struct fsm *a, struct fsm *b);

/*
* A convenience to intersect against a character set, rather than
* a pre-existing FSM. Unlike fsm_intersect(), the FSM is required
* to be a DFA.
*/
struct fsm *
fsm_intersect_charset(struct fsm *a, size_t n, const char *charset);

/*
* Subtract b from a. This is not commutative.
*/
Expand Down
21 changes: 21 additions & 0 deletions man/fsm.1/fsm.1.xml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
<!ENTITY io.arg "<replaceable>io</replaceable>">
<!ENTITY iterations.arg "<replaceable>iterations</replaceable>">
<!ENTITY length.arg "<replaceable>length</replaceable>">
<!ENTITY charset.arg "<replaceable>charset</replaceable>">

<!ENTITY a.opt "<option>-a</option>">
<!ENTITY w.opt "<option>-w</option>">
Expand All @@ -32,6 +33,7 @@
<!ENTITY G.opt "<option>-G</option>&nbsp;&length.arg;">
<!ENTITY k.opt "<option>-k</option>&nbsp;&io.arg;">
<!ENTITY i.opt "<option>-i</option>&nbsp;&iterations.arg;">
<!ENTITY U.opt "<option>-U</option>&nbsp;&charset.arg;">
<!ENTITY X.opt "<option>-X</option>">

<!ENTITY d.opt "<option>-d</option>">
Expand Down Expand Up @@ -75,6 +77,7 @@
<command>fsm</command>

<arg choice="opt">&x.opt;</arg>
<arg choice="opt">&U.opt;</arg>

<arg choice="plain" rep="repeat">&str.arg;</arg>
</cmdsynopsis>
Expand All @@ -83,6 +86,7 @@
<command>fsm</command>

<arg choice="plain">&p.opt;</arg>
<arg choice="opt">&U.opt;</arg>

<arg choice="opt">&l.opt;</arg>
<arg choice="opt">&a.opt;</arg>
Expand All @@ -103,6 +107,7 @@
<arg choice="plain">&r.opt;</arg>
<arg choice="plain">&t.opt;</arg>
</group>
<arg choice="opt">&U.opt;</arg>

<arg choice="opt">&i.opt;</arg>
</cmdsynopsis>
Expand All @@ -111,6 +116,7 @@
<command>fsm</command>

<arg choice="req" rep="repeat">&q.opt;</arg>
<arg choice="opt">&U.opt;</arg>

<arg choice="plain" rep="repeat">&file.arg;</arg>
</cmdsynopsis>
Expand Down Expand Up @@ -190,6 +196,21 @@
</listitem>
</varlistentry>

<varlistentry>
<term>&U.opt;</term>

<listitem>
<para>Intersect the resulting state machine down to a
given character set.
This is done after any transformations are applied
(e.g. by <option>t</option>).</para>

<para>The default character set is a byte.
It is not possible to specify a particular character set
that includes a literal <literal>\0</literal>.</para>
</listitem>
</varlistentry>

<varlistentry>
<term>&w.opt;</term>

Expand Down
27 changes: 26 additions & 1 deletion src/fsm/main.c
Original file line number Diff line number Diff line change
Expand Up @@ -368,6 +368,7 @@ main(int argc, char *argv[])
double elapsed;
fsm_print *print;
enum op op;
const char *charset;
struct fsm *fsm;
int xfiles;
int r;
Expand Down Expand Up @@ -395,7 +396,7 @@ main(int argc, char *argv[])
{
int c;

while (c = getopt(argc, argv, "h" "aCcgwXe:k:i:" "xpq:l:dG:mrt:EW:"), c != -1) {
while (c = getopt(argc, argv, "h" "aCcgwXe:k:i:" "xpq:l:dG:mrt:EU:W:"), c != -1) {
switch (c) {
case 'a': opt.anonymous_states = 1; break;
case 'c': opt.consolidate_edges = 1; break;
Expand All @@ -421,13 +422,19 @@ main(int argc, char *argv[])
case 'r': op = op_name("reverse"); break;
case 't': op = op_name(optarg); break;
case 'E': op = op_name("remove_epsilons"); break;

case 'U':
charset = optarg;
break;

case 'W':
/* print = gen_words; */
/* num_words = strtoul(optarg, NULL, 10); */
/* XXX: error handling */
fprintf(stderr, "not yet implemented.\n");
exit(EXIT_FAILURE);
break;

case 'G':
generate_bounds = strtoul(optarg, NULL, 10);
if (generate_bounds == 0) {
Expand Down Expand Up @@ -548,6 +555,24 @@ main(int argc, char *argv[])
q = NULL;
}

/*
* It might be more efficient to intersect the character set for each
* operand, but that gives a different result for some operations
* (complement especially). So since we'd also need to intersect the
* result here too, I'm just doing it in the one place for simplicity.
*
* Passing a NULL charset is a no-op, so the default charset is a byte.
* We can't include \0 here because optarg is a string, and I don't
* want to invent a syntax for character sets.
*/
if (charset != NULL) {
q = fsm_intersect_charset(q, strlen(charset), charset);
if (q == NULL) {
perror("fsm_intersect_charset");
exit(EXIT_FAILURE);
}
}

fsm_to_cleanup = q;

if (-1 == clock_gettime(CLOCK_MONOTONIC, &post)) {
Expand Down
1 change: 1 addition & 0 deletions src/libfsm/Makefile
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
.include "../../share/mk/top.mk"

SRC += src/libfsm/capture.c
SRC += src/libfsm/charset.c
SRC += src/libfsm/collate.c
SRC += src/libfsm/complete.c
SRC += src/libfsm/consolidate.c
Expand Down
95 changes: 95 additions & 0 deletions src/libfsm/charset.c
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
/*
* Copyright 2024 Katherine Flavel
*
* See LICENCE for the full copyright terms.
*/

#include <assert.h>
#include <stdbool.h>
#include <stddef.h>

#include <fsm/fsm.h>
#include <fsm/bool.h>
#include <fsm/pred.h>
#include <fsm/walk.h>

#include "internal.h"
#include "walk2.h"

/* ^[abc..]*$ */
struct fsm *
fsm_intersect_charset(struct fsm *a, size_t n, const char *charset)
{
struct fsm *b, *q;

assert(a != NULL);

if (charset == NULL) {
return a;
}

/*
* Since intersection is destructive, there's no point in making
* the charset FSM in advance and then fsm_clone()ing it.
* We may as well just make a new one each time.
*/
{
fsm_state_t state;

// TODO: pass .statealloc explicitly, we know it's 1. the default is overkill
b = fsm_new(a->opt);
if (b == NULL) {
return NULL;
}

if (!fsm_addstate(b, &state)) {
goto error;
}

for (size_t i = 0; i < n; i++) {
if (!fsm_addedge_literal(b, state, state, charset[i])) {
goto error;
}
}

fsm_setend(b, state, true);
fsm_setstart(b, state);
}

// TODO: document prerequisites
assert(fsm_all(a, fsm_isdfa));
assert(fsm_all(b, fsm_isdfa));

/*
* This is equivalent to fsm_intersect(). fsm_intersect() asserts that
* both operands are DFA at runtime. But we know this empirically from
* our callers. So we call fsm_walk2() directly to avoid needlessly
* running the DFA predicate in DNDEBUG builds.
*
* This is intersection implemented by walking sets of states through
* both FSM simultaneously, as described by Hopcroft, Motwani and Ullman
* (2001, 2nd ed.) 4.2, Closure under Intersection.
*
* The intersection of two FSM consists of only those items which
* are present in _BOTH.
*/
q = fsm_walk2(a, b, FSM_WALK2_BOTH, FSM_WALK2_BOTH);
if (q == NULL) {
return NULL;
}

fsm_free(a);
fsm_free(b);

/* walking two DFA produces a DFA */
assert(fsm_all(q, fsm_isdfa));

return q;

error:

fsm_free(b);

return NULL;
}

1 change: 1 addition & 0 deletions src/libfsm/libfsm.syms
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ fsm_complement
fsm_union
fsm_union_array
fsm_intersect
fsm_intersect_charset

# <fsm/cost.h>
fsm_cost_legible
Expand Down

0 comments on commit 99271af

Please sign in to comment.