From 78642c36ae0b48ee2f51100e37e3cb2e1e95eab1 Mon Sep 17 00:00:00 2001
From: Scott Vokes <svokes@fastly.com>
Date: Wed, 3 Jan 2024 16:05:40 -0500
Subject: [PATCH 1/6] hash_id: Rather than adding the IDs, combine into one
 uint64_t.

- hash_id should take a uint64_t argument, rather than unsigned.

- Instead of adding them or hashing them separately and combining,
  pack both into the uint64_t argument for hash_id, since each is
  a 32-bit ID. Further experimentation supports that this has
  better collision behavior.
---
 include/adt/hash.h       |  2 +-
 src/libfsm/determinise.c | 13 +++----------
 2 files changed, 4 insertions(+), 11 deletions(-)

diff --git a/include/adt/hash.h b/include/adt/hash.h
index 74ee4a890..c2b7a0683 100644
--- a/include/adt/hash.h
+++ b/include/adt/hash.h
@@ -17,7 +17,7 @@
 
 SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW()
 static __inline__ uint64_t
-hash_id(unsigned id)
+hash_id(uint64_t id)
 {
 	/* xorshift* A1(12,25,27),
 	 * from http://vigna.di.unimi.it/ftp/papers/xorshift.pdf */
diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c
index b6c688ac4..5f6a984bc 100644
--- a/src/libfsm/determinise.c
+++ b/src/libfsm/determinise.c
@@ -1645,16 +1645,9 @@ hash_pair(fsm_state_t a, fsm_state_t b)
 	assert(b & RESULT_BIT);
 	a &=~ RESULT_BIT;
 	b &=~ RESULT_BIT;
-
-	/* Don't hash a and b separately and combine them with
-	 * hash_id, because it's common to have adjacent pairs of
-	 * result IDs, and with how hash_id works that leads to
-	 * multiples of similar hash values bunching up.
-	 *
-	 * This could be replaced with a better hash function later,
-	 * but use LOG_CACHE_HTAB to ensure there aren't visually obvious
-	 * runs of collisions appearing in the tables. */
-	const uint64_t res = hash_id(a + b);
+	assert(a != b);
+	const uint64_t ab = ((uint64_t)a << 32) | (uint64_t)b;
+	const uint64_t res = hash_id(ab);
 	/* fprintf(stderr, "%s: a %d, b %d -> %016lx\n", __func__, a, b, res); */
 	return res;
 }

From 78e0082d67ddd7eb49da54d66c9335b4f1ba61cf Mon Sep 17 00:00:00 2001
From: Scott Vokes <svokes@fastly.com>
Date: Thu, 4 Jan 2024 14:14:46 -0500
Subject: [PATCH 2/6] stateset: Remove unused `state_set_hash`.

---
 include/adt/stateset.h |  3 ---
 src/adt/stateset.c     | 16 ----------------
 2 files changed, 19 deletions(-)

diff --git a/include/adt/stateset.h b/include/adt/stateset.h
index 83e835467..3ec91794b 100644
--- a/include/adt/stateset.h
+++ b/include/adt/stateset.h
@@ -72,8 +72,5 @@ state_set_rebase(struct state_set **set, fsm_state_t base);
 void
 state_set_replace(struct state_set **set, fsm_state_t old, fsm_state_t new);
 
-unsigned long
-state_set_hash(const struct state_set *set);
-
 #endif
 
diff --git a/src/adt/stateset.c b/src/adt/stateset.c
index 8d73a9b5f..c2cc8b8c0 100644
--- a/src/adt/stateset.c
+++ b/src/adt/stateset.c
@@ -668,19 +668,3 @@ state_set_replace(struct state_set **setp, fsm_state_t old, fsm_state_t new)
 		}
 	}
 }
-
-unsigned long
-state_set_hash(const struct state_set *set)
-{
-	if (set == NULL) {
-		return 0;	/* empty */
-	}
-
-	if (IS_SINGLETON(set)) {
-		fsm_state_t state;
-		state = SINGLETON_DECODE(set);
-		return hashrec(&state, sizeof state);
-	}
-
-	return hashrec(set->a, set->i * sizeof *set->a);
-}

From dfab0deb71ae48b86c8ff60bfae1fe224f4bbf22 Mon Sep 17 00:00:00 2001
From: Scott Vokes <svokes@fastly.com>
Date: Thu, 4 Jan 2024 14:42:42 -0500
Subject: [PATCH 3/6] Add include/adt/hash.h `hash_ids` and consolidate uses.

- Add a function to hash a series of IDs, rather than doing it with
  a loop combining the intermediate hashes in inconsistent ways.

- Add a #define to log a probe count and a limit to assert, and
  rework the hashing code control flow for tracking/asserting on
  the probe count.

- Remove `hash_fnv1a_64` from a logging statement (normally not
  compiled), since that's the only remaining use remove the FNV1a
  functions from hash.h.

- Remove FSM_PHI_32 and FSM_PHI_64, now unused.

- parser.act still uses 32-bit FNV1a, but that's for character-by-
  character hashing, so that still makes sense, and it has its
  own copy of fnv1a.
---
 include/adt/hash.h         | 39 +++++++----------------------------
 src/adt/idmap.c            | 27 +++++++++++++++++-------
 src/adt/internedstateset.c | 18 ++++++++--------
 src/libfsm/determinise.c   | 42 ++++++++++++++++++++++++++------------
 4 files changed, 65 insertions(+), 61 deletions(-)

diff --git a/include/adt/hash.h b/include/adt/hash.h
index c2b7a0683..bd1cf2907 100644
--- a/include/adt/hash.h
+++ b/include/adt/hash.h
@@ -10,10 +10,10 @@
 #include <stdint.h>
 
 #include "adt/common.h"
+#include "fsm/fsm.h"
 
-/* 32 and 64-bit approximations of the golden ratio. */
-#define FSM_PHI_32 0x9e3779b9UL
-#define FSM_PHI_64 (uint64_t)0x9e3779b97f4a7c15UL
+#define HASH_LOG_PROBES 0
+/* #define HASH_PROBE_LIMIT 100 */
 
 SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW()
 static __inline__ uint64_t
@@ -28,38 +28,13 @@ hash_id(uint64_t id)
 	return x * 2685821657736338717LLU;
 }
 
-/* FNV-1a hash function, 32 and 64 bit versions. This is in the public
- * domain. For details, see:
- *
- *     http://www.isthe.com/chongo/tech/comp/fnv/index.html
- */
-
-SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW()
-static __inline__ uint32_t
-hash_fnv1a_32(const uint8_t *buf, size_t length)
-{
-#define FNV1a_32_OFFSET_BASIS 	0x811c9dc5UL
-#define FNV1a_32_PRIME		0x01000193UL
-	uint32_t h = FNV1a_32_OFFSET_BASIS;
-	size_t i;
-	for (i = 0; i < length; i++) {
-		h ^= buf[i];
-		h *= FNV1a_32_PRIME;
-	}
-	return h;
-}
-
 SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW()
 static __inline__ uint64_t
-hash_fnv1a_64(const uint8_t *buf, size_t length)
+hash_ids(size_t count, const fsm_state_t *ids)
 {
-#define FNV1a_64_OFFSET_BASIS   0xcbf29ce484222325UL
-#define FNV1a_64_PRIME		0x100000001b3UL
-	uint64_t h = FNV1a_64_OFFSET_BASIS;
-	size_t i;
-	for (i = 0; i < length; i++) {
-		h ^= buf[i];
-		h *= FNV1a_64_PRIME;
+	uint64_t h = 0;
+	for (size_t i = 0; i < count; i++) {
+		h = hash_id(h ^ ids[i]);
 	}
 	return h;
 }
diff --git a/src/adt/idmap.c b/src/adt/idmap.c
index 342ecd136..208af3c5a 100644
--- a/src/adt/idmap.c
+++ b/src/adt/idmap.c
@@ -203,12 +203,15 @@ idmap_set(struct idmap *m, fsm_state_t state_id,
 	}
 
 	const uint64_t mask = m->bucket_count - 1;
-	for (size_t b_i = 0; b_i < m->bucket_count; b_i++) {
+	int res = 0;
+	size_t b_i;
+	for (b_i = 0; b_i < m->bucket_count; b_i++) {
 		struct idmap_bucket *b = &m->buckets[(h + b_i) & mask];
 		if (b->state == state_id) {
 			assert(b->values != NULL);
 			u64bitset_set(b->values, value);
-			return 1;
+			res = 1;
+			goto done;
 		} else if (b->state == NO_STATE) {
 			b->state = state_id;
 			assert(b->values == NULL);
@@ -217,20 +220,30 @@ idmap_set(struct idmap *m, fsm_state_t state_id,
 			b->values = f_calloc(m->alloc,
 			    vw, sizeof(b->values[0]));
 			if (b->values == NULL) {
-				return 0;
+				goto done;
 			}
 			m->buckets_used++;
 
 			u64bitset_set(b->values, value);
-			return 1;
+			res = 1;
+			goto done;
 		} else {
 			continue; /* collision */
 		}
-
 	}
 
-	assert(!"unreachable");
-	return 0;
+done:
+#if HASH_LOG_PROBES
+	fprintf(stderr, "%s: res %d after %zu probes (%u buckets)\n",
+	    __func__, res, b_i, m->bucket_count);
+#endif
+
+#ifdef HASH_PROBE_LIMIT
+	assert(b_i < HASH_PROBE_LIMIT);
+#endif
+	assert(b_i < m->bucket_count);
+
+	return res;
 }
 
 static const struct idmap_bucket *
diff --git a/src/adt/internedstateset.c b/src/adt/internedstateset.c
index a39f09a19..9daf50ae7 100644
--- a/src/adt/internedstateset.c
+++ b/src/adt/internedstateset.c
@@ -197,15 +197,10 @@ dump_tables(FILE *f, const struct interned_state_set_pool *pool)
 #endif
 }
 
-SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW()
 static uint64_t
 hash_state_ids(size_t count, const fsm_state_t *ids)
 {
-	uint64_t h = 0;
-	for (size_t i = 0; i < count; i++) {
-		h ^= hash_id(ids[i]);
-	}
-	return h;
+	return hash_ids(count, ids);
 }
 
 static bool
@@ -329,8 +324,13 @@ interned_state_set_intern_set(struct interned_state_set_pool *pool,
 		fprintf(stderr, "%s: htab[(0x%lx + %lu) & 0x%lx => %d\n",
 		    __func__, h, b_i, mask, *b);
 #endif
+
+#ifdef HASH_PROBE_LIMIT
+		assert(probes < HASH_PROBE_LIMIT);
+#endif
+
 		if (*b == NO_ID) {
-#if LOG_ISS > 3
+#if LOG_ISS > 3 || HASH_LOG_PROBES
 			fprintf(stderr, "%s: empty bucket (%zd probes)\n", __func__, probes);
 #endif
 			dst_bucket = b;
@@ -351,7 +351,7 @@ interned_state_set_intern_set(struct interned_state_set_pool *pool,
 
 		if (0 == memcmp(states, buf, s->length * sizeof(buf[0]))) {
 			*result = id;
-#if LOG_ISS > 3
+#if LOG_ISS > 3 || HASH_LOG_PROBES
 			fprintf(stderr, "%s: reused %u (%zd probes)\n", __func__, id, probes);
 #endif
 			return true;
@@ -362,7 +362,7 @@ interned_state_set_intern_set(struct interned_state_set_pool *pool,
 	}
 	assert(dst_bucket != NULL);
 
-#if LOG_ISS > 3
+#if LOG_ISS > 3 || HASH_LOG_PROBES
 	fprintf(stderr, "%s: miss after %zd probes\n", __func__, probes);
 #else
         (void)probes;
diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c
index 5f6a984bc..582d769f6 100644
--- a/src/libfsm/determinise.c
+++ b/src/libfsm/determinise.c
@@ -396,7 +396,7 @@ hash_iss(interned_state_set_id iss)
 {
 	/* Just hashing the ID directly is fine here -- since they're
 	 * interned, they're identified by pointer equality. */
-	return FSM_PHI_64 * (uintptr_t)iss;
+	return hash_id((uintptr_t)iss);
 }
 
 static struct mapping *
@@ -1309,15 +1309,10 @@ build_output_from_cached_analysis(struct analyze_closures_env *env, fsm_state_t
 
 #define LOG_TO_SET_HTAB 0
 
-SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW()
 static uint64_t
 to_set_hash(size_t count, const fsm_state_t *ids)
 {
-	uint64_t h = hash_id(count);
-	for (size_t i = 0; i < count; i++) {
-		h ^= hash_id(ids[i]);
-	}
-	return h;
+	return hash_ids(count, ids);
 }
 
 static int
@@ -1333,15 +1328,14 @@ to_set_htab_check(struct analyze_closures_env *env,
 		return 0;
 	}
 
-
-#if LOG_TO_SET_HTAB
+#if LOG_TO_SET_HTAB || HASH_LOG_PROBES || defined(HASH_PROBE_LIMIT)
 	size_t probes = 0;
 #endif
 	int res = 0;
 
 	const uint64_t mask = bcount - 1;
 	for (size_t b_i = 0; b_i < bcount; b_i++) {
-#if LOG_TO_SET_HTAB
+#if LOG_TO_SET_HTAB || HASH_LOG_PROBES || defined(HASH_PROBE_LIMIT)
 		probes++;
 #endif
 		const struct to_set_bucket *b = &htab->buckets[(h + b_i) & mask];
@@ -1364,10 +1358,16 @@ to_set_htab_check(struct analyze_closures_env *env,
 	}
 
 done:
-#if LOG_TO_SET_HTAB
+#if LOG_TO_SET_HTAB || HASH_LOG_PROBES
 	fprintf(stderr, "%s: result %d, %zu probes, htab: used %zu/%zu ceil\n",
 	    __func__, res, probes, htab->buckets_used, htab->bucket_count);
 #endif
+#ifdef HASH_PROBE_LIMIT
+	if (probes >= HASH_PROBE_LIMIT) {
+		fprintf(stderr, "-- %zd probes, limit exceeded\n", probes);
+	}
+	assert(probes < HASH_PROBE_LIMIT);
+#endif
 
 	return res;
 }
@@ -1431,6 +1431,22 @@ to_set_htab_save(struct analyze_closures_env *env,
 			b->count = count;
 			b->offset = offset;
 			htab->buckets_used++;
+#if HASH_LOG_PROBES
+			fprintf(stderr, "%s: [", __func__);
+			const fsm_state_t *ids = &env->to_sets.buf[offset];
+			for (size_t i = 0; i < count; i++) {
+				fprintf(stderr, "%s%d",
+				    i > 0 ? " " : "", ids[i]);
+			}
+
+			fprintf(stderr, "] -> hash %lx -> b %zu (%zu/%zu used), %zd probes\n",
+			    hash, (hash + b_i) & mask,
+			    htab->buckets_used, htab->bucket_count, b_i);
+#endif
+#if HASH_PROBE_LIMIT
+			assert(b_i < HASH_PROBE_LIMIT);
+#endif
+
 			return 1;
 		} else if (b->count == count) {
 			assert(b->offset != offset); /* no duplicates */
@@ -1586,8 +1602,8 @@ commit_buffered_result(struct analyze_closures_env *env, uint32_t *cache_result_
 		memcpy(&nr->entries[0], env->results.buffer.entries,
 		    nr->count * sizeof(env->results.buffer.entries[0]));
 #if LOG_GROUPING || LOG_COMMIT_BUFFERED_GROUP
-		fprintf(stderr, "%s: alloc %zu, hash 0x%016lx\n",
-		    __func__, alloc_sz, hash_fnv1a_64((const uint8_t *)nr, alloc_sz));
+		fprintf(stderr, "%s: alloc %zu, count %u\n",
+		    __func__, alloc_sz, nr->count));
 #endif
 	}
 

From 9cba5601a60145312eda4798f8fe7891119977b2 Mon Sep 17 00:00:00 2001
From: Scott Vokes <svokes@fastly.com>
Date: Fri, 5 Jan 2024 09:26:53 -0500
Subject: [PATCH 4/6] hash_pair: Ensure pair order is consistent for hashing.

---
 src/libfsm/determinise.c | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c
index 582d769f6..c0bca3b64 100644
--- a/src/libfsm/determinise.c
+++ b/src/libfsm/determinise.c
@@ -1659,10 +1659,14 @@ hash_pair(fsm_state_t a, fsm_state_t b)
 	assert(a != b);
 	assert(a & RESULT_BIT);
 	assert(b & RESULT_BIT);
-	a &=~ RESULT_BIT;
-	b &=~ RESULT_BIT;
-	assert(a != b);
-	const uint64_t ab = ((uint64_t)a << 32) | (uint64_t)b;
+	const uint64_t ma = (uint64_t)(a & ~RESULT_BIT); /* m: masked */
+	const uint64_t mb = (uint64_t)(b & ~RESULT_BIT);
+	assert(ma != mb);
+
+	/* Left-shift the smaller ID, so the pair order is consistent for hashing. */
+	const uint64_t ab = (ma < mb)
+	    ? ((ma << 32) | mb)
+	    : ((mb << 32) | ma);
 	const uint64_t res = hash_id(ab);
 	/* fprintf(stderr, "%s: a %d, b %d -> %016lx\n", __func__, a, b, res); */
 	return res;

From f39390bbd766f78e801b7028c8866f8a1e14439b Mon Sep 17 00:00:00 2001
From: Scott Vokes <svokes@fastly.com>
Date: Fri, 5 Jan 2024 09:30:36 -0500
Subject: [PATCH 5/6] hash_ids: Add EXPENSIVE_CHECK for ids[] being in
 ascending order.

---
 include/adt/hash.h | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/include/adt/hash.h b/include/adt/hash.h
index bd1cf2907..65c8dc897 100644
--- a/include/adt/hash.h
+++ b/include/adt/hash.h
@@ -12,6 +12,10 @@
 #include "adt/common.h"
 #include "fsm/fsm.h"
 
+#if EXPENSIVE_CHECKS
+#include <assert.h>
+#endif
+
 #define HASH_LOG_PROBES 0
 /* #define HASH_PROBE_LIMIT 100 */
 
@@ -35,6 +39,11 @@ hash_ids(size_t count, const fsm_state_t *ids)
 	uint64_t h = 0;
 	for (size_t i = 0; i < count; i++) {
 		h = hash_id(h ^ ids[i]);
+#if EXPENSIVE_CHECKS
+		if (i > 0) {
+			assert(ids[i-1] <= ids[i]);
+		}
+#endif
 	}
 	return h;
 }

From 89e3b88bf8715b9594897bf135cbc68435f7daf4 Mon Sep 17 00:00:00 2001
From: Scott Vokes <svokes@fastly.com>
Date: Fri, 5 Jan 2024 14:17:45 -0500
Subject: [PATCH 6/6] determinise: Ensure that the analysis_cache pair ordering
 is consistent.

---
 src/libfsm/determinise.c | 29 ++++++++++++++++++++++++++---
 1 file changed, 26 insertions(+), 3 deletions(-)

diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c
index c0bca3b64..7c5612b36 100644
--- a/src/libfsm/determinise.c
+++ b/src/libfsm/determinise.c
@@ -1672,6 +1672,15 @@ hash_pair(fsm_state_t a, fsm_state_t b)
 	return res;
 }
 
+static int eq_bucket_pair(const struct result_pair_bucket *b, fsm_state_t pa, fsm_state_t pb)
+{
+	if ((pa & ~RESULT_BIT) < (pb & ~RESULT_BIT)) {
+		return b->ids[0] == pa && b->ids[1] == pb;
+	} else {
+		return b->ids[0] == pb && b->ids[1] == pa;
+	}
+}
+
 static int
 analysis_cache_check_pair(struct analyze_closures_env *env, fsm_state_t pa, fsm_state_t pb,
 	fsm_state_t *result_id)
@@ -1697,7 +1706,7 @@ analysis_cache_check_pair(struct analyze_closures_env *env, fsm_state_t pa, fsm_
 		struct result_pair_bucket *b = &htab->buckets[(h + i) & mask];
 		if (b->t == RPBT_UNUSED) {
 			break;	/* not found */
-		} else if (b->t == RPBT_PAIR && b->ids[0] == pa && b->ids[1] == pb) {
+		} else if (b->t == RPBT_PAIR && eq_bucket_pair(b, pa, pb)) {
 			*result_id = b->result_id; /* hit */
 #if LOG_GROUPING > 1
 			fprintf(stderr, "%s: cache hit for pair { %d_R, %d_R } => %d\n",
@@ -1831,6 +1840,7 @@ pair_cache_save(struct analyze_closures_env *env,
 		assert(pa != pb);
 		assert(pa & RESULT_BIT);
 		assert(pb & RESULT_BIT);
+		assert((pa & ~RESULT_BIT) < (pb & ~RESULT_BIT));
 	}
 
 #if LOG_GROUPING > 1
@@ -1868,6 +1878,8 @@ pair_cache_save(struct analyze_closures_env *env,
 				    : hash_pair(ob->ids[0], ob->ids[1]));
 				if (ob->t == RPBT_SINGLE_STATE) {
 					assert(ob->ids[0] == ob->ids[1]);
+				} else {
+					assert((ob->ids[0] & ~RESULT_BIT) < (ob->ids[1] & ~RESULT_BIT));
 				}
 
 				for (size_t nb_i = 0; nb_i < ncount; nb_i++) {
@@ -1902,8 +1914,9 @@ pair_cache_save(struct analyze_closures_env *env,
 #endif
 
 		if (b->t == RPBT_UNUSED) { /* empty */
-			b->ids[0] = pa;
-			b->ids[1] = pb;
+			/* Ensure pair ordering is consistent */
+			b->ids[0] = pa < pb ? pa : pb;
+			b->ids[1] = pa < pb ? pb : pa;
 			b->t = type;
 			b->result_id = result_id;
 			htab->buckets_used++;
@@ -1947,6 +1960,16 @@ analysis_cache_save_pair(struct analyze_closures_env *env,
 	fsm_state_t a, fsm_state_t b, fsm_state_t result_id)
 {
 	assert(a != b);
+	assert((a & RESULT_BIT) == 0);
+	assert((b & RESULT_BIT) == 0);
+
+	if (a > b) {		/* if necessary, swap to ensure a < b */
+		const fsm_state_t tmp = a;
+		a = b;
+		b = tmp;
+	}
+	assert(a < b);
+
 	return pair_cache_save(env, RPBT_PAIR, a | RESULT_BIT, b | RESULT_BIT, result_id);
 }