From 78642c36ae0b48ee2f51100e37e3cb2e1e95eab1 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Wed, 3 Jan 2024 16:05:40 -0500 Subject: [PATCH 1/6] hash_id: Rather than adding the IDs, combine into one uint64_t. - hash_id should take a uint64_t argument, rather than unsigned. - Instead of adding them or hashing them separately and combining, pack both into the uint64_t argument for hash_id, since each is a 32-bit ID. Further experimentation supports that this has better collision behavior. --- include/adt/hash.h | 2 +- src/libfsm/determinise.c | 13 +++---------- 2 files changed, 4 insertions(+), 11 deletions(-) diff --git a/include/adt/hash.h b/include/adt/hash.h index 74ee4a890..c2b7a0683 100644 --- a/include/adt/hash.h +++ b/include/adt/hash.h @@ -17,7 +17,7 @@ SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW() static __inline__ uint64_t -hash_id(unsigned id) +hash_id(uint64_t id) { /* xorshift* A1(12,25,27), * from http://vigna.di.unimi.it/ftp/papers/xorshift.pdf */ diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c index b6c688ac4..5f6a984bc 100644 --- a/src/libfsm/determinise.c +++ b/src/libfsm/determinise.c @@ -1645,16 +1645,9 @@ hash_pair(fsm_state_t a, fsm_state_t b) assert(b & RESULT_BIT); a &=~ RESULT_BIT; b &=~ RESULT_BIT; - - /* Don't hash a and b separately and combine them with - * hash_id, because it's common to have adjacent pairs of - * result IDs, and with how hash_id works that leads to - * multiples of similar hash values bunching up. - * - * This could be replaced with a better hash function later, - * but use LOG_CACHE_HTAB to ensure there aren't visually obvious - * runs of collisions appearing in the tables. */ - const uint64_t res = hash_id(a + b); + assert(a != b); + const uint64_t ab = ((uint64_t)a << 32) | (uint64_t)b; + const uint64_t res = hash_id(ab); /* fprintf(stderr, "%s: a %d, b %d -> %016lx\n", __func__, a, b, res); */ return res; } From 78e0082d67ddd7eb49da54d66c9335b4f1ba61cf Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Thu, 4 Jan 2024 14:14:46 -0500 Subject: [PATCH 2/6] stateset: Remove unused `state_set_hash`. --- include/adt/stateset.h | 3 --- src/adt/stateset.c | 16 ---------------- 2 files changed, 19 deletions(-) diff --git a/include/adt/stateset.h b/include/adt/stateset.h index 83e835467..3ec91794b 100644 --- a/include/adt/stateset.h +++ b/include/adt/stateset.h @@ -72,8 +72,5 @@ state_set_rebase(struct state_set **set, fsm_state_t base); void state_set_replace(struct state_set **set, fsm_state_t old, fsm_state_t new); -unsigned long -state_set_hash(const struct state_set *set); - #endif diff --git a/src/adt/stateset.c b/src/adt/stateset.c index 8d73a9b5f..c2cc8b8c0 100644 --- a/src/adt/stateset.c +++ b/src/adt/stateset.c @@ -668,19 +668,3 @@ state_set_replace(struct state_set **setp, fsm_state_t old, fsm_state_t new) } } } - -unsigned long -state_set_hash(const struct state_set *set) -{ - if (set == NULL) { - return 0; /* empty */ - } - - if (IS_SINGLETON(set)) { - fsm_state_t state; - state = SINGLETON_DECODE(set); - return hashrec(&state, sizeof state); - } - - return hashrec(set->a, set->i * sizeof *set->a); -} From dfab0deb71ae48b86c8ff60bfae1fe224f4bbf22 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Thu, 4 Jan 2024 14:42:42 -0500 Subject: [PATCH 3/6] Add include/adt/hash.h `hash_ids` and consolidate uses. - Add a function to hash a series of IDs, rather than doing it with a loop combining the intermediate hashes in inconsistent ways. - Add a #define to log a probe count and a limit to assert, and rework the hashing code control flow for tracking/asserting on the probe count. - Remove `hash_fnv1a_64` from a logging statement (normally not compiled), since that's the only remaining use remove the FNV1a functions from hash.h. - Remove FSM_PHI_32 and FSM_PHI_64, now unused. - parser.act still uses 32-bit FNV1a, but that's for character-by- character hashing, so that still makes sense, and it has its own copy of fnv1a. --- include/adt/hash.h | 39 +++++++---------------------------- src/adt/idmap.c | 27 +++++++++++++++++------- src/adt/internedstateset.c | 18 ++++++++-------- src/libfsm/determinise.c | 42 ++++++++++++++++++++++++++------------ 4 files changed, 65 insertions(+), 61 deletions(-) diff --git a/include/adt/hash.h b/include/adt/hash.h index c2b7a0683..bd1cf2907 100644 --- a/include/adt/hash.h +++ b/include/adt/hash.h @@ -10,10 +10,10 @@ #include #include "adt/common.h" +#include "fsm/fsm.h" -/* 32 and 64-bit approximations of the golden ratio. */ -#define FSM_PHI_32 0x9e3779b9UL -#define FSM_PHI_64 (uint64_t)0x9e3779b97f4a7c15UL +#define HASH_LOG_PROBES 0 +/* #define HASH_PROBE_LIMIT 100 */ SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW() static __inline__ uint64_t @@ -28,38 +28,13 @@ hash_id(uint64_t id) return x * 2685821657736338717LLU; } -/* FNV-1a hash function, 32 and 64 bit versions. This is in the public - * domain. For details, see: - * - * http://www.isthe.com/chongo/tech/comp/fnv/index.html - */ - -SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW() -static __inline__ uint32_t -hash_fnv1a_32(const uint8_t *buf, size_t length) -{ -#define FNV1a_32_OFFSET_BASIS 0x811c9dc5UL -#define FNV1a_32_PRIME 0x01000193UL - uint32_t h = FNV1a_32_OFFSET_BASIS; - size_t i; - for (i = 0; i < length; i++) { - h ^= buf[i]; - h *= FNV1a_32_PRIME; - } - return h; -} - SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW() static __inline__ uint64_t -hash_fnv1a_64(const uint8_t *buf, size_t length) +hash_ids(size_t count, const fsm_state_t *ids) { -#define FNV1a_64_OFFSET_BASIS 0xcbf29ce484222325UL -#define FNV1a_64_PRIME 0x100000001b3UL - uint64_t h = FNV1a_64_OFFSET_BASIS; - size_t i; - for (i = 0; i < length; i++) { - h ^= buf[i]; - h *= FNV1a_64_PRIME; + uint64_t h = 0; + for (size_t i = 0; i < count; i++) { + h = hash_id(h ^ ids[i]); } return h; } diff --git a/src/adt/idmap.c b/src/adt/idmap.c index 342ecd136..208af3c5a 100644 --- a/src/adt/idmap.c +++ b/src/adt/idmap.c @@ -203,12 +203,15 @@ idmap_set(struct idmap *m, fsm_state_t state_id, } const uint64_t mask = m->bucket_count - 1; - for (size_t b_i = 0; b_i < m->bucket_count; b_i++) { + int res = 0; + size_t b_i; + for (b_i = 0; b_i < m->bucket_count; b_i++) { struct idmap_bucket *b = &m->buckets[(h + b_i) & mask]; if (b->state == state_id) { assert(b->values != NULL); u64bitset_set(b->values, value); - return 1; + res = 1; + goto done; } else if (b->state == NO_STATE) { b->state = state_id; assert(b->values == NULL); @@ -217,20 +220,30 @@ idmap_set(struct idmap *m, fsm_state_t state_id, b->values = f_calloc(m->alloc, vw, sizeof(b->values[0])); if (b->values == NULL) { - return 0; + goto done; } m->buckets_used++; u64bitset_set(b->values, value); - return 1; + res = 1; + goto done; } else { continue; /* collision */ } - } - assert(!"unreachable"); - return 0; +done: +#if HASH_LOG_PROBES + fprintf(stderr, "%s: res %d after %zu probes (%u buckets)\n", + __func__, res, b_i, m->bucket_count); +#endif + +#ifdef HASH_PROBE_LIMIT + assert(b_i < HASH_PROBE_LIMIT); +#endif + assert(b_i < m->bucket_count); + + return res; } static const struct idmap_bucket * diff --git a/src/adt/internedstateset.c b/src/adt/internedstateset.c index a39f09a19..9daf50ae7 100644 --- a/src/adt/internedstateset.c +++ b/src/adt/internedstateset.c @@ -197,15 +197,10 @@ dump_tables(FILE *f, const struct interned_state_set_pool *pool) #endif } -SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW() static uint64_t hash_state_ids(size_t count, const fsm_state_t *ids) { - uint64_t h = 0; - for (size_t i = 0; i < count; i++) { - h ^= hash_id(ids[i]); - } - return h; + return hash_ids(count, ids); } static bool @@ -329,8 +324,13 @@ interned_state_set_intern_set(struct interned_state_set_pool *pool, fprintf(stderr, "%s: htab[(0x%lx + %lu) & 0x%lx => %d\n", __func__, h, b_i, mask, *b); #endif + +#ifdef HASH_PROBE_LIMIT + assert(probes < HASH_PROBE_LIMIT); +#endif + if (*b == NO_ID) { -#if LOG_ISS > 3 +#if LOG_ISS > 3 || HASH_LOG_PROBES fprintf(stderr, "%s: empty bucket (%zd probes)\n", __func__, probes); #endif dst_bucket = b; @@ -351,7 +351,7 @@ interned_state_set_intern_set(struct interned_state_set_pool *pool, if (0 == memcmp(states, buf, s->length * sizeof(buf[0]))) { *result = id; -#if LOG_ISS > 3 +#if LOG_ISS > 3 || HASH_LOG_PROBES fprintf(stderr, "%s: reused %u (%zd probes)\n", __func__, id, probes); #endif return true; @@ -362,7 +362,7 @@ interned_state_set_intern_set(struct interned_state_set_pool *pool, } assert(dst_bucket != NULL); -#if LOG_ISS > 3 +#if LOG_ISS > 3 || HASH_LOG_PROBES fprintf(stderr, "%s: miss after %zd probes\n", __func__, probes); #else (void)probes; diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c index 5f6a984bc..582d769f6 100644 --- a/src/libfsm/determinise.c +++ b/src/libfsm/determinise.c @@ -396,7 +396,7 @@ hash_iss(interned_state_set_id iss) { /* Just hashing the ID directly is fine here -- since they're * interned, they're identified by pointer equality. */ - return FSM_PHI_64 * (uintptr_t)iss; + return hash_id((uintptr_t)iss); } static struct mapping * @@ -1309,15 +1309,10 @@ build_output_from_cached_analysis(struct analyze_closures_env *env, fsm_state_t #define LOG_TO_SET_HTAB 0 -SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW() static uint64_t to_set_hash(size_t count, const fsm_state_t *ids) { - uint64_t h = hash_id(count); - for (size_t i = 0; i < count; i++) { - h ^= hash_id(ids[i]); - } - return h; + return hash_ids(count, ids); } static int @@ -1333,15 +1328,14 @@ to_set_htab_check(struct analyze_closures_env *env, return 0; } - -#if LOG_TO_SET_HTAB +#if LOG_TO_SET_HTAB || HASH_LOG_PROBES || defined(HASH_PROBE_LIMIT) size_t probes = 0; #endif int res = 0; const uint64_t mask = bcount - 1; for (size_t b_i = 0; b_i < bcount; b_i++) { -#if LOG_TO_SET_HTAB +#if LOG_TO_SET_HTAB || HASH_LOG_PROBES || defined(HASH_PROBE_LIMIT) probes++; #endif const struct to_set_bucket *b = &htab->buckets[(h + b_i) & mask]; @@ -1364,10 +1358,16 @@ to_set_htab_check(struct analyze_closures_env *env, } done: -#if LOG_TO_SET_HTAB +#if LOG_TO_SET_HTAB || HASH_LOG_PROBES fprintf(stderr, "%s: result %d, %zu probes, htab: used %zu/%zu ceil\n", __func__, res, probes, htab->buckets_used, htab->bucket_count); #endif +#ifdef HASH_PROBE_LIMIT + if (probes >= HASH_PROBE_LIMIT) { + fprintf(stderr, "-- %zd probes, limit exceeded\n", probes); + } + assert(probes < HASH_PROBE_LIMIT); +#endif return res; } @@ -1431,6 +1431,22 @@ to_set_htab_save(struct analyze_closures_env *env, b->count = count; b->offset = offset; htab->buckets_used++; +#if HASH_LOG_PROBES + fprintf(stderr, "%s: [", __func__); + const fsm_state_t *ids = &env->to_sets.buf[offset]; + for (size_t i = 0; i < count; i++) { + fprintf(stderr, "%s%d", + i > 0 ? " " : "", ids[i]); + } + + fprintf(stderr, "] -> hash %lx -> b %zu (%zu/%zu used), %zd probes\n", + hash, (hash + b_i) & mask, + htab->buckets_used, htab->bucket_count, b_i); +#endif +#if HASH_PROBE_LIMIT + assert(b_i < HASH_PROBE_LIMIT); +#endif + return 1; } else if (b->count == count) { assert(b->offset != offset); /* no duplicates */ @@ -1586,8 +1602,8 @@ commit_buffered_result(struct analyze_closures_env *env, uint32_t *cache_result_ memcpy(&nr->entries[0], env->results.buffer.entries, nr->count * sizeof(env->results.buffer.entries[0])); #if LOG_GROUPING || LOG_COMMIT_BUFFERED_GROUP - fprintf(stderr, "%s: alloc %zu, hash 0x%016lx\n", - __func__, alloc_sz, hash_fnv1a_64((const uint8_t *)nr, alloc_sz)); + fprintf(stderr, "%s: alloc %zu, count %u\n", + __func__, alloc_sz, nr->count)); #endif } From 9cba5601a60145312eda4798f8fe7891119977b2 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Fri, 5 Jan 2024 09:26:53 -0500 Subject: [PATCH 4/6] hash_pair: Ensure pair order is consistent for hashing. --- src/libfsm/determinise.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c index 582d769f6..c0bca3b64 100644 --- a/src/libfsm/determinise.c +++ b/src/libfsm/determinise.c @@ -1659,10 +1659,14 @@ hash_pair(fsm_state_t a, fsm_state_t b) assert(a != b); assert(a & RESULT_BIT); assert(b & RESULT_BIT); - a &=~ RESULT_BIT; - b &=~ RESULT_BIT; - assert(a != b); - const uint64_t ab = ((uint64_t)a << 32) | (uint64_t)b; + const uint64_t ma = (uint64_t)(a & ~RESULT_BIT); /* m: masked */ + const uint64_t mb = (uint64_t)(b & ~RESULT_BIT); + assert(ma != mb); + + /* Left-shift the smaller ID, so the pair order is consistent for hashing. */ + const uint64_t ab = (ma < mb) + ? ((ma << 32) | mb) + : ((mb << 32) | ma); const uint64_t res = hash_id(ab); /* fprintf(stderr, "%s: a %d, b %d -> %016lx\n", __func__, a, b, res); */ return res; From f39390bbd766f78e801b7028c8866f8a1e14439b Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Fri, 5 Jan 2024 09:30:36 -0500 Subject: [PATCH 5/6] hash_ids: Add EXPENSIVE_CHECK for ids[] being in ascending order. --- include/adt/hash.h | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/include/adt/hash.h b/include/adt/hash.h index bd1cf2907..65c8dc897 100644 --- a/include/adt/hash.h +++ b/include/adt/hash.h @@ -12,6 +12,10 @@ #include "adt/common.h" #include "fsm/fsm.h" +#if EXPENSIVE_CHECKS +#include +#endif + #define HASH_LOG_PROBES 0 /* #define HASH_PROBE_LIMIT 100 */ @@ -35,6 +39,11 @@ hash_ids(size_t count, const fsm_state_t *ids) uint64_t h = 0; for (size_t i = 0; i < count; i++) { h = hash_id(h ^ ids[i]); +#if EXPENSIVE_CHECKS + if (i > 0) { + assert(ids[i-1] <= ids[i]); + } +#endif } return h; } From 89e3b88bf8715b9594897bf135cbc68435f7daf4 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Fri, 5 Jan 2024 14:17:45 -0500 Subject: [PATCH 6/6] determinise: Ensure that the analysis_cache pair ordering is consistent. --- src/libfsm/determinise.c | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c index c0bca3b64..7c5612b36 100644 --- a/src/libfsm/determinise.c +++ b/src/libfsm/determinise.c @@ -1672,6 +1672,15 @@ hash_pair(fsm_state_t a, fsm_state_t b) return res; } +static int eq_bucket_pair(const struct result_pair_bucket *b, fsm_state_t pa, fsm_state_t pb) +{ + if ((pa & ~RESULT_BIT) < (pb & ~RESULT_BIT)) { + return b->ids[0] == pa && b->ids[1] == pb; + } else { + return b->ids[0] == pb && b->ids[1] == pa; + } +} + static int analysis_cache_check_pair(struct analyze_closures_env *env, fsm_state_t pa, fsm_state_t pb, fsm_state_t *result_id) @@ -1697,7 +1706,7 @@ analysis_cache_check_pair(struct analyze_closures_env *env, fsm_state_t pa, fsm_ struct result_pair_bucket *b = &htab->buckets[(h + i) & mask]; if (b->t == RPBT_UNUSED) { break; /* not found */ - } else if (b->t == RPBT_PAIR && b->ids[0] == pa && b->ids[1] == pb) { + } else if (b->t == RPBT_PAIR && eq_bucket_pair(b, pa, pb)) { *result_id = b->result_id; /* hit */ #if LOG_GROUPING > 1 fprintf(stderr, "%s: cache hit for pair { %d_R, %d_R } => %d\n", @@ -1831,6 +1840,7 @@ pair_cache_save(struct analyze_closures_env *env, assert(pa != pb); assert(pa & RESULT_BIT); assert(pb & RESULT_BIT); + assert((pa & ~RESULT_BIT) < (pb & ~RESULT_BIT)); } #if LOG_GROUPING > 1 @@ -1868,6 +1878,8 @@ pair_cache_save(struct analyze_closures_env *env, : hash_pair(ob->ids[0], ob->ids[1])); if (ob->t == RPBT_SINGLE_STATE) { assert(ob->ids[0] == ob->ids[1]); + } else { + assert((ob->ids[0] & ~RESULT_BIT) < (ob->ids[1] & ~RESULT_BIT)); } for (size_t nb_i = 0; nb_i < ncount; nb_i++) { @@ -1902,8 +1914,9 @@ pair_cache_save(struct analyze_closures_env *env, #endif if (b->t == RPBT_UNUSED) { /* empty */ - b->ids[0] = pa; - b->ids[1] = pb; + /* Ensure pair ordering is consistent */ + b->ids[0] = pa < pb ? pa : pb; + b->ids[1] = pa < pb ? pb : pa; b->t = type; b->result_id = result_id; htab->buckets_used++; @@ -1947,6 +1960,16 @@ analysis_cache_save_pair(struct analyze_closures_env *env, fsm_state_t a, fsm_state_t b, fsm_state_t result_id) { assert(a != b); + assert((a & RESULT_BIT) == 0); + assert((b & RESULT_BIT) == 0); + + if (a > b) { /* if necessary, swap to ensure a < b */ + const fsm_state_t tmp = a; + a = b; + b = tmp; + } + assert(a < b); + return pair_cache_save(env, RPBT_PAIR, a | RESULT_BIT, b | RESULT_BIT, result_id); }