From dfab0deb71ae48b86c8ff60bfae1fe224f4bbf22 Mon Sep 17 00:00:00 2001 From: Scott Vokes Date: Thu, 4 Jan 2024 14:42:42 -0500 Subject: [PATCH] Add include/adt/hash.h `hash_ids` and consolidate uses. - Add a function to hash a series of IDs, rather than doing it with a loop combining the intermediate hashes in inconsistent ways. - Add a #define to log a probe count and a limit to assert, and rework the hashing code control flow for tracking/asserting on the probe count. - Remove `hash_fnv1a_64` from a logging statement (normally not compiled), since that's the only remaining use remove the FNV1a functions from hash.h. - Remove FSM_PHI_32 and FSM_PHI_64, now unused. - parser.act still uses 32-bit FNV1a, but that's for character-by- character hashing, so that still makes sense, and it has its own copy of fnv1a. --- include/adt/hash.h | 39 +++++++---------------------------- src/adt/idmap.c | 27 +++++++++++++++++------- src/adt/internedstateset.c | 18 ++++++++-------- src/libfsm/determinise.c | 42 ++++++++++++++++++++++++++------------ 4 files changed, 65 insertions(+), 61 deletions(-) diff --git a/include/adt/hash.h b/include/adt/hash.h index c2b7a0683..bd1cf2907 100644 --- a/include/adt/hash.h +++ b/include/adt/hash.h @@ -10,10 +10,10 @@ #include #include "adt/common.h" +#include "fsm/fsm.h" -/* 32 and 64-bit approximations of the golden ratio. */ -#define FSM_PHI_32 0x9e3779b9UL -#define FSM_PHI_64 (uint64_t)0x9e3779b97f4a7c15UL +#define HASH_LOG_PROBES 0 +/* #define HASH_PROBE_LIMIT 100 */ SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW() static __inline__ uint64_t @@ -28,38 +28,13 @@ hash_id(uint64_t id) return x * 2685821657736338717LLU; } -/* FNV-1a hash function, 32 and 64 bit versions. This is in the public - * domain. For details, see: - * - * http://www.isthe.com/chongo/tech/comp/fnv/index.html - */ - -SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW() -static __inline__ uint32_t -hash_fnv1a_32(const uint8_t *buf, size_t length) -{ -#define FNV1a_32_OFFSET_BASIS 0x811c9dc5UL -#define FNV1a_32_PRIME 0x01000193UL - uint32_t h = FNV1a_32_OFFSET_BASIS; - size_t i; - for (i = 0; i < length; i++) { - h ^= buf[i]; - h *= FNV1a_32_PRIME; - } - return h; -} - SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW() static __inline__ uint64_t -hash_fnv1a_64(const uint8_t *buf, size_t length) +hash_ids(size_t count, const fsm_state_t *ids) { -#define FNV1a_64_OFFSET_BASIS 0xcbf29ce484222325UL -#define FNV1a_64_PRIME 0x100000001b3UL - uint64_t h = FNV1a_64_OFFSET_BASIS; - size_t i; - for (i = 0; i < length; i++) { - h ^= buf[i]; - h *= FNV1a_64_PRIME; + uint64_t h = 0; + for (size_t i = 0; i < count; i++) { + h = hash_id(h ^ ids[i]); } return h; } diff --git a/src/adt/idmap.c b/src/adt/idmap.c index 342ecd136..208af3c5a 100644 --- a/src/adt/idmap.c +++ b/src/adt/idmap.c @@ -203,12 +203,15 @@ idmap_set(struct idmap *m, fsm_state_t state_id, } const uint64_t mask = m->bucket_count - 1; - for (size_t b_i = 0; b_i < m->bucket_count; b_i++) { + int res = 0; + size_t b_i; + for (b_i = 0; b_i < m->bucket_count; b_i++) { struct idmap_bucket *b = &m->buckets[(h + b_i) & mask]; if (b->state == state_id) { assert(b->values != NULL); u64bitset_set(b->values, value); - return 1; + res = 1; + goto done; } else if (b->state == NO_STATE) { b->state = state_id; assert(b->values == NULL); @@ -217,20 +220,30 @@ idmap_set(struct idmap *m, fsm_state_t state_id, b->values = f_calloc(m->alloc, vw, sizeof(b->values[0])); if (b->values == NULL) { - return 0; + goto done; } m->buckets_used++; u64bitset_set(b->values, value); - return 1; + res = 1; + goto done; } else { continue; /* collision */ } - } - assert(!"unreachable"); - return 0; +done: +#if HASH_LOG_PROBES + fprintf(stderr, "%s: res %d after %zu probes (%u buckets)\n", + __func__, res, b_i, m->bucket_count); +#endif + +#ifdef HASH_PROBE_LIMIT + assert(b_i < HASH_PROBE_LIMIT); +#endif + assert(b_i < m->bucket_count); + + return res; } static const struct idmap_bucket * diff --git a/src/adt/internedstateset.c b/src/adt/internedstateset.c index a39f09a19..9daf50ae7 100644 --- a/src/adt/internedstateset.c +++ b/src/adt/internedstateset.c @@ -197,15 +197,10 @@ dump_tables(FILE *f, const struct interned_state_set_pool *pool) #endif } -SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW() static uint64_t hash_state_ids(size_t count, const fsm_state_t *ids) { - uint64_t h = 0; - for (size_t i = 0; i < count; i++) { - h ^= hash_id(ids[i]); - } - return h; + return hash_ids(count, ids); } static bool @@ -329,8 +324,13 @@ interned_state_set_intern_set(struct interned_state_set_pool *pool, fprintf(stderr, "%s: htab[(0x%lx + %lu) & 0x%lx => %d\n", __func__, h, b_i, mask, *b); #endif + +#ifdef HASH_PROBE_LIMIT + assert(probes < HASH_PROBE_LIMIT); +#endif + if (*b == NO_ID) { -#if LOG_ISS > 3 +#if LOG_ISS > 3 || HASH_LOG_PROBES fprintf(stderr, "%s: empty bucket (%zd probes)\n", __func__, probes); #endif dst_bucket = b; @@ -351,7 +351,7 @@ interned_state_set_intern_set(struct interned_state_set_pool *pool, if (0 == memcmp(states, buf, s->length * sizeof(buf[0]))) { *result = id; -#if LOG_ISS > 3 +#if LOG_ISS > 3 || HASH_LOG_PROBES fprintf(stderr, "%s: reused %u (%zd probes)\n", __func__, id, probes); #endif return true; @@ -362,7 +362,7 @@ interned_state_set_intern_set(struct interned_state_set_pool *pool, } assert(dst_bucket != NULL); -#if LOG_ISS > 3 +#if LOG_ISS > 3 || HASH_LOG_PROBES fprintf(stderr, "%s: miss after %zd probes\n", __func__, probes); #else (void)probes; diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c index 5f6a984bc..582d769f6 100644 --- a/src/libfsm/determinise.c +++ b/src/libfsm/determinise.c @@ -396,7 +396,7 @@ hash_iss(interned_state_set_id iss) { /* Just hashing the ID directly is fine here -- since they're * interned, they're identified by pointer equality. */ - return FSM_PHI_64 * (uintptr_t)iss; + return hash_id((uintptr_t)iss); } static struct mapping * @@ -1309,15 +1309,10 @@ build_output_from_cached_analysis(struct analyze_closures_env *env, fsm_state_t #define LOG_TO_SET_HTAB 0 -SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW() static uint64_t to_set_hash(size_t count, const fsm_state_t *ids) { - uint64_t h = hash_id(count); - for (size_t i = 0; i < count; i++) { - h ^= hash_id(ids[i]); - } - return h; + return hash_ids(count, ids); } static int @@ -1333,15 +1328,14 @@ to_set_htab_check(struct analyze_closures_env *env, return 0; } - -#if LOG_TO_SET_HTAB +#if LOG_TO_SET_HTAB || HASH_LOG_PROBES || defined(HASH_PROBE_LIMIT) size_t probes = 0; #endif int res = 0; const uint64_t mask = bcount - 1; for (size_t b_i = 0; b_i < bcount; b_i++) { -#if LOG_TO_SET_HTAB +#if LOG_TO_SET_HTAB || HASH_LOG_PROBES || defined(HASH_PROBE_LIMIT) probes++; #endif const struct to_set_bucket *b = &htab->buckets[(h + b_i) & mask]; @@ -1364,10 +1358,16 @@ to_set_htab_check(struct analyze_closures_env *env, } done: -#if LOG_TO_SET_HTAB +#if LOG_TO_SET_HTAB || HASH_LOG_PROBES fprintf(stderr, "%s: result %d, %zu probes, htab: used %zu/%zu ceil\n", __func__, res, probes, htab->buckets_used, htab->bucket_count); #endif +#ifdef HASH_PROBE_LIMIT + if (probes >= HASH_PROBE_LIMIT) { + fprintf(stderr, "-- %zd probes, limit exceeded\n", probes); + } + assert(probes < HASH_PROBE_LIMIT); +#endif return res; } @@ -1431,6 +1431,22 @@ to_set_htab_save(struct analyze_closures_env *env, b->count = count; b->offset = offset; htab->buckets_used++; +#if HASH_LOG_PROBES + fprintf(stderr, "%s: [", __func__); + const fsm_state_t *ids = &env->to_sets.buf[offset]; + for (size_t i = 0; i < count; i++) { + fprintf(stderr, "%s%d", + i > 0 ? " " : "", ids[i]); + } + + fprintf(stderr, "] -> hash %lx -> b %zu (%zu/%zu used), %zd probes\n", + hash, (hash + b_i) & mask, + htab->buckets_used, htab->bucket_count, b_i); +#endif +#if HASH_PROBE_LIMIT + assert(b_i < HASH_PROBE_LIMIT); +#endif + return 1; } else if (b->count == count) { assert(b->offset != offset); /* no duplicates */ @@ -1586,8 +1602,8 @@ commit_buffered_result(struct analyze_closures_env *env, uint32_t *cache_result_ memcpy(&nr->entries[0], env->results.buffer.entries, nr->count * sizeof(env->results.buffer.entries[0])); #if LOG_GROUPING || LOG_COMMIT_BUFFERED_GROUP - fprintf(stderr, "%s: alloc %zu, hash 0x%016lx\n", - __func__, alloc_sz, hash_fnv1a_64((const uint8_t *)nr, alloc_sz)); + fprintf(stderr, "%s: alloc %zu, count %u\n", + __func__, alloc_sz, nr->count)); #endif }