diff --git a/include/adt/hash.h b/include/adt/hash.h index 74ee4a890..65c8dc897 100644 --- a/include/adt/hash.h +++ b/include/adt/hash.h @@ -10,14 +10,18 @@ #include #include "adt/common.h" +#include "fsm/fsm.h" -/* 32 and 64-bit approximations of the golden ratio. */ -#define FSM_PHI_32 0x9e3779b9UL -#define FSM_PHI_64 (uint64_t)0x9e3779b97f4a7c15UL +#if EXPENSIVE_CHECKS +#include +#endif + +#define HASH_LOG_PROBES 0 +/* #define HASH_PROBE_LIMIT 100 */ SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW() static __inline__ uint64_t -hash_id(unsigned id) +hash_id(uint64_t id) { /* xorshift* A1(12,25,27), * from http://vigna.di.unimi.it/ftp/papers/xorshift.pdf */ @@ -28,38 +32,18 @@ hash_id(unsigned id) return x * 2685821657736338717LLU; } -/* FNV-1a hash function, 32 and 64 bit versions. This is in the public - * domain. For details, see: - * - * http://www.isthe.com/chongo/tech/comp/fnv/index.html - */ - -SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW() -static __inline__ uint32_t -hash_fnv1a_32(const uint8_t *buf, size_t length) -{ -#define FNV1a_32_OFFSET_BASIS 0x811c9dc5UL -#define FNV1a_32_PRIME 0x01000193UL - uint32_t h = FNV1a_32_OFFSET_BASIS; - size_t i; - for (i = 0; i < length; i++) { - h ^= buf[i]; - h *= FNV1a_32_PRIME; - } - return h; -} - SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW() static __inline__ uint64_t -hash_fnv1a_64(const uint8_t *buf, size_t length) +hash_ids(size_t count, const fsm_state_t *ids) { -#define FNV1a_64_OFFSET_BASIS 0xcbf29ce484222325UL -#define FNV1a_64_PRIME 0x100000001b3UL - uint64_t h = FNV1a_64_OFFSET_BASIS; - size_t i; - for (i = 0; i < length; i++) { - h ^= buf[i]; - h *= FNV1a_64_PRIME; + uint64_t h = 0; + for (size_t i = 0; i < count; i++) { + h = hash_id(h ^ ids[i]); +#if EXPENSIVE_CHECKS + if (i > 0) { + assert(ids[i-1] <= ids[i]); + } +#endif } return h; } diff --git a/include/adt/stateset.h b/include/adt/stateset.h index 83e835467..3ec91794b 100644 --- a/include/adt/stateset.h +++ b/include/adt/stateset.h @@ -72,8 +72,5 @@ state_set_rebase(struct state_set **set, fsm_state_t base); void state_set_replace(struct state_set **set, fsm_state_t old, fsm_state_t new); -unsigned long -state_set_hash(const struct state_set *set); - #endif diff --git a/src/adt/idmap.c b/src/adt/idmap.c index 342ecd136..208af3c5a 100644 --- a/src/adt/idmap.c +++ b/src/adt/idmap.c @@ -203,12 +203,15 @@ idmap_set(struct idmap *m, fsm_state_t state_id, } const uint64_t mask = m->bucket_count - 1; - for (size_t b_i = 0; b_i < m->bucket_count; b_i++) { + int res = 0; + size_t b_i; + for (b_i = 0; b_i < m->bucket_count; b_i++) { struct idmap_bucket *b = &m->buckets[(h + b_i) & mask]; if (b->state == state_id) { assert(b->values != NULL); u64bitset_set(b->values, value); - return 1; + res = 1; + goto done; } else if (b->state == NO_STATE) { b->state = state_id; assert(b->values == NULL); @@ -217,20 +220,30 @@ idmap_set(struct idmap *m, fsm_state_t state_id, b->values = f_calloc(m->alloc, vw, sizeof(b->values[0])); if (b->values == NULL) { - return 0; + goto done; } m->buckets_used++; u64bitset_set(b->values, value); - return 1; + res = 1; + goto done; } else { continue; /* collision */ } - } - assert(!"unreachable"); - return 0; +done: +#if HASH_LOG_PROBES + fprintf(stderr, "%s: res %d after %zu probes (%u buckets)\n", + __func__, res, b_i, m->bucket_count); +#endif + +#ifdef HASH_PROBE_LIMIT + assert(b_i < HASH_PROBE_LIMIT); +#endif + assert(b_i < m->bucket_count); + + return res; } static const struct idmap_bucket * diff --git a/src/adt/internedstateset.c b/src/adt/internedstateset.c index a39f09a19..9daf50ae7 100644 --- a/src/adt/internedstateset.c +++ b/src/adt/internedstateset.c @@ -197,15 +197,10 @@ dump_tables(FILE *f, const struct interned_state_set_pool *pool) #endif } -SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW() static uint64_t hash_state_ids(size_t count, const fsm_state_t *ids) { - uint64_t h = 0; - for (size_t i = 0; i < count; i++) { - h ^= hash_id(ids[i]); - } - return h; + return hash_ids(count, ids); } static bool @@ -329,8 +324,13 @@ interned_state_set_intern_set(struct interned_state_set_pool *pool, fprintf(stderr, "%s: htab[(0x%lx + %lu) & 0x%lx => %d\n", __func__, h, b_i, mask, *b); #endif + +#ifdef HASH_PROBE_LIMIT + assert(probes < HASH_PROBE_LIMIT); +#endif + if (*b == NO_ID) { -#if LOG_ISS > 3 +#if LOG_ISS > 3 || HASH_LOG_PROBES fprintf(stderr, "%s: empty bucket (%zd probes)\n", __func__, probes); #endif dst_bucket = b; @@ -351,7 +351,7 @@ interned_state_set_intern_set(struct interned_state_set_pool *pool, if (0 == memcmp(states, buf, s->length * sizeof(buf[0]))) { *result = id; -#if LOG_ISS > 3 +#if LOG_ISS > 3 || HASH_LOG_PROBES fprintf(stderr, "%s: reused %u (%zd probes)\n", __func__, id, probes); #endif return true; @@ -362,7 +362,7 @@ interned_state_set_intern_set(struct interned_state_set_pool *pool, } assert(dst_bucket != NULL); -#if LOG_ISS > 3 +#if LOG_ISS > 3 || HASH_LOG_PROBES fprintf(stderr, "%s: miss after %zd probes\n", __func__, probes); #else (void)probes; diff --git a/src/adt/stateset.c b/src/adt/stateset.c index 8d73a9b5f..c2cc8b8c0 100644 --- a/src/adt/stateset.c +++ b/src/adt/stateset.c @@ -668,19 +668,3 @@ state_set_replace(struct state_set **setp, fsm_state_t old, fsm_state_t new) } } } - -unsigned long -state_set_hash(const struct state_set *set) -{ - if (set == NULL) { - return 0; /* empty */ - } - - if (IS_SINGLETON(set)) { - fsm_state_t state; - state = SINGLETON_DECODE(set); - return hashrec(&state, sizeof state); - } - - return hashrec(set->a, set->i * sizeof *set->a); -} diff --git a/src/libfsm/determinise.c b/src/libfsm/determinise.c index b6c688ac4..7c5612b36 100644 --- a/src/libfsm/determinise.c +++ b/src/libfsm/determinise.c @@ -396,7 +396,7 @@ hash_iss(interned_state_set_id iss) { /* Just hashing the ID directly is fine here -- since they're * interned, they're identified by pointer equality. */ - return FSM_PHI_64 * (uintptr_t)iss; + return hash_id((uintptr_t)iss); } static struct mapping * @@ -1309,15 +1309,10 @@ build_output_from_cached_analysis(struct analyze_closures_env *env, fsm_state_t #define LOG_TO_SET_HTAB 0 -SUPPRESS_EXPECTED_UNSIGNED_INTEGER_OVERFLOW() static uint64_t to_set_hash(size_t count, const fsm_state_t *ids) { - uint64_t h = hash_id(count); - for (size_t i = 0; i < count; i++) { - h ^= hash_id(ids[i]); - } - return h; + return hash_ids(count, ids); } static int @@ -1333,15 +1328,14 @@ to_set_htab_check(struct analyze_closures_env *env, return 0; } - -#if LOG_TO_SET_HTAB +#if LOG_TO_SET_HTAB || HASH_LOG_PROBES || defined(HASH_PROBE_LIMIT) size_t probes = 0; #endif int res = 0; const uint64_t mask = bcount - 1; for (size_t b_i = 0; b_i < bcount; b_i++) { -#if LOG_TO_SET_HTAB +#if LOG_TO_SET_HTAB || HASH_LOG_PROBES || defined(HASH_PROBE_LIMIT) probes++; #endif const struct to_set_bucket *b = &htab->buckets[(h + b_i) & mask]; @@ -1364,10 +1358,16 @@ to_set_htab_check(struct analyze_closures_env *env, } done: -#if LOG_TO_SET_HTAB +#if LOG_TO_SET_HTAB || HASH_LOG_PROBES fprintf(stderr, "%s: result %d, %zu probes, htab: used %zu/%zu ceil\n", __func__, res, probes, htab->buckets_used, htab->bucket_count); #endif +#ifdef HASH_PROBE_LIMIT + if (probes >= HASH_PROBE_LIMIT) { + fprintf(stderr, "-- %zd probes, limit exceeded\n", probes); + } + assert(probes < HASH_PROBE_LIMIT); +#endif return res; } @@ -1431,6 +1431,22 @@ to_set_htab_save(struct analyze_closures_env *env, b->count = count; b->offset = offset; htab->buckets_used++; +#if HASH_LOG_PROBES + fprintf(stderr, "%s: [", __func__); + const fsm_state_t *ids = &env->to_sets.buf[offset]; + for (size_t i = 0; i < count; i++) { + fprintf(stderr, "%s%d", + i > 0 ? " " : "", ids[i]); + } + + fprintf(stderr, "] -> hash %lx -> b %zu (%zu/%zu used), %zd probes\n", + hash, (hash + b_i) & mask, + htab->buckets_used, htab->bucket_count, b_i); +#endif +#if HASH_PROBE_LIMIT + assert(b_i < HASH_PROBE_LIMIT); +#endif + return 1; } else if (b->count == count) { assert(b->offset != offset); /* no duplicates */ @@ -1586,8 +1602,8 @@ commit_buffered_result(struct analyze_closures_env *env, uint32_t *cache_result_ memcpy(&nr->entries[0], env->results.buffer.entries, nr->count * sizeof(env->results.buffer.entries[0])); #if LOG_GROUPING || LOG_COMMIT_BUFFERED_GROUP - fprintf(stderr, "%s: alloc %zu, hash 0x%016lx\n", - __func__, alloc_sz, hash_fnv1a_64((const uint8_t *)nr, alloc_sz)); + fprintf(stderr, "%s: alloc %zu, count %u\n", + __func__, alloc_sz, nr->count)); #endif } @@ -1643,22 +1659,28 @@ hash_pair(fsm_state_t a, fsm_state_t b) assert(a != b); assert(a & RESULT_BIT); assert(b & RESULT_BIT); - a &=~ RESULT_BIT; - b &=~ RESULT_BIT; - - /* Don't hash a and b separately and combine them with - * hash_id, because it's common to have adjacent pairs of - * result IDs, and with how hash_id works that leads to - * multiples of similar hash values bunching up. - * - * This could be replaced with a better hash function later, - * but use LOG_CACHE_HTAB to ensure there aren't visually obvious - * runs of collisions appearing in the tables. */ - const uint64_t res = hash_id(a + b); + const uint64_t ma = (uint64_t)(a & ~RESULT_BIT); /* m: masked */ + const uint64_t mb = (uint64_t)(b & ~RESULT_BIT); + assert(ma != mb); + + /* Left-shift the smaller ID, so the pair order is consistent for hashing. */ + const uint64_t ab = (ma < mb) + ? ((ma << 32) | mb) + : ((mb << 32) | ma); + const uint64_t res = hash_id(ab); /* fprintf(stderr, "%s: a %d, b %d -> %016lx\n", __func__, a, b, res); */ return res; } +static int eq_bucket_pair(const struct result_pair_bucket *b, fsm_state_t pa, fsm_state_t pb) +{ + if ((pa & ~RESULT_BIT) < (pb & ~RESULT_BIT)) { + return b->ids[0] == pa && b->ids[1] == pb; + } else { + return b->ids[0] == pb && b->ids[1] == pa; + } +} + static int analysis_cache_check_pair(struct analyze_closures_env *env, fsm_state_t pa, fsm_state_t pb, fsm_state_t *result_id) @@ -1684,7 +1706,7 @@ analysis_cache_check_pair(struct analyze_closures_env *env, fsm_state_t pa, fsm_ struct result_pair_bucket *b = &htab->buckets[(h + i) & mask]; if (b->t == RPBT_UNUSED) { break; /* not found */ - } else if (b->t == RPBT_PAIR && b->ids[0] == pa && b->ids[1] == pb) { + } else if (b->t == RPBT_PAIR && eq_bucket_pair(b, pa, pb)) { *result_id = b->result_id; /* hit */ #if LOG_GROUPING > 1 fprintf(stderr, "%s: cache hit for pair { %d_R, %d_R } => %d\n", @@ -1818,6 +1840,7 @@ pair_cache_save(struct analyze_closures_env *env, assert(pa != pb); assert(pa & RESULT_BIT); assert(pb & RESULT_BIT); + assert((pa & ~RESULT_BIT) < (pb & ~RESULT_BIT)); } #if LOG_GROUPING > 1 @@ -1855,6 +1878,8 @@ pair_cache_save(struct analyze_closures_env *env, : hash_pair(ob->ids[0], ob->ids[1])); if (ob->t == RPBT_SINGLE_STATE) { assert(ob->ids[0] == ob->ids[1]); + } else { + assert((ob->ids[0] & ~RESULT_BIT) < (ob->ids[1] & ~RESULT_BIT)); } for (size_t nb_i = 0; nb_i < ncount; nb_i++) { @@ -1889,8 +1914,9 @@ pair_cache_save(struct analyze_closures_env *env, #endif if (b->t == RPBT_UNUSED) { /* empty */ - b->ids[0] = pa; - b->ids[1] = pb; + /* Ensure pair ordering is consistent */ + b->ids[0] = pa < pb ? pa : pb; + b->ids[1] = pa < pb ? pb : pa; b->t = type; b->result_id = result_id; htab->buckets_used++; @@ -1934,6 +1960,16 @@ analysis_cache_save_pair(struct analyze_closures_env *env, fsm_state_t a, fsm_state_t b, fsm_state_t result_id) { assert(a != b); + assert((a & RESULT_BIT) == 0); + assert((b & RESULT_BIT) == 0); + + if (a > b) { /* if necessary, swap to ensure a < b */ + const fsm_state_t tmp = a; + a = b; + b = tmp; + } + assert(a < b); + return pair_cache_save(env, RPBT_PAIR, a | RESULT_BIT, b | RESULT_BIT, result_id); }