From 66a53bdd491364b4b659612df542675fe70e633a Mon Sep 17 00:00:00 2001 From: Shengqi Chen Date: Tue, 27 Aug 2024 15:52:33 +0800 Subject: [PATCH 1/3] dmu_objset: replace dnode_hash impl with cityhash4 As mentioned in PR #16131, replacing CRC-based hash with cityhash4 could slightly improve the performance by eliminating memory access. Replacing algorightm is safe since the hash result is not persisted. See: openzfs/zfs#16131 Signed-off-by: Shengqi Chen --- module/zfs/dmu_objset.c | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index f030fba22669..dfcd7459e6a6 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -66,6 +66,7 @@ #include "zfs_namecheck.h" #include #include +#include /* * Needed to close a window in dnode_move() that allows the objset to be freed @@ -404,27 +405,13 @@ dmu_objset_byteswap(void *buf, size_t size) } /* - * The hash is a CRC-based hash of the objset_t pointer and the object number. + * Runs cityhash4 on the objset_t pointer and the object number. */ static uint64_t dnode_hash(const objset_t *os, uint64_t obj) { uintptr_t osv = (uintptr_t)os; - uint64_t crc = -1ULL; - - ASSERT(zfs_crc64_table[128] == ZFS_CRC64_POLY); - /* - * The lower 11 bits of the pointer don't have much entropy, because - * the objset_t is more than 1KB long and so likely aligned to 2KB. - */ - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (osv >> 11)) & 0xFF]; - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 0)) & 0xFF]; - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 8)) & 0xFF]; - crc = (crc >> 8) ^ zfs_crc64_table[(crc ^ (obj >> 16)) & 0xFF]; - - crc ^= (osv>>14) ^ (obj>>24); - - return (crc); + return (cityhash4((uint64_t)osv, obj, 0, 0)); } static unsigned int From f14d9d6f06a66bd66f814b94b4cc96e5ec9ce128 Mon Sep 17 00:00:00 2001 From: Shengqi Chen Date: Sat, 7 Sep 2024 21:55:03 +0800 Subject: [PATCH 2/3] zcommon: add specialized versions of cityhash4 Specializing cityhash4 on 32-bit architectures can reduce the size of stack frames as well as instruction count. This is a tiny but useful optimization, since some callers invoke it frequently. When specializing into 1/2/3/4-arg versions, the stack usage (in bytes) on some 32-bit arches are listed as follows: - x86: 32, 32, 32, 40 - arm-v7a: 20, 20, 28, 36 - riscv: 0, 0, 0, 16 - power: 16, 16, 16, 32 - mipsel: 8, 8, 8, 24 And each actual argument (even if passing 0) contributes evenly to the number of multiplication instructions generated: - x86: 9, 12, 15 ,18 - arm-v7a: 6, 8, 10, 12 - riscv / power: 12, 18, 20, 24 - mipsel: 9, 12, 15, 19 On 64-bit architectures, the tendencies are similar. But both stack sizes and instruction counts are significantly smaller thus negligible. See more discussion at https://github.com/openzfs/zfs/pull/16483. Acked-by: Alexander Motin Signed-off-by: Shengqi Chen --- include/cityhash.h | 7 +++++++ lib/libzfs/libzfs.abi | 18 ++++++++++++++++++ module/zcommon/cityhash.c | 33 +++++++++++++++++++++++++++++++-- 3 files changed, 56 insertions(+), 2 deletions(-) diff --git a/include/cityhash.h b/include/cityhash.h index 3b2d1e84b5b3..346fb673a07d 100644 --- a/include/cityhash.h +++ b/include/cityhash.h @@ -32,6 +32,13 @@ extern "C" { #endif +/* + * Define 1/2/3-argument specialized versions of cityhash4, which can reduce + * instruction count (especially multiplication) on some 32-bit arches. + */ +_SYS_CITYHASH_H uint64_t cityhash1(uint64_t); +_SYS_CITYHASH_H uint64_t cityhash2(uint64_t, uint64_t); +_SYS_CITYHASH_H uint64_t cityhash3(uint64_t, uint64_t, uint64_t); _SYS_CITYHASH_H uint64_t cityhash4(uint64_t, uint64_t, uint64_t, uint64_t); #ifdef __cplusplus diff --git a/lib/libzfs/libzfs.abi b/lib/libzfs/libzfs.abi index 5b0dffb03f49..1f1f2fdffb15 100644 --- a/lib/libzfs/libzfs.abi +++ b/lib/libzfs/libzfs.abi @@ -154,6 +154,9 @@ + + + @@ -9241,6 +9244,21 @@ + + + + + + + + + + + + + + + diff --git a/module/zcommon/cityhash.c b/module/zcommon/cityhash.c index 413a96df2cda..c758ec47d1f6 100644 --- a/module/zcommon/cityhash.c +++ b/module/zcommon/cityhash.c @@ -49,8 +49,8 @@ cityhash_helper(uint64_t u, uint64_t v, uint64_t mul) return (b); } -uint64_t -cityhash4(uint64_t w1, uint64_t w2, uint64_t w3, uint64_t w4) +static inline uint64_t +cityhash_impl(uint64_t w1, uint64_t w2, uint64_t w3, uint64_t w4) { uint64_t mul = HASH_K2 + 64; uint64_t a = w1 * HASH_K1; @@ -59,9 +59,38 @@ cityhash4(uint64_t w1, uint64_t w2, uint64_t w3, uint64_t w4) uint64_t d = w3 * HASH_K2; return (cityhash_helper(rotate(a + b, 43) + rotate(c, 30) + d, a + rotate(b + HASH_K2, 18) + c, mul)); +} +/* + * Passing w as the 2nd argument could save one 64-bit multiplication. + */ +uint64_t +cityhash1(uint64_t w) +{ + return (cityhash_impl(0, w, 0, 0)); +} + +uint64_t +cityhash2(uint64_t w1, uint64_t w2) +{ + return (cityhash_impl(w1, w2, 0, 0)); +} + +uint64_t +cityhash3(uint64_t w1, uint64_t w2, uint64_t w3) +{ + return (cityhash_impl(w1, w2, w3, 0)); +} + +uint64_t +cityhash4(uint64_t w1, uint64_t w2, uint64_t w3, uint64_t w4) +{ + return (cityhash_impl(w1, w2, w3, w4)); } #if defined(_KERNEL) +EXPORT_SYMBOL(cityhash1); +EXPORT_SYMBOL(cityhash2); +EXPORT_SYMBOL(cityhash3); EXPORT_SYMBOL(cityhash4); #endif From e958069e611bba347d918e571fab7ae43edeaccc Mon Sep 17 00:00:00 2001 From: Shengqi Chen Date: Sat, 7 Sep 2024 22:07:14 +0800 Subject: [PATCH 3/3] cityhash: replace invocations with specialized versions when possible So that we can get actual benefit from last commit. See more discussion at https://github.com/openzfs/zfs/pull/16483. Acked-by: Alexander Motin Signed-off-by: Shengqi Chen --- cmd/zstream/zstream_redup.c | 4 ++-- module/os/linux/zfs/zvol_os.c | 4 ++-- module/zfs/dmu_objset.c | 4 ++-- module/zfs/zio.c | 4 ++-- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/cmd/zstream/zstream_redup.c b/cmd/zstream/zstream_redup.c index dccd325d4cfa..51266b0b6629 100644 --- a/cmd/zstream/zstream_redup.c +++ b/cmd/zstream/zstream_redup.c @@ -132,7 +132,7 @@ static void rdt_insert(redup_table_t *rdt, uint64_t guid, uint64_t object, uint64_t offset, uint64_t stream_offset) { - uint64_t ch = cityhash4(guid, object, offset, 0); + uint64_t ch = cityhash3(guid, object, offset); uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits); redup_entry_t **rdepp; @@ -152,7 +152,7 @@ rdt_lookup(redup_table_t *rdt, uint64_t guid, uint64_t object, uint64_t offset, uint64_t *stream_offsetp) { - uint64_t ch = cityhash4(guid, object, offset, 0); + uint64_t ch = cityhash3(guid, object, offset); uint64_t hashcode = BF64_GET(ch, 0, rdt->numhashbits); for (redup_entry_t *rde = rdt->redup_hash_array[hashcode]; diff --git a/module/os/linux/zfs/zvol_os.c b/module/os/linux/zfs/zvol_os.c index 303aafc95dd7..2396690b40fd 100644 --- a/module/os/linux/zfs/zvol_os.c +++ b/module/os/linux/zfs/zvol_os.c @@ -551,8 +551,8 @@ zvol_request_impl(zvol_state_t *zv, struct bio *bio, struct request *rq, blk_mq_hw_queue = rq->q->queue_hw_ctx[rq->q->mq_map[rq->cpu]]->queue_num; #endif - taskq_hash = cityhash4((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, - blk_mq_hw_queue, 0); + taskq_hash = cityhash3((uintptr_t)zv, offset >> ZVOL_TASKQ_OFFSET_SHIFT, + blk_mq_hw_queue); tq_idx = taskq_hash % ztqs->tqs_cnt; if (rw == WRITE) { diff --git a/module/zfs/dmu_objset.c b/module/zfs/dmu_objset.c index dfcd7459e6a6..9f336918e6fd 100644 --- a/module/zfs/dmu_objset.c +++ b/module/zfs/dmu_objset.c @@ -405,13 +405,13 @@ dmu_objset_byteswap(void *buf, size_t size) } /* - * Runs cityhash4 on the objset_t pointer and the object number. + * Runs cityhash on the objset_t pointer and the object number. */ static uint64_t dnode_hash(const objset_t *os, uint64_t obj) { uintptr_t osv = (uintptr_t)os; - return (cityhash4((uint64_t)osv, obj, 0, 0)); + return (cityhash2((uint64_t)osv, obj)); } static unsigned int diff --git a/module/zfs/zio.c b/module/zfs/zio.c index 66a8a9fefd8c..2e38d47c99b1 100644 --- a/module/zfs/zio.c +++ b/module/zfs/zio.c @@ -4182,8 +4182,8 @@ zio_alloc_zil(spa_t *spa, objset_t *os, uint64_t txg, blkptr_t *new_bp, * some parallelism. */ int flags = METASLAB_ZIL; - int allocator = (uint_t)cityhash4(0, 0, 0, - os->os_dsl_dataset->ds_object) % spa->spa_alloc_count; + int allocator = (uint_t)cityhash1(os->os_dsl_dataset->ds_object) + % spa->spa_alloc_count; error = metaslab_alloc(spa, spa_log_class(spa), size, new_bp, 1, txg, NULL, flags, &io_alloc_list, NULL, allocator); *slog = (error == 0);