From d30efc9b8ca0ebd57827cf3f60c1491043d35a61 Mon Sep 17 00:00:00 2001 From: Bill Traynor Date: Thu, 3 Aug 2023 12:06:55 -0400 Subject: [PATCH] Adding software optimization guide to bitmanip Adding the software optimization guide to bitmanip chapter. Note that in the original document this was included as an appendix. --- src/b-st-ext.adoc | 145 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 144 insertions(+), 1 deletion(-) diff --git a/src/b-st-ext.adoc b/src/b-st-ext.adoc index de8317eda..f8e3999b8 100644 --- a/src/b-st-ext.adoc +++ b/src/b-st-ext.adoc @@ -1137,4 +1137,147 @@ include::insns/xpermn.adoc[] <<< include::insns/zext_h.adoc[] <<< -include::insns/zip.adoc[] \ No newline at end of file +include::insns/zip.adoc[] + +=== Software optimization guide + +==== strlen + +The *orc.b* instruction allows for the efficient detection of *NUL* bytes in an XLEN-sized chunk of data: + + * the result of *orc.b* on a chunk that does not contain any *NUL* bytes will be all-ones, and + * after a bitwise-negation of the result of *orc.b*, the number of data bytes before the first *NUL* byte (if any) can be detected by *ctz*/*clz* (depending on the endianness of data). + +A full example of a *strlen* function, which uses these techniques and also demonstrates the use of it for unaligned/partial data, is the following: + +[source,asm] +-- +#include + + .text + .globl strlen + .type strlen, @function +strlen: + andi a3, a0, (SZREG-1) // offset + andi a1, a0, -SZREG // align pointer +.Lprologue: + li a4, SZREG + sub a4, a4, a3 // XLEN - offset + slli a3, a3, PTRLOG // offset * 8 + REG_L a2, 0(a1) // chunk + /* + * Shift the partial/unaligned chunk we loaded to remove the bytes + * from before the start of the string, adding NUL bytes at the end. + */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + srl a2, a2 ,a3 // chunk >> (offset * 8) +#else + sll a2, a2, a3 +#endif + orc.b a2, a2 + not a2, a2 + /* + * Non-NUL bytes in the string have been expanded to 0x00, while + * NUL bytes have become 0xff. Search for the first set bit + * (corresponding to a NUL byte in the original chunk). + */ +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + ctz a2, a2 +#else + clz a2, a2 +#endif + /* + * The first chunk is special: compare against the number of valid + * bytes in this chunk. + */ + srli a0, a2, 3 + bgtu a4, a0, .Ldone + addi a3, a1, SZREG + li a4, -1 + .align 2 + /* + * Our critical loop is 4 instructions and processes data in 4 byte + * or 8 byte chunks. + */ +.Lloop: + REG_L a2, SZREG(a1) + addi a1, a1, SZREG + orc.b a2, a2 + beq a2, a4, .Lloop + +.Lepilogue: + not a2, a2 +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + ctz a2, a2 +#else + clz a2, a2 +#endif + sub a1, a1, a3 + add a0, a0, a1 + srli a2, a2, 3 + add a0, a0, a2 +.Ldone: + ret +-- + +==== strcmp + +[source,asm] +-- +#include + + .text + .globl strcmp + .type strcmp, @function +strcmp: + or a4, a0, a1 + li t2, -1 + and a4, a4, SZREG-1 + bnez a4, .Lsimpleloop + + # Main loop for aligned strings +.Lloop: + REG_L a2, 0(a0) + REG_L a3, 0(a1) + orc.b t0, a2 + bne t0, t2, .Lfoundnull + addi a0, a0, SZREG + addi a1, a1, SZREG + beq a2, a3, .Lloop + + # Words don't match, and no null byte in first word. + # Get bytes in big-endian order and compare. +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ + rev8 a2, a2 + rev8 a3, a3 +#endif + # Synthesize (a2 >= a3) ? 1 : -1 in a branchless sequence. + sltu a0, a2, a3 + neg a0, a0 + ori a0, a0, 1 + ret + +.Lfoundnull: + # Found a null byte. + # If words don't match, fall back to simple loop. + bne a2, a3, .Lsimpleloop + + # Otherwise, strings are equal. + li a0, 0 + ret + + # Simple loop for misaligned strings +.Lsimpleloop: + lbu a2, 0(a0) + lbu a3, 0(a1) + addi a0, a0, 1 + addi a1, a1, 1 + bne a2, a3, 1f + bnez a2, .Lsimpleloop + +1: + sub a0, a2, a3 + ret + +.size strcmp, .-strcmp +-- \ No newline at end of file