Skip to content

Commit

Permalink
Adding software optimization guide to bitmanip
Browse files Browse the repository at this point in the history
Adding the software optimization guide to bitmanip chapter.
Note that in the original document this was included as an appendix.
  • Loading branch information
wmat committed Aug 3, 2023
1 parent efe57a0 commit d30efc9
Showing 1 changed file with 144 additions and 1 deletion.
145 changes: 144 additions & 1 deletion src/b-st-ext.adoc
Original file line number Diff line number Diff line change
Expand Up @@ -1137,4 +1137,147 @@ include::insns/xpermn.adoc[]
<<<
include::insns/zext_h.adoc[]
<<<
include::insns/zip.adoc[]
include::insns/zip.adoc[]

=== Software optimization guide

==== strlen

The *orc.b* instruction allows for the efficient detection of *NUL* bytes in an XLEN-sized chunk of data:

* the result of *orc.b* on a chunk that does not contain any *NUL* bytes will be all-ones, and
* after a bitwise-negation of the result of *orc.b*, the number of data bytes before the first *NUL* byte (if any) can be detected by *ctz*/*clz* (depending on the endianness of data).

A full example of a *strlen* function, which uses these techniques and also demonstrates the use of it for unaligned/partial data, is the following:

[source,asm]
--
#include <sys/asm.h>

.text
.globl strlen
.type strlen, @function
strlen:
andi a3, a0, (SZREG-1) // offset
andi a1, a0, -SZREG // align pointer
.Lprologue:
li a4, SZREG
sub a4, a4, a3 // XLEN - offset
slli a3, a3, PTRLOG // offset * 8
REG_L a2, 0(a1) // chunk
/*
* Shift the partial/unaligned chunk we loaded to remove the bytes
* from before the start of the string, adding NUL bytes at the end.
*/
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
srl a2, a2 ,a3 // chunk >> (offset * 8)
#else
sll a2, a2, a3
#endif
orc.b a2, a2
not a2, a2
/*
* Non-NUL bytes in the string have been expanded to 0x00, while
* NUL bytes have become 0xff. Search for the first set bit
* (corresponding to a NUL byte in the original chunk).
*/
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
ctz a2, a2
#else
clz a2, a2
#endif
/*
* The first chunk is special: compare against the number of valid
* bytes in this chunk.
*/
srli a0, a2, 3
bgtu a4, a0, .Ldone
addi a3, a1, SZREG
li a4, -1
.align 2
/*
* Our critical loop is 4 instructions and processes data in 4 byte
* or 8 byte chunks.
*/
.Lloop:
REG_L a2, SZREG(a1)
addi a1, a1, SZREG
orc.b a2, a2
beq a2, a4, .Lloop

.Lepilogue:
not a2, a2
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
ctz a2, a2
#else
clz a2, a2
#endif
sub a1, a1, a3
add a0, a0, a1
srli a2, a2, 3
add a0, a0, a2
.Ldone:
ret
--

==== strcmp

[source,asm]
--
#include <sys/asm.h>

.text
.globl strcmp
.type strcmp, @function
strcmp:
or a4, a0, a1
li t2, -1
and a4, a4, SZREG-1
bnez a4, .Lsimpleloop

# Main loop for aligned strings
.Lloop:
REG_L a2, 0(a0)
REG_L a3, 0(a1)
orc.b t0, a2
bne t0, t2, .Lfoundnull
addi a0, a0, SZREG
addi a1, a1, SZREG
beq a2, a3, .Lloop

# Words don't match, and no null byte in first word.
# Get bytes in big-endian order and compare.
#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__
rev8 a2, a2
rev8 a3, a3
#endif
# Synthesize (a2 >= a3) ? 1 : -1 in a branchless sequence.
sltu a0, a2, a3
neg a0, a0
ori a0, a0, 1
ret

.Lfoundnull:
# Found a null byte.
# If words don't match, fall back to simple loop.
bne a2, a3, .Lsimpleloop

# Otherwise, strings are equal.
li a0, 0
ret

# Simple loop for misaligned strings
.Lsimpleloop:
lbu a2, 0(a0)
lbu a3, 0(a1)
addi a0, a0, 1
addi a1, a1, 1
bne a2, a3, 1f
bnez a2, .Lsimpleloop

1:
sub a0, a2, a3
ret

.size strcmp, .-strcmp
--

0 comments on commit d30efc9

Please sign in to comment.