strlen:
xor r8,r8
.Lalignlong:
test rdi, 0xf
je .LfindNull
prefetch [rdi + 8]
cmp Byte PTR [rdi], 0
je .LansNoAdd
inc r8
inc rdi
jmp .Lalignlong
# do while is faster than while because of less jumps (Agner)
.LfindNull:
mov r9, 0xFEFEFEFEFEFEFEFF
mov r10, 0x8080808080808080 # citation: Bit Twiddling Hacks Sean Eron Anderson
prefetch [rdi + 192]
mov rcx, [rdi]
lea rax, [rcx + r9]
not rcx
and rcx, rax
and rcx, r10
jne .Lanswer
nop # no idea why this makes it 2 cycles faster. findloop changes from 4a -> 4b
.Lfindloop:
prefetch [rdi + 420]
mov rcx, [rdi + 8]
add rdi, 8
add r8, 8
lea rax, [rcx + r9]
not rcx
and rcx, rax
and rcx, r10
je .Lfindloop
.Lanswer:
bsf rcx, rcx
shr rcx, 3
lea rax, [rcx + r8]
ret
.LansNoAdd:
mov rax, r8
ret
This should be the x86 64 bit assembly code for counting the length of a char string, and the address of the string is passed to RDI.
I don't understand the first .Lalignlong
part; does that do the data alignment?
And if yes, how is it supposed to work? Especially the line test rdi, 0xf
confuses me very much.