Optimized strlen function for ARM CPU.
For the fun of it, I tried to write in assembly an optimized strlen function for ARM CPU.
I used for this my development board based on ARM920T (ARM v4T @200MHz) running GNU/Linux.
The compiler used is CodeSourcery's GCC. Its version is not very important since the functions are written in ASM.
My goal was to get better results than gnu builtin strlen function. I once found this function being :
.global gcc_strlen
.func gcc_strlen
gcc_strlen:
bic r1, r0, $3 @ addr of word containing first byte
ldr r2, [r1], $4 @ get the first word
ands r3, r0, $3 @ how many bytes are duff?
rsb r0, r3, $0 @ get - that number into counter.
beq Laligned @ skip into main check routine if no @ more
orr r2, r2, $0x000000ff @ set this byte to non-zero
subs r3, r3, $1 @ any more to do?
orrgt r2, r2, $0x0000ff00 @ if so, set this byte
subs r3, r3, $1 @ more?
orrgt r2, r2, $0x00ff0000 @ then set.
Laligned: @ here, we have a word in r2. Does it
tst r2, $0x000000ff @ contain any zeroes?
tstne r2, $0x0000ff00 @
tstne r2, $0x00ff0000 @
tstne r2, $0xff000000 @
addne r0, r0, $4 @ if not, the string is 4 bytes longer
ldrne r2, [r1], $4 @ and we continue to the next word
bne Laligned @
Llastword: @ drop through to here once we find a
tst r2, $0x000000ff @
addne r0, r0, $1 @
tstne r2, $0x0000ff00 @ and add up to 3 bytes on to it
addne r0, r0, $1 @
tstne r2, $0x00ff0000 @ (if first three all non-zero, 4th
addne r0, r0, $1 @ must be zero)
bx lr
.endfunc
After many tries, I finally wrote the following function:
;@ ARM optimized strlen function
;@ -----------------------------
;@ prototype:
;@ unsigned int mystrlen(char *s)
;@ (C) Yann Poupet
;@ LICENSE: GPL V2
.globl mystrlen
.code 32
.func mystrlen
.equ L_ENDIAN,0
.equ B_ENDIAN,1
.equ ENDIAN,L_ENDIAN
;@ if((((data-0x01010101)^data)&0x80808080)==0) data does not contain zero byte
;@ TODO: check R0 != NULL ?
mystrlen:
BIC R3,R0,#0x03 ;@ R3=R0&~3
LDR R1,[R3] ;@ R1=*R3
ANDS R2,R0,#0x03 ;@ R2=R0&3
MOV R12,#32 ;@ R12=0x20
;@ if (R2 != 0)
SUBNE R2,R12,R2,LSL #3 ;@ R2=32-(8*R2)
ORR R12,R12,R12,LSL #8 ;@ R12=0x00002020
ORR R12,R12,R12,LSL #16 ;@ R12=0x20202020
;@ if (R2 != 0)
.if ENDIAN == L_ENDIAN
ORRNE R1,R1,R12,LSR R2 ;@ R1 |= (R12>>R2)
.else
ORRNE R1,R1,R12,LSL R2 ;@ R1 |= (R12<<R2)
.endif
LOOPSTART:
SUB R2,R1,R12,LSR #5 ;@ R2=R1-0x0101010101
EOR R2,R2,R1 ;@ R2=R2^R1
ANDS R2,R2,R12,LSL #2 ;@ R2=R2&0x80808080
LDREQ R1,[R3,#4]!
.equ UNROLL_LEVEL,0
.rept UNROLL_LEVEL ;@ Do loop unrolling
SUBEQ R2,R1,R12,LSR #5 ;@ R2=R1-0x0101010101
EOREQ R2,R2,R1 ;@ R2=R2^R1
ANDEQS R2,R2,R12,LSL #2 ;@ R2=R2&0x80808080
LDREQ R1,[R3,#4]!
.endr
BEQ LOOPSTART
SUB R0,R3,R0
.if ENDIAN == L_ENDIAN
TST R1,#0x000000FF
ADDNE R0,R0,#1
TSTNE R1,#0x0000FF00
ADDNE R0,R0,#1
TSTNE R1,#0x00FF0000
ADDNE R0,R0,#1
.else
ANDS R2,R1,#0xFF000000
ADDNE R0,R0,#1
ANDNES R2,R1,#0x00FF0000
ADDNE R0,R0,#1
ANDNES R2,R1,#0x0000FF00
ADDNE R0,R0,#1
.endif
BX LR
.endfunc
.end
I wrote an application to test the performances:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
extern int mystrlen(char *s);
extern int _strlen(char *s);
#define LOOPS 512
#define BUFLEN 512
void print_time(struct timeval *a,struct timeval *b)
{
unsigned int sec,usec;
sec = (unsigned int)(a->tv_sec-b->tv_sec)-((a->tv_usec<b->tv_usec)?1:0);
usec= (unsigned int)(a->tv_usec+((a->tv_usec<b->tv_usec)?1000000:0)-b->tv_usec);
printf("time: %u.%06us\n",sec,usec);
}
int main(int argc,char *argv[])
{
int i,j;
char string[BUFLEN];
struct timeval b,a;
unsigned l,k=0;
memset(string,'x',BUFLEN);// should fill dcache
string[BUFLEN-1] = 0;
printf("Using strlen ...\n");
// measure time to process with gcc's builtin strlen
gettimeofday(&b,NULL);
for (i=0;i<LOOPS;i++)
{
for (j=0;j<BUFLEN;j++)
{
l = __builtin_strlen(&string[BUFLEN - 1 - j]);
}
}
gettimeofday(&a,NULL);
print_time(&a,&b);
printf("len=%u\n",l+k);
printf("Using arm's strlen ...\n");
gettimeofday(&b,NULL);
for (i=0;i<LOOPS;i++)
{
for (j=0;j<BUFLEN;j++)
{
l = _strlen(&string[BUFLEN - 1 - j]);
}
}
gettimeofday(&a,NULL);
print_time(&a,&b);
printf("len=%u\n",l+k);
printf("Using mystrlen ...\n");
// measure time to process with new strlen
gettimeofday(&b,NULL);
for (i=0;i<LOOPS;i++)
{
for (j=0;j<BUFLEN;j++)
{
l = mystrlen(&string[BUFLEN - 1 - j]);
}
}
gettimeofday(&a,NULL);
print_time(&a,&b);
printf("len=%u\n",l+k);
// check mystrlen accuracy
for (i=0;i<BUFLEN;i++)
{
if ((l=mystrlen(&string[i])) != BUFLEN - i - 1)
{
printf("ERROR1:mystrlen(&string[%d])=%u\n",i,l);
break;
}
}
for (i=0;i<BUFLEN;i++)
{
// set byte before start of string to zero in order to check
// it does not disturb the function
if (i > 1) string[i-1] = 0;
if ((l=mystrlen(&string[i])) != BUFLEN - i - 1)
{
printf("ERROR2:mystrlen(&string[%d])=%u\n",i,l);
break;
}
}
exit (0);
}
_strlen is an optimized function I found in newlib, written by ARM's Richard Earnshaw, cf here
His function is quite similar to mine; I however finished writting my function before I found this one written for Newlib.
The results on my board are the following:
# /strlen_test
Using strlen ...
time: 0.807146s
Using arm's strlen ...
time: 0.716098s
Using mystrlen ...
time: 0.627786s
The numbers can be improved a little bit by setting UNROLL_LEVEL to 1, in which case the timings are:
# /strlen_test
Using strlen ...
time: 0.808105s
Using arm's strlen ...
time: 0.715434s
Using mystrlen ...
time: 0.546416s
It can be improved even more by setting UNROLL_LEVEL to higher numbers, but the I coudn't get below 0.50s. Setting UNROLL_LEVEL to values higher than 2 or 3 does not improve much tests and increases code size.
NOTE: although I added code for big endian, I did not test it, so it might not work!
Writting this function was a lot of fun, and I'm glad it's faster than both the functions I compared it to. However, the one found in newlib, written by ARM, deals with much more options (ARM arch V6,v7,v5, Thumb, Thumb2, ...) and also manages platform indianness; mine does not (actually it should but I did not test it). AFAIK (I do not have any other platform to test), it is faster only for ARMV4T / Little endian.
You can use this code as long as you respect its license, which is GPL V2. If you do so, let me know :)
If you want to contact me, you'll find my email here
The files used or included in this pages can be downloaded : arm_asm.h,Makefile,strlen_arm.c,strlen.s,strlen_test.c.