Back to ARM page
Optimized strlen function for ARM CPU.
For the fun of it, I tried to write in assembly an optimized strlen function for ARM CPU.
I used for this my development board based on ARM920T (ARM v4T @200MHz) running GNU/Linux.
The compiler used is CodeSourcery's GCC. Its version is not very important since the functions are written in ASM.
My goal was to get better results than gnu builtin strlen function. I once found this function being :
.global gcc_strlen
.func  gcc_strlen
gcc_strlen:
bic     r1, r0, $3              @ addr of word containing first byte
ldr     r2, [r1], $4            @ get the first word
ands    r3, r0, $3              @ how many bytes are duff?
rsb     r0, r3, $0              @ get - that number into counter.
beq     Laligned                @ skip into main check routine if no @ more
orr     r2, r2, $0x000000ff     @ set this byte to non-zero
subs    r3, r3, $1              @ any more to do?
orrgt   r2, r2, $0x0000ff00     @ if so, set this byte
subs    r3, r3, $1              @ more?
orrgt   r2, r2, $0x00ff0000     @ then set.
Laligned:                       @ here, we have a word in r2.  Does it
tst     r2, $0x000000ff         @ contain any zeroes?
tstne   r2, $0x0000ff00         @
tstne   r2, $0x00ff0000         @
tstne   r2, $0xff000000         @
addne   r0, r0, $4              @ if not, the string is 4 bytes longer
ldrne   r2, [r1], $4            @ and we continue to the next word
bne     Laligned                @
Llastword:                      @ drop through to here once we find a
tst     r2, $0x000000ff         @
addne   r0, r0, $1              @
tstne   r2, $0x0000ff00         @ and add up to 3 bytes on to it
addne   r0, r0, $1              @
tstne   r2, $0x00ff0000         @ (if first three all non-zero, 4th
addne   r0, r0, $1              @  must be zero)
bx      lr
.endfunc
After many tries, I finally wrote the following function:
;@ ARM optimized strlen function
;@ -----------------------------
;@ prototype:
;@ unsigned int mystrlen(char *s)
;@ (C) Yann Poupet
;@ LICENSE: GPL V2
.globl mystrlen
.code  32
.func  mystrlen

.equ L_ENDIAN,0
.equ B_ENDIAN,1
.equ ENDIAN,L_ENDIAN

	;@ if((((data-0x01010101)^data)&0x80808080)==0) data does not contain zero byte
	;@ TODO: check R0 != NULL ?

mystrlen:
	BIC     R3,R0,#0x03         ;@ R3=R0&~3
	LDR     R1,[R3]             ;@ R1=*R3
	ANDS    R2,R0,#0x03         ;@ R2=R0&3
	MOV     R12,#32             ;@ R12=0x20
	                            ;@ if (R2 != 0)
	SUBNE   R2,R12,R2,LSL #3    ;@   R2=32-(8*R2)
	ORR     R12,R12,R12,LSL #8  ;@ R12=0x00002020
	ORR     R12,R12,R12,LSL #16 ;@ R12=0x20202020
	                            ;@ if (R2 != 0)
.if ENDIAN == L_ENDIAN
	ORRNE   R1,R1,R12,LSR R2    ;@   R1 |= (R12>>R2)
.else
	ORRNE   R1,R1,R12,LSL R2    ;@   R1 |= (R12<<R2)
.endif

LOOPSTART:

	SUB     R2,R1,R12,LSR #5    ;@ R2=R1-0x0101010101
	EOR     R2,R2,R1            ;@ R2=R2^R1
	ANDS    R2,R2,R12,LSL #2    ;@ R2=R2&0x80808080
	LDREQ   R1,[R3,#4]!

.equ  UNROLL_LEVEL,0
.rept UNROLL_LEVEL              ;@ Do loop unrolling
	SUBEQ   R2,R1,R12,LSR #5    ;@ R2=R1-0x0101010101
	EOREQ   R2,R2,R1            ;@ R2=R2^R1
	ANDEQS  R2,R2,R12,LSL #2    ;@ R2=R2&0x80808080
	LDREQ   R1,[R3,#4]!
.endr

	BEQ     LOOPSTART

	SUB     R0,R3,R0

.if ENDIAN == L_ENDIAN
	TST     R1,#0x000000FF
	ADDNE   R0,R0,#1
	TSTNE   R1,#0x0000FF00
	ADDNE   R0,R0,#1
	TSTNE   R1,#0x00FF0000
	ADDNE   R0,R0,#1
.else
	ANDS    R2,R1,#0xFF000000
	ADDNE   R0,R0,#1
	ANDNES  R2,R1,#0x00FF0000
	ADDNE   R0,R0,#1
	ANDNES  R2,R1,#0x0000FF00
	ADDNE   R0,R0,#1
.endif

	BX      LR

.endfunc

.end
I wrote an application to test the performances:
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/time.h>
extern int mystrlen(char *s);
extern int _strlen(char *s);
#define LOOPS 512
#define BUFLEN 512

void print_time(struct timeval *a,struct timeval *b)
{
	unsigned int sec,usec;
	sec = (unsigned int)(a->tv_sec-b->tv_sec)-((a->tv_usec<b->tv_usec)?1:0);
	usec= (unsigned int)(a->tv_usec+((a->tv_usec<b->tv_usec)?1000000:0)-b->tv_usec);
	printf("time: %u.%06us\n",sec,usec);
}

int main(int argc,char *argv[])
{
	int i,j;
	char string[BUFLEN];
	struct timeval b,a;
	unsigned l,k=0;

	memset(string,'x',BUFLEN);// should fill dcache
	string[BUFLEN-1] = 0;

	printf("Using strlen ...\n");

	// measure time to process with gcc's builtin strlen
	gettimeofday(&b,NULL);
	for (i=0;i<LOOPS;i++)
	{
		for (j=0;j<BUFLEN;j++)
		{
			l = __builtin_strlen(&string[BUFLEN - 1 - j]);
		}
	}
	gettimeofday(&a,NULL);

	print_time(&a,&b);
	printf("len=%u\n",l+k);

	printf("Using arm's strlen ...\n");
	gettimeofday(&b,NULL);
	for (i=0;i<LOOPS;i++)
	{
		for (j=0;j<BUFLEN;j++)
		{
			l = _strlen(&string[BUFLEN - 1 - j]);
		}
	}
	gettimeofday(&a,NULL);

	print_time(&a,&b);
	printf("len=%u\n",l+k);

	printf("Using mystrlen ...\n");

	// measure time to process with new strlen
	gettimeofday(&b,NULL);
	for (i=0;i<LOOPS;i++)
	{
		for (j=0;j<BUFLEN;j++)
		{
			l = mystrlen(&string[BUFLEN - 1 - j]);
		}
	}
	gettimeofday(&a,NULL);

	print_time(&a,&b);
	printf("len=%u\n",l+k);

	// check mystrlen accuracy
	for (i=0;i<BUFLEN;i++)
	{
		if ((l=mystrlen(&string[i])) != BUFLEN - i - 1)
		{
			printf("ERROR1:mystrlen(&string[%d])=%u\n",i,l);
			break;
		}
	}

	for (i=0;i<BUFLEN;i++)
	{
		// set byte before start of string to zero in order to check
		// it does not disturb the function
		if (i > 1) string[i-1] = 0;
		if ((l=mystrlen(&string[i])) != BUFLEN - i - 1)
		{
			printf("ERROR2:mystrlen(&string[%d])=%u\n",i,l);
			break;
		}
	}

	exit (0);
}
_strlen is an optimized function I found in newlib, written by ARM's Richard Earnshaw, cf here
His function is quite similar to mine; I however finished writting my function before I found this one written for Newlib.

The results on my board are the following:
# /strlen_test 
Using strlen ...
time: 0.807146s
Using arm's strlen ...
time: 0.716098s
Using mystrlen ...
time: 0.627786s

The numbers can be improved a little bit by setting UNROLL_LEVEL to 1, in which case the timings are:
# /strlen_test 
Using strlen ...
time: 0.808105s
Using arm's strlen ...
time: 0.715434s
Using mystrlen ...
time: 0.546416s
It can be improved even more by setting UNROLL_LEVEL to higher numbers, but the I coudn't get below 0.50s. Setting UNROLL_LEVEL to values higher than 2 or 3 does not improve much tests and increases code size.

NOTE: although I added code for big endian, I did not test it, so it might not work!

Writting this function was a lot of fun, and I'm glad it's faster than both the functions I compared it to. However, the one found in newlib, written by ARM, deals with much more options (ARM arch V6,v7,v5, Thumb, Thumb2, ...) and also manages platform indianness; mine does not (actually it should but I did not test it). AFAIK (I do not have any other platform to test), it is faster only for ARMV4T / Little endian.

You can use this code as long as you respect its license, which is GPL V2. If you do so, let me know :)

If you want to contact me, you'll find my email here

The files used or included in this pages can be downloaded : arm_asm.h,Makefile,strlen_arm.c,strlen.s,strlen_test.c.