Rockbox.org home
release
dev builds
extras
themes manual
wiki
device status forums
mailing lists
IRC bugs
patches
dev guide



Rockbox mail archive

Subject: ARM asm memcpy, code review requested

ARM asm memcpy, code review requested

From: <postmaster_at_diffenbach.org>
Date: Mon, 03 Jul 2006 13:24:56 -0400

I've written a version of memcpy in ARM assembly for the ipod builds.

It's NOT based on the version in the linux kernel source (but after
writing it, I found it's similar to the kernel's memset). It uses
load/store multiple, but not preload or bursting or anything cool like
that.

As this is my first foray into ARM assembly, I'd like some fresh eyes
to give it a code review. Once it's been reviewed, cleaned up, and
possibly improved, I'll release it under the GPL.

Attached is the source file and a set of timings (in MS excel format).

In general, it runs in about one microsecond more than half the time
the C memcpy takes to run, for word-aligned dst and src.

For non-word aligned dst and srcs, the C version falls back on
byte-wise copying. The asm memcpy can do fast copy for unaligned dst
and src, so long as dst and src both have the SAME (mis-)alignment.
For these cases, the asm memcpy takes about a tenth of the time as the
C memcpy, with the ratio improving as more bytes are copied. For
differently aligned dst and src, the asm also falls back on byte-wise
copying.

(Of course, callers really shouldn't be doing big non-word aligned
copies.)

For certain cases, the asm memcpy takes ~ one microsecond longer than
the C version, in particular for word-aligned copies of lengths 1, 2,
5, 16, 17, 20, 26, 30, and 40 bytes.

--Tom


/***************************************************************************
 *
 * Copyright (C) 2006 by TP Diffenbach
 *
 * This software is NOT yet released under any license
 *
 ****************************************************************************/
#include "config.h"

#ifdef CPU_ARM
    .section .icode,"ax",%progbits
#else
    .section .icode,"ax",_at_progbits
#endif

    .align 2
    
    
different_aligns:
    /* We must do byte copies */
    stmfd r13!, {r0, lr }
byte_loop:
    ldrb r3, [r1], #1
    strb r3, [r0], #1
    subs r2, r2, #1
    bne byte_loop
    ldmfd r13!, {r0, pc }


        .global memcpy2
        .type memcpy2, %function
memcpy2:
_at_void* memcpy2( void* dst, const void* src, size_t len )

_at__at_ Register usage:
_at__at_ r0: dst
_at__at_ r1: src
_at__at_ r2: len
_at__at_
_at__at_ r3: various bitmasks, load and store for different_aligns loop
_at__at_ r4: counter for multi loop, not used for different_aligns
_at__at_ r5-r8: load and store, not used for different_aligns

    cmp r2, #0
    moveq pc, lr _at_ just return if caller wants to copy zero bytes
    
    cmp r2, #8
    bls different_aligns
    
    /*check for src alignment*/
    eor r3, r0, r1 _at_ r3 = dest | src
    tst r3, #3 _at_ test for same alignment
    bne different_aligns _at_ jump if align( r1 ) != align( ro )

    /* else, they have the same same alignment */
    stmfd r13!, {r0, r4-r8, lr } _at_ save regs
    ands r3, r0, #3 _at_ find out what that alignment is
    beq multi _at_ iff align( r1 ) == align( r0 ) == 0 skip to multi
    
    /* otherwise, move up to three bytes to get to a word alignment
       if align = 1, we need to move forward 3 bytes to get to a word boundry
       if align = 2, we need to move forward 2 bytes to get to a word boundry
       if align = 3, we need to move forward 1 byte to get to a word boundry
    */
    cmp r3, #2 _at_ "subtract" 2 from either 1, 2, or 3
    ldrneb r4, [r1], #1 _at_ one aligned or three aligned ( 1 != 2 || 3 !- 2 ), ((byte*) src)++
    strneb r4, [r0], #1 _at_ one aligned or three aligned ( 1 != 2 || 3 !- 2 ), store to *dst++
    _at_ldrlsh r4, [r1], #2 _at_ one or two ( 1 <= 2 || 2 <= 2 ) aligned, ((halfword*) src)++
    _at_strlsh r4, [r0], #2 _at_ one or two ( 1 <= 2 || 2 <= 2 ) aligned, store to *dst++
    
    ldrlsb r4, [r1], #1 _at_ one aligned or two aligned, ((byte*) src)++
    strlsb r4, [r0], #1 _at_ one aligned or two aligned, store to *dst++
    ldrlsb r4, [r1], #1 _at_ one aligned or two aligned, ((byte*) src)++
    strlsb r4, [r0], #1 _at_ one aligned or two aligned, store to *dst++
    
    sub r3, r3, #4
    add r2, r2, r3 _at_ length -= bytes written

multi:
    /* once we get here, we're word aligned */
    
    /*
    bytes = length
    words = byte / 4, rem = byte moves
    instr = quadword = words / 4, rem = partial instructions
    loop = instr / 4, rem = jump to instr
   
    010101010
    llliiwwbb
    
    21
    52631
    684268421
    */
 
    ands r3, r2, #48 _at_ #32 | #16, r3 = partial loop count << 3
    mov r4, r2, LSR #6 _at_ r4 = loop count
    
    /* Now, like Duff's device, jump into the loop to perform the extra instructions */
    /* Replace later with direct adjustment of PC */
    beq loop_test
    cmp r3, #32 _at_r3 = 16 or 32 or 48, corresponding to 1 or 2 or 3
    bhi loop3
    beq loop2
    blo loop1
    
loop:
    ldmia r1!, { r5-r8 } _at_load four registers
    stmia r0!, { r5-r8 } _at_store four registers
loop3:
    ldmia r1!, { r5-r8 } _at_load four registers
    stmia r0!, { r5-r8 } _at_store four registers
loop2:
    ldmia r1!, { r5-r8 } _at_load four registers
    stmia r0!, { r5-r8 } _at_store four registers
loop1:
    ldmia r1!, { r5-r8 } _at_load four registers
    stmia r0!, { r5-r8 } _at_store four registers
    
loop_test:
    cmp r4, #0
    subne r4, r4, #1
    bne loop
    
    /* Now do the extra words */
    ands r3, r2, #12 _at_ #8 | #4, r3 = extra words << 2
    beq extra_bytes
    cmp r3, #8
    ldrne r5, [r1], #4 _at_ if r5 !=8 then it's 4 or 12, so load and store
    strne r5, [r0], #4
    ldmhsia r1!, {r5-r6} _at_ if r5 >= 8, it's 8 or 12, so load and store 2
    stmhsia r0!, {r5-r6}
    
    /* Now do the extra bytes */
extra_bytes:
    tst r2, #2 _at_ any extra bytes?
    beq clean_up
    cmp r3, #2 _at_ subtract 2 from either 1, 2, or 3
    ldrneb r5, [r1], #1 _at_ load and store one byte iff r3 != 2 (i.e, r3 == 1 || r3 == 3)
    strneb r5, [r0], #1 _at_ 1 or 3
    ldrhsb r5, [r1], #1 _at_ load and store a byte iff r3 >= 2
    strhsb r5, [r0], #1 _at_ 2 or 3
    ldrhsb r5, [r1], #1 _at_ load and store a byte iff r3 > 2
    strhsb r5, [r0], #1 _at_ 2 or 3
    
clean_up:
    ldmfd r13!, {r0, r4-r8, pc } _at_r0 is the retval, must equal original dst

end:
    .size different_aligns, .end-memcpy2
        .align 2
    
_at_ Local Variables:
_at_ asm-comment-char: ?_at_
_at_ comment-start: "_at_ "
_at_ block-comment-start: "/*"
_at_ block-comment-end: "*/"
_at_ indent-tabs-mode: t
_at_ End:


    
Received on 2006-07-03

Page template was last modified "Tue Sep 7 00:00:02 2021" The Rockbox Crew -- Privacy Policy