Index: apps/codecs/libmad/layer3.c =================================================================== --- apps/codecs/libmad/layer3.c (revision 13459) +++ apps/codecs/libmad/layer3.c (working copy) @@ -922,8 +922,19 @@ } /* we must take care that sz >= bits and sz < sizeof(long) lest bits == 0 */ +# if defined(CPU_ARM) +# define MASK(cache, sz, bits) \ + ({ unsigned long res; \ + asm ("mov %0, #1\n\t" \ + "rsb %0, %0, %0, lsl %3\n\t" \ + "and %0, %0, %1, lsr %2" \ + : "=&r" (res) : "r" (cache), "r" ((sz) - (bits)), "r" (bits)); \ + res; \ + }) +#else # define MASK(cache, sz, bits) \ (((cache) >> ((sz) - (bits))) & ((1 << (bits)) - 1)) +#endif # define MASK1BIT(cache, sz) \ ((cache) & (1 << ((sz) - 1))) @@ -1546,6 +1557,9 @@ return MAD_ERROR_NONE; } +#if defined(CPU_ARM) +void III_aliasreduce(mad_fixed_t xr[576], int lines); +#else /* * NAME: III_aliasreduce() * DESCRIPTION: perform frequency line alias reduction @@ -1600,6 +1614,7 @@ } } } +#endif # if defined(ASO_IMDCT) void III_imdct_l(mad_fixed_t const [18], mad_fixed_t [36], unsigned int); @@ -2894,6 +2909,11 @@ #endif +#ifdef CPU_ARM +void III_overlap(mad_fixed_t const output[36], mad_fixed_t overlap[18], + mad_fixed_t sample[18][32], unsigned int sb); +#else + /* * NAME: III_overlap() * DESCRIPTION: perform overlap-add of windowed IMDCT outputs @@ -2941,6 +2961,7 @@ } # endif } +#endif /* * NAME: III_overlap_z() @@ -3142,10 +3163,21 @@ /* (nonzero) subbands 2-31 */ +/* i = 576; while (i > 36 && xr[ch][i - 1] == 0) --i; +*/ + { + /* saves ~600k cycles */ + mad_fixed_t *p = &xr[ch][576]; + mad_fixed_t tmp = xr[ch][35]; + xr[ch][35] = 1; + while (!*--p); + xr[ch][35] = tmp; + i = p - &xr[ch][0] + 1; + } sblimit = 32 - (576 - i) / 18; if (channel->block_type != 2) { Index: apps/codecs/libmad/dct32_arm.S =================================================================== --- apps/codecs/libmad/dct32_arm.S (revision 0) +++ apps/codecs/libmad/dct32_arm.S (revision 0) @@ -0,0 +1,324 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * $Id: $ + * + * Copyright (C) 2007 by Tomasz Malesinski + * + * All files in this archive are subject to the GNU General Public License. + * See the file COPYING in the source tree root for full license agreement. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ + + .global dct32 + .section .text,"ax",%progbits +/* .text */ + +dct32: + stmdb r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr} + sub r13, r13, #144 + str r0, [r13, #12] + str r1, [r13, #8] + str r2, [r13, #4] + str r3, [r13] + add r0, r13, #16 + add r1, r0, #128 + ldr r2, =bitrev +.shuffle: + ldr r5, [r13, #12] + ldr r3, [r2], #4 + sub r4, r5, r3, lsl #4 + add r3, r5, r3, lsl #4 + ldr r6, [r3] + ldr r8, [r4, #124] + add r6, r6, r8 + sub r8, r6, r8, lsl #1 + ldr r7, [r3, #8] + ldr lr, [r4, #116] + add r7, r7, lr + sub lr, r7, lr, lsl #1 + ldr r10, [r3, #64] + ldr r9, [r4, #60] + add r10, r10, r9 + sub r9, r10, r9, lsl #1 + ldr r11, [r3, #72] + ldr r12, [r4, #52] + add r11, r11, r12 + sub r12, r11, r12, lsl #1 + add r6, r6, r10 + sub r10, r6, r10, lsl #1 + add r7, r7, r11 + sub r11, r7, r11, lsl #1 + add r8, r8, r12 + sub r12, r8, r12, lsl #1 + add lr, lr, r9 + sub r9, lr, r9, lsl #1 + stmia r0!, {r6, r7, r8, r9, r10, r11, r12, lr} + cmp r0, r1 + bne .shuffle + ldr r0, =189812531 + add r1, r13, #16 + add r3, r1, #128 +.l2: + add r2, r1, #32 + ldmia r2, {r4, r5, r8, r9} + ldmia r1, {r6, r7, r10, r11} + add r6, r6, r4 + sub r4, r6, r4, lsl #1 + add r7, r7, r5 + sub r5, r7, r5, lsl #1 + stmia r2!, {r4, r5} + stmia r1!, {r6, r7} + add r9, r9, r8 + sub r8, r9, r8, lsl #1 + smull r4, r6, r9, r0 + movs r4, r4, lsr #28 + adc r4, r4, r6, lsl #4 + smull r5, r6, r8, r0 + movs r5, r5, lsr #28 + adc r5, r5, r6, lsl #4 + add r10, r10, r4 + sub r4, r10, r4, lsl #1 + add r11, r11, r5 + sub r5, r11, r5, lsl #1 + stmia r2!, {r4, r5} + stmia r1!, {r10, r11} + ldmia r2, {r5, r6, r8, r11} + ldmia r1, {r4, r7, r9, r10} + add r4, r4, r6 + sub r6, r4, r6, lsl #1 + add r7, r7, r5 + sub r5, r7, r5, lsl #1 + stmia r2!, {r6, r7} + stmia r1!, {r4, r5} + add r11, r11, r8 + sub r8, r11, r8, lsl #1 + smull r5, r4, r8, r0 + movs r5, r5, lsr #28 + adc r5, r5, r4, lsl #4 + smull r6, r4, r11, r0 + movs r6, r6, lsr #28 + adc r6, r6, r4, lsl #4 + add r9, r9, r5 + sub r5, r9, r5, lsl #1 + sub r10, r10, r6 + add r6, r10, r6, lsl #1 + stmia r2!, {r5, r6} + stmia r1!, {r9, r10} + add r1, r1, #32 + cmp r1, r3 + bne .l2 + add r2, r13, #16 + add r3, r2, #64 + ldr r0, =sincos + add r1, r0, #128 +.lbut8: + ldmia r3, {r7, r8} + ldmia r0, {r9, r10} + add r0, r0, #16 + smull r6, r5, r7, r9 + smlal r6, r5, r10, r8 + movs r6, r6, lsr #28 + adc r6, r6, r5, lsl #4 + smull r10, r5, r7, r10 + rsb r9, r9, #0 + smlal r10, r5, r8, r9 + movs r10, r10, lsr #28 + adc r5, r10, r5, lsl #4 + ldmia r2, {r7, r8} + add r7, r7, r5 + sub r5, r7, r5, lsl #1 + add r8, r8, r6 + sub r6, r8, r6, lsl #1 + stmia r3!, {r5, r6} + stmia r2!, {r7, r8} + cmp r0, r1 + bne .lbut8 + add r1, r13, #16 + ldr r2, =sincos + ldr r3, =sincos2 + ldr r0, [r13, #8] + mov r0, r0, lsl #2 + ldr r4, [r13, #4] + add r4, r4, r0 + ldr r5, [r13] + add r5, r5, #480 + add r5, r5, r0 + mov r0, #0 +.l4: + rsb r12, r0, #16 + and r12, r12, #15 + add lr, r13, #16 + add r12, lr, r12, lsl #3 + ldmia r1!, {r10, r11} + ldmia r12, {r6, r7} + add r6, r6, r10 + sub r10, r6, r10, lsl #1 + add r11, r11, r7 + sub r7, r11, r7, lsl #1 + ldmia r2!, {r12, lr} + smull r9, r8, r11, r12 + smlal r9, r8, lr, r10 + movs r9, r9, lsr #28 + adc r9, r9, r8, lsl #4 + smull lr, r8, r11, lr + rsb r12, r12, #0 + smlal lr, r8, r10, r12 + movs lr, lr, lsr #28 + adc r8, lr, r8, lsl #4 + add r6, r6, r8 + sub r8, r6, r8, lsl #1 + add r7, r7, r9 + sub r9, r7, r9, lsl #1 + add lr, r3, #128 + ldmia lr, {r10, r11} + smull lr, r12, r8, r11 + smlal lr, r12, r9, r10 + movs lr, lr, lsr #28 + adc r12, lr, r12, lsl #4 + str r12, [r4], #32 + cmp r0, #0 + cmpne r0, #8 + beq .skip1 + smull lr, r12, r8, r10 + rsb r9, r9, #0 + smlal lr, r12, r9, r11 + movs lr, lr, lsr #28 + adc r12, lr, r12, lsl #4 + add lr, r5, r0, lsl #6 + str r12, [lr, #-512] +.skip1: + ldmia r3!, {r10, r11} + smull lr, r12, r7, r10 + smlal lr, r12, r6, r11 + movs lr, lr, lsr #28 + adc r12, lr, r12, lsl #4 + str r12, [r5], #-32 + cmp r0, #0 + cmpne r0, #8 + beq .skip2 + smull lr, r12, r6, r10 + rsb r7, r7, #0 + smlal lr, r12, r7, r11 + movs lr, lr, lsr #28 + adc r12, lr, r12, lsl #4 + sub lr, r4, r0, lsl #6 + str r12, [lr, #480] +.skip2: + add r0, r0, #1 + cmp r0, #9 + bne .l4 + add r13, r13, #144 + ldmia r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc} +bitrev: + .word 0x0 + .word 0x2 + .word 0x1 + .word 0x3 + +sincos: + .word 0x0 + .word 0x10000000 + .word -0x31f1708 + .word 0xfb14be8 + .word -0x61f78aa + .word 0xec835e8 + .word -0x8e39d9d + .word 0xd4db315 + .word -0xb504f33 + .word 0xb504f33 + .word -0xd4db315 + .word 0x8e39d9d + .word -0xec835e8 + .word 0x61f78aa + .word -0xfb14be8 + .word 0x31f1708 + .word -0x10000000 + .word 0x0 + .word -0xfb14be8 + .word -0x31f1708 + .word -0xec835e8 + .word -0x61f78aa + .word -0xd4db315 + .word -0x8e39d9d + .word -0xb504f33 + .word -0xb504f33 + .word -0x8e39d9d + .word -0xd4db315 + .word -0x61f78aa + .word -0xec835e8 + .word -0x31f1708 + .word -0xfb14be8 + +sincos2: + .word 0x0 + .word 0x8000000 + .word 0x647d98 + .word 0x7fd8879 + .word 0xc8bd36 + .word 0x7f62369 + .word 0x12c8107 + .word 0x7e9d560 + .word 0x18f8b84 + .word 0x7d8a5f4 + .word 0x1f19f98 + .word 0x7c29fbf + .word 0x25280c6 + .word 0x7a7d056 + .word 0x2b1f34f + .word 0x7884841 + .word 0x30fbc55 + .word 0x7641af4 + .word 0x36ba201 + .word 0x73b5ebd + .word 0x3c56ba7 + .word 0x70e2cbc + .word 0x41ce1e6 + .word 0x6dca0d1 + .word 0x471cece + .word 0x6a6d98a + .word 0x4c3fdff + .word 0x66cf812 + .word 0x5133cc9 + .word 0x62f201b + .word 0x55f5a4d + .word 0x5ed77c9 + .word 0x5a8279a + .word 0x5a8279a + .word 0x5ed77c9 + .word 0x55f5a4d + .word 0x62f201b + .word 0x5133cc9 + .word 0x66cf812 + .word 0x4c3fdff + .word 0x6a6d98a + .word 0x471cece + .word 0x6dca0d1 + .word 0x41ce1e6 + .word 0x70e2cbc + .word 0x3c56ba7 + .word 0x73b5ebd + .word 0x36ba201 + .word 0x7641af4 + .word 0x30fbc55 + .word 0x7884841 + .word 0x2b1f34f + .word 0x7a7d056 + .word 0x25280c6 + .word 0x7c29fbf + .word 0x1f19f98 + .word 0x7d8a5f4 + .word 0x18f8b84 + .word 0x7e9d560 + .word 0x12c8107 + .word 0x7f62369 + .word 0xc8bd36 + .word 0x7fd8879 + .word 0x647d98 Index: apps/codecs/libmad/SOURCES =================================================================== --- apps/codecs/libmad/SOURCES (revision 13459) +++ apps/codecs/libmad/SOURCES (working copy) @@ -14,4 +14,6 @@ #endif #if defined(CPU_ARM) && !defined(SIMULATOR) imdct_l_arm.S +dct32_arm.S +synth_full_arm.S #endif Index: apps/codecs/libmad/synth.c =================================================================== --- apps/codecs/libmad/synth.c (revision 13459) +++ apps/codecs/libmad/synth.c (working copy) @@ -67,6 +67,13 @@ } } +#ifdef FPM_ARM + +void dct32(mad_fixed_t const in[32], unsigned int slot, + mad_fixed_t lo[16][8], mad_fixed_t hi[16][8]); + +#else + /* * An optional optimization called here the Subband Synthesis Optimization * (SSO) improves the performance of subband synthesis at the expense of @@ -533,6 +540,8 @@ # undef MUL # undef SHIFT +#endif + /* third SSO shift and/or D[] optimization preshift */ # if defined(OPT_SSO) @@ -816,10 +825,230 @@ } } -#else +#elif defined(FPM_ARM) +#define PROD_ODD_0(hi, lo, f, ptr) \ + do { \ + mad_fixed_t *__p = (f); \ + asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #4]\n\t" \ + "smull %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #60]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #52]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #44]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + "ldmia %2, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #36]\n\t" \ + "smlal %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #28]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #20]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #12]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + : "=&r" (lo), "=&r" (hi), "+r" (__p) \ + : "r" (ptr) \ + : "r0", "r1", "r2", "r3", "r4"); \ + } while (0) + +#define PROD_ODD_A(hi, lo, f, ptr) \ + do { \ + mad_fixed_t *__p = (f); \ + asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #4]\n\t" \ + "smlal %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #60]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #52]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #44]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + "ldmia %2, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #36]\n\t" \ + "smlal %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #28]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #20]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #12]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + : "+r" (lo), "+r" (hi), "+r" (__p) \ + : "r" (ptr) \ + : "r0", "r1", "r2", "r3", "r4"); \ + } while (0) + +#define PROD_EVEN_0(hi, lo, f, ptr) \ + do { \ + mad_fixed_t *__p = (f); \ + asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #0]\n\t" \ + "smull %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #56]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #48]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #40]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + "ldmia %2, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #32]\n\t" \ + "smlal %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #24]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #16]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #8]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + : "=&r" (lo), "=&r" (hi), "+r" (__p) \ + : "r" (ptr) \ + : "r0", "r1", "r2", "r3", "r4"); \ + } while (0) + +#define PROD_EVEN_A(hi, lo, f, ptr) \ + do { \ + mad_fixed_t *__p = (f); \ + asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #0]\n\t" \ + "smlal %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #56]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #48]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #40]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + "ldmia %2, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #32]\n\t" \ + "smlal %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #24]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #16]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #8]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + : "+r" (lo), "+r" (hi), "+r" (__p) \ + : "r" (ptr) \ + : "r0", "r1", "r2", "r3", "r4"); \ + } while (0) + +#define PROD_EVENBACK_0(hi, lo, f, ptr) \ + do { \ + mad_fixed_t *__p = (f); \ + asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #60]\n\t" \ + "smull %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #68]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #76]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #84]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + "ldmia %2, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #92]\n\t" \ + "smlal %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #100]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #108]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #116]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + : "=&r" (lo), "=&r" (hi), "+r" (__p) \ + : "r" (ptr) \ + : "r0", "r1", "r2", "r3", "r4"); \ + } while (0) + +#define PROD_EVENBACK_A(hi, lo, f, ptr) \ + do { \ + mad_fixed_t *__p = (f); \ + asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #60]\n\t" \ + "smlal %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #68]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #76]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #84]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + "ldmia %2, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #92]\n\t" \ + "smlal %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #100]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #108]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #116]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + : "+r" (lo), "+r" (hi), "+r" (__p) \ + : "r" (ptr) \ + : "r0", "r1", "r2", "r3", "r4"); \ + } while (0) + +#define PROD_ODDBACK_0(hi, lo, f, ptr) \ + do { \ + mad_fixed_t *__p = (f); \ + asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #120]\n\t" \ + "smull %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #64]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #72]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #80]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + "ldmia %2, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #88]\n\t" \ + "smlal %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #96]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #104]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #112]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + : "=&r" (lo), "=&r" (hi), "+r" (__p) \ + : "r" (ptr) \ + : "r0", "r1", "r2", "r3", "r4"); \ + } while (0) + +#define PROD_ODDBACK_A(hi, lo, f, ptr) \ + do { \ + mad_fixed_t *__p = (f); \ + asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #120]\n\t" \ + "smlal %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #64]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #72]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #80]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + "ldmia %2, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #88]\n\t" \ + "smlal %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #96]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #104]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #112]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + : "+r" (lo), "+r" (hi), "+r" (__p) \ + : "r" (ptr) \ + : "r0", "r1", "r2", "r3", "r4"); \ + } while (0) + +void synth_full1(mad_fixed_t *pcm, mad_fixed_t (*fo)[8], mad_fixed_t (*fe)[8], + mad_fixed_t const (*D0ptr)[32], + mad_fixed_t const (*D1ptr)[32]); +void synth_full2(mad_fixed_t *pcm, mad_fixed_t (*fo)[8], mad_fixed_t (*fe)[8], + mad_fixed_t const (*D0ptr)[32], + mad_fixed_t const (*D1ptr)[32]); + +#ifndef CPU_PP static void synth_full(struct mad_synth *synth, struct mad_frame const *frame, + unsigned int nch, unsigned int ns) ICODE_ATTR; +#endif +static +void synth_full(struct mad_synth *synth, struct mad_frame const *frame, unsigned int nch, unsigned int ns) { int p; @@ -855,6 +1084,7 @@ if(s & 1) { ptr = *D0ptr; +/* ML0(hi, lo, (*fx)[0], ptr[ 1]); MLA(hi, lo, (*fx)[1], ptr[15]); MLA(hi, lo, (*fx)[2], ptr[13]); @@ -863,7 +1093,10 @@ MLA(hi, lo, (*fx)[5], ptr[ 7]); MLA(hi, lo, (*fx)[6], ptr[ 5]); MLA(hi, lo, (*fx)[7], ptr[ 3]); +*/ + PROD_ODD_0(hi, lo, *fx, ptr); MLN(hi, lo); +/* MLA(hi, lo, (*fe)[0], ptr[ 0]); MLA(hi, lo, (*fe)[1], ptr[14]); MLA(hi, lo, (*fe)[2], ptr[12]); @@ -872,9 +1105,146 @@ MLA(hi, lo, (*fe)[5], ptr[ 6]); MLA(hi, lo, (*fe)[6], ptr[ 4]); MLA(hi, lo, (*fe)[7], ptr[ 2]); +*/ + PROD_EVEN_A(hi, lo, *fe, ptr); pcm[0] = SHIFT(MLZ(hi, lo)); pcm += 16; + synth_full1(pcm, fo, fe, D0ptr, D1ptr); + D0ptr += 15; + D1ptr += 15; + fo += 15; + fe += 15; + + ptr = *(D0ptr + 1); + PROD_ODD_0(hi, lo, *fo, ptr); +/* + ML0(hi, lo, (*fo)[0], ptr[ 1]); + MLA(hi, lo, (*fo)[1], ptr[15]); + MLA(hi, lo, (*fo)[2], ptr[13]); + MLA(hi, lo, (*fo)[3], ptr[11]); + MLA(hi, lo, (*fo)[4], ptr[ 9]); + MLA(hi, lo, (*fo)[5], ptr[ 7]); + MLA(hi, lo, (*fo)[6], ptr[ 5]); + MLA(hi, lo, (*fo)[7], ptr[ 3]); +*/ + pcm[0] = SHIFT(-MLZ(hi, lo)); + } + else + { + ptr = *D0ptr; +/* + ML0(hi, lo, (*fx)[0], ptr[ 0]); + MLA(hi, lo, (*fx)[1], ptr[14]); + MLA(hi, lo, (*fx)[2], ptr[12]); + MLA(hi, lo, (*fx)[3], ptr[10]); + MLA(hi, lo, (*fx)[4], ptr[ 8]); + MLA(hi, lo, (*fx)[5], ptr[ 6]); + MLA(hi, lo, (*fx)[6], ptr[ 4]); + MLA(hi, lo, (*fx)[7], ptr[ 2]); +*/ + PROD_EVEN_0(hi, lo, *fx, ptr); + MLN(hi, lo); +/* + MLA(hi, lo, (*fe)[0], ptr[ 1]); + MLA(hi, lo, (*fe)[1], ptr[15]); + MLA(hi, lo, (*fe)[2], ptr[13]); + MLA(hi, lo, (*fe)[3], ptr[11]); + MLA(hi, lo, (*fe)[4], ptr[ 9]); + MLA(hi, lo, (*fe)[5], ptr[ 7]); + MLA(hi, lo, (*fe)[6], ptr[ 5]); + MLA(hi, lo, (*fe)[7], ptr[ 3]); +*/ + PROD_ODD_A(hi, lo, *fe, ptr); + pcm[0] = SHIFT(MLZ(hi, lo)); + pcm += 16; + + synth_full2(pcm, fo, fe, D0ptr, D1ptr); + D0ptr += 15; + D1ptr += 15; + fo += 15; + fe += 15; + + ptr = *(D0ptr + 1); +/* + ML0(hi, lo, (*fo)[0], ptr[ 0]); + MLA(hi, lo, (*fo)[1], ptr[14]); + MLA(hi, lo, (*fo)[2], ptr[12]); + MLA(hi, lo, (*fo)[3], ptr[10]); + MLA(hi, lo, (*fo)[4], ptr[ 8]); + MLA(hi, lo, (*fo)[5], ptr[ 6]); + MLA(hi, lo, (*fo)[6], ptr[ 4]); + MLA(hi, lo, (*fo)[7], ptr[ 2]); +*/ + PROD_EVEN_0(hi, lo, *fo, ptr); + pcm[0] = SHIFT(-MLZ(hi, lo)); + } + + pcm += 16; + phase = (phase + 1) % 16; + } + } +} + +# else + +static +void synth_full(struct mad_synth *synth, struct mad_frame const *frame, + unsigned int nch, unsigned int ns) +{ + int p; + unsigned int phase, ch, s, sb; + mad_fixed_t *pcm, (*filter)[2][2][16][8]; + mad_fixed_t const (*sbsample)[36][32]; + mad_fixed_t (*fe)[8], (*fx)[8], (*fo)[8]; + mad_fixed_t const (*D0ptr)[32], *ptr; + mad_fixed_t const (*D1ptr)[32]; + mad_fixed64hi_t hi; + mad_fixed64lo_t lo; + + for (ch = 0; ch < nch; ++ch) { + sbsample = &frame->sbsample[ch]; + filter = &synth->filter[ch]; + phase = synth->phase; + pcm = synth->pcm.samples[ch]; + + for (s = 0; s < ns; ++s) { + dct32((*sbsample)[s], phase >> 1, + (*filter)[0][phase & 1], (*filter)[1][phase & 1]); + + p = (phase - 1) & 0xf; + + /* calculate 32 samples */ + fe = &(*filter)[0][ phase & 1][0]; + fx = &(*filter)[0][~phase & 1][0]; + fo = &(*filter)[1][~phase & 1][0]; + + D0ptr = (void*)&D[0][ p]; + D1ptr = (void*)&D[0][-p]; + + if(s & 1) + { + ptr = *D0ptr; + ML0(hi, lo, (*fx)[0], ptr[ 1]); + MLA(hi, lo, (*fx)[1], ptr[15]); + MLA(hi, lo, (*fx)[2], ptr[13]); + MLA(hi, lo, (*fx)[3], ptr[11]); + MLA(hi, lo, (*fx)[4], ptr[ 9]); + MLA(hi, lo, (*fx)[5], ptr[ 7]); + MLA(hi, lo, (*fx)[6], ptr[ 5]); + MLA(hi, lo, (*fx)[7], ptr[ 3]); + MLN(hi, lo); + MLA(hi, lo, (*fe)[0], ptr[ 0]); + MLA(hi, lo, (*fe)[1], ptr[14]); + MLA(hi, lo, (*fe)[2], ptr[12]); + MLA(hi, lo, (*fe)[3], ptr[10]); + MLA(hi, lo, (*fe)[4], ptr[ 8]); + MLA(hi, lo, (*fe)[5], ptr[ 6]); + MLA(hi, lo, (*fe)[6], ptr[ 4]); + MLA(hi, lo, (*fe)[7], ptr[ 2]); + pcm[0] = SHIFT(MLZ(hi, lo)); + pcm += 16; + for (sb = 15; sb; sb--, fo++) { ++fe; @@ -1020,6 +1390,7 @@ } } } + # endif # endif Index: apps/codecs/libmad/synth_full_arm.S =================================================================== --- apps/codecs/libmad/synth_full_arm.S (revision 0) +++ apps/codecs/libmad/synth_full_arm.S (revision 0) @@ -0,0 +1,316 @@ + .section .text,"ax",%progbits + + .global synth_full1 + .global synth_full2 + + ;; r0 = pcm + ;; r1 = fo + ;; r2 = fe + ;; r3 = D0ptr + ;; r4 = D1ptr +synth_full1: + stmdb sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr} + ldr r4, [sp, #40] + ldr r5, =synth_full_sp + str sp, [r5] + mov r5, #15 + add r2, r2, #32 +.l: + add r3, r3, #128 + add r4, r4, #128 + ldmia r1!, {r10, r11, r12, lr} + ldr r7, [r3, #4] + smull r6, r7, r10, r7 + ldr r9, [r4, #120] + smull r8, r9, r10, r9 + + ldr r10, [r3, #60] + smlal r6, r7, r11, r10 + ldr r10, [r3, #52] + smlal r6, r7, r12, r10 + ldr r10, [r3, #44] + smlal r6, r7, lr, r10 + + ldr r10, [r4, #64] + smlal r8, r9, r11, r10 + ldr r10, [r4, #72] + smlal r8, r9, r12, r10 + ldr r10, [r4, #80] + smlal r8, r9, lr, r10 + + ldmia r1!, {r11, r12, sp, lr} + ldr r10, [r3, #36] + smlal r6, r7, r11, r10 + ldr r10, [r3, #28] + smlal r6, r7, r12, r10 + ldr r10, [r3, #20] + smlal r6, r7, sp, r10 + ldr r10, [r3, #12] + smlal r6, r7, lr, r10 + + ldr r10, [r4, #88] + smlal r8, r9, r11, r10 + ldr r10, [r4, #96] + smlal r8, r9, r12, r10 + ldr r10, [r4, #104] + smlal r8, r9, sp, r10 + ldr r10, [r4, #112] + smlal r8, r9, lr, r10 + + rsbs r6, r6, #0 + rsc r7, r7, #0 + + ldmia r2!, {r11, r12, sp, lr} + + ldr r10, [r3, #0] + smlal r6, r7, r11, r10 + ldr r10, [r3, #56] + smlal r6, r7, r12, r10 + ldr r10, [r3, #48] + smlal r6, r7, sp, r10 + ldr r10, [r3, #40] + smlal r6, r7, lr, r10 + + ldr r10, [r4, #60] + smlal r8, r9, r11, r10 + ldr r10, [r4, #68] + smlal r8, r9, r12, r10 + ldr r10, [r4, #76] + smlal r8, r9, sp, r10 + ldr r10, [r4, #84] + smlal r8, r9, lr, r10 + + ldmia r2!, {r11, r12, sp, lr} + ldr r10, [r3, #32] + smlal r6, r7, r11, r10 + ldr r10, [r3, #24] + smlal r6, r7, r12, r10 + ldr r10, [r3, #16] + smlal r6, r7, sp, r10 + ldr r10, [r3, #8] + smlal r6, r7, lr, r10 + + ldr r10, [r4, #92] + smlal r8, r9, r11, r10 + ldr r10, [r4, #100] + smlal r8, r9, r12, r10 + ldr r10, [r4, #108] + smlal r8, r9, sp, r10 + ldr r10, [r4, #116] + smlal r8, r9, lr, r10 + + movs r6, r6, lsr #16 + adc r6, r6, r7, lsl #16 + str r6, [r0, -r5, lsl #2] + + movs r8, r8, lsr #16 + adc r8, r8, r9, lsl #16 + str r8, [r0, r5, lsl #2] + + subs r5, r5, #1 + bne .l + + ldr r5, =synth_full_sp + ldr sp, [r5] + ldmia sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc} + +synth_full2: + stmdb sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr} + ldr r4, [sp, #40] + ldr r5, =synth_full_sp + str sp, [r5] + mov r5, #15 + add r2, r2, #32 +.l2: + add r3, r3, #128 + add r4, r4, #128 + ldmia r1!, {r10, r11, r12, lr} + ldr r7, [r3, #0] + smull r6, r7, r10, r7 + ldr r9, [r4, #60] + smull r8, r9, r10, r9 + + ldr r10, [r3, #56] + smlal r6, r7, r11, r10 + ldr r10, [r3, #48] + smlal r6, r7, r12, r10 + ldr r10, [r3, #40] + smlal r6, r7, lr, r10 + + ldr r10, [r4, #68] + smlal r8, r9, r11, r10 + ldr r10, [r4, #76] + smlal r8, r9, r12, r10 + ldr r10, [r4, #84] + smlal r8, r9, lr, r10 + + ldmia r1!, {r11, r12, sp, lr} + ldr r10, [r3, #32] + smlal r6, r7, r11, r10 + ldr r10, [r3, #24] + smlal r6, r7, r12, r10 + ldr r10, [r3, #16] + smlal r6, r7, sp, r10 + ldr r10, [r3, #8] + smlal r6, r7, lr, r10 + + ldr r10, [r4, #92] + smlal r8, r9, r11, r10 + ldr r10, [r4, #100] + smlal r8, r9, r12, r10 + ldr r10, [r4, #108] + smlal r8, r9, sp, r10 + ldr r10, [r4, #116] + smlal r8, r9, lr, r10 + + rsbs r6, r6, #0 + rsc r7, r7, #0 + + ldmia r2!, {r11, r12, sp, lr} + + ldr r10, [r3, #4] + smlal r6, r7, r11, r10 + ldr r10, [r3, #60] + smlal r6, r7, r12, r10 + ldr r10, [r3, #52] + smlal r6, r7, sp, r10 + ldr r10, [r3, #44] + smlal r6, r7, lr, r10 + + ldr r10, [r4, #120] + smlal r8, r9, r11, r10 + ldr r10, [r4, #64] + smlal r8, r9, r12, r10 + ldr r10, [r4, #72] + smlal r8, r9, sp, r10 + ldr r10, [r4, #80] + smlal r8, r9, lr, r10 + + ldmia r2!, {r11, r12, sp, lr} + ldr r10, [r3, #36] + smlal r6, r7, r11, r10 + ldr r10, [r3, #28] + smlal r6, r7, r12, r10 + ldr r10, [r3, #20] + smlal r6, r7, sp, r10 + ldr r10, [r3, #12] + smlal r6, r7, lr, r10 + + ldr r10, [r4, #88] + smlal r8, r9, r11, r10 + ldr r10, [r4, #96] + smlal r8, r9, r12, r10 + ldr r10, [r4, #104] + smlal r8, r9, sp, r10 + ldr r10, [r4, #112] + smlal r8, r9, lr, r10 + + movs r6, r6, lsr #16 + adc r6, r6, r7, lsl #16 + str r6, [r0, -r5, lsl #2] + + movs r8, r8, lsr #16 + adc r8, r8, r9, lsl #16 + str r8, [r0, r5, lsl #2] + + subs r5, r5, #1 + bne .l2 + + ldr r5, =synth_full_sp + ldr sp, [r5] + ldmia sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc} + + .global III_aliasreduce + +III_aliasreduce: + stmdb sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr} + add r1, r0, r1, lsl #2 + add r0, r0, #72 +.arl1: + mov r2, #8 + mov r3, r0 @ a + mov r4, r0 @ b + ldr r5, =csa @ cs/ca +.arl2: + ldmdb r3, {r6, r12} + ldmia r4, {r7, lr} + + ldmia r5!, {r8, r9} + smull r10, r11, r7, r8 + smlal r10, r11, r12, r9 + movs r10, r10, lsr #28 + adc r10, r10, r11, lsl #4 + + rsb r7, r7, #0 + smull r11, r8, r12, r8 + smlal r11, r8, r7, r9 + movs r11, r11, lsr #28 + adc r11, r11, r8, lsl #4 + + ldmia r5!, {r8, r9} + smull r12, r7, lr, r8 + smlal r12, r7, r6, r9 + movs r12, r12, lsr #28 + adc r12, r12, r7, lsl #4 + stmia r4!, {r10, r12} + + rsb lr, lr, #0 + smull r7, r10, r6, r8 + smlal r7, r10, lr, r9 + movs r7, r7, lsr #28 + adc r7, r7, r10, lsl #4 + stmdb r3!, {r7, r11} + + subs r2, r2, #2 + bne .arl2 + add r0, r0, #72 + cmp r0, r1 + blo .arl1 + ldmia sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc} + +csa: + .word +0x0db84a81 + .word -0x083b5fe7 + .word +0x0e1b9d7f + .word -0x078c36d2 + .word +0x0f31adcf + .word -0x05039814 + .word +0x0fbba815 + .word -0x02e91dd1 + .word +0x0feda417 + .word -0x0183603a + .word +0x0ffc8fc8 + .word -0x00a7cb87 + .word +0x0fff964c + .word -0x003a2847 + .word +0x0ffff8d3 + .word -0x000f27b4 + + .global III_overlap +III_overlap: + stmdb sp!, {r4, r5, r6, r7, r8, lr} + add r2, r2, r3, lsl #2 + mov r3, #6 +.ol: + ldmia r0!, {r4, r5, r6} + ldmia r1!, {r7, r8, lr} + add r4, r4, r7 + add r5, r5, r8 + add r6, r6, lr + str r4, [r2], #128 + str r5, [r2], #128 + str r6, [r2], #128 + subs r3, r3, #1 + bne .ol + sub r1, r1, #72 + ldmia r0!, {r4, r5, r6, r7, r8, lr} + stmia r1!, {r4, r5, r6, r7, r8, lr} + ldmia r0!, {r4, r5, r6, r7, r8, lr} + stmia r1!, {r4, r5, r6, r7, r8, lr} + ldmia r0!, {r4, r5, r6, r7, r8, lr} + stmia r1!, {r4, r5, r6, r7, r8, lr} + ldmia sp!, {r4, r5, r6, r7, r8, pc} + + .section .ibss,"aw",%nobits +synth_full_sp: + .space 4 Index: apps/codecs/libmad/bit.c =================================================================== --- apps/codecs/libmad/bit.c (revision 13459) +++ apps/codecs/libmad/bit.c (working copy) @@ -128,6 +128,8 @@ * NAME: bit->read() * DESCRIPTION: read an arbitrary number of bits and return their UIMSBF value */ + +#if 0 unsigned long bmask[] ICONST_ATTR = { 0x00000000, 0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, @@ -135,7 +137,10 @@ 0x0003ffff, 0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff, 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff, 0x7fffffff, 0xffffffff }; +# endif + unsigned long mad_bit_read(struct mad_bitptr *bitptr, unsigned int len) ICODE_ATTR; +# if 0 unsigned long mad_bit_read(struct mad_bitptr *bitptr, unsigned int len) { unsigned long *curr = &bitptr->ptr[bitptr->readbit>>5]; @@ -159,7 +164,26 @@ return 0; } +#else +unsigned long mad_bit_read(struct mad_bitptr *bitptr, unsigned int len) +{ + unsigned long *curr = &bitptr->ptr[bitptr->readbit>>5]; + if(len) + { + unsigned long r = betoh32(curr[0]) << (bitptr->readbit & 31); + + if((bitptr->readbit & 31) + len > 32) + r += betoh32(curr[1]) >> (-bitptr->readbit & 31); + + bitptr->readbit += len; + return r >> (32 - len); + } + + return 0; +} +#endif + # if 0 /* * NAME: bit->write()