Index: apps/codecs/libmad/layer3.c =================================================================== --- apps/codecs/libmad/layer3.c (revision 13459) +++ apps/codecs/libmad/layer3.c (working copy) @@ -922,8 +922,19 @@ } /* we must take care that sz >= bits and sz < sizeof(long) lest bits == 0 */ +# if defined(CPU_ARM) +# define MASK(cache, sz, bits) \ + ({ unsigned long res; \ + asm ("mov %0, #1\n\t" \ + "rsb %0, %0, %0, lsl %3\n\t" \ + "and %0, %0, %1, lsr %2" \ + : "=&r" (res) : "r" (cache), "r" ((sz) - (bits)), "r" (bits)); \ + res; \ + }) +#else # define MASK(cache, sz, bits) \ (((cache) >> ((sz) - (bits))) & ((1 << (bits)) - 1)) +#endif # define MASK1BIT(cache, sz) \ ((cache) & (1 << ((sz) - 1))) @@ -1546,6 +1557,9 @@ return MAD_ERROR_NONE; } +#if defined(CPU_ARM) +void III_aliasreduce(mad_fixed_t xr[576], int lines); +#else /* * NAME: III_aliasreduce() * DESCRIPTION: perform frequency line alias reduction @@ -1600,6 +1614,7 @@ } } } +#endif # if defined(ASO_IMDCT) void III_imdct_l(mad_fixed_t const [18], mad_fixed_t [36], unsigned int); @@ -2894,6 +2909,11 @@ #endif +#ifdef CPU_ARM +void III_overlap(mad_fixed_t const output[36], mad_fixed_t overlap[18], + mad_fixed_t sample[18][32], unsigned int sb); +#else + /* * NAME: III_overlap() * DESCRIPTION: perform overlap-add of windowed IMDCT outputs @@ -2941,6 +2961,7 @@ } # endif } +#endif /* * NAME: III_overlap_z() @@ -3142,10 +3163,21 @@ /* (nonzero) subbands 2-31 */ +/* i = 576; while (i > 36 && xr[ch][i - 1] == 0) --i; +*/ + { + /* saves ~600k cycles */ + mad_fixed_t *p = &xr[ch][576]; + mad_fixed_t tmp = xr[ch][35]; + xr[ch][35] = 1; + while (!*--p); + xr[ch][35] = tmp; + i = p - &xr[ch][0] + 1; + } sblimit = 32 - (576 - i) / 18; if (channel->block_type != 2) { Index: apps/codecs/libmad/SOURCES =================================================================== --- apps/codecs/libmad/SOURCES (revision 13459) +++ apps/codecs/libmad/SOURCES (working copy) @@ -14,4 +14,6 @@ #endif #if defined(CPU_ARM) && !defined(SIMULATOR) imdct_l_arm.S +dct32_arm.S +synth_full_arm.S #endif Index: apps/codecs/libmad/synth.c =================================================================== --- apps/codecs/libmad/synth.c (revision 13459) +++ apps/codecs/libmad/synth.c (working copy) @@ -67,6 +67,13 @@ } } +#ifdef FPM_ARM + +void dct32(mad_fixed_t const in[32], unsigned int slot, + mad_fixed_t lo[16][8], mad_fixed_t hi[16][8]); + +#else + /* * An optional optimization called here the Subband Synthesis Optimization * (SSO) improves the performance of subband synthesis at the expense of @@ -533,6 +540,8 @@ # undef MUL # undef SHIFT +#endif + /* third SSO shift and/or D[] optimization preshift */ # if defined(OPT_SSO) @@ -816,10 +825,228 @@ } } -#else +#elif defined(FPM_ARM) +#define PROD_ODD_0(hi, lo, f, ptr) \ + do { \ + mad_fixed_t *__p = (f); \ + asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #4]\n\t" \ + "smull %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #60]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #52]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #44]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + "ldmia %2, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #36]\n\t" \ + "smlal %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #28]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #20]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #12]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + : "=&r" (lo), "=&r" (hi), "+r" (__p) \ + : "r" (ptr) \ + : "r0", "r1", "r2", "r3", "r4"); \ + } while (0) + +#define PROD_ODD_A(hi, lo, f, ptr) \ + do { \ + mad_fixed_t *__p = (f); \ + asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #4]\n\t" \ + "smlal %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #60]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #52]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #44]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + "ldmia %2, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #36]\n\t" \ + "smlal %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #28]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #20]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #12]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + : "+r" (lo), "+r" (hi), "+r" (__p) \ + : "r" (ptr) \ + : "r0", "r1", "r2", "r3", "r4"); \ + } while (0) + +#define PROD_EVEN_0(hi, lo, f, ptr) \ + do { \ + mad_fixed_t *__p = (f); \ + asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #0]\n\t" \ + "smull %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #56]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #48]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #40]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + "ldmia %2, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #32]\n\t" \ + "smlal %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #24]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #16]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #8]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + : "=&r" (lo), "=&r" (hi), "+r" (__p) \ + : "r" (ptr) \ + : "r0", "r1", "r2", "r3", "r4"); \ + } while (0) + +#define PROD_EVEN_A(hi, lo, f, ptr) \ + do { \ + mad_fixed_t *__p = (f); \ + asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #0]\n\t" \ + "smlal %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #56]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #48]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #40]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + "ldmia %2, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #32]\n\t" \ + "smlal %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #24]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #16]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #8]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + : "+r" (lo), "+r" (hi), "+r" (__p) \ + : "r" (ptr) \ + : "r0", "r1", "r2", "r3", "r4"); \ + } while (0) + +#define PROD_EVENBACK_0(hi, lo, f, ptr) \ + do { \ + mad_fixed_t *__p = (f); \ + asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #60]\n\t" \ + "smull %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #68]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #76]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #84]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + "ldmia %2, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #92]\n\t" \ + "smlal %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #100]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #108]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #116]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + : "=&r" (lo), "=&r" (hi), "+r" (__p) \ + : "r" (ptr) \ + : "r0", "r1", "r2", "r3", "r4"); \ + } while (0) + +#define PROD_EVENBACK_A(hi, lo, f, ptr) \ + do { \ + mad_fixed_t *__p = (f); \ + asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #60]\n\t" \ + "smlal %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #68]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #76]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #84]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + "ldmia %2, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #92]\n\t" \ + "smlal %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #100]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #108]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #116]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + : "+r" (lo), "+r" (hi), "+r" (__p) \ + : "r" (ptr) \ + : "r0", "r1", "r2", "r3", "r4"); \ + } while (0) + +#define PROD_ODDBACK_0(hi, lo, f, ptr) \ + do { \ + mad_fixed_t *__p = (f); \ + asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #120]\n\t" \ + "smull %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #64]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #72]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #80]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + "ldmia %2, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #88]\n\t" \ + "smlal %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #96]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #104]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #112]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + : "=&r" (lo), "=&r" (hi), "+r" (__p) \ + : "r" (ptr) \ + : "r0", "r1", "r2", "r3", "r4"); \ + } while (0) + +#define PROD_ODDBACK_A(hi, lo, f, ptr) \ + do { \ + mad_fixed_t *__p = (f); \ + asm("ldmia %2!, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #120]\n\t" \ + "smlal %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #64]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #72]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #80]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + "ldmia %2, {r0, r1, r2, r3}\n\t" \ + "ldr r4, [%3, #88]\n\t" \ + "smlal %0, %1, r0, r4\n\t" \ + "ldr r4, [%3, #96]\n\t" \ + "smlal %0, %1, r1, r4\n\t" \ + "ldr r4, [%3, #104]\n\t" \ + "smlal %0, %1, r2, r4\n\t" \ + "ldr r4, [%3, #112]\n\t" \ + "smlal %0, %1, r3, r4\n\t" \ + : "+r" (lo), "+r" (hi), "+r" (__p) \ + : "r" (ptr) \ + : "r0", "r1", "r2", "r3", "r4"); \ + } while (0) + +void synth_full1(mad_fixed_t *pcm, mad_fixed_t (*fo)[8], mad_fixed_t (*fe)[8], + mad_fixed_t const (*D0ptr)[32], + mad_fixed_t const (*D1ptr)[32]); +void synth_full2(mad_fixed_t *pcm, mad_fixed_t (*fo)[8], mad_fixed_t (*fe)[8], + mad_fixed_t const (*D0ptr)[32], + mad_fixed_t const (*D1ptr)[32]); + static void synth_full(struct mad_synth *synth, struct mad_frame const *frame, + unsigned int nch, unsigned int ns) ICODE_ATTR; +static +void synth_full(struct mad_synth *synth, struct mad_frame const *frame, unsigned int nch, unsigned int ns) { int p; @@ -855,6 +1082,7 @@ if(s & 1) { ptr = *D0ptr; +/* ML0(hi, lo, (*fx)[0], ptr[ 1]); MLA(hi, lo, (*fx)[1], ptr[15]); MLA(hi, lo, (*fx)[2], ptr[13]); @@ -863,7 +1091,10 @@ MLA(hi, lo, (*fx)[5], ptr[ 7]); MLA(hi, lo, (*fx)[6], ptr[ 5]); MLA(hi, lo, (*fx)[7], ptr[ 3]); +*/ + PROD_ODD_0(hi, lo, *fx, ptr); MLN(hi, lo); +/* MLA(hi, lo, (*fe)[0], ptr[ 0]); MLA(hi, lo, (*fe)[1], ptr[14]); MLA(hi, lo, (*fe)[2], ptr[12]); @@ -872,9 +1103,146 @@ MLA(hi, lo, (*fe)[5], ptr[ 6]); MLA(hi, lo, (*fe)[6], ptr[ 4]); MLA(hi, lo, (*fe)[7], ptr[ 2]); +*/ + PROD_EVEN_A(hi, lo, *fe, ptr); pcm[0] = SHIFT(MLZ(hi, lo)); pcm += 16; + synth_full1(pcm, fo, fe, D0ptr, D1ptr); + D0ptr += 15; + D1ptr += 15; + fo += 15; + fe += 15; + + ptr = *(D0ptr + 1); + PROD_ODD_0(hi, lo, *fo, ptr); +/* + ML0(hi, lo, (*fo)[0], ptr[ 1]); + MLA(hi, lo, (*fo)[1], ptr[15]); + MLA(hi, lo, (*fo)[2], ptr[13]); + MLA(hi, lo, (*fo)[3], ptr[11]); + MLA(hi, lo, (*fo)[4], ptr[ 9]); + MLA(hi, lo, (*fo)[5], ptr[ 7]); + MLA(hi, lo, (*fo)[6], ptr[ 5]); + MLA(hi, lo, (*fo)[7], ptr[ 3]); +*/ + pcm[0] = SHIFT(-MLZ(hi, lo)); + } + else + { + ptr = *D0ptr; +/* + ML0(hi, lo, (*fx)[0], ptr[ 0]); + MLA(hi, lo, (*fx)[1], ptr[14]); + MLA(hi, lo, (*fx)[2], ptr[12]); + MLA(hi, lo, (*fx)[3], ptr[10]); + MLA(hi, lo, (*fx)[4], ptr[ 8]); + MLA(hi, lo, (*fx)[5], ptr[ 6]); + MLA(hi, lo, (*fx)[6], ptr[ 4]); + MLA(hi, lo, (*fx)[7], ptr[ 2]); +*/ + PROD_EVEN_0(hi, lo, *fx, ptr); + MLN(hi, lo); +/* + MLA(hi, lo, (*fe)[0], ptr[ 1]); + MLA(hi, lo, (*fe)[1], ptr[15]); + MLA(hi, lo, (*fe)[2], ptr[13]); + MLA(hi, lo, (*fe)[3], ptr[11]); + MLA(hi, lo, (*fe)[4], ptr[ 9]); + MLA(hi, lo, (*fe)[5], ptr[ 7]); + MLA(hi, lo, (*fe)[6], ptr[ 5]); + MLA(hi, lo, (*fe)[7], ptr[ 3]); +*/ + PROD_ODD_A(hi, lo, *fe, ptr); + pcm[0] = SHIFT(MLZ(hi, lo)); + pcm += 16; + + synth_full2(pcm, fo, fe, D0ptr, D1ptr); + D0ptr += 15; + D1ptr += 15; + fo += 15; + fe += 15; + + ptr = *(D0ptr + 1); +/* + ML0(hi, lo, (*fo)[0], ptr[ 0]); + MLA(hi, lo, (*fo)[1], ptr[14]); + MLA(hi, lo, (*fo)[2], ptr[12]); + MLA(hi, lo, (*fo)[3], ptr[10]); + MLA(hi, lo, (*fo)[4], ptr[ 8]); + MLA(hi, lo, (*fo)[5], ptr[ 6]); + MLA(hi, lo, (*fo)[6], ptr[ 4]); + MLA(hi, lo, (*fo)[7], ptr[ 2]); +*/ + PROD_EVEN_0(hi, lo, *fo, ptr); + pcm[0] = SHIFT(-MLZ(hi, lo)); + } + + pcm += 16; + phase = (phase + 1) % 16; + } + } +} + +# else + +static +void synth_full(struct mad_synth *synth, struct mad_frame const *frame, + unsigned int nch, unsigned int ns) +{ + int p; + unsigned int phase, ch, s, sb; + mad_fixed_t *pcm, (*filter)[2][2][16][8]; + mad_fixed_t const (*sbsample)[36][32]; + mad_fixed_t (*fe)[8], (*fx)[8], (*fo)[8]; + mad_fixed_t const (*D0ptr)[32], *ptr; + mad_fixed_t const (*D1ptr)[32]; + mad_fixed64hi_t hi; + mad_fixed64lo_t lo; + + for (ch = 0; ch < nch; ++ch) { + sbsample = &frame->sbsample[ch]; + filter = &synth->filter[ch]; + phase = synth->phase; + pcm = synth->pcm.samples[ch]; + + for (s = 0; s < ns; ++s) { + dct32((*sbsample)[s], phase >> 1, + (*filter)[0][phase & 1], (*filter)[1][phase & 1]); + + p = (phase - 1) & 0xf; + + /* calculate 32 samples */ + fe = &(*filter)[0][ phase & 1][0]; + fx = &(*filter)[0][~phase & 1][0]; + fo = &(*filter)[1][~phase & 1][0]; + + D0ptr = (void*)&D[0][ p]; + D1ptr = (void*)&D[0][-p]; + + if(s & 1) + { + ptr = *D0ptr; + ML0(hi, lo, (*fx)[0], ptr[ 1]); + MLA(hi, lo, (*fx)[1], ptr[15]); + MLA(hi, lo, (*fx)[2], ptr[13]); + MLA(hi, lo, (*fx)[3], ptr[11]); + MLA(hi, lo, (*fx)[4], ptr[ 9]); + MLA(hi, lo, (*fx)[5], ptr[ 7]); + MLA(hi, lo, (*fx)[6], ptr[ 5]); + MLA(hi, lo, (*fx)[7], ptr[ 3]); + MLN(hi, lo); + MLA(hi, lo, (*fe)[0], ptr[ 0]); + MLA(hi, lo, (*fe)[1], ptr[14]); + MLA(hi, lo, (*fe)[2], ptr[12]); + MLA(hi, lo, (*fe)[3], ptr[10]); + MLA(hi, lo, (*fe)[4], ptr[ 8]); + MLA(hi, lo, (*fe)[5], ptr[ 6]); + MLA(hi, lo, (*fe)[6], ptr[ 4]); + MLA(hi, lo, (*fe)[7], ptr[ 2]); + pcm[0] = SHIFT(MLZ(hi, lo)); + pcm += 16; + for (sb = 15; sb; sb--, fo++) { ++fe; @@ -1020,6 +1388,7 @@ } } } + # endif # endif Index: apps/codecs/libmad/bit.c =================================================================== --- apps/codecs/libmad/bit.c (revision 13459) +++ apps/codecs/libmad/bit.c (working copy) @@ -128,6 +128,8 @@ * NAME: bit->read() * DESCRIPTION: read an arbitrary number of bits and return their UIMSBF value */ + +#if 0 unsigned long bmask[] ICONST_ATTR = { 0x00000000, 0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, @@ -135,7 +137,10 @@ 0x0003ffff, 0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff, 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff, 0x7fffffff, 0xffffffff }; +# endif + unsigned long mad_bit_read(struct mad_bitptr *bitptr, unsigned int len) ICODE_ATTR; +# if 0 unsigned long mad_bit_read(struct mad_bitptr *bitptr, unsigned int len) { unsigned long *curr = &bitptr->ptr[bitptr->readbit>>5]; @@ -159,7 +164,26 @@ return 0; } +#else +unsigned long mad_bit_read(struct mad_bitptr *bitptr, unsigned int len) +{ + unsigned long *curr = &bitptr->ptr[bitptr->readbit>>5]; + if(len) + { + unsigned long r = betoh32(curr[0]) << (bitptr->readbit & 31); + + if((bitptr->readbit & 31) + len > 32) + r += betoh32(curr[1]) >> (-bitptr->readbit & 31); + + bitptr->readbit += len; + return r >> (32 - len); + } + + return 0; +} +#endif + # if 0 /* * NAME: bit->write() Index: firmware/target/arm/iriver/h10/button-target.h =================================================================== --- firmware/target/arm/iriver/h10/button-target.h (revision 13459) +++ firmware/target/arm/iriver/h10/button-target.h (working copy) @@ -40,6 +40,7 @@ #define BUTTON_LEFT 0x00000002 #define BUTTON_RIGHT 0x00000004 +#define BUTTON_SELECT BUTTON_RIGHT #define BUTTON_REW 0x00000008 #define BUTTON_PLAY 0x00000010