Index: apps/codecs/libmusepack/decoder.h =================================================================== --- apps/codecs/libmusepack/decoder.h (revision 28360) +++ apps/codecs/libmusepack/decoder.h (working copy) @@ -51,7 +51,7 @@ #define SLOW_SEEKING_WINDOW 0x80000000 enum { - MPC_V_MEM = 2304, + MPC_V_SIZE = (2*32*36 + 2*32*15), MPC_DECODER_MEMSIZE = 16384, // overall buffer size }; @@ -87,8 +87,8 @@ mpc_uint8_t SCF_shift[256]; #endif - MPC_SAMPLE_FORMAT V_L[MPC_V_MEM + 960]; - MPC_SAMPLE_FORMAT V_R[MPC_V_MEM + 960]; + MPC_SAMPLE_FORMAT V_L[MPC_V_SIZE]; + MPC_SAMPLE_FORMAT V_R[MPC_V_SIZE]; MPC_SAMPLE_FORMAT *Y_L; MPC_SAMPLE_FORMAT *Y_R; MPC_SAMPLE_FORMAT SCF[256]; ///< holds adapted scalefactors (for clipping prevention) Index: apps/codecs/libmusepack/synth_filter.c =================================================================== --- apps/codecs/libmusepack/synth_filter.c (revision 28360) +++ apps/codecs/libmusepack/synth_filter.c (working copy) @@ -120,19 +120,20 @@ * * mpc_dct32 is a dct32 with in[32]->dct[32] that contains the mirroring from * dct[32] to the expected out[64]. The symmetry is - * out[16] = 0, - * out[ 0..15] = dct[ 0..15], - * out[32..17] = -dct[ 0..15], - * out[33..48] = -dct[16..31], - * out[63..48] = -dct[16..31]. + * out0[16] = 0, + * out0[ 0..15] = dct[ 0..15], + * out0[31..17] = -dct[ 1..15], + * out1[ 0] = -dct[ 0], + * out1[ 1..16] = -dct[16..31], + * out1[31..16] = -dct[16..31]. * The cos-tab has the format s0.31. *****************************************************************************/ void -mpc_dct32(const MPC_SAMPLE_FORMAT *in, MPC_SAMPLE_FORMAT *v) +mpc_dct32(const MPC_SAMPLE_FORMAT *in, MPC_SAMPLE_FORMAT *v0, MPC_SAMPLE_FORMAT *v1) ICODE_ATTR_MPC_LARGE_IRAM; void -mpc_dct32(const MPC_SAMPLE_FORMAT *in, MPC_SAMPLE_FORMAT *v) +mpc_dct32(const MPC_SAMPLE_FORMAT *in, MPC_SAMPLE_FORMAT *v0, MPC_SAMPLE_FORMAT *v1) { MPC_SAMPLE_FORMAT t0, t1, t2, t3, t4, t5, t6, t7; MPC_SAMPLE_FORMAT t8, t9, t10, t11, t12, t13, t14, t15; @@ -191,6 +192,9 @@ #define costab30 (0x0c8bd35e) /* 0.098017140 */ #define costab31 (0x0647d97c) /* 0.049067674 */ +#define VMUL (36+15) +#define VDIF (32*VMUL) + t0 = in[ 0] + in[31]; t16 = MPC_DCT32_MUL(in[ 0] - in[31], costab01); t1 = in[15] + in[16]; t17 = MPC_DCT32_MUL(in[15] - in[16], costab31); @@ -277,22 +281,22 @@ t113 = t69 + t70; t114 = t71 + t72; - /* 0 */ v[48] = -MPC_DCT32_SHIFT(t113 + t114); - /* 16 */ v[32] = -(v[ 0] = MPC_DCT32_SHIFT(MPC_DCT32_MUL(t113 - t114, costab16))); + /* 0 */ v1[VMUL*48-VDIF] = -MPC_DCT32_SHIFT(t113 + t114); + /* 16 */ v1[VMUL*32-VDIF] = -(v0[VMUL* 0] = MPC_DCT32_SHIFT(MPC_DCT32_MUL(t113 - t114, costab16))); t115 = t73 + t74; t116 = t75 + t76; t32 = t115 + t116; - /* 1 */ v[49] = v[47] = -MPC_DCT32_SHIFT(t32); + /* 1 */ v1[VMUL*49-VDIF] = v1[VMUL*47-VDIF] = -MPC_DCT32_SHIFT(t32); t118 = t78 + t79; t119 = t80 + t81; t58 = t118 + t119; - /* 2 */ v[50] = v[46] = -MPC_DCT32_SHIFT(t58); + /* 2 */ v1[VMUL*50-VDIF] = v1[VMUL*46-VDIF] = -MPC_DCT32_SHIFT(t58); t121 = t83 + t84; t122 = t85 + t86; @@ -301,14 +305,14 @@ t49 = (t67 * 2) - t32; - /* 3 */ v[51] = v[45] = -MPC_DCT32_SHIFT(t49); + /* 3 */ v1[VMUL*51-VDIF] = v1[VMUL*45-VDIF] = -MPC_DCT32_SHIFT(t49); t125 = t89 + t90; t126 = t91 + t92; t93 = t125 + t126; - /* 4 */ v[52] = v[44] = -MPC_DCT32_SHIFT(t93); + /* 4 */ v1[VMUL*52-VDIF] = v1[VMUL*44-VDIF] = -MPC_DCT32_SHIFT(t93); t128 = t94 + t95; t129 = t96 + t97; @@ -317,7 +321,7 @@ t68 = (t98 * 2) - t49; - /* 5 */ v[53] = v[43] = -MPC_DCT32_SHIFT(t68); + /* 5 */ v1[VMUL*53-VDIF] = v1[VMUL*43-VDIF] = -MPC_DCT32_SHIFT(t68); t132 = t100 + t101; t133 = t102 + t103; @@ -326,7 +330,7 @@ t82 = (t104 * 2) - t58; - /* 6 */ v[54] = v[42] = -MPC_DCT32_SHIFT(t82); + /* 6 */ v1[VMUL*54-VDIF] = v1[VMUL*42-VDIF] = -MPC_DCT32_SHIFT(t82); t136 = t106 + t107; t137 = t108 + t109; @@ -337,14 +341,14 @@ t77 = (t87 * 2) - t68; - /* 7 */ v[55] = v[41] = -MPC_DCT32_SHIFT(t77); + /* 7 */ v1[VMUL*55-VDIF] = v1[VMUL*41-VDIF] = -MPC_DCT32_SHIFT(t77); t141 = MPC_DCT32_MUL(t69 - t70, costab08); t142 = MPC_DCT32_MUL(t71 - t72, costab24); t143 = t141 + t142; - /* 8 */ v[56] = v[40] = -MPC_DCT32_SHIFT(t143); - /* 24 */ v[24] = -(v[ 8] = MPC_DCT32_SHIFT((MPC_DCT32_MUL(t141 - t142, costab16) * 2) - t143)); + /* 8 */ v1[VMUL*56-VDIF] = v1[VMUL*40-VDIF] = -MPC_DCT32_SHIFT(t143); + /* 24 */ v0[VMUL*24] = -(v0[VMUL* 8] = MPC_DCT32_SHIFT((MPC_DCT32_MUL(t141 - t142, costab16) * 2) - t143)); t144 = MPC_DCT32_MUL(t73 - t74, costab08); t145 = MPC_DCT32_MUL(t75 - t76, costab24); @@ -352,7 +356,7 @@ t88 = (t146 * 2) - t77; - /* 9 */ v[57] = v[39] = -MPC_DCT32_SHIFT(t88); + /* 9 */ v1[VMUL*57-VDIF] = v1[VMUL*39-VDIF] = -MPC_DCT32_SHIFT(t88); t148 = MPC_DCT32_MUL(t78 - t79, costab08); t149 = MPC_DCT32_MUL(t80 - t81, costab24); @@ -360,7 +364,7 @@ t105 = (t150 * 2) - t82; - /* 10 */ v[58] = v[38] = -MPC_DCT32_SHIFT(t105); + /* 10 */ v1[VMUL*58-VDIF] = v1[VMUL*38-VDIF] = -MPC_DCT32_SHIFT(t105); t152 = MPC_DCT32_MUL(t83 - t84, costab08); t153 = MPC_DCT32_MUL(t85 - t86, costab24); @@ -370,7 +374,7 @@ t99 = (t111 * 2) - t88; - /* 11 */ v[59] = v[37] = -MPC_DCT32_SHIFT(t99); + /* 11 */ v1[VMUL*59-VDIF] = v1[VMUL*37-VDIF] = -MPC_DCT32_SHIFT(t99); t157 = MPC_DCT32_MUL(t89 - t90, costab08); t158 = MPC_DCT32_MUL(t91 - t92, costab24); @@ -378,12 +382,12 @@ t127 = (t159 * 2) - t93; - /* 12 */ v[60] = v[36] = -MPC_DCT32_SHIFT(t127); + /* 12 */ v1[VMUL*60-VDIF] = v1[VMUL*36-VDIF] = -MPC_DCT32_SHIFT(t127); t160 = (MPC_DCT32_MUL(t125 - t126, costab16) * 2) - t127; - /* 20 */ v[28] = -(v[ 4] = MPC_DCT32_SHIFT(t160)); - /* 28 */ v[20] = -(v[12] = MPC_DCT32_SHIFT((((MPC_DCT32_MUL(t157 - t158, costab16) * 2) - t159) * 2) - t160)); + /* 20 */ v0[VMUL*28] = -(v0[VMUL* 4] = MPC_DCT32_SHIFT(t160)); + /* 28 */ v0[VMUL*20] = -(v0[VMUL*12] = MPC_DCT32_SHIFT((((MPC_DCT32_MUL(t157 - t158, costab16) * 2) - t159) * 2) - t160)); t161 = MPC_DCT32_MUL(t94 - t95, costab08); t162 = MPC_DCT32_MUL(t96 - t97, costab24); @@ -393,7 +397,7 @@ t112 = (t130 * 2) - t99; - /* 13 */ v[61] = v[35] = -MPC_DCT32_SHIFT(t112); + /* 13 */ v1[VMUL*61-VDIF] = v1[VMUL*35-VDIF] = -MPC_DCT32_SHIFT(t112); t164 = (MPC_DCT32_MUL(t128 - t129, costab16) * 2) - t130; @@ -405,22 +409,22 @@ t120 = (t134 * 2) - t105; - /* 14 */ v[62] = v[34] = -MPC_DCT32_SHIFT(t120); + /* 14 */ v1[VMUL*62-VDIF] = v1[VMUL*34-VDIF] = -MPC_DCT32_SHIFT(t120); t135 = (MPC_DCT32_MUL(t118 - t119, costab16) * 2) - t120; - /* 18 */ v[30] = -(v[ 2] = MPC_DCT32_SHIFT(t135)); + /* 18 */ v0[VMUL*30] = -(v0[VMUL* 2] = MPC_DCT32_SHIFT(t135)); t169 = (MPC_DCT32_MUL(t132 - t133, costab16) * 2) - t134; t151 = (t169 * 2) - t135; - /* 22 */ v[26] = -(v[ 6] = MPC_DCT32_SHIFT(t151)); + /* 22 */ v0[VMUL*26] = -(v0[VMUL* 6] = MPC_DCT32_SHIFT(t151)); t170 = (((MPC_DCT32_MUL(t148 - t149, costab16) * 2) - t150) * 2) - t151; - /* 26 */ v[22] = -(v[10] = MPC_DCT32_SHIFT(t170)); - /* 30 */ v[18] = -(v[14] = MPC_DCT32_SHIFT((((((MPC_DCT32_MUL(t166 - t167, costab16) * 2) - t168) * 2) - t169) * 2) - t170)); + /* 26 */ v0[VMUL*22] = -(v0[VMUL*10] = MPC_DCT32_SHIFT(t170)); + /* 30 */ v0[VMUL*18] = -(v0[VMUL*14] = MPC_DCT32_SHIFT((((((MPC_DCT32_MUL(t166 - t167, costab16) * 2) - t168) * 2) - t169) * 2) - t170)); t171 = MPC_DCT32_MUL(t106 - t107, costab08); t172 = MPC_DCT32_MUL(t108 - t109, costab24); @@ -434,19 +438,19 @@ t117 = (t123 * 2) - t112; - /* 15 */ v[63] = v[33] =-MPC_DCT32_SHIFT(t117); + /* 15 */ v1[VMUL*63-VDIF] = v1[VMUL*33-VDIF] =-MPC_DCT32_SHIFT(t117); t124 = (MPC_DCT32_MUL(t115 - t116, costab16) * 2) - t117; - /* 17 */ v[31] = -(v[ 1] = MPC_DCT32_SHIFT(t124)); + /* 17 */ v0[VMUL*31] = -(v0[VMUL* 1] = MPC_DCT32_SHIFT(t124)); t131 = (t139 * 2) - t124; - /* 19 */ v[29] = -(v[ 3] = MPC_DCT32_SHIFT(t131)); + /* 19 */ v0[VMUL*29] = -(v0[VMUL* 3] = MPC_DCT32_SHIFT(t131)); t140 = (t164 * 2) - t131; - /* 21 */ v[27] = -(v[ 5] = MPC_DCT32_SHIFT(t140)); + /* 21 */ v0[VMUL*27] = -(v0[VMUL* 5] = MPC_DCT32_SHIFT(t140)); t174 = (MPC_DCT32_MUL(t136 - t137, costab16) * 2) - t138; @@ -454,22 +458,24 @@ t147 = (t155 * 2) - t140; - /* 23 */ v[25] = -(v[ 7] = MPC_DCT32_SHIFT(t147)); + /* 23 */ v0[VMUL*25] = -(v0[VMUL* 7] = MPC_DCT32_SHIFT(t147)); t156 = (((MPC_DCT32_MUL(t144 - t145, costab16) * 2) - t146) * 2) - t147; - /* 25 */ v[23] = -(v[ 9] = MPC_DCT32_SHIFT(t156)); + /* 25 */ v0[VMUL*23] = -(v0[VMUL* 9] = MPC_DCT32_SHIFT(t156)); t175 = (((MPC_DCT32_MUL(t152 - t153, costab16) * 2) - t154) * 2) - t155; t165 = (t175 * 2) - t156; - /* 27 */ v[21] = -(v[11] = MPC_DCT32_SHIFT(t165)); + /* 27 */ v0[VMUL*21] = -(v0[VMUL*11] = MPC_DCT32_SHIFT(t165)); t176 = (((((MPC_DCT32_MUL(t161 - t162, costab16) * 2) - t163) * 2) - t164) * 2) - t165; - /* 29 */ v[19] = -(v[13] = MPC_DCT32_SHIFT(t176)); - /* 31 */ v[17] = -(v[15] = MPC_DCT32_SHIFT((((((((MPC_DCT32_MUL(t171 - t172, costab16) * 2) - t173) * 2) - t174) * 2) - t175) * 2) - t176)); + /* 29 */ v0[VMUL*19] = -(v0[VMUL*13] = MPC_DCT32_SHIFT(t176)); + /* 31 */ v0[VMUL*17] = -(v0[VMUL*15] = MPC_DCT32_SHIFT((((((((MPC_DCT32_MUL(t171 - t172, costab16) * 2) - t173) * 2) - t174) * 2) - t175) * 2) - t176)); + + /* 33 */ v0[VMUL*16] = 0; } #if defined(CPU_ARM) || defined(CPU_COLDFIRE) @@ -486,16 +492,16 @@ mpc_int32_t k; // 64=64x64-multiply (FIXED_POINT) or float=float*float (!FIXED_POINT) in C - for ( k = 0; k < 32; k++, D += 16, V++ ) + for ( k = 0; k < 32; k++, D += 16, V += (36+15) ) { - *Data = MPC_MULTIPLY_EX(V[ 0],D[ 0],30) + MPC_MULTIPLY_EX(V[ 96],D[ 1],30) - + MPC_MULTIPLY_EX(V[128],D[ 2],30) + MPC_MULTIPLY_EX(V[224],D[ 3],30) - + MPC_MULTIPLY_EX(V[256],D[ 4],30) + MPC_MULTIPLY_EX(V[352],D[ 5],30) - + MPC_MULTIPLY_EX(V[384],D[ 6],30) + MPC_MULTIPLY_EX(V[480],D[ 7],30) - + MPC_MULTIPLY_EX(V[512],D[ 8],30) + MPC_MULTIPLY_EX(V[608],D[ 9],30) - + MPC_MULTIPLY_EX(V[640],D[10],30) + MPC_MULTIPLY_EX(V[736],D[11],30) - + MPC_MULTIPLY_EX(V[768],D[12],30) + MPC_MULTIPLY_EX(V[864],D[13],30) - + MPC_MULTIPLY_EX(V[896],D[14],30) + MPC_MULTIPLY_EX(V[992],D[15],30); + *Data = MPC_MULTIPLY_EX(V[ 0],D[ 0],30) + MPC_MULTIPLY_EX(V[ 1],D[ 1],30) + + MPC_MULTIPLY_EX(V[ 2],D[ 2],30) + MPC_MULTIPLY_EX(V[ 3],D[ 3],30) + + MPC_MULTIPLY_EX(V[ 4],D[ 4],30) + MPC_MULTIPLY_EX(V[ 5],D[ 5],30) + + MPC_MULTIPLY_EX(V[ 6],D[ 6],30) + MPC_MULTIPLY_EX(V[ 7],D[ 7],30) + + MPC_MULTIPLY_EX(V[ 8],D[ 8],30) + MPC_MULTIPLY_EX(V[ 9],D[ 9],30) + + MPC_MULTIPLY_EX(V[10],D[10],30) + MPC_MULTIPLY_EX(V[11],D[11],30) + + MPC_MULTIPLY_EX(V[12],D[12],30) + MPC_MULTIPLY_EX(V[13],D[13],30) + + MPC_MULTIPLY_EX(V[14],D[14],30) + MPC_MULTIPLY_EX(V[15],D[15],30); Data += 1; // total: 16 muls, 15 adds, 16 shifts } @@ -506,14 +512,18 @@ mpc_full_synthesis_filter(MPC_SAMPLE_FORMAT *OutData, MPC_SAMPLE_FORMAT *V, const MPC_SAMPLE_FORMAT *Y) { mpc_uint32_t n; + mpc_uint32_t offset0; + mpc_uint32_t offset1; if (NULL != OutData) { for ( n = 0; n < 36; n++, Y += 32, OutData += 32 ) { - V -= 64; - mpc_dct32(Y, V); - mpc_decoder_windowing_D( OutData, V, Di_opt ); + V -= 1; + offset0 = (n&1) ? MPC_V_SIZE/2 : 0; + offset1 = MPC_V_SIZE/2 - offset0; + mpc_dct32(Y, V + offset0, V + offset1); + mpc_decoder_windowing_D( OutData, V + offset0, Di_opt ); } } } @@ -523,17 +533,26 @@ int num_channels) { (void)num_channels; + int i, offset; /********* left channel ********/ - memmove(d->V_L + MPC_V_MEM, d->V_L, 960 * sizeof(MPC_SAMPLE_FORMAT) ); + for (i=0, offset=0; i<64; ++i, offset += 51) + { + memmove(d->V_L + offset + 36, d->V_L + offset, (15) * sizeof(MPC_SAMPLE_FORMAT) ); + } +// memmove(d->V_L + 36, d->V_L, (MPC_V_SIZE-36) * sizeof(MPC_SAMPLE_FORMAT) ); mpc_full_synthesis_filter(OutData, - (MPC_SAMPLE_FORMAT *)(d->V_L + MPC_V_MEM), + (MPC_SAMPLE_FORMAT *)(d->V_L + 36), (MPC_SAMPLE_FORMAT *)(d->Y_L)); /******** right channel ********/ - memmove(d->V_R + MPC_V_MEM, d->V_R, 960 * sizeof(MPC_SAMPLE_FORMAT) ); + for (i=0, offset=0; i<64; ++i, offset += 51) + { + memmove(d->V_R + offset + 36, d->V_R + offset, (15) * sizeof(MPC_SAMPLE_FORMAT) ); + } +// memmove(d->V_R + 36, d->V_R, (MPC_V_SIZE-36) * sizeof(MPC_SAMPLE_FORMAT) ); mpc_full_synthesis_filter((OutData == NULL ? NULL : OutData + MPC_FRAME_LENGTH), - (MPC_SAMPLE_FORMAT *)(d->V_R + MPC_V_MEM), + (MPC_SAMPLE_FORMAT *)(d->V_R + 36), (MPC_SAMPLE_FORMAT *)(d->Y_R)); } Index: apps/codecs/libmusepack/synth_filter_arm.S =================================================================== --- apps/codecs/libmusepack/synth_filter_arm.S (revision 28360) +++ apps/codecs/libmusepack/synth_filter_arm.S (working copy) @@ -34,7 +34,7 @@ .align 2 .global mpc_decoder_windowing_D .type mpc_decoder_windowing_D, %function -#if 0 +#if 1 mpc_decoder_windowing_D: /* r0 = Data[] */ /* r1 = V[] */ @@ -43,55 +43,43 @@ /************************************************************************ * Reference implementation. ***********************************************************************/ - stmfd sp!, {r4-r8, lr} + stmfd sp!, {r4-r11, lr} mov lr, #32 .loop32: - ldmia r2!, { r3-r6 } /* load D[00..03] */ - ldr r7, [r1] /* 0 */ - smull r8, r12, r7, r3 - ldr r7, [r1, #96*4] /* 1 */ - smlal r8, r12, r7, r4 - ldr r7, [r1, #128*4] /* 2 */ - smlal r8, r12, r7, r5 - ldr r7, [r1, #224*4] /* 3 */ - smlal r8, r12, r7, r6 - ldmia r2!, { r3-r6 } /* load D[04..07] */ - ldr r7, [r1, #256*4] /* 4 */ - smlal r8, r12, r7, r3 - ldr r7, [r1, #352*4] /* 5 */ - smlal r8, r12, r7, r4 - ldr r7, [r1, #384*4] /* 6 */ - smlal r8, r12, r7, r5 - ldr r7, [r1, #480*4] /* 7 */ - smlal r8, r12, r7, r6 - ldmia r2!, { r3-r6 } /* load D[08..11] */ - ldr r7, [r1, #512*4] /* 8 */ - smlal r8, r12, r7, r3 - ldr r7, [r1, #608*4] /* 9 */ - smlal r8, r12, r7, r4 - ldr r7, [r1, #640*4] /* 10 */ - smlal r8, r12, r7, r5 - ldr r7, [r1, #736*4] /* 11 */ - smlal r8, r12, r7, r6 - ldmia r2!, { r3-r6 } /* load D[12..15] */ - ldr r7, [r1, #768*4] /* 12 */ - smlal r8, r12, r7, r3 - ldr r7, [r1, #864*4] /* 13 */ - smlal r8, r12, r7, r4 - ldr r7, [r1, #896*4] /* 14 */ - smlal r8, r12, r7, r5 - ldr r7, [r1, #992*4] /* 15 */ - smlal r8, r12, r7, r6 - mov r8, r8, lsr #16 - orr r8, r8, r12, lsl #16 /* (lo>>16) || (hi<<16) */ - str r8, [r0], #4 /* store Data */ - add r1, r1, #4 /* V++ */ + ldmia r2!, { r3-r6 } /* load D[00..03] */ + ldmia r1!, { r7-r10} /* load V[00..03] */ + smull r11, r12, r7, r3 /* 0 */ + smlal r11, r12, r8, r4 /* 1 */ + smlal r11, r12, r9, r5 /* 2 */ + smlal r11, r12, r10, r6 /* 3 */ + ldmia r2!, { r3-r6 } /* load D[04..07] */ + ldmia r1!, { r7-r10} /* load V[04..07] */ + smlal r11, r12, r7, r3 /* 4 */ + smlal r11, r12, r8, r4 /* 5 */ + smlal r11, r12, r9, r5 /* 6 */ + smlal r11, r12, r10, r6 /* 7 */ + ldmia r2!, { r3-r6 } /* load D[08..11] */ + ldmia r1!, { r7-r10} /* load V[08..11] */ + smlal r11, r12, r7, r3 /* 8 */ + smlal r11, r12, r8, r4 /* 9 */ + smlal r11, r12, r9, r5 /* 10 */ + smlal r11, r12, r10, r6 /* 11 */ + ldmia r2!, { r3-r6 } /* load D[12..15] */ + ldmia r1!, { r7-r10} /* load V[12..15] */ + smlal r11, r12, r7, r3 /* 12 */ + smlal r11, r12, r8, r4 /* 13 */ + smlal r11, r12, r9, r5 /* 14 */ + smlal r11, r12, r10, r6 /* 15 */ + mov r11, r11, lsr #16 + orr r11, r11, r12, lsl #16 /* (lo>>16) || (hi<<16) */ + str r11, [r0], #4 /* store Data */ + add r1, r1, #35*4 /* V+=(51-16) */ subs lr, lr, #1 bgt .loop32 - ldmpc regs=r4-r8 + ldmpc regs=r4-r11 #else mpc_decoder_windowing_D: /* r0 = Data[] */ @@ -114,120 +102,121 @@ *****************************************/ add r2, r2, #4 /* D+=1, r2 = D[01] as D[00] = 0 */ ldmia r2!, { r3-r6 } /* load D[01..04] */ - ldr r7 , [r1, #96*4] /* 1 */ - ldr r10, [r1, #992*4] /* 15 */ + ldr r7 , [r1, #1*4] /* 1 */ + ldr r10, [r1, #15*4] /* 15 */ rsb r10, r10, r7 /* V[01] - V[15] */ smull r8, r9, r10, r3 - ldr r7 , [r1, #128*4] /* 2 */ - ldr r10, [r1, #896*4] /* 14 */ + ldr r7 , [r1, #2*4] /* 2 */ + ldr r10, [r1, #14*4] /* 14 */ add r10, r10, r7 /* V[02] + V[14] */ smlal r8, r9, r10, r4 - ldr r7 , [r1, #224*4] /* 3 */ - ldr r10, [r1, #864*4] /* 13 */ + ldr r7 , [r1, #3*4] /* 3 */ + ldr r10, [r1, #13*4] /* 13 */ rsb r10, r10, r7 /* V[03] - V[13] */ smlal r8, r9, r10, r5 - ldr r7 , [r1, #256*4] /* 4 */ - ldr r10, [r1, #768*4] /* 12 */ + ldr r7 , [r1, #4*4] /* 4 */ + ldr r10, [r1, #12*4] /* 12 */ add r10, r10, r7 /* V[04] + V[12] */ smlal r8, r9, r10, r6 ldmia r2!, { r3-r6 } /* load D[05..08] */ - ldr r7 , [r1, #352*4] /* 5 */ - ldr r10, [r1, #736*4] /* 11 */ + ldr r7 , [r1, #5*4] /* 5 */ + ldr r10, [r1, #11*4] /* 11 */ rsb r10, r10, r7 /* V[05] - V[11] */ smlal r8, r9, r10, r3 - ldr r7 , [r1, #384*4] /* 6 */ - ldr r10, [r1, #640*4] /* 10 */ + ldr r7 , [r1, #6*4] /* 6 */ + ldr r10, [r1, #10*4] /* 10 */ add r10, r10, r7 /* V[06] + V[10] */ smlal r8, r9, r10, r4 - ldr r7 , [r1, #480*4] /* 7 */ - ldr r10, [r1, #608*4] /* 9 */ + ldr r7 , [r1, #7*4] /* 7 */ + ldr r10, [r1, #9*4] /* 9 */ rsb r10, r10, r7 /* V[07] - V[09] */ smlal r8, r9, r10, r5 - ldr r10, [r1, #512*4] /* 8 */ + ldr r10, [r1, #8*4] /* 8 */ smlal r8, r9, r10, r6 mov r8, r8, lsr #16 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */ str r8, [r0], #4 /* store Data */ - add r1, r1, #4 /* V+=1, r1 = V[01] */ + add r1, r1, #51*4 /* V+=51, r1 = sample idx 1 */ add r2, r2, #7*4 /* D+=7, r2 = D[16] */ /****************************************** - * rows 01..15 are symmetrc to rows 31..17 + * rows 01..15 are symmetric to rows 31..17 * r8 = lo, r9 = hi of 01..15 * r1 = V[01..15] * r10 = lo, r11 = hi of 31..17 * r12 = V[31..16] *****************************************/ mov lr, #15 - add r12, r1, #30*4 /* r12 = V[31] */ + add r12, r1, #1536*4 + sub r12, r12, #6*4 /* V+(51*30), r12 = sample idx 31 */ .loop15: ldmia r2!, { r3-r6 } /* load D[00..03] */ - ldr r7, [r12, #768*4] /* 12 */ + ldr r7, [r12, #12*4] /* 12 */ smull r10, r11, r7, r6 - ldr r7, [r12, #864*4] /* 13 */ + ldr r7, [r12, #13*4] /* 13 */ smlal r10, r11, r7, r5 - ldr r7, [r12, #896*4] /* 14 */ + ldr r7, [r12, #14*4] /* 14 */ smlal r10, r11, r7, r4 - ldr r7, [r12, #992*4] /* 15 */ + ldr r7, [r12, #15*4] /* 15 */ smlal r10, r11, r7, r3 ldr r7, [r1] /* 0 */ smull r8, r9, r7, r3 - ldr r7, [r1, #96*4] /* 1 */ + ldr r7, [r1, #1*4] /* 1 */ smlal r8, r9, r7, r4 - ldr r7, [r1, #128*4] /* 2 */ + ldr r7, [r1, #2*4] /* 2 */ smlal r8, r9, r7, r5 - ldr r7, [r1, #224*4] /* 3 */ + ldr r7, [r1, #3*4] /* 3 */ smlal r8, r9, r7, r6 ldmia r2!, { r3-r6 } /* load D[04..07] */ - ldr r7, [r1, #256*4] /* 4 */ + ldr r7, [r1, #4*4] /* 4 */ smlal r8, r9, r7, r3 - ldr r7, [r1, #352*4] /* 5 */ + ldr r7, [r1, #5*4] /* 5 */ smlal r8, r9, r7, r4 - ldr r7, [r1, #384*4] /* 6 */ + ldr r7, [r1, #6*4] /* 6 */ smlal r8, r9, r7, r5 - ldr r7, [r1, #480*4] /* 7 */ + ldr r7, [r1, #7*4] /* 7 */ smlal r8, r9, r7, r6 - ldr r7, [r12, #512*4] /* 8 */ + ldr r7, [r12, #8*4] /* 8 */ smlal r10, r11, r7, r6 - ldr r7, [r12, #608*4] /* 9 */ + ldr r7, [r12, #9*4] /* 9 */ smlal r10, r11, r7, r5 - ldr r7, [r12, #640*4] /* 10 */ + ldr r7, [r12, #10*4] /* 10 */ smlal r10, r11, r7, r4 - ldr r7, [r12, #736*4] /* 11 */ + ldr r7, [r12, #11*4] /* 11 */ smlal r10, r11, r7, r3 ldmia r2!, { r3-r6 } /* load D[08..11] */ - ldr r7, [r12, #256*4] /* 4 */ + ldr r7, [r12, #4*4] /* 4 */ smlal r10, r11, r7, r6 - ldr r7, [r12, #352*4] /* 5 */ + ldr r7, [r12, #5*4] /* 5 */ smlal r10, r11, r7, r5 - ldr r7, [r12, #384*4] /* 6 */ + ldr r7, [r12, #6*4] /* 6 */ smlal r10, r11, r7, r4 - ldr r7, [r12, #480*4] /* 7 */ + ldr r7, [r12, #7*4] /* 7 */ smlal r10, r11, r7, r3 - ldr r7, [r1, #512*4] /* 8 */ + ldr r7, [r1, #8*4] /* 8 */ smlal r8, r9, r7, r3 - ldr r7, [r1, #608*4] /* 9 */ + ldr r7, [r1, #9*4] /* 9 */ smlal r8, r9, r7, r4 - ldr r7, [r1, #640*4] /* 10 */ + ldr r7, [r1, #10*4] /* 10 */ smlal r8, r9, r7, r5 - ldr r7, [r1, #736*4] /* 11 */ + ldr r7, [r1, #11*4] /* 11 */ smlal r8, r9, r7, r6 ldmia r2!, { r3-r6 } /* load D[12..15] */ - ldr r7, [r1, #768*4] /* 12 */ + ldr r7, [r1, #12*4] /* 12 */ smlal r8, r9, r7, r3 - ldr r7, [r1, #864*4] /* 13 */ + ldr r7, [r1, #13*4] /* 13 */ smlal r8, r9, r7, r4 - ldr r7, [r1, #896*4] /* 14 */ + ldr r7, [r1, #14*4] /* 14 */ smlal r8, r9, r7, r5 - ldr r7, [r1, #992*4] /* 15 */ + ldr r7, [r1, #15*4] /* 15 */ smlal r8, r9, r7, r6 ldr r7, [r12] /* 0 */ smlal r10, r11, r7, r6 - ldr r7, [r12, #96*4] /* 1 */ + ldr r7, [r12, #1*4] /* 1 */ smlal r10, r11, r7, r5 - ldr r7, [r12, #128*4] /* 2 */ + ldr r7, [r12, #2*4] /* 2 */ smlal r10, r11, r7, r4 - ldr r7, [r12, #224*4] /* 3 */ + ldr r7, [r12, #3*4] /* 3 */ smlal r10, r11, r7, r3 /* store Data[01..15] */ mov r8, r8, lsr #16 @@ -241,8 +230,8 @@ str r10, [r0], #4 /* store Data */ sub r0, r0, lr, asl #3 /* r0 = r0 - 2*lr [words] */ /* correct adresses for next loop */ - sub r12, r12, #4 /* r12 = V-- */ - add r1, r1, #4 /* r1 = V++ */ + sub r12, r12, #51*4 /* r12 = V-=51 */ + add r1, r1, #51*4 /* r1 = V+=51 */ /* next loop */ subs lr, lr, #1 bgt .loop15 @@ -252,42 +241,42 @@ *****************************************/ ldmia r2!, { r3-r6 } /* load D[00..03] */ ldr r7 , [r1] /* 0 */ - ldr r10, [r1, #992*4] /* 15 */ + ldr r10, [r1, #15*4] /* 15 */ rsb r10, r10, r7 /* V[00] - V[15] */ smull r8, r9, r10, r3 - ldr r7 , [r1, #96*4] /* 1 */ - ldr r10, [r1, #896*4] /* 14 */ + ldr r7 , [r1, #1*4] /* 1 */ + ldr r10, [r1, #14*4] /* 14 */ rsb r10, r10, r7 /* V[01] - V[14] */ smlal r8, r9, r10, r4 - ldr r7 , [r1, #128*4] /* 2 */ - ldr r10, [r1, #864*4] /* 13 */ + ldr r7 , [r1, #2*4] /* 2 */ + ldr r10, [r1, #13*4] /* 13 */ rsb r10, r10, r7 /* V[02] - V[13] */ smlal r8, r9, r10, r5 - ldr r7 , [r1, #224*4] /* 3 */ - ldr r10, [r1, #768*4] /* 12 */ + ldr r7 , [r1, #3*4] /* 3 */ + ldr r10, [r1, #12*4] /* 12 */ rsb r10, r10, r7 /* V[03] - V[12] */ smlal r8, r9, r10, r6 ldmia r2!, { r3-r6 } /* load D[04..07] */ - ldr r7 , [r1, #256*4] /* 4 */ - ldr r10, [r1, #736*4] /* 11 */ + ldr r7 , [r1, #4*4] /* 4 */ + ldr r10, [r1, #11*4] /* 11 */ rsb r10, r10, r7 /* V[04] - V[11] */ smlal r8, r9, r10, r3 - ldr r7 , [r1, #352*4] /* 5 */ - ldr r10, [r1, #640*4] /* 10 */ + ldr r7 , [r1, #5*4] /* 5 */ + ldr r10, [r1, #10*4] /* 10 */ rsb r10, r10, r7 /* V[05] - V[10] */ smlal r8, r9, r10, r4 - ldr r7 , [r1, #384*4] /* 6 */ - ldr r10, [r1, #608*4] /* 9 */ + ldr r7 , [r1, #6*4] /* 6 */ + ldr r10, [r1, #9*4] /* 9 */ rsb r10, r10, r7 /* V[06] - V[09] */ smlal r8, r9, r10, r5 - ldr r7 , [r1, #480*4] /* 7 */ - ldr r10, [r1, #512*4] /* 8 */ + ldr r7 , [r1, #7*4] /* 7 */ + ldr r10, [r1, #8*4] /* 8 */ rsb r10, r10, r7 /* V[07] - V[08] */ smlal r8, r9, r10, r6 mov r8, r8, lsr #16 orr r8, r8, r9, lsl #16 /* (lo>>16) || (hi<<16) */ str r8, [r0], #4 /* store Data */ - add r1, r1, #4 /* V++ */ + add r1, r1, #51*4 /* V+=51 */ ldmpc regs=r4-r11 #endif