Index: apps/codecs/libmusepack/math.h =================================================================== --- apps/codecs/libmusepack/math.h (revision 14284) +++ apps/codecs/libmusepack/math.h (working copy) @@ -44,176 +44,194 @@ #ifdef MPC_FIXED_POINT + #ifdef _WIN32_WCE + #include + #define MPC_HAVE_MULHIGH + #endif + + #define MPC_FIXED_POINT_SCALE_SHIFT (MPC_FIXED_POINT_SHIFT + MPC_FIXED_POINT_FRACTPART) + #define MPC_FIXED_POINT_SCALE (1 << (MPC_FIXED_POINT_SCALE_SHIFT - 1)) + //in fixedpoint mode, results in decode output buffer are in -MPC_FIXED_POINT_SCALE ... MPC_FIXED_POINT_SCALE range + + #define MPC_FIXED_POINT_FRACTPART 14 + typedef mpc_int32_t MPC_SAMPLE_FORMAT; + typedef mpc_int64_t MPC_SAMPLE_FORMAT_MULTIPLY; + + #define MAKE_MPC_SAMPLE(X) (MPC_SAMPLE_FORMAT)((double)(X) * (double)(((mpc_int64_t)1)<>Y) -#ifdef _WIN32_WCE + #if defined(CPU_COLDFIRE) -#include + #define MPC_MULTIPLY(X,Y) mpc_multiply((X), (Y)) + #define MPC_MULTIPLY_EX(X,Y,Z) mpc_multiply_ex((X), (Y), (Z)) + + static inline MPC_SAMPLE_FORMAT mpc_multiply(MPC_SAMPLE_FORMAT x, + MPC_SAMPLE_FORMAT y) + { + MPC_SAMPLE_FORMAT t1, t2; + asm volatile ( + "mac.l %[x],%[y],%%acc0\n" /* multiply */ + "mulu.l %[y],%[x] \n" /* get lower half, avoid emac stall */ + "movclr.l %%acc0,%[t1] \n" /* get higher half */ + "moveq.l #17,%[t2] \n" + "asl.l %[t2],%[t1] \n" /* hi <<= 17, plus one free */ + "moveq.l #14,%[t2] \n" + "lsr.l %[t2],%[x] \n" /* (unsigned)lo >>= 14 */ + "or.l %[x],%[t1] \n" /* combine result */ + : /* outputs */ + [t1]"=&d"(t1), + [t2]"=&d"(t2), + [x] "+d" (x) + : /* inputs */ + [y] "d" (y) + ); + return t1; + } -#define MPC_HAVE_MULHIGH + static inline MPC_SAMPLE_FORMAT mpc_multiply_ex(MPC_SAMPLE_FORMAT x, + MPC_SAMPLE_FORMAT y, + unsigned shift) + { + MPC_SAMPLE_FORMAT t1, t2; + asm volatile ( + "mac.l %[x],%[y],%%acc0\n" /* multiply */ + "mulu.l %[y],%[x] \n" /* get lower half, avoid emac stall */ + "movclr.l %%acc0,%[t1] \n" /* get higher half */ + "moveq.l #31,%[t2] \n" + "sub.l %[sh],%[t2] \n" /* t2 = 31 - shift */ + "ble.s 1f \n" + "asl.l %[t2],%[t1] \n" /* hi <<= 31 - shift */ + "lsr.l %[sh],%[x] \n" /* (unsigned)lo >>= shift */ + "or.l %[x],%[t1] \n" /* combine result */ + "bra.s 2f \n" + "1: \n" + "neg.l %[t2] \n" /* t2 = shift - 31 */ + "asr.l %[t2],%[t1] \n" /* hi >>= t2 */ + "2: \n" + : /* outputs */ + [t1]"=&d"(t1), + [t2]"=&d"(t2), + [x] "+d" (x) + : /* inputs */ + [y] "d" (y), + [sh]"d" (shift) + ); + return t1; + } + #elif defined(CPU_ARM) + // borrowed and adapted from libMAD + #define MPC_MULTIPLY(X,Y) \ + ({ \ + MPC_SAMPLE_FORMAT low; \ + MPC_SAMPLE_FORMAT high; \ + asm volatile ( /* will calculate: result = (X*Y)>>14 */ \ + "smull %0,%1,%2,%3 \n\t" /* multiply with result %0 [0..31], %1 [32..63] */ \ + "mov %0, %0, lsr #14 \n\t" /* %0 = %0 >> 14 */ \ + "orr %0, %0, %1, lsl #18 \n\t"/* result = %0 OR (%1 << 18) */ \ + : "=&r"(low), "=&r" (high) \ + : "r"(X),"r"(Y)); \ + low; \ + }) + + // borrowed and adapted from libMAD + #define MPC_MULTIPLY_EX(X,Y,Z) \ + ({ \ + MPC_SAMPLE_FORMAT low; \ + MPC_SAMPLE_FORMAT high; \ + asm volatile ( /* will calculate: result = (X*Y)>>Z */ \ + "smull %0,%1,%2,%3 \n\t" /* multiply with result %0 [0..31], %1 [32..63] */ \ + "mov %0, %0, lsr %4 \n\t" /* %0 = %0 >> Z */ \ + "orr %0, %0, %1, lsl %5 \n\t" /* result = %0 OR (%1 << (32-Z)) */ \ + : "=&r"(low), "=&r" (high) \ + : "r"(X),"r"(Y),"r"(Z),"r"(32-Z)); \ + low; \ + }) + #else /* libmusepack standard */ -#endif + #define MPC_MULTIPLY_NOTRUNCATE(X,Y) \ + (((MPC_SAMPLE_FORMAT_MULTIPLY)(X) * (MPC_SAMPLE_FORMAT_MULTIPLY)(Y)) >> MPC_FIXED_POINT_FRACTPART) + #define MPC_MULTIPLY_EX_NOTRUNCATE(X,Y,Z) \ + (((MPC_SAMPLE_FORMAT_MULTIPLY)(X) * (MPC_SAMPLE_FORMAT_MULTIPLY)(Y)) >> (Z)) -#define MPC_FIXED_POINT_SCALE_SHIFT (MPC_FIXED_POINT_SHIFT + MPC_FIXED_POINT_FRACTPART) -#define MPC_FIXED_POINT_SCALE (1 << (MPC_FIXED_POINT_SCALE_SHIFT - 1)) + #ifdef _DEBUG + static inline MPC_SAMPLE_FORMAT MPC_MULTIPLY(MPC_SAMPLE_FORMAT item1,MPC_SAMPLE_FORMAT item2) + { + MPC_SAMPLE_FORMAT_MULTIPLY temp = MPC_MULTIPLY_NOTRUNCATE(item1,item2); + assert(temp == (MPC_SAMPLE_FORMAT_MULTIPLY)(MPC_SAMPLE_FORMAT)temp); + return (MPC_SAMPLE_FORMAT)temp; + } + + static inline MPC_SAMPLE_FORMAT MPC_MULTIPLY_EX(MPC_SAMPLE_FORMAT item1,MPC_SAMPLE_FORMAT item2,unsigned shift) + { + MPC_SAMPLE_FORMAT_MULTIPLY temp = MPC_MULTIPLY_EX_NOTRUNCATE(item1,item2,shift); + assert(temp == (MPC_SAMPLE_FORMAT_MULTIPLY)(MPC_SAMPLE_FORMAT)temp); + return (MPC_SAMPLE_FORMAT)temp; + } + #else + #define MPC_MULTIPLY(X,Y) ((MPC_SAMPLE_FORMAT)MPC_MULTIPLY_NOTRUNCATE(X,Y)) + #define MPC_MULTIPLY_EX(X,Y,Z) ((MPC_SAMPLE_FORMAT)MPC_MULTIPLY_EX_NOTRUNCATE(X,Y,Z)) + #endif + #endif -//in fixedpoint mode, results in decode output buffer are in -MPC_FIXED_POINT_SCALE ... MPC_FIXED_POINT_SCALE range + #ifdef MPC_HAVE_MULHIGH + #define MPC_MULTIPLY_FRACT(X,Y) _MulHigh(X,Y) + #else + #if defined(CPU_COLDFIRE) + /* loses one bit of accuracy. The rest of the macros won't be as easy as this... */ + #define MPC_MULTIPLY_FRACT(X,Y) \ + ({ \ + MPC_SAMPLE_FORMAT t; \ + asm volatile ( \ + "mac.l %[A], %[B], %%acc0\n\t" \ + "movclr.l %%acc0, %[t]\n\t" \ + "asr.l #1, %[t]\n\t" \ + : [t] "=d" (t) \ + : [A] "r" ((X)), [B] "r" ((Y))); \ + t; \ + }) + #elif defined(CPU_ARM) + // borrowed and adapted from libMAD + #define MPC_MULTIPLY_FRACT(X,Y) \ + ({ \ + MPC_SAMPLE_FORMAT low; \ + MPC_SAMPLE_FORMAT high; \ + asm volatile ( /* will calculate: result = (X*Y)>>32 */ \ + "smull %0,%1,%2,%3 \n\t" /* multiply with result %0 [0..31], %1 [32..63] */ \ + : "=&r"(low), "=&r" (high) /* result = %1 [32..63], saves the >>32 */ \ + : "r"(X),"r"(Y)); \ + high; \ + }) + #else + #define MPC_MULTIPLY_FRACT(X,Y) MPC_MULTIPLY_EX(X,Y,32) + #endif + #endif -#define MPC_FIXED_POINT_FRACTPART 14 -typedef mpc_int32_t MPC_SAMPLE_FORMAT; + #define MPC_MAKE_FRACT_CONST(X) (MPC_SAMPLE_FORMAT)((X) * (double)(((mpc_int64_t)1)<<32) ) + + #define MPC_MULTIPLY_FLOAT_INT(X,Y) ((X)*(Y)) -typedef mpc_int64_t MPC_SAMPLE_FORMAT_MULTIPLY; - -#define MAKE_MPC_SAMPLE(X) (MPC_SAMPLE_FORMAT)((double)(X) * (double)(((mpc_int64_t)1)<>= 14 */ - "or.l %[x],%[t1] \n" /* combine result */ - : /* outputs */ - [t1]"=&d"(t1), - [t2]"=&d"(t2), - [x] "+d" (x) - : /* inputs */ - [y] "d" (y) - ); - return t1; -} - -static inline MPC_SAMPLE_FORMAT mpc_multiply_ex(MPC_SAMPLE_FORMAT x, - MPC_SAMPLE_FORMAT y, - unsigned shift) -{ - MPC_SAMPLE_FORMAT t1, t2; - asm volatile ( - "mac.l %[x],%[y],%%acc0\n" /* multiply */ - "mulu.l %[y],%[x] \n" /* get lower half, avoid emac stall */ - "movclr.l %%acc0,%[t1] \n" /* get higher half */ - "moveq.l #31,%[t2] \n" - "sub.l %[sh],%[t2] \n" /* t2 = 31 - shift */ - "ble.s 1f \n" - "asl.l %[t2],%[t1] \n" /* hi <<= 31 - shift */ - "lsr.l %[sh],%[x] \n" /* (unsigned)lo >>= shift */ - "or.l %[x],%[t1] \n" /* combine result */ - "bra.s 2f \n" - "1: \n" - "neg.l %[t2] \n" /* t2 = shift - 31 */ - "asr.l %[t2],%[t1] \n" /* hi >>= t2 */ - "2: \n" - : /* outputs */ - [t1]"=&d"(t1), - [t2]"=&d"(t2), - [x] "+d" (x) - : /* inputs */ - [y] "d" (y), - [sh]"d" (shift) - ); - return t1; -} -#else /* libmusepack standard */ - -#define MPC_MULTIPLY_NOTRUNCATE(X,Y) \ - (((MPC_SAMPLE_FORMAT_MULTIPLY)(X) * (MPC_SAMPLE_FORMAT_MULTIPLY)(Y)) >> MPC_FIXED_POINT_FRACTPART) - -#define MPC_MULTIPLY_EX_NOTRUNCATE(X,Y,Z) \ - (((MPC_SAMPLE_FORMAT_MULTIPLY)(X) * (MPC_SAMPLE_FORMAT_MULTIPLY)(Y)) >> (Z)) - -#ifdef _DEBUG -static inline MPC_SAMPLE_FORMAT MPC_MULTIPLY(MPC_SAMPLE_FORMAT item1,MPC_SAMPLE_FORMAT item2) -{ - MPC_SAMPLE_FORMAT_MULTIPLY temp = MPC_MULTIPLY_NOTRUNCATE(item1,item2); - assert(temp == (MPC_SAMPLE_FORMAT_MULTIPLY)(MPC_SAMPLE_FORMAT)temp); - return (MPC_SAMPLE_FORMAT)temp; -} - -static inline MPC_SAMPLE_FORMAT MPC_MULTIPLY_EX(MPC_SAMPLE_FORMAT item1,MPC_SAMPLE_FORMAT item2,unsigned shift) -{ - MPC_SAMPLE_FORMAT_MULTIPLY temp = MPC_MULTIPLY_EX_NOTRUNCATE(item1,item2,shift); - assert(temp == (MPC_SAMPLE_FORMAT_MULTIPLY)(MPC_SAMPLE_FORMAT)temp); - return (MPC_SAMPLE_FORMAT)temp; -} #else -#define MPC_MULTIPLY(X,Y) ((MPC_SAMPLE_FORMAT)MPC_MULTIPLY_NOTRUNCATE(X,Y)) -#define MPC_MULTIPLY_EX(X,Y,Z) ((MPC_SAMPLE_FORMAT)MPC_MULTIPLY_EX_NOTRUNCATE(X,Y,Z)) -#endif + //in floating-point mode, decoded samples are in -1...1 range -#endif + typedef float MPC_SAMPLE_FORMAT; + + #define MAKE_MPC_SAMPLE(X) ((MPC_SAMPLE_FORMAT)(X)) + #define MAKE_MPC_SAMPLE_EX(X,Y) ((MPC_SAMPLE_FORMAT)(X)) + + #define MPC_MULTIPLY_FRACT(X,Y) ((X)*(Y)) + #define MPC_MAKE_FRACT_CONST(X) (X) + + #define MPC_MULTIPLY_FLOAT_INT(X,Y) ((X)*(Y)) + #define MPC_MULTIPLY(X,Y) ((X)*(Y)) + #define MPC_MULTIPLY_EX(X,Y,Z) ((X)*(Y)) + + #define MPC_SHR_RND(X, Y) (X) -#ifdef MPC_HAVE_MULHIGH -#define MPC_MULTIPLY_FRACT(X,Y) _MulHigh(X,Y) -#else -#if defined(CPU_COLDFIRE) -/* loses one bit of accuracy. - the rest of the macros won't be as easy as this... */ -#define MPC_MULTIPLY_FRACT(X,Y) \ - ({ \ - MPC_SAMPLE_FORMAT t; \ - asm volatile ( \ - "mac.l %[A], %[B], %%acc0\n\t" \ - "movclr.l %%acc0, %[t]\n\t" \ - "asr.l #1, %[t]\n\t" \ - : [t] "=d" (t) \ - : [A] "r" ((X)), [B] "r" ((Y))); \ - t; \ - }) -#else -#define MPC_MULTIPLY_FRACT(X,Y) MPC_MULTIPLY_EX(X,Y,32) #endif -#endif -#define MPC_MAKE_FRACT_CONST(X) (MPC_SAMPLE_FORMAT)((X) * (double)(((mpc_int64_t)1)<<32) ) -#define MPC_MULTIPLY_FRACT_CONST(X,Y) MPC_MULTIPLY_FRACT(X,MPC_MAKE_FRACT_CONST(Y)) -#define MPC_MULTIPLY_FRACT_CONST_FIX(X,Y,Z) ( MPC_MULTIPLY_FRACT(X,MPC_MAKE_FRACT_CONST( Y / (1<<(Z)) )) << (Z) ) -#define MPC_MULTIPLY_FRACT_CONST_SHR(X,Y,Z) MPC_MULTIPLY_FRACT(X,MPC_MAKE_FRACT_CONST( Y / (1<<(Z)) )) - -#define MPC_MULTIPLY_FLOAT_INT(X,Y) ((X)*(Y)) -#define MPC_SCALE_CONST(X,Y,Z) MPC_MULTIPLY_EX(X,MAKE_MPC_SAMPLE_EX(Y,Z),(Z)) -#define MPC_SCALE_CONST_SHL(X,Y,Z,S) MPC_MULTIPLY_EX(X,MAKE_MPC_SAMPLE_EX(Y,Z),(Z)-(S)) -#define MPC_SCALE_CONST_SHR(X,Y,Z,S) MPC_MULTIPLY_EX(X,MAKE_MPC_SAMPLE_EX(Y,Z),(Z)+(S)) -#define MPC_SHR(X,Y) ((X)>>(Y)) -#define MPC_SHL(X,Y) ((X)<<(Y)) - -#else - -//in floating-point mode, decoded samples are in -1...1 range - -typedef float MPC_SAMPLE_FORMAT; - -#define MAKE_MPC_SAMPLE(X) ((MPC_SAMPLE_FORMAT)(X)) -#define MAKE_MPC_SAMPLE_EX(X,Y) ((MPC_SAMPLE_FORMAT)(X)) - -#define MPC_MULTIPLY_FRACT(X,Y) ((X)*(Y)) -#define MPC_MAKE_FRACT_CONST(X) (X) -#define MPC_MULTIPLY_FRACT_CONST(X,Y) MPC_MULTPLY_FRACT(X,MPC_MAKE_FRACT_CONST(Y)) -#define MPC_MULTIPLY_FRACT_CONST_SHR(X,Y,Z) MPC_MULTIPLY_FRACT(X,MPC_MAKE_FRACT_CONST( Y )) -#define MPC_MULTIPLY_FRACT_CONST_FIX(X,Y,Z) MPC_MULTIPLY_FRACT(X,MPC_MAKE_FRACT_CONST( Y )) - -#define MPC_MULTIPLY_FLOAT_INT(X,Y) ((X)*(Y)) -#define MPC_MULTIPLY(X,Y) ((X)*(Y)) -#define MPC_MULTIPLY_EX(X,Y,Z) ((X)*(Y)) -#define MPC_SCALE_CONST(X,Y,Z) ((X)*(Y)) -#define MPC_SCALE_CONST_SHL(X,Y,Z,S) ((X)*(Y)) -#define MPC_SCALE_CONST_SHR(X,Y,Z,S) ((X)*(Y)) -#define MPC_SHR(X,Y) (X) -#define MPC_SHL(X,Y) (X) - -#endif - #endif // _mpcdec_math_h_ Index: apps/codecs/libmusepack/mpc_decoder.c =================================================================== --- apps/codecs/libmusepack/mpc_decoder.c (revision 14284) +++ apps/codecs/libmusepack/mpc_decoder.c (working copy) @@ -417,7 +417,7 @@ mpc_uint32_t mpc_decoder_decode_frame(mpc_decoder *d, mpc_uint32_t *in_buffer, - mpc_uint32_t in_len, MPC_SAMPLE_FORMAT *out_buffer) + mpc_uint32_t in_len, MPC_OUTPUT_FORMAT *out_buffer) { mpc_decoder_reset_bitstream_decode(d); if (in_len > sizeof(Speicher)) in_len = sizeof(Speicher); @@ -445,7 +445,7 @@ } static mpc_uint32_t -mpc_decoder_decode_internal(mpc_decoder *d, MPC_SAMPLE_FORMAT *buffer) +mpc_decoder_decode_internal(mpc_decoder *d, MPC_OUTPUT_FORMAT *buffer) { mpc_uint32_t output_frame_length = MPC_FRAME_LENGTH; @@ -566,7 +566,7 @@ mpc_uint32_t mpc_decoder_decode( mpc_decoder *d, - MPC_SAMPLE_FORMAT *buffer, + MPC_OUTPUT_FORMAT *buffer, mpc_uint32_t *vbr_update_acc, mpc_uint32_t *vbr_update_bits) { Index: apps/codecs/libmusepack/decoder.h =================================================================== --- apps/codecs/libmusepack/decoder.h (revision 14284) +++ apps/codecs/libmusepack/decoder.h (working copy) @@ -51,6 +51,8 @@ MPC_DECODER_MEMSIZE = 16384, // overall buffer size }; +typedef mpc_int32_t MPC_OUTPUT_FORMAT; + typedef struct { mpc_int16_t L [36]; mpc_int16_t R [36]; Index: apps/codecs/libmusepack/synth_filter.c =================================================================== --- apps/codecs/libmusepack/synth_filter.c (revision 14284) +++ apps/codecs/libmusepack/synth_filter.c (working copy) @@ -39,19 +39,51 @@ #include "musepack.h" #include "internal.h" +/* S E T T I N G S */ +// choose speed vs. accuracy for MPC_FIXED_POINT +// speed-setting will increase decoding speed on ARM only (+20%), loss of accuracy equals about 5 dB SNR (15bit output precision) +// to not use the speed-optimization -> comment OPTIMIZE_FOR_SPEED +#if defined(MPC_FIXED_POINT) + #if defined(CPU_COLDFIRE) + // do nothing + #elif defined(CPU_ARM) + #define OPTIMIZE_FOR_SPEED + #else + #define OPTIMIZE_FOR_SPEED + #endif +#else + // do nothing +#endif + /* C O N S T A N T S */ #undef _ -#define MPC_FIXED_POINT_SYNTH_FIX 2 - -#ifdef MPC_FIXED_POINT -#define _(value) MPC_MAKE_FRACT_CONST((double)value/(double)(0x40000)) +#if defined(MPC_FIXED_POINT) + #if defined(OPTIMIZE_FOR_SPEED) + // round to +/- 2^14 as pre-shift before 32=32x32-multiply + #define _(value) (MPC_SHR_RND(value, 3)) + + // round to +/- 2^17 as pre-shift before 32=32x32-multiply + #define MPC_V_PRESHIFT(X) MPC_SHR_RND(X, 14) + #else + // saturate to +/- 2^31 (= value << (31-17)), D-values are +/- 2^17 + #define _(value) (value << (14)) + + // do not perform pre-shift + #define MPC_V_PRESHIFT(X) (X) + #endif #else -#define _(value) MAKE_MPC_SAMPLE((double)value/(double)(0x10000)) + // IMPORTANT: internal scaling is somehow strange for floating point, therefore we scale the coefficients Di_opt + // by the correct amount to have proper scaled output + #define _(value) MAKE_MPC_SAMPLE((double)value*(double)(0x1000)) + + // do not perform pre-shift + #define MPC_V_PRESHIFT(X) (X) #endif - - + +// Di_opt coefficients are +/- 2^17 static const MPC_SAMPLE_FORMAT Di_opt [32] [16] ICONST_ATTR = { + /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */ { _( 0), _( -29), _( 213), _( -459), _( 2037), _(-5153), _( 6574), _(-37489), _(75038), _(37489), _(6574), _( 5153), _(2037), _( 459), _(213), _(29) }, { _( -1), _( -31), _( 218), _( -519), _( 2000), _(-5517), _( 5959), _(-39336), _(74992), _(35640), _(7134), _( 4788), _(2063), _( 401), _(208), _(26) }, { _( -1), _( -35), _( 222), _( -581), _( 1952), _(-5879), _( 5288), _(-41176), _(74856), _(33791), _(7640), _( 4425), _(2080), _( 347), _(202), _(24) }, @@ -88,363 +120,513 @@ #undef _ -static void Calculate_New_V ( const MPC_SAMPLE_FORMAT * Sample, MPC_SAMPLE_FORMAT * V ) +// V-coefficients were expanded (<<) by V_COEFFICIENT_EXPAND +#define V_COEFFICIENT_EXPAND 27 + +#if defined(MPC_FIXED_POINT) + #if defined(OPTIMIZE_FOR_SPEED) + // define 32=32x32-multiplication for DCT-coefficients with samples, vcoef will be pre-shifted on creation + // samples are rounded to +/- 2^19 as pre-shift before 32=32x32-multiply + #define MPC_MULTIPLY_V(sample, vcoef) ( MPC_SHR_RND(sample, 12) * vcoef ) + + // round to +/- 2^16 as pre-shift before 32=32x32-multiply + #define MPC_MAKE_INVCOS(value) (MPC_SHR_RND(value, 15)) + #else + // define 64=32x32-multiplication for DCT-coefficients with samples. Via usage of MPC_FRACT highly optimized assembler might be used + // MULTIPLY_FRACT will do >>32 after multiplication, as V-coef were expanded by V_COEFFICIENT_EXPAND we'll correct this on the result. + // Will loose 5bit accuracy on result in fract part without effect on final audio result + #define MPC_MULTIPLY_V(sample, vcoef) ( (MPC_MULTIPLY_FRACT(sample, vcoef)) << (32-V_COEFFICIENT_EXPAND) ) + + // directly use accurate 32bit-coefficients + #define MPC_MAKE_INVCOS(value) (value) + #endif +#else + // for floating point use the standard multiplication macro + #define MPC_MULTIPLY_V(sample, vcoef) ( MPC_MULTIPLY(sample, vcoef) ) + + // downscale the accurate 32bit-coefficients and convert to float + #define MPC_MAKE_INVCOS(value) MAKE_MPC_SAMPLE((double)value/(double)(1<=2 - A11 = MPC_MULTIPLY_FRACT_CONST_SHR((Sample[11] - Sample[20]) , 1.1694399118f , MPC_FIXED_POINT_SYNTH_FIX); - A12 = MPC_MULTIPLY_FRACT_CONST_SHR((Sample[12] - Sample[19]) , 1.4841645956f , MPC_FIXED_POINT_SYNTH_FIX); -#else - A11 = MPC_SCALE_CONST_SHR ((Sample[11] - Sample[20]) , 1.1694399118f , 30, MPC_FIXED_POINT_SYNTH_FIX); - A12 = MPC_SCALE_CONST_SHR ((Sample[12] - Sample[19]) , 1.4841645956f , 30, MPC_FIXED_POINT_SYNTH_FIX); -#endif - A13 = MPC_SCALE_CONST_SHR ((Sample[13] - Sample[18]) , 2.0577809811f , 29, MPC_FIXED_POINT_SYNTH_FIX); - A14 = MPC_SCALE_CONST_SHR ((Sample[14] - Sample[17]) , 3.4076085091f , 29, MPC_FIXED_POINT_SYNTH_FIX); - A15 = MPC_SCALE_CONST_SHR ((Sample[15] - Sample[16]) , 10.1900081635f, 27 ,MPC_FIXED_POINT_SYNTH_FIX); + A[ 0] = MPC_MULTIPLY_V((Sample[ 0] - Sample[31]), INVCOS01); + A[ 1] = MPC_MULTIPLY_V((Sample[ 1] - Sample[30]), INVCOS03); + A[ 2] = MPC_MULTIPLY_V((Sample[ 2] - Sample[29]), INVCOS05); + A[ 3] = MPC_MULTIPLY_V((Sample[ 3] - Sample[28]), INVCOS07); + A[ 4] = MPC_MULTIPLY_V((Sample[ 4] - Sample[27]), INVCOS09); + A[ 5] = MPC_MULTIPLY_V((Sample[ 5] - Sample[26]), INVCOS11); + A[ 6] = MPC_MULTIPLY_V((Sample[ 6] - Sample[25]), INVCOS13); + A[ 7] = MPC_MULTIPLY_V((Sample[ 7] - Sample[24]), INVCOS15); + A[ 8] = MPC_MULTIPLY_V((Sample[ 8] - Sample[23]), INVCOS17); + A[ 9] = MPC_MULTIPLY_V((Sample[ 9] - Sample[22]), INVCOS19); + A[10] = MPC_MULTIPLY_V((Sample[10] - Sample[21]), INVCOS21); + A[11] = MPC_MULTIPLY_V((Sample[11] - Sample[20]), INVCOS23); + A[12] = MPC_MULTIPLY_V((Sample[12] - Sample[19]), INVCOS25); + A[13] = MPC_MULTIPLY_V((Sample[13] - Sample[18]), INVCOS27); + A[14] = MPC_MULTIPLY_V((Sample[14] - Sample[17]), INVCOS29); + A[15] = MPC_MULTIPLY_V((Sample[15] - Sample[16]), INVCOS31); + // 16 subs, 16 muls, 16 shifts - B00 = A00 + A15; - B01 = A01 + A14; - B02 = A02 + A13; - B03 = A03 + A12; - B04 = A04 + A11; - B05 = A05 + A10; - B06 = A06 + A09; - B07 = A07 + A08; - B08 = MPC_SCALE_CONST((A00 - A15) , 0.5024192929f , 31); - B09 = MPC_SCALE_CONST((A01 - A14) , 0.5224986076f , 31); - B10 = MPC_SCALE_CONST((A02 - A13) , 0.5669440627f , 31); - B11 = MPC_SCALE_CONST((A03 - A12) , 0.6468217969f , 31); - B12 = MPC_SCALE_CONST((A04 - A11) , 0.7881546021f , 31); - B13 = MPC_SCALE_CONST((A05 - A10) , 1.0606776476f , 30); - B14 = MPC_SCALE_CONST((A06 - A09) , 1.7224471569f , 30); - B15 = MPC_SCALE_CONST((A07 - A08) , 5.1011486053f , 28); + B[ 0] = A[ 0] + A[15]; + B[ 1] = A[ 1] + A[14]; + B[ 2] = A[ 2] + A[13]; + B[ 3] = A[ 3] + A[12]; + B[ 4] = A[ 4] + A[11]; + B[ 5] = A[ 5] + A[10]; + B[ 6] = A[ 6] + A[ 9]; + B[ 7] = A[ 7] + A[ 8]; + B[ 8] = MPC_MULTIPLY_V((A[ 0] - A[15]), INVCOS02); + B[ 9] = MPC_MULTIPLY_V((A[ 1] - A[14]), INVCOS06); + B[10] = MPC_MULTIPLY_V((A[ 2] - A[13]), INVCOS10); + B[11] = MPC_MULTIPLY_V((A[ 3] - A[12]), INVCOS14); + B[12] = MPC_MULTIPLY_V((A[ 4] - A[11]), INVCOS18); + B[13] = MPC_MULTIPLY_V((A[ 5] - A[10]), INVCOS22); + B[14] = MPC_MULTIPLY_V((A[ 6] - A[ 9]), INVCOS26); + B[15] = MPC_MULTIPLY_V((A[ 7] - A[ 8]), INVCOS30); + // 8 adds, 8 subs, 8 muls, 8 shift - A00 = B00 + B07; - A01 = B01 + B06; - A02 = B02 + B05; - A03 = B03 + B04; - A04 = MPC_SCALE_CONST((B00 - B07) , 0.5097956061f , 31); - A05 = MPC_SCALE_CONST((B01 - B06) , 0.6013448834f , 31); - A06 = MPC_SCALE_CONST((B02 - B05) , 0.8999761939f , 31); - A07 = MPC_SCALE_CONST((B03 - B04) , 2.5629155636f , 29); - A08 = B08 + B15; - A09 = B09 + B14; - A10 = B10 + B13; - A11 = B11 + B12; - A12 = MPC_SCALE_CONST((B08 - B15) , 0.5097956061f , 31); - A13 = MPC_SCALE_CONST((B09 - B14) , 0.6013448834f , 31); - A14 = MPC_SCALE_CONST((B10 - B13) , 0.8999761939f , 31); - A15 = MPC_SCALE_CONST((B11 - B12) , 2.5629155636f , 29); + A[ 0] = B[ 0] + B[ 7]; + A[ 1] = B[ 1] + B[ 6]; + A[ 2] = B[ 2] + B[ 5]; + A[ 3] = B[ 3] + B[ 4]; + A[ 4] = MPC_MULTIPLY_V((B[ 0] - B[ 7]), INVCOS04); + A[ 5] = MPC_MULTIPLY_V((B[ 1] - B[ 6]), INVCOS12); + A[ 6] = MPC_MULTIPLY_V((B[ 2] - B[ 5]), INVCOS20); + A[ 7] = MPC_MULTIPLY_V((B[ 3] - B[ 4]), INVCOS28); + A[ 8] = B[ 8] + B[15]; + A[ 9] = B[ 9] + B[14]; + A[10] = B[10] + B[13]; + A[11] = B[11] + B[12]; + A[12] = MPC_MULTIPLY_V((B[ 8] - B[15]), INVCOS04); + A[13] = MPC_MULTIPLY_V((B[ 9] - B[14]), INVCOS12); + A[14] = MPC_MULTIPLY_V((B[10] - B[13]), INVCOS20); + A[15] = MPC_MULTIPLY_V((B[11] - B[12]), INVCOS28); + // 8 adds, 8 subs, 8 muls, 8 shift - B00 = A00 + A03; - B01 = A01 + A02; - B02 = MPC_SCALE_CONST((A00 - A03) , 0.5411961079f , 31); - B03 = MPC_SCALE_CONST((A01 - A02) , 1.3065630198f , 30); - B04 = A04 + A07; - B05 = A05 + A06; - B06 = MPC_SCALE_CONST((A04 - A07) , 0.5411961079f , 31); - B07 = MPC_SCALE_CONST((A05 - A06) , 1.3065630198f , 30); - B08 = A08 + A11; - B09 = A09 + A10; - B10 = MPC_SCALE_CONST((A08 - A11) , 0.5411961079f , 31); - B11 = MPC_SCALE_CONST((A09 - A10) , 1.3065630198f , 30); - B12 = A12 + A15; - B13 = A13 + A14; - B14 = MPC_SCALE_CONST((A12 - A15) , 0.5411961079f , 31); - B15 = MPC_SCALE_CONST((A13 - A14) , 1.3065630198f , 30); + B[ 0] = A[ 0] + A[ 3]; + B[ 1] = A[ 1] + A[ 2]; + B[ 2] = MPC_MULTIPLY_V((A[ 0] - A[ 3]), INVCOS08); + B[ 3] = MPC_MULTIPLY_V((A[ 1] - A[ 2]), INVCOS24); + B[ 4] = A[ 4] + A[ 7]; + B[ 5] = A[ 5] + A[ 6]; + B[ 6] = MPC_MULTIPLY_V((A[ 4] - A[ 7]), INVCOS08); + B[ 7] = MPC_MULTIPLY_V((A[ 5] - A[ 6]), INVCOS24); + B[ 8] = A[ 8] + A[11]; + B[ 9] = A[ 9] + A[10]; + B[10] = MPC_MULTIPLY_V((A[ 8] - A[11]), INVCOS08); + B[11] = MPC_MULTIPLY_V((A[ 9] - A[10]), INVCOS24); + B[12] = A[12] + A[15]; + B[13] = A[13] + A[14]; + B[14] = MPC_MULTIPLY_V((A[12] - A[15]), INVCOS08); + B[15] = MPC_MULTIPLY_V((A[13] - A[14]), INVCOS24); + // 8 adds, 8 subs, 8 muls, 8 shift - A00 = MPC_SHL(B00 + B01, MPC_FIXED_POINT_SYNTH_FIX); - A01 = MPC_SCALE_CONST_SHL((B00 - B01) , 0.7071067691f , 31, MPC_FIXED_POINT_SYNTH_FIX); - A02 = MPC_SHL(B02 + B03, MPC_FIXED_POINT_SYNTH_FIX); - A03 = MPC_SCALE_CONST_SHL((B02 - B03) , 0.7071067691f , 31, MPC_FIXED_POINT_SYNTH_FIX); - A04 = MPC_SHL(B04 + B05, MPC_FIXED_POINT_SYNTH_FIX); - A05 = MPC_SCALE_CONST_SHL((B04 - B05) , 0.7071067691f , 31, MPC_FIXED_POINT_SYNTH_FIX); - A06 = MPC_SHL(B06 + B07, MPC_FIXED_POINT_SYNTH_FIX); - A07 = MPC_SCALE_CONST_SHL((B06 - B07) , 0.7071067691f , 31, MPC_FIXED_POINT_SYNTH_FIX); - A08 = MPC_SHL(B08 + B09, MPC_FIXED_POINT_SYNTH_FIX); - A09 = MPC_SCALE_CONST_SHL((B08 - B09) , 0.7071067691f , 31, MPC_FIXED_POINT_SYNTH_FIX); - A10 = MPC_SHL(B10 + B11, MPC_FIXED_POINT_SYNTH_FIX); - A11 = MPC_SCALE_CONST_SHL((B10 - B11) , 0.7071067691f , 31, MPC_FIXED_POINT_SYNTH_FIX); - A12 = MPC_SHL(B12 + B13, MPC_FIXED_POINT_SYNTH_FIX); - A13 = MPC_SCALE_CONST_SHL((B12 - B13) , 0.7071067691f , 31, MPC_FIXED_POINT_SYNTH_FIX); - A14 = MPC_SHL(B14 + B15, MPC_FIXED_POINT_SYNTH_FIX); - A15 = MPC_SCALE_CONST_SHL((B14 - B15) , 0.7071067691f , 31, MPC_FIXED_POINT_SYNTH_FIX); + A[ 0] = B[ 0] + B[ 1]; + A[ 1] = MPC_MULTIPLY_V((B[ 0] - B[ 1]), INVCOS16); + A[ 2] = B[ 2] + B[ 3]; + A[ 3] = MPC_MULTIPLY_V((B[ 2] - B[ 3]), INVCOS16); + A[ 4] = B[ 4] + B[ 5]; + A[ 5] = MPC_MULTIPLY_V((B[ 4] - B[ 5]), INVCOS16); + A[ 6] = B[ 6] + B[ 7]; + A[ 7] = MPC_MULTIPLY_V((B[ 6] - B[ 7]), INVCOS16); + A[ 8] = B[ 8] + B[ 9]; + A[ 9] = MPC_MULTIPLY_V((B[ 8] - B[ 9]), INVCOS16); + A[10] = B[10] + B[11]; + A[11] = MPC_MULTIPLY_V((B[10] - B[11]), INVCOS16); + A[12] = B[12] + B[13]; + A[13] = MPC_MULTIPLY_V((B[12] - B[13]), INVCOS16); + A[14] = B[14] + B[15]; + A[15] = MPC_MULTIPLY_V((B[14] - B[15]), INVCOS16); + // 8 adds, 8 subs, 8 muls, 8 shift - // mehrfach verwendete Ausdrücke: A04+A06+A07, A09+A13+A15 - V[ 5] = (V[11] = (V[13] = A07 + (V[15] = A15)) + A11) + A05 + A13; - V[ 7] = (V[ 9] = A03 + A11 + A15) + A13; - V[33] = -(V[ 1] = A01 + A09 + A13 + A15) - A14; - V[35] = -(V[ 3] = A05 + A07 + A09 + A13 + A15) - A06 - A14; - V[37] = (tmp = -(A10 + A11 + A13 + A14 + A15)) - A05 - A06 - A07; - V[39] = tmp - A02 - A03; // abhängig vom Befehl drüber - V[41] = (tmp += A13 - A12) - A02 - A03; // abhängig vom Befehl 2 drüber - V[43] = tmp - A04 - A06 - A07; // abhängig von Befehlen 1 und 3 drüber - V[47] = (tmp = -(A08 + A12 + A14 + A15)) - A00; - V[45] = tmp - A04 - A06 - A07; // abhängig vom Befehl drüber + // multiple used expressions: A[ 4]+A[ 6]+A[ 7], A[ 9]+A[13]+A[15] + V[ 5] = (V[11] = (V[13] = A[ 7] + (V[15] = A[15])) + A[11]) + A[ 5] + A[13]; + V[ 7] = (V[ 9] = A[ 3] + A[11] + A[15]) + A[13]; + V[33] = -(V[ 1] = A[ 1] + A[ 9] + A[13] + A[15]) - A[14]; + V[35] = -(V[ 3] = A[ 5] + A[ 7] + A[ 9] + A[13] + A[15]) - A[ 6] - A[14]; + V[37] = (tmp = -(A[10] + A[11] + A[13] + A[14] + A[15])) - A[ 5] - A[ 6] - A[ 7]; + V[39] = tmp - A[ 2] - A[ 3]; + V[41] = (tmp += A[13] - A[12]) - A[ 2] - A[ 3]; + V[43] = tmp - A[ 4] - A[ 6] - A[ 7]; + V[47] = (tmp = -(A[ 8] + A[12] + A[14] + A[15])) - A[ 0]; + V[45] = tmp - A[ 4] - A[ 6] - A[ 7]; + // 22 adds, 18 subs - V[32] = -V[ 0]; - V[31] = -V[ 1]; - V[30] = -V[ 2]; - V[29] = -V[ 3]; - V[28] = -V[ 4]; - V[27] = -V[ 5]; - V[26] = -V[ 6]; - V[25] = -V[ 7]; - V[24] = -V[ 8]; - V[23] = -V[ 9]; - V[22] = -V[10]; - V[21] = -V[11]; - V[20] = -V[12]; - V[19] = -V[13]; - V[18] = -V[14]; - V[17] = -V[15]; + V[32] = -(V[ 0] = MPC_V_PRESHIFT(V[ 0])); + V[31] = -(V[ 1] = MPC_V_PRESHIFT(V[ 1])); + V[30] = -(V[ 2] = MPC_V_PRESHIFT(V[ 2])); + V[29] = -(V[ 3] = MPC_V_PRESHIFT(V[ 3])); + V[28] = -(V[ 4] = MPC_V_PRESHIFT(V[ 4])); + V[27] = -(V[ 5] = MPC_V_PRESHIFT(V[ 5])); + V[26] = -(V[ 6] = MPC_V_PRESHIFT(V[ 6])); + V[25] = -(V[ 7] = MPC_V_PRESHIFT(V[ 7])); + V[24] = -(V[ 8] = MPC_V_PRESHIFT(V[ 8])); + V[23] = -(V[ 9] = MPC_V_PRESHIFT(V[ 9])); + V[22] = -(V[10] = MPC_V_PRESHIFT(V[10])); + V[21] = -(V[11] = MPC_V_PRESHIFT(V[11])); + V[20] = -(V[12] = MPC_V_PRESHIFT(V[12])); + V[19] = -(V[13] = MPC_V_PRESHIFT(V[13])); + V[18] = -(V[14] = MPC_V_PRESHIFT(V[14])); + V[17] = -(V[15] = MPC_V_PRESHIFT(V[15])); + // 16 adds, 16 shifts (OPTIMIZE_FOR_SPEED only) - V[63] = V[33]; - V[62] = V[34]; - V[61] = V[35]; - V[60] = V[36]; - V[59] = V[37]; - V[58] = V[38]; - V[57] = V[39]; - V[56] = V[40]; - V[55] = V[41]; - V[54] = V[42]; - V[53] = V[43]; - V[52] = V[44]; - V[51] = V[45]; - V[50] = V[46]; - V[49] = V[47]; + V[63] = (V[33] = MPC_V_PRESHIFT(V[33])); + V[62] = (V[34] = MPC_V_PRESHIFT(V[34])); + V[61] = (V[35] = MPC_V_PRESHIFT(V[35])); + V[60] = (V[36] = MPC_V_PRESHIFT(V[36])); + V[59] = (V[37] = MPC_V_PRESHIFT(V[37])); + V[58] = (V[38] = MPC_V_PRESHIFT(V[38])); + V[57] = (V[39] = MPC_V_PRESHIFT(V[39])); + V[56] = (V[40] = MPC_V_PRESHIFT(V[40])); + V[55] = (V[41] = MPC_V_PRESHIFT(V[41])); + V[54] = (V[42] = MPC_V_PRESHIFT(V[42])); + V[53] = (V[43] = MPC_V_PRESHIFT(V[43])); + V[52] = (V[44] = MPC_V_PRESHIFT(V[44])); + V[51] = (V[45] = MPC_V_PRESHIFT(V[45])); + V[50] = (V[46] = MPC_V_PRESHIFT(V[46])); + V[49] = (V[47] = MPC_V_PRESHIFT(V[47])); + V[48] = (V[48] = MPC_V_PRESHIFT(V[48])); + // 16 adds, 16 shifts (OPTIMIZE_FOR_SPEED only) + + // OPTIMIZE_FOR_SPEED total: 143 adds, 107 subs, 80 muls, 112 shifts + // total: 111 adds, 107 subs, 80 muls, 80 shifts } -static void Synthese_Filter_float_internal(MPC_SAMPLE_FORMAT * OutData,MPC_SAMPLE_FORMAT * V,const MPC_SAMPLE_FORMAT * Y) +static inline void +mpc_decoder_windowing_D(MPC_OUTPUT_FORMAT * Data, const MPC_SAMPLE_FORMAT * V) { + const MPC_SAMPLE_FORMAT *D = (const MPC_SAMPLE_FORMAT *) &Di_opt; + mpc_int32_t k; + + #if defined(OPTIMIZE_FOR_SPEED) + #if defined(CPU_ARM) + // 32=32x32-multiply assembler for ARM + for ( k = 0; k < 32; k++, V++ ) + { + asm volatile ( + "ldmia %[D]!, { r0-r3 } \n\t" + "ldr r4, [%[V]] \n\t" + "mul r5, r0, r4 \n\t" + "ldr r4, [%[V], #96*4] \n\t" + "mla r5, r1, r4, r5 \n\t" + "ldr r4, [%[V], #128*4] \n\t" + "mla r5, r2, r4, r5 \n\t" + "ldr r4, [%[V], #224*4] \n\t" + "mla r5, r3, r4, r5 \n\t" + + "ldmia %[D]!, { r0-r3 } \n\t" + "ldr r4, [%[V], #256*4] \n\t" + "mla r5, r0, r4, r5 \n\t" + "ldr r4, [%[V], #352*4] \n\t" + "mla r5, r1, r4, r5 \n\t" + "ldr r4, [%[V], #384*4] \n\t" + "mla r5, r2, r4, r5 \n\t" + "ldr r4, [%[V], #480*4] \n\t" + "mla r5, r3, r4, r5 \n\t" + + "ldmia %[D]!, { r0-r3 } \n\t" + "ldr r4, [%[V], #512*4] \n\t" + "mla r5, r0, r4, r5 \n\t" + "ldr r4, [%[V], #608*4] \n\t" + "mla r5, r1, r4, r5 \n\t" + "ldr r4, [%[V], #640*4] \n\t" + "mla r5, r2, r4, r5 \n\t" + "ldr r4, [%[V], #736*4] \n\t" + "mla r5, r3, r4, r5 \n\t" + + "ldmia %[D]!, { r0-r3 } \n\t" + "ldr r4, [%[V], #768*4] \n\t" + "mla r5, r0, r4, r5 \n\t" + "ldr r4, [%[V], #864*4] \n\t" + "mla r5, r1, r4, r5 \n\t" + "ldr r4, [%[V], #896*4] \n\t" + "mla r5, r2, r4, r5 \n\t" + "ldr r4, [%[V], #992*4] \n\t" + "mla r5, r3, r4, r5 \n\t" + "str r5, [%[Data]], #4 \n" + : [Data] "+r" (Data), [D] "+r" (D) + : [V] "r" (V) + : "r0", "r1", "r2", "r3", "r4", "r5"); + } + #else + // 32=32x32-multiply (FIXED_POINT) + for ( k = 0; k < 32; k++, D += 16, V++ ) + { + *Data = V[ 0]*D[ 0] + V[ 96]*D[ 1] + V[128]*D[ 2] + V[224]*D[ 3] + + V[256]*D[ 4] + V[352]*D[ 5] + V[384]*D[ 6] + V[480]*D[ 7] + + V[512]*D[ 8] + V[608]*D[ 9] + V[640]*D[10] + V[736]*D[11] + + V[768]*D[12] + V[864]*D[13] + V[896]*D[14] + V[992]*D[15]; + Data += 1; + // total: 16 muls, 15 adds + } + #endif + #else + #if defined(CPU_COLDFIRE) + // 64=32x32-multiply assembler for Coldfire + for ( k = 0; k < 32; k++, D += 16, V++ ) + { + asm volatile ( + "movem.l (%[D]), %%d0-%%d3 \n\t" + "move.l (%[V]), %%a5 \n\t" + "mac.l %%d0, %%a5, (96*4, %[V]), %%a5, %%acc0 \n\t" + "mac.l %%d1, %%a5, (128*4, %[V]), %%a5, %%acc0\n\t" + "mac.l %%d2, %%a5, (224*4, %[V]), %%a5, %%acc0\n\t" + "mac.l %%d3, %%a5, (256*4, %[V]), %%a5, %%acc0\n\t" + "movem.l (4*4, %[D]), %%d0-%%d3 \n\t" + "mac.l %%d0, %%a5, (352*4, %[V]), %%a5, %%acc0\n\t" + "mac.l %%d1, %%a5, (384*4, %[V]), %%a5, %%acc0\n\t" + "mac.l %%d2, %%a5, (480*4, %[V]), %%a5, %%acc0\n\t" + "mac.l %%d3, %%a5, (512*4, %[V]), %%a5, %%acc0\n\t" + "movem.l (8*4, %[D]), %%d0-%%d3 \n\t" + "mac.l %%d0, %%a5, (608*4, %[V]), %%a5, %%acc0\n\t" + "mac.l %%d1, %%a5, (640*4, %[V]), %%a5, %%acc0\n\t" + "mac.l %%d2, %%a5, (736*4, %[V]), %%a5, %%acc0\n\t" + "mac.l %%d3, %%a5, (768*4, %[V]), %%a5, %%acc0\n\t" + "movem.l (12*4, %[D]), %%d0-%%d3 \n\t" + "mac.l %%d0, %%a5, (864*4, %[V]), %%a5, %%acc0\n\t" + "mac.l %%d1, %%a5, (896*4, %[V]), %%a5, %%acc0\n\t" + "mac.l %%d2, %%a5, (992*4, %[V]), %%a5, %%acc0\n\t" + "mac.l %%d3, %%a5, %%acc0 \n\t" + "movclr.l %%acc0, %%d0 \n\t" + "move.l %%d0, (%[Data])+ \n" + : [Data] "+a" (Data) + : [V] "a" (V), [D] "a" (D) + : "d0", "d1", "d2", "d3", "a5"); + } + #elif defined(CPU_ARM) + // 64=32x32-multiply assembler for ARM + for ( k = 0; k < 32; k++, V++ ) + { + asm volatile ( + "ldmia %[D]!, { r0-r3 } \n\t" + "ldr r4, [%[V]] \n\t" + "smull r5, r6, r0, r4 \n\t" + "ldr r4, [%[V], #96*4] \n\t" + "smlal r5, r6, r1, r4 \n\t" + "ldr r4, [%[V], #128*4] \n\t" + "smlal r5, r6, r2, r4 \n\t" + "ldr r4, [%[V], #224*4] \n\t" + "smlal r5, r6, r3, r4 \n\t" + + "ldmia %[D]!, { r0-r3 } \n\t" + "ldr r4, [%[V], #256*4] \n\t" + "smlal r5, r6, r0, r4 \n\t" + "ldr r4, [%[V], #352*4] \n\t" + "smlal r5, r6, r1, r4 \n\t" + "ldr r4, [%[V], #384*4] \n\t" + "smlal r5, r6, r2, r4 \n\t" + "ldr r4, [%[V], #480*4] \n\t" + "smlal r5, r6, r3, r4 \n\t" + + "ldmia %[D]!, { r0-r3 } \n\t" + "ldr r4, [%[V], #512*4] \n\t" + "smlal r5, r6, r0, r4 \n\t" + "ldr r4, [%[V], #608*4] \n\t" + "smlal r5, r6, r1, r4 \n\t" + "ldr r4, [%[V], #640*4] \n\t" + "smlal r5, r6, r2, r4 \n\t" + "ldr r4, [%[V], #736*4] \n\t" + "smlal r5, r6, r3, r4 \n\t" + + "ldmia %[D]!, { r0-r3 } \n\t" + "ldr r4, [%[V], #768*4] \n\t" + "smlal r5, r6, r0, r4 \n\t" + "ldr r4, [%[V], #864*4] \n\t" + "smlal r5, r6, r1, r4 \n\t" + "ldr r4, [%[V], #896*4] \n\t" + "smlal r5, r6, r2, r4 \n\t" + "ldr r4, [%[V], #992*4] \n\t" + "smlal r5, r6, r3, r4 \n\t" + "mov r4, r6, lsl #1 \n\t" + "orr r4, r4, r5, lsr #31\n\t" + "str r4, [%[Data]], #4 \n" + : [Data] "+r" (Data), [D] "+r" (D) + : [V] "r" (V) + : "r0", "r1", "r2", "r3", "r4", "r5", "r6"); + } + #else + // 64=64x64-multiply (FIXED_POINT) or float=float*float (!FIXED_POINT) in C + for ( k = 0; k < 32; k++, D += 16, V++ ) + { + *Data = MPC_MULTIPLY_EX(V[ 0],D[ 0],31) + MPC_MULTIPLY_EX(V[ 96],D[ 1],31) + MPC_MULTIPLY_EX(V[128],D[ 2],31) + MPC_MULTIPLY_EX(V[224],D[ 3],31) + + MPC_MULTIPLY_EX(V[256],D[ 4],31) + MPC_MULTIPLY_EX(V[352],D[ 5],31) + MPC_MULTIPLY_EX(V[384],D[ 6],31) + MPC_MULTIPLY_EX(V[480],D[ 7],31) + + MPC_MULTIPLY_EX(V[512],D[ 8],31) + MPC_MULTIPLY_EX(V[608],D[ 9],31) + MPC_MULTIPLY_EX(V[640],D[10],31) + MPC_MULTIPLY_EX(V[736],D[11],31) + + MPC_MULTIPLY_EX(V[768],D[12],31) + MPC_MULTIPLY_EX(V[864],D[13],31) + MPC_MULTIPLY_EX(V[896],D[14],31) + MPC_MULTIPLY_EX(V[992],D[15],31); + Data += 1; + // total: 16 muls, 15 adds, 16 shifts + } + #endif + #endif +} + +static void +mpc_full_synthesis_filter(MPC_OUTPUT_FORMAT *OutData, MPC_SAMPLE_FORMAT *V, const MPC_SAMPLE_FORMAT *Y) +{ mpc_uint32_t n; - for ( n = 0; n < 36; n++, Y += 32 ) { - V -= 64; - Calculate_New_V ( Y, V ); - if (OutData != NULL) + + if (NULL != OutData) + { + for ( n = 0; n < 36; n++, Y += 32, OutData += 32 ) { - MPC_SAMPLE_FORMAT * Data = OutData; - const MPC_SAMPLE_FORMAT * D = (const MPC_SAMPLE_FORMAT *) &Di_opt; - mpc_int32_t k; - //mpc_int32_t tmp; - - - - #if defined(CPU_COLDFIRE) - for ( k = 0; k < 32; k++, D += 16, V++ ) { - asm volatile ( - "movem.l (%[D]), %%d0-%%d3 \n\t" - "move.l (%[V]), %%a5 \n\t" - "mac.l %%d0, %%a5, (96*4, %[V]), %%a5, %%acc0 \n\t" - "mac.l %%d1, %%a5, (128*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d2, %%a5, (224*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d3, %%a5, (256*4, %[V]), %%a5, %%acc0\n\t" - "movem.l (4*4, %[D]), %%d0-%%d3 \n\t" - "mac.l %%d0, %%a5, (352*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d1, %%a5, (384*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d2, %%a5, (480*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d3, %%a5, (512*4, %[V]), %%a5, %%acc0\n\t" - "movem.l (8*4, %[D]), %%d0-%%d3 \n\t" - "mac.l %%d0, %%a5, (608*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d1, %%a5, (640*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d2, %%a5, (736*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d3, %%a5, (768*4, %[V]), %%a5, %%acc0\n\t" - "movem.l (12*4, %[D]), %%d0-%%d3 \n\t" - "mac.l %%d0, %%a5, (864*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d1, %%a5, (896*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d2, %%a5, (992*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d3, %%a5, %%acc0 \n\t" - "movclr.l %%acc0, %%d0 \n\t" - "move.l %%d0, (%[Data])+ \n" - : [Data] "+a" (Data) - : [V] "a" (V), [D] "a" (D) - : "d0", "d1", "d2", "d3", "a5"); - #elif defined(CPU_ARM) - for ( k = 0; k < 32; k++, V++ ) { - asm volatile ( - "ldmia %[D]!, { r0-r3 } \n\t" - "ldr r4, [%[V]] \n\t" - "smull r5, r6, r0, r4 \n\t" - "ldr r4, [%[V], #96*4] \n\t" - "smlal r5, r6, r1, r4 \n\t" - "ldr r4, [%[V], #128*4] \n\t" - "smlal r5, r6, r2, r4 \n\t" - "ldr r4, [%[V], #224*4] \n\t" - "smlal r5, r6, r3, r4 \n\t" - - "ldmia %[D]!, { r0-r3 } \n\t" - "ldr r4, [%[V], #256*4] \n\t" - "smlal r5, r6, r0, r4 \n\t" - "ldr r4, [%[V], #352*4] \n\t" - "smlal r5, r6, r1, r4 \n\t" - "ldr r4, [%[V], #384*4] \n\t" - "smlal r5, r6, r2, r4 \n\t" - "ldr r4, [%[V], #480*4] \n\t" - "smlal r5, r6, r3, r4 \n\t" - - "ldmia %[D]!, { r0-r3 } \n\t" - "ldr r4, [%[V], #512*4] \n\t" - "smlal r5, r6, r0, r4 \n\t" - "ldr r4, [%[V], #608*4] \n\t" - "smlal r5, r6, r1, r4 \n\t" - "ldr r4, [%[V], #640*4] \n\t" - "smlal r5, r6, r2, r4 \n\t" - "ldr r4, [%[V], #736*4] \n\t" - "smlal r5, r6, r3, r4 \n\t" - - "ldmia %[D]!, { r0-r3 } \n\t" - "ldr r4, [%[V], #768*4] \n\t" - "smlal r5, r6, r0, r4 \n\t" - "ldr r4, [%[V], #864*4] \n\t" - "smlal r5, r6, r1, r4 \n\t" - "ldr r4, [%[V], #896*4] \n\t" - "smlal r5, r6, r2, r4 \n\t" - "ldr r4, [%[V], #992*4] \n\t" - "smlal r5, r6, r3, r4 \n\t" - "mov r4, r6, lsl #1 \n\t" - "orr r4, r4, r5, lsr #31\n\t" - "str r4, [%[Data]], #4 \n" - : [Data] "+r" (Data), [D] "+r" (D) - : [V] "r" (V) - : "r0", "r1", "r2", "r3", "r4", "r5", "r6"); - #else - for ( k = 0; k < 32; k++, D += 16, V++ ) { - *Data = MPC_SHL( - MPC_MULTIPLY_FRACT(V[ 0],D[ 0]) + MPC_MULTIPLY_FRACT(V[ 96],D[ 1]) + MPC_MULTIPLY_FRACT(V[128],D[ 2]) + MPC_MULTIPLY_FRACT(V[224],D[ 3]) - + MPC_MULTIPLY_FRACT(V[256],D[ 4]) + MPC_MULTIPLY_FRACT(V[352],D[ 5]) + MPC_MULTIPLY_FRACT(V[384],D[ 6]) + MPC_MULTIPLY_FRACT(V[480],D[ 7]) - + MPC_MULTIPLY_FRACT(V[512],D[ 8]) + MPC_MULTIPLY_FRACT(V[608],D[ 9]) + MPC_MULTIPLY_FRACT(V[640],D[10]) + MPC_MULTIPLY_FRACT(V[736],D[11]) - + MPC_MULTIPLY_FRACT(V[768],D[12]) + MPC_MULTIPLY_FRACT(V[864],D[13]) + MPC_MULTIPLY_FRACT(V[896],D[14]) + MPC_MULTIPLY_FRACT(V[992],D[15]) - , 1); - - Data += 1; - #endif - } - V -= 32;//bleh - OutData+=32; + V -= 64; + mpc_calculate_new_V ( Y, V ); + mpc_decoder_windowing_D( OutData, V); } - } + } } void -mpc_decoder_synthese_filter_float(mpc_decoder *d, MPC_SAMPLE_FORMAT* OutData) +mpc_decoder_synthese_filter_float(mpc_decoder *d, MPC_OUTPUT_FORMAT *OutData) { /********* left channel ********/ memmove(d->V_L + MPC_V_MEM, d->V_L, 960 * sizeof(MPC_SAMPLE_FORMAT) ); - Synthese_Filter_float_internal( + mpc_full_synthesis_filter( OutData, (MPC_SAMPLE_FORMAT *)(d->V_L + MPC_V_MEM), (MPC_SAMPLE_FORMAT *)(d->Y_L [0])); @@ -452,7 +634,7 @@ /******** right channel ********/ memmove(d->V_R + MPC_V_MEM, d->V_R, 960 * sizeof(MPC_SAMPLE_FORMAT) ); - Synthese_Filter_float_internal( + mpc_full_synthesis_filter( (OutData == NULL ? NULL : OutData + MPC_FRAME_LENGTH), (MPC_SAMPLE_FORMAT *)(d->V_R + MPC_V_MEM), (MPC_SAMPLE_FORMAT *)(d->Y_R [0])); Index: apps/codecs/libmusepack/musepack.h =================================================================== --- apps/codecs/libmusepack/musepack.h (revision 14284) +++ apps/codecs/libmusepack/musepack.h (working copy) @@ -120,7 +120,7 @@ /// \return > 0 to indicate the number of bytes that were actually read from the stream. mpc_uint32_t mpc_decoder_decode( mpc_decoder *d, - MPC_SAMPLE_FORMAT *buffer, + MPC_OUTPUT_FORMAT *buffer, mpc_uint32_t *vbr_update_acc, mpc_uint32_t *vbr_update_bits); @@ -128,7 +128,7 @@ mpc_decoder *d, mpc_uint32_t *in_buffer, mpc_uint32_t in_len, - MPC_SAMPLE_FORMAT *out_buffer); + MPC_OUTPUT_FORMAT *out_buffer); /// Seeks to the specified sample in the source stream. mpc_bool_t mpc_decoder_seek_sample(mpc_decoder *d, mpc_int64_t destsample); Index: apps/codecs/libmusepack/requant.c =================================================================== --- apps/codecs/libmusepack/requant.c (revision 14284) +++ apps/codecs/libmusepack/requant.c (working copy) @@ -53,8 +53,8 @@ const MPC_SAMPLE_FORMAT __Cc [1 + 18] = { _(111.285962475327f), // 32768/2/255*sqrt(3) _(65536.000000000000f), _(21845.333333333332f), _(13107.200000000001f), _(9362.285714285713f), - _(7281.777777777777f), _(4369.066666666666f), _(2114.064516129032f), _(1040.253968253968f), - _(516.031496062992f), _(257.003921568627f), _(128.250489236790f), _(64.062561094819f), + _(7281.777777777777f), _(4369.066666666666f), _(2114.064516129032f), _(1040.253968253968f), + _(516.031496062992f), _(257.003921568627f), _(128.250489236790f), _(64.062561094819f), _(32.015632633121f), _(16.003907203907f), _(8.000976681723f), _(4.000244155527f), _(2.000061037018f), _(1.000015259021f) }; Index: apps/codecs/libmusepack/internal.h =================================================================== --- apps/codecs/libmusepack/internal.h (revision 14284) +++ apps/codecs/libmusepack/internal.h (working copy) @@ -67,7 +67,7 @@ /// helper functions used by multiple files mpc_uint32_t mpc_random_int(mpc_decoder *d); // in synth_filter.c void mpc_decoder_initialisiere_quantisierungstabellen(mpc_decoder *d, double scale_factor); -void mpc_decoder_synthese_filter_float(mpc_decoder *d, MPC_SAMPLE_FORMAT* OutData); +void mpc_decoder_synthese_filter_float(mpc_decoder *d, MPC_OUTPUT_FORMAT* OutData); #endif // _mpcdec_internal_h Index: apps/codecs/libmusepack/Makefile =================================================================== --- apps/codecs/libmusepack/Makefile (revision 14284) +++ apps/codecs/libmusepack/Makefile (working copy) @@ -14,7 +14,13 @@ INCLUDES += $(patsubst %,-I$(APPSDIR)/%,$(subst :, ,$(APPEXTRA))) endif -MUSEPACKOPTS = -O2 +# libmusepack is faster on ARM-targets with -O1 instead of -O2 +ifeq ($(CPU),arm) + MUSEPACKOPTS += -O1 +else + MUSEPACKOPTS += -O2 +endif + CFLAGS = $(INCLUDES) $(GCCOPTS) $(TARGET_INC) $(MUSEPACKOPTS) $(TARGET) \ $(EXTRA_DEFINES) -DMEM=${MEMORYSIZE} $(PROFILE_OPTS) Index: apps/codecs/mpc.c =================================================================== --- apps/codecs/mpc.c (revision 14284) +++ apps/codecs/mpc.c (working copy) @@ -63,8 +63,8 @@ return true; } -MPC_SAMPLE_FORMAT sample_buffer[MPC_DECODER_BUFFER_LENGTH] -IBSS_ATTR_MPC_SAMPLE_BUF; +//-- MPC_SAMPLE_FORMAT sample_buffer[MPC_DECODER_BUFFER_LENGTH] IBSS_ATTR_MPC_SAMPLE_BUF; +MPC_OUTPUT_FORMAT sample_buffer[MPC_DECODER_BUFFER_LENGTH] IBSS_ATTR_MPC_SAMPLE_BUF; mpc_uint32_t seek_table[10000]; /* this is the codec entry point */