Index: apps/codecs/libmusepack/math.h =================================================================== --- apps/codecs/libmusepack/math.h (revision 13591) +++ apps/codecs/libmusepack/math.h (working copy) @@ -44,176 +44,190 @@ #ifdef MPC_FIXED_POINT + #ifdef _WIN32_WCE + #include + #define MPC_HAVE_MULHIGH + #endif -#ifdef _WIN32_WCE + #define MPC_FIXED_POINT_SCALE_SHIFT (MPC_FIXED_POINT_SHIFT + MPC_FIXED_POINT_FRACTPART) + #define MPC_FIXED_POINT_SCALE (1 << (MPC_FIXED_POINT_SCALE_SHIFT - 1)) + //in fixedpoint mode, results in decode output buffer are in -MPC_FIXED_POINT_SCALE ... MPC_FIXED_POINT_SCALE range + + #define MPC_FIXED_POINT_FRACTPART 14 + typedef mpc_int32_t MPC_SAMPLE_FORMAT; + typedef mpc_int64_t MPC_SAMPLE_FORMAT_MULTIPLY; + + #define MAKE_MPC_SAMPLE(X) (MPC_SAMPLE_FORMAT)((double)(X) * (double)(((mpc_int64_t)1)< + #if defined(CPU_COLDFIRE) -#define MPC_HAVE_MULHIGH + #define MPC_MULTIPLY(X,Y) mpc_multiply((X), (Y)) + #define MPC_MULTIPLY_EX(X,Y,Z) mpc_multiply_ex((X), (Y), (Z)) + + static inline MPC_SAMPLE_FORMAT mpc_multiply(MPC_SAMPLE_FORMAT x, + MPC_SAMPLE_FORMAT y) + { + MPC_SAMPLE_FORMAT t1, t2; + asm volatile ( + "mac.l %[x],%[y],%%acc0\n" /* multiply */ + "mulu.l %[y],%[x] \n" /* get lower half, avoid emac stall */ + "movclr.l %%acc0,%[t1] \n" /* get higher half */ + "moveq.l #17,%[t2] \n" + "asl.l %[t2],%[t1] \n" /* hi <<= 17, plus one free */ + "moveq.l #14,%[t2] \n" + "lsr.l %[t2],%[x] \n" /* (unsigned)lo >>= 14 */ + "or.l %[x],%[t1] \n" /* combine result */ + : /* outputs */ + [t1]"=&d"(t1), + [t2]"=&d"(t2), + [x] "+d" (x) + : /* inputs */ + [y] "d" (y) + ); + return t1; + } + + static inline MPC_SAMPLE_FORMAT mpc_multiply_ex(MPC_SAMPLE_FORMAT x, + MPC_SAMPLE_FORMAT y, + unsigned shift) + { + MPC_SAMPLE_FORMAT t1, t2; + asm volatile ( + "mac.l %[x],%[y],%%acc0\n" /* multiply */ + "mulu.l %[y],%[x] \n" /* get lower half, avoid emac stall */ + "movclr.l %%acc0,%[t1] \n" /* get higher half */ + "moveq.l #31,%[t2] \n" + "sub.l %[sh],%[t2] \n" /* t2 = 31 - shift */ + "ble.s 1f \n" + "asl.l %[t2],%[t1] \n" /* hi <<= 31 - shift */ + "lsr.l %[sh],%[x] \n" /* (unsigned)lo >>= shift */ + "or.l %[x],%[t1] \n" /* combine result */ + "bra.s 2f \n" + "1: \n" + "neg.l %[t2] \n" /* t2 = shift - 31 */ + "asr.l %[t2],%[t1] \n" /* hi >>= t2 */ + "2: \n" + : /* outputs */ + [t1]"=&d"(t1), + [t2]"=&d"(t2), + [x] "+d" (x) + : /* inputs */ + [y] "d" (y), + [sh]"d" (shift) + ); + return t1; + } + #elif defined(CPU_ARM) + // borrowed and adapted from libMAD + #define MPC_MULTIPLY(X,Y) \ + ({ \ + MPC_SAMPLE_FORMAT low; \ + MPC_SAMPLE_FORMAT high; \ + asm volatile ( \ + "smull %0,%1,%2,%3 \n\t" \ + "mov %0, %0, lsr #14 \n\t" \ + "orr %0, %0, %1, lsl #18 \n\t" \ + : "=&r"(low), "=&r" (high) \ + : "r"(X),"r"(Y)); \ + low; \ + }) + + // borrowed and adapted from libMAD + #define MPC_MULTIPLY_EX(X,Y,Z) \ + ({ \ + MPC_SAMPLE_FORMAT low; \ + MPC_SAMPLE_FORMAT high; \ + asm volatile ( \ + "smull %0,%1,%2,%3 \n\t" \ + "mov %0, %0, lsr %4 \n\t" \ + "orr %0, %0, %1, lsl %5 \n\t" \ + : "=&r"(low), "=&r" (high) \ + : "r"(X),"r"(Y),"r"(Z),"r"(32-Z)); \ + low; \ + }) + #else /* libmusepack standard */ -#endif + #define MPC_MULTIPLY_NOTRUNCATE(X,Y) \ + (((MPC_SAMPLE_FORMAT_MULTIPLY)(X) * (MPC_SAMPLE_FORMAT_MULTIPLY)(Y)) >> MPC_FIXED_POINT_FRACTPART) + + #define MPC_MULTIPLY_EX_NOTRUNCATE(X,Y,Z) \ + (((MPC_SAMPLE_FORMAT_MULTIPLY)(X) * (MPC_SAMPLE_FORMAT_MULTIPLY)(Y)) >> (Z)) + #ifdef _DEBUG + static inline MPC_SAMPLE_FORMAT MPC_MULTIPLY(MPC_SAMPLE_FORMAT item1,MPC_SAMPLE_FORMAT item2) + { + MPC_SAMPLE_FORMAT_MULTIPLY temp = MPC_MULTIPLY_NOTRUNCATE(item1,item2); + assert(temp == (MPC_SAMPLE_FORMAT_MULTIPLY)(MPC_SAMPLE_FORMAT)temp); + return (MPC_SAMPLE_FORMAT)temp; + } + + static inline MPC_SAMPLE_FORMAT MPC_MULTIPLY_EX(MPC_SAMPLE_FORMAT item1,MPC_SAMPLE_FORMAT item2,unsigned shift) + { + MPC_SAMPLE_FORMAT_MULTIPLY temp = MPC_MULTIPLY_EX_NOTRUNCATE(item1,item2,shift); + assert(temp == (MPC_SAMPLE_FORMAT_MULTIPLY)(MPC_SAMPLE_FORMAT)temp); + return (MPC_SAMPLE_FORMAT)temp; + } + #else + #define MPC_MULTIPLY(X,Y) ((MPC_SAMPLE_FORMAT)MPC_MULTIPLY_NOTRUNCATE(X,Y)) + #define MPC_MULTIPLY_EX(X,Y,Z) ((MPC_SAMPLE_FORMAT)MPC_MULTIPLY_EX_NOTRUNCATE(X,Y,Z)) + #endif -#define MPC_FIXED_POINT_SCALE_SHIFT (MPC_FIXED_POINT_SHIFT + MPC_FIXED_POINT_FRACTPART) -#define MPC_FIXED_POINT_SCALE (1 << (MPC_FIXED_POINT_SCALE_SHIFT - 1)) + #endif + #ifdef MPC_HAVE_MULHIGH + #define MPC_MULTIPLY_FRACT(X,Y) _MulHigh(X,Y) + #else + #if defined(CPU_COLDFIRE) + /* loses one bit of accuracy. The rest of the macros won't be as easy as this... */ + #define MPC_MULTIPLY_FRACT(X,Y) \ + ({ \ + MPC_SAMPLE_FORMAT t; \ + asm volatile ( \ + "mac.l %[A], %[B], %%acc0\n\t" \ + "movclr.l %%acc0, %[t]\n\t" \ + "asr.l #1, %[t]\n\t" \ + : [t] "=d" (t) \ + : [A] "r" ((X)), [B] "r" ((Y))); \ + t; \ + }) + #elif defined(CPU_ARM) + // borrowed and adapted from libMAD + #define MPC_MULTIPLY_FRACT(X,Y) \ + ({ \ + MPC_SAMPLE_FORMAT low; \ + MPC_SAMPLE_FORMAT high; \ + asm volatile ( \ + "smull %0,%1,%2,%3 \n\t" \ + : "=&r"(low), "=&r" (high) \ + : "r"(X),"r"(Y)); \ + high; \ + }) + #else + #define MPC_MULTIPLY_FRACT(X,Y) MPC_MULTIPLY_EX(X,Y,32) + #endif + #endif -//in fixedpoint mode, results in decode output buffer are in -MPC_FIXED_POINT_SCALE ... MPC_FIXED_POINT_SCALE range + #define MPC_MAKE_FRACT_CONST(X) (MPC_SAMPLE_FORMAT)((X) * (double)(((mpc_int64_t)1)<<32) ) + + #define MPC_MULTIPLY_FLOAT_INT(X,Y) ((X)*(Y)) -#define MPC_FIXED_POINT_FRACTPART 14 -typedef mpc_int32_t MPC_SAMPLE_FORMAT; - -typedef mpc_int64_t MPC_SAMPLE_FORMAT_MULTIPLY; - -#define MAKE_MPC_SAMPLE(X) (MPC_SAMPLE_FORMAT)((double)(X) * (double)(((mpc_int64_t)1)<>= 14 */ - "or.l %[x],%[t1] \n" /* combine result */ - : /* outputs */ - [t1]"=&d"(t1), - [t2]"=&d"(t2), - [x] "+d" (x) - : /* inputs */ - [y] "d" (y) - ); - return t1; -} - -static inline MPC_SAMPLE_FORMAT mpc_multiply_ex(MPC_SAMPLE_FORMAT x, - MPC_SAMPLE_FORMAT y, - unsigned shift) -{ - MPC_SAMPLE_FORMAT t1, t2; - asm volatile ( - "mac.l %[x],%[y],%%acc0\n" /* multiply */ - "mulu.l %[y],%[x] \n" /* get lower half, avoid emac stall */ - "movclr.l %%acc0,%[t1] \n" /* get higher half */ - "moveq.l #31,%[t2] \n" - "sub.l %[sh],%[t2] \n" /* t2 = 31 - shift */ - "ble.s 1f \n" - "asl.l %[t2],%[t1] \n" /* hi <<= 31 - shift */ - "lsr.l %[sh],%[x] \n" /* (unsigned)lo >>= shift */ - "or.l %[x],%[t1] \n" /* combine result */ - "bra.s 2f \n" - "1: \n" - "neg.l %[t2] \n" /* t2 = shift - 31 */ - "asr.l %[t2],%[t1] \n" /* hi >>= t2 */ - "2: \n" - : /* outputs */ - [t1]"=&d"(t1), - [t2]"=&d"(t2), - [x] "+d" (x) - : /* inputs */ - [y] "d" (y), - [sh]"d" (shift) - ); - return t1; -} -#else /* libmusepack standard */ - -#define MPC_MULTIPLY_NOTRUNCATE(X,Y) \ - (((MPC_SAMPLE_FORMAT_MULTIPLY)(X) * (MPC_SAMPLE_FORMAT_MULTIPLY)(Y)) >> MPC_FIXED_POINT_FRACTPART) - -#define MPC_MULTIPLY_EX_NOTRUNCATE(X,Y,Z) \ - (((MPC_SAMPLE_FORMAT_MULTIPLY)(X) * (MPC_SAMPLE_FORMAT_MULTIPLY)(Y)) >> (Z)) - -#ifdef _DEBUG -static inline MPC_SAMPLE_FORMAT MPC_MULTIPLY(MPC_SAMPLE_FORMAT item1,MPC_SAMPLE_FORMAT item2) -{ - MPC_SAMPLE_FORMAT_MULTIPLY temp = MPC_MULTIPLY_NOTRUNCATE(item1,item2); - assert(temp == (MPC_SAMPLE_FORMAT_MULTIPLY)(MPC_SAMPLE_FORMAT)temp); - return (MPC_SAMPLE_FORMAT)temp; -} - -static inline MPC_SAMPLE_FORMAT MPC_MULTIPLY_EX(MPC_SAMPLE_FORMAT item1,MPC_SAMPLE_FORMAT item2,unsigned shift) -{ - MPC_SAMPLE_FORMAT_MULTIPLY temp = MPC_MULTIPLY_EX_NOTRUNCATE(item1,item2,shift); - assert(temp == (MPC_SAMPLE_FORMAT_MULTIPLY)(MPC_SAMPLE_FORMAT)temp); - return (MPC_SAMPLE_FORMAT)temp; -} #else -#define MPC_MULTIPLY(X,Y) ((MPC_SAMPLE_FORMAT)MPC_MULTIPLY_NOTRUNCATE(X,Y)) -#define MPC_MULTIPLY_EX(X,Y,Z) ((MPC_SAMPLE_FORMAT)MPC_MULTIPLY_EX_NOTRUNCATE(X,Y,Z)) -#endif + //in floating-point mode, decoded samples are in -1...1 range + + typedef float MPC_SAMPLE_FORMAT; + + #define MAKE_MPC_SAMPLE(X) ((MPC_SAMPLE_FORMAT)(X)) + #define MAKE_MPC_SAMPLE_EX(X,Y) ((MPC_SAMPLE_FORMAT)(X)) + + #define MPC_MULTIPLY_FRACT(X,Y) ((X)*(Y)) + #define MPC_MAKE_FRACT_CONST(X) (X) + + #define MPC_MULTIPLY_FLOAT_INT(X,Y) ((X)*(Y)) + #define MPC_MULTIPLY(X,Y) ((X)*(Y)) + #define MPC_MULTIPLY_EX(X,Y,Z) ((X)*(Y)) #endif -#ifdef MPC_HAVE_MULHIGH -#define MPC_MULTIPLY_FRACT(X,Y) _MulHigh(X,Y) -#else -#if defined(CPU_COLDFIRE) -/* loses one bit of accuracy. - the rest of the macros won't be as easy as this... */ -#define MPC_MULTIPLY_FRACT(X,Y) \ - ({ \ - MPC_SAMPLE_FORMAT t; \ - asm volatile ( \ - "mac.l %[A], %[B], %%acc0\n\t" \ - "movclr.l %%acc0, %[t]\n\t" \ - "asr.l #1, %[t]\n\t" \ - : [t] "=d" (t) \ - : [A] "r" ((X)), [B] "r" ((Y))); \ - t; \ - }) -#else -#define MPC_MULTIPLY_FRACT(X,Y) MPC_MULTIPLY_EX(X,Y,32) -#endif -#endif - -#define MPC_MAKE_FRACT_CONST(X) (MPC_SAMPLE_FORMAT)((X) * (double)(((mpc_int64_t)1)<<32) ) -#define MPC_MULTIPLY_FRACT_CONST(X,Y) MPC_MULTIPLY_FRACT(X,MPC_MAKE_FRACT_CONST(Y)) -#define MPC_MULTIPLY_FRACT_CONST_FIX(X,Y,Z) ( MPC_MULTIPLY_FRACT(X,MPC_MAKE_FRACT_CONST( Y / (1<<(Z)) )) << (Z) ) -#define MPC_MULTIPLY_FRACT_CONST_SHR(X,Y,Z) MPC_MULTIPLY_FRACT(X,MPC_MAKE_FRACT_CONST( Y / (1<<(Z)) )) - -#define MPC_MULTIPLY_FLOAT_INT(X,Y) ((X)*(Y)) -#define MPC_SCALE_CONST(X,Y,Z) MPC_MULTIPLY_EX(X,MAKE_MPC_SAMPLE_EX(Y,Z),(Z)) -#define MPC_SCALE_CONST_SHL(X,Y,Z,S) MPC_MULTIPLY_EX(X,MAKE_MPC_SAMPLE_EX(Y,Z),(Z)-(S)) -#define MPC_SCALE_CONST_SHR(X,Y,Z,S) MPC_MULTIPLY_EX(X,MAKE_MPC_SAMPLE_EX(Y,Z),(Z)+(S)) -#define MPC_SHR(X,Y) ((X)>>(Y)) -#define MPC_SHL(X,Y) ((X)<<(Y)) - -#else - -//in floating-point mode, decoded samples are in -1...1 range - -typedef float MPC_SAMPLE_FORMAT; - -#define MAKE_MPC_SAMPLE(X) ((MPC_SAMPLE_FORMAT)(X)) -#define MAKE_MPC_SAMPLE_EX(X,Y) ((MPC_SAMPLE_FORMAT)(X)) - -#define MPC_MULTIPLY_FRACT(X,Y) ((X)*(Y)) -#define MPC_MAKE_FRACT_CONST(X) (X) -#define MPC_MULTIPLY_FRACT_CONST(X,Y) MPC_MULTPLY_FRACT(X,MPC_MAKE_FRACT_CONST(Y)) -#define MPC_MULTIPLY_FRACT_CONST_SHR(X,Y,Z) MPC_MULTIPLY_FRACT(X,MPC_MAKE_FRACT_CONST( Y )) -#define MPC_MULTIPLY_FRACT_CONST_FIX(X,Y,Z) MPC_MULTIPLY_FRACT(X,MPC_MAKE_FRACT_CONST( Y )) - -#define MPC_MULTIPLY_FLOAT_INT(X,Y) ((X)*(Y)) -#define MPC_MULTIPLY(X,Y) ((X)*(Y)) -#define MPC_MULTIPLY_EX(X,Y,Z) ((X)*(Y)) -#define MPC_SCALE_CONST(X,Y,Z) ((X)*(Y)) -#define MPC_SCALE_CONST_SHL(X,Y,Z,S) ((X)*(Y)) -#define MPC_SCALE_CONST_SHR(X,Y,Z,S) ((X)*(Y)) -#define MPC_SHR(X,Y) (X) -#define MPC_SHL(X,Y) (X) - -#endif - #endif // _mpcdec_math_h_ Index: apps/codecs/libmusepack/synth_filter.c =================================================================== --- apps/codecs/libmusepack/synth_filter.c (revision 13591) +++ apps/codecs/libmusepack/synth_filter.c (working copy) @@ -42,16 +42,12 @@ /* C O N S T A N T S */ #undef _ -#define MPC_FIXED_POINT_SYNTH_FIX 2 +// saturate to +/- 2^31 (= value << (31-17)), D-values are +/- 2^17 +#define _(value) (value << (14)) -#ifdef MPC_FIXED_POINT -#define _(value) MPC_MAKE_FRACT_CONST((double)value/(double)(0x40000)) -#else -#define _(value) MAKE_MPC_SAMPLE((double)value/(double)(0x10000)) -#endif - - +// Di_opt coefficients were scaled by <<17 static const MPC_SAMPLE_FORMAT Di_opt [32] [16] ICONST_ATTR = { + /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 */ { _( 0), _( -29), _( 213), _( -459), _( 2037), _(-5153), _( 6574), _(-37489), _(75038), _(37489), _(6574), _( 5153), _(2037), _( 459), _(213), _(29) }, { _( -1), _( -31), _( 218), _( -519), _( 2000), _(-5517), _( 5959), _(-39336), _(74992), _(35640), _(7134), _( 4788), _(2063), _( 401), _(208), _(26) }, { _( -1), _( -35), _( 222), _( -581), _( 1952), _(-5879), _( 5288), _(-41176), _(74856), _(33791), _(7640), _( 4425), _(2080), _( 347), _(202), _(24) }, @@ -88,13 +84,56 @@ #undef _ -static void Calculate_New_V ( const MPC_SAMPLE_FORMAT * Sample, MPC_SAMPLE_FORMAT * V ) +// V-coefficients were expanded (<<) by V_COEFFICIENT_EXPAND +#define V_COEFFICIENT_EXPAND 27 + +// define 64=32x32-multiplication for DCT-coefficients with samples, via usage of MPC_FRACT highly optimized assembler might be used +// MULTIPLY_FRACT will do >>32 after multiplication, as V-coef were expanded by V_COEFFICIENT_EXPAND we'll correct this on the result +// will loose 5bit accuracy on result in fract part without effect on final audio result +#define MPC_MULTIPLY_V(sample, vcoef) ( (MPC_MULTIPLY_FRACT(sample, vcoef)) << (32-V_COEFFICIENT_EXPAND) ) + +// define constants for DCT-synthesis +// INVCOSxx = (0.5 / cos(xx*PI/64)) << 27, <<27 to saturate to +/- 2^31 +#define INVCOS01 ( 67189797) +#define INVCOS02 ( 67433575) +#define INVCOS03 ( 67843164) +#define INVCOS04 ( 68423604) +#define INVCOS05 ( 69182167) +#define INVCOS06 ( 70128577) +#define INVCOS07 ( 71275330) +#define INVCOS08 ( 72638111) +#define INVCOS09 ( 74236348) +#define INVCOS10 ( 76093940) +#define INVCOS11 ( 78240207) +#define INVCOS12 ( 80711144) +#define INVCOS13 ( 83551089) +#define INVCOS14 ( 86814950) +#define INVCOS15 ( 90571242) +#define INVCOS16 ( 94906266) +#define INVCOS17 ( 99929967) +#define INVCOS18 ( 105784321) +#define INVCOS19 ( 112655602) +#define INVCOS20 ( 120792764) +#define INVCOS21 ( 130535899) +#define INVCOS22 ( 142361749) +#define INVCOS23 ( 156959571) +#define INVCOS24 ( 175363913) +#define INVCOS25 ( 199201203) +#define INVCOS26 ( 231182936) +#define INVCOS27 ( 276190692) +#define INVCOS28 ( 343988688) +#define INVCOS29 ( 457361460) +#define INVCOS30 ( 684664578) +#define INVCOS31 (1367679739) + +static inline void +mpc_calculate_new_V ( const MPC_SAMPLE_FORMAT * Sample, MPC_SAMPLE_FORMAT * V ) { // Calculating new V-buffer values for left channel // calculate new V-values (ISO-11172-3, p. 39) // based upon fast-MDCT algorithm by Byeong Gi Lee - /*static*/ MPC_SAMPLE_FORMAT A00, A01, A02, A03, A04, A05, A06, A07, A08, A09, A10, A11, A12, A13, A14, A15; - /*static*/ MPC_SAMPLE_FORMAT B00, B01, B02, B03, B04, B05, B06, B07, B08, B09, B10, B11, B12, B13, B14, B15; + MPC_SAMPLE_FORMAT A00, A01, A02, A03, A04, A05, A06, A07, A08, A09, A10, A11, A12, A13, A14, A15; + MPC_SAMPLE_FORMAT B00, B01, B02, B03, B04, B05, B06, B07, B08, B09, B10, B11, B12, B13, B14, B15; MPC_SAMPLE_FORMAT tmp; A00 = Sample[ 0] + Sample[31]; @@ -113,6 +152,7 @@ A13 = Sample[13] + Sample[18]; A14 = Sample[14] + Sample[17]; A15 = Sample[15] + Sample[16]; + // 16 adds B00 = A00 + A15; B01 = A01 + A14; @@ -122,66 +162,71 @@ B05 = A05 + A10; B06 = A06 + A09; B07 = A07 + A08;; - B08 = MPC_SCALE_CONST((A00 - A15) , 0.5024192929f , 31); - B09 = MPC_SCALE_CONST((A01 - A14) , 0.5224986076f , 31); - B10 = MPC_SCALE_CONST((A02 - A13) , 0.5669440627f , 31); - B11 = MPC_SCALE_CONST((A03 - A12) , 0.6468217969f , 31); - B12 = MPC_SCALE_CONST((A04 - A11) , 0.7881546021f , 31); - B13 = MPC_SCALE_CONST((A05 - A10) , 1.0606776476f , 30); - B14 = MPC_SCALE_CONST((A06 - A09) , 1.7224471569f , 30); - B15 = MPC_SCALE_CONST((A07 - A08) , 5.1011486053f , 28); + B08 = MPC_MULTIPLY_V((A00 - A15), INVCOS02); + B09 = MPC_MULTIPLY_V((A01 - A14), INVCOS06); + B10 = MPC_MULTIPLY_V((A02 - A13), INVCOS10); + B11 = MPC_MULTIPLY_V((A03 - A12), INVCOS14); + B12 = MPC_MULTIPLY_V((A04 - A11), INVCOS18); + B13 = MPC_MULTIPLY_V((A05 - A10), INVCOS22); + B14 = MPC_MULTIPLY_V((A06 - A09), INVCOS26); + B15 = MPC_MULTIPLY_V((A07 - A08), INVCOS30); + // 8 adds, 8 subs, 8 muls, 8 shifts A00 = B00 + B07; A01 = B01 + B06; A02 = B02 + B05; A03 = B03 + B04; - A04 = MPC_SCALE_CONST((B00 - B07) , 0.5097956061f , 31); - A05 = MPC_SCALE_CONST((B01 - B06) , 0.6013448834f , 31); - A06 = MPC_SCALE_CONST((B02 - B05) , 0.8999761939f , 31); - A07 = MPC_SCALE_CONST((B03 - B04) , 2.5629155636f , 29); + A04 = MPC_MULTIPLY_V((B00 - B07), INVCOS04); + A05 = MPC_MULTIPLY_V((B01 - B06), INVCOS12); + A06 = MPC_MULTIPLY_V((B02 - B05), INVCOS20); + A07 = MPC_MULTIPLY_V((B03 - B04), INVCOS28); A08 = B08 + B15; A09 = B09 + B14; A10 = B10 + B13; A11 = B11 + B12; - A12 = MPC_SCALE_CONST((B08 - B15) , 0.5097956061f , 31); - A13 = MPC_SCALE_CONST((B09 - B14) , 0.6013448834f , 31); - A14 = MPC_SCALE_CONST((B10 - B13) , 0.8999761939f , 31); - A15 = MPC_SCALE_CONST((B11 - B12) , 2.5629155636f , 29); + A12 = MPC_MULTIPLY_V((B08 - B15), INVCOS04); + A13 = MPC_MULTIPLY_V((B09 - B14), INVCOS12); + A14 = MPC_MULTIPLY_V((B10 - B13), INVCOS20); + A15 = MPC_MULTIPLY_V((B11 - B12), INVCOS28); + // 8 adds, 8 subs, 8 muls, 8 shifts B00 = A00 + A03; B01 = A01 + A02; - B02 = MPC_MULTIPLY_FRACT_CONST_FIX((A00 - A03) , 0.5411961079f , 1); - B03 = MPC_MULTIPLY_FRACT_CONST_FIX((A01 - A02) , 1.3065630198f , 2); + B02 = MPC_MULTIPLY_V((A00 - A03), INVCOS08); + B03 = MPC_MULTIPLY_V((A01 - A02), INVCOS24); B04 = A04 + A07; B05 = A05 + A06; - B06 = MPC_MULTIPLY_FRACT_CONST_FIX((A04 - A07) , 0.5411961079f , 1); - B07 = MPC_MULTIPLY_FRACT_CONST_FIX((A05 - A06) , 1.3065630198f , 2); + B06 = MPC_MULTIPLY_V((A04 - A07), INVCOS08); + B07 = MPC_MULTIPLY_V((A05 - A06), INVCOS24); B08 = A08 + A11; B09 = A09 + A10; - B10 = MPC_MULTIPLY_FRACT_CONST_FIX((A08 - A11) , 0.5411961079f , 1); - B11 = MPC_MULTIPLY_FRACT_CONST_FIX((A09 - A10) , 1.3065630198f , 2); + B10 = MPC_MULTIPLY_V((A08 - A11), INVCOS08); + B11 = MPC_MULTIPLY_V((A09 - A10), INVCOS24); B12 = A12 + A15; B13 = A13 + A14; - B14 = MPC_MULTIPLY_FRACT_CONST_FIX((A12 - A15) , 0.5411961079f , 1); - B15 = MPC_MULTIPLY_FRACT_CONST_FIX((A13 - A14) , 1.3065630198f , 2); + B14 = MPC_MULTIPLY_V((A12 - A15), INVCOS08); + B15 = MPC_MULTIPLY_V((A13 - A14), INVCOS24); + // 8 adds, 8 subs, 8 muls, 8 shifts A00 = B00 + B01; - A01 = MPC_MULTIPLY_FRACT_CONST_FIX((B00 - B01) , 0.7071067691f , 1); + A01 = MPC_MULTIPLY_V((B00 - B01), INVCOS16); A02 = B02 + B03; - A03 = MPC_MULTIPLY_FRACT_CONST_FIX((B02 - B03) , 0.7071067691f , 1); + A03 = MPC_MULTIPLY_V((B02 - B03), INVCOS16); A04 = B04 + B05; - A05 = MPC_MULTIPLY_FRACT_CONST_FIX((B04 - B05) , 0.7071067691f , 1); + A05 = MPC_MULTIPLY_V((B04 - B05), INVCOS16); A06 = B06 + B07; - A07 = MPC_MULTIPLY_FRACT_CONST_FIX((B06 - B07) , 0.7071067691f , 1); + A07 = MPC_MULTIPLY_V((B06 - B07), INVCOS16); A08 = B08 + B09; - A09 = MPC_MULTIPLY_FRACT_CONST_FIX((B08 - B09) , 0.7071067691f , 1); + A09 = MPC_MULTIPLY_V((B08 - B09), INVCOS16); A10 = B10 + B11; - A11 = MPC_MULTIPLY_FRACT_CONST_FIX((B10 - B11) , 0.7071067691f , 1); + A11 = MPC_MULTIPLY_V((B10 - B11), INVCOS16); A12 = B12 + B13; - A13 = MPC_MULTIPLY_FRACT_CONST_FIX((B12 - B13) , 0.7071067691f , 1); + A13 = MPC_MULTIPLY_V((B12 - B13), INVCOS16); A14 = B14 + B15; - A15 = MPC_MULTIPLY_FRACT_CONST_FIX((B14 - B15) , 0.7071067691f , 1); + A15 = MPC_MULTIPLY_V((B14 - B15), INVCOS16); + // 8 adds, 8 subs, 8 muls, 8 shifts + // multiple used expressions: -(A12 + A14 + A15) V[48] = -A00; V[ 0] = A01; V[40] = -A02 - (V[ 8] = A03); @@ -192,28 +237,24 @@ V[46] = (tmp = -(A12 + A14 + A15)) - A08; V[42] = tmp - A10 - A11; - A00 = MPC_MULTIPLY_FRACT_CONST_SHR((Sample[ 0] - Sample[31]) , 0.5006030202f , MPC_FIXED_POINT_SYNTH_FIX); - A01 = MPC_MULTIPLY_FRACT_CONST_SHR((Sample[ 1] - Sample[30]) , 0.5054709315f , MPC_FIXED_POINT_SYNTH_FIX); - A02 = MPC_MULTIPLY_FRACT_CONST_SHR((Sample[ 2] - Sample[29]) , 0.5154473186f , MPC_FIXED_POINT_SYNTH_FIX); - A03 = MPC_MULTIPLY_FRACT_CONST_SHR((Sample[ 3] - Sample[28]) , 0.5310425758f , MPC_FIXED_POINT_SYNTH_FIX); - A04 = MPC_MULTIPLY_FRACT_CONST_SHR((Sample[ 4] - Sample[27]) , 0.5531039238f , MPC_FIXED_POINT_SYNTH_FIX); - A05 = MPC_MULTIPLY_FRACT_CONST_SHR((Sample[ 5] - Sample[26]) , 0.5829349756f , MPC_FIXED_POINT_SYNTH_FIX); - A06 = MPC_MULTIPLY_FRACT_CONST_SHR((Sample[ 6] - Sample[25]) , 0.6225041151f , MPC_FIXED_POINT_SYNTH_FIX); - A07 = MPC_MULTIPLY_FRACT_CONST_SHR((Sample[ 7] - Sample[24]) , 0.6748083234f , MPC_FIXED_POINT_SYNTH_FIX); - A08 = MPC_MULTIPLY_FRACT_CONST_SHR((Sample[ 8] - Sample[23]) , 0.7445362806f , MPC_FIXED_POINT_SYNTH_FIX); - A09 = MPC_MULTIPLY_FRACT_CONST_SHR((Sample[ 9] - Sample[22]) , 0.8393496275f , MPC_FIXED_POINT_SYNTH_FIX); - A10 = MPC_MULTIPLY_FRACT_CONST_SHR((Sample[10] - Sample[21]) , 0.9725682139f , MPC_FIXED_POINT_SYNTH_FIX); -#if MPC_FIXED_POINT_SYNTH_FIX>=2 - A11 = MPC_MULTIPLY_FRACT_CONST_SHR((Sample[11] - Sample[20]) , 1.1694399118f , MPC_FIXED_POINT_SYNTH_FIX); - A12 = MPC_MULTIPLY_FRACT_CONST_SHR((Sample[12] - Sample[19]) , 1.4841645956f , MPC_FIXED_POINT_SYNTH_FIX); -#else - A11 = MPC_SCALE_CONST_SHR ((Sample[11] - Sample[20]) , 1.1694399118f , 30, MPC_FIXED_POINT_SYNTH_FIX); - A12 = MPC_SCALE_CONST_SHR ((Sample[12] - Sample[19]) , 1.4841645956f , 30, MPC_FIXED_POINT_SYNTH_FIX); -#endif - A13 = MPC_SCALE_CONST_SHR ((Sample[13] - Sample[18]) , 2.0577809811f , 29, MPC_FIXED_POINT_SYNTH_FIX); - A14 = MPC_SCALE_CONST_SHR ((Sample[14] - Sample[17]) , 3.4076085091f , 29, MPC_FIXED_POINT_SYNTH_FIX); - A15 = MPC_SCALE_CONST_SHR ((Sample[15] - Sample[16]) , 10.1900081635f, 27 ,MPC_FIXED_POINT_SYNTH_FIX); - + A00 = MPC_MULTIPLY_V((Sample[ 0] - Sample[31]), INVCOS01); + A01 = MPC_MULTIPLY_V((Sample[ 1] - Sample[30]), INVCOS03); + A02 = MPC_MULTIPLY_V((Sample[ 2] - Sample[29]), INVCOS05); + A03 = MPC_MULTIPLY_V((Sample[ 3] - Sample[28]), INVCOS07); + A04 = MPC_MULTIPLY_V((Sample[ 4] - Sample[27]), INVCOS09); + A05 = MPC_MULTIPLY_V((Sample[ 5] - Sample[26]), INVCOS11); + A06 = MPC_MULTIPLY_V((Sample[ 6] - Sample[25]), INVCOS13); + A07 = MPC_MULTIPLY_V((Sample[ 7] - Sample[24]), INVCOS15); + A08 = MPC_MULTIPLY_V((Sample[ 8] - Sample[23]), INVCOS17); + A09 = MPC_MULTIPLY_V((Sample[ 9] - Sample[22]), INVCOS19); + A10 = MPC_MULTIPLY_V((Sample[10] - Sample[21]), INVCOS21); + A11 = MPC_MULTIPLY_V((Sample[11] - Sample[20]), INVCOS23); + A12 = MPC_MULTIPLY_V((Sample[12] - Sample[19]), INVCOS25); + A13 = MPC_MULTIPLY_V((Sample[13] - Sample[18]), INVCOS27); + A14 = MPC_MULTIPLY_V((Sample[14] - Sample[17]), INVCOS29); + A15 = MPC_MULTIPLY_V((Sample[15] - Sample[16]), INVCOS31); + // 16 subs, 16 muls, 16 shifts + B00 = A00 + A15; B01 = A01 + A14; B02 = A02 + A13; @@ -222,77 +263,81 @@ B05 = A05 + A10; B06 = A06 + A09; B07 = A07 + A08; - B08 = MPC_SCALE_CONST((A00 - A15) , 0.5024192929f , 31); - B09 = MPC_SCALE_CONST((A01 - A14) , 0.5224986076f , 31); - B10 = MPC_SCALE_CONST((A02 - A13) , 0.5669440627f , 31); - B11 = MPC_SCALE_CONST((A03 - A12) , 0.6468217969f , 31); - B12 = MPC_SCALE_CONST((A04 - A11) , 0.7881546021f , 31); - B13 = MPC_SCALE_CONST((A05 - A10) , 1.0606776476f , 30); - B14 = MPC_SCALE_CONST((A06 - A09) , 1.7224471569f , 30); - B15 = MPC_SCALE_CONST((A07 - A08) , 5.1011486053f , 28); + B08 = MPC_MULTIPLY_V((A00 - A15), INVCOS02); + B09 = MPC_MULTIPLY_V((A01 - A14), INVCOS06); + B10 = MPC_MULTIPLY_V((A02 - A13), INVCOS10); + B11 = MPC_MULTIPLY_V((A03 - A12), INVCOS14); + B12 = MPC_MULTIPLY_V((A04 - A11), INVCOS18); + B13 = MPC_MULTIPLY_V((A05 - A10), INVCOS22); + B14 = MPC_MULTIPLY_V((A06 - A09), INVCOS26); + B15 = MPC_MULTIPLY_V((A07 - A08), INVCOS30); + // 8 adds, 8 subs, 8 muls, 8 shift A00 = B00 + B07; A01 = B01 + B06; A02 = B02 + B05; A03 = B03 + B04; - A04 = MPC_SCALE_CONST((B00 - B07) , 0.5097956061f , 31); - A05 = MPC_SCALE_CONST((B01 - B06) , 0.6013448834f , 31); - A06 = MPC_SCALE_CONST((B02 - B05) , 0.8999761939f , 31); - A07 = MPC_SCALE_CONST((B03 - B04) , 2.5629155636f , 29); + A04 = MPC_MULTIPLY_V((B00 - B07), INVCOS04); + A05 = MPC_MULTIPLY_V((B01 - B06), INVCOS12); + A06 = MPC_MULTIPLY_V((B02 - B05), INVCOS20); + A07 = MPC_MULTIPLY_V((B03 - B04), INVCOS28); A08 = B08 + B15; A09 = B09 + B14; A10 = B10 + B13; A11 = B11 + B12; - A12 = MPC_SCALE_CONST((B08 - B15) , 0.5097956061f , 31); - A13 = MPC_SCALE_CONST((B09 - B14) , 0.6013448834f , 31); - A14 = MPC_SCALE_CONST((B10 - B13) , 0.8999761939f , 31); - A15 = MPC_SCALE_CONST((B11 - B12) , 2.5629155636f , 29); + A12 = MPC_MULTIPLY_V((B08 - B15), INVCOS04); + A13 = MPC_MULTIPLY_V((B09 - B14), INVCOS12); + A14 = MPC_MULTIPLY_V((B10 - B13), INVCOS20); + A15 = MPC_MULTIPLY_V((B11 - B12), INVCOS28); + // 8 adds, 8 subs, 8 muls, 8 shift B00 = A00 + A03; B01 = A01 + A02; - B02 = MPC_SCALE_CONST((A00 - A03) , 0.5411961079f , 31); - B03 = MPC_SCALE_CONST((A01 - A02) , 1.3065630198f , 30); + B02 = MPC_MULTIPLY_V((A00 - A03), INVCOS08); + B03 = MPC_MULTIPLY_V((A01 - A02), INVCOS24); B04 = A04 + A07; B05 = A05 + A06; - B06 = MPC_SCALE_CONST((A04 - A07) , 0.5411961079f , 31); - B07 = MPC_SCALE_CONST((A05 - A06) , 1.3065630198f , 30); + B06 = MPC_MULTIPLY_V((A04 - A07), INVCOS08); + B07 = MPC_MULTIPLY_V((A05 - A06), INVCOS24); B08 = A08 + A11; B09 = A09 + A10; - B10 = MPC_SCALE_CONST((A08 - A11) , 0.5411961079f , 31); - B11 = MPC_SCALE_CONST((A09 - A10) , 1.3065630198f , 30); + B10 = MPC_MULTIPLY_V((A08 - A11), INVCOS08); + B11 = MPC_MULTIPLY_V((A09 - A10), INVCOS24); B12 = A12 + A15; B13 = A13 + A14; - B14 = MPC_SCALE_CONST((A12 - A15) , 0.5411961079f , 31); - B15 = MPC_SCALE_CONST((A13 - A14) , 1.3065630198f , 30); + B14 = MPC_MULTIPLY_V((A12 - A15), INVCOS08); + B15 = MPC_MULTIPLY_V((A13 - A14), INVCOS24); + // 8 adds, 8 subs, 8 muls, 8 shift - A00 = MPC_SHL(B00 + B01, MPC_FIXED_POINT_SYNTH_FIX); - A01 = MPC_SCALE_CONST_SHL((B00 - B01) , 0.7071067691f , 31, MPC_FIXED_POINT_SYNTH_FIX); - A02 = MPC_SHL(B02 + B03, MPC_FIXED_POINT_SYNTH_FIX); - A03 = MPC_SCALE_CONST_SHL((B02 - B03) , 0.7071067691f , 31, MPC_FIXED_POINT_SYNTH_FIX); - A04 = MPC_SHL(B04 + B05, MPC_FIXED_POINT_SYNTH_FIX); - A05 = MPC_SCALE_CONST_SHL((B04 - B05) , 0.7071067691f , 31, MPC_FIXED_POINT_SYNTH_FIX); - A06 = MPC_SHL(B06 + B07, MPC_FIXED_POINT_SYNTH_FIX); - A07 = MPC_SCALE_CONST_SHL((B06 - B07) , 0.7071067691f , 31, MPC_FIXED_POINT_SYNTH_FIX); - A08 = MPC_SHL(B08 + B09, MPC_FIXED_POINT_SYNTH_FIX); - A09 = MPC_SCALE_CONST_SHL((B08 - B09) , 0.7071067691f , 31, MPC_FIXED_POINT_SYNTH_FIX); - A10 = MPC_SHL(B10 + B11, MPC_FIXED_POINT_SYNTH_FIX); - A11 = MPC_SCALE_CONST_SHL((B10 - B11) , 0.7071067691f , 31, MPC_FIXED_POINT_SYNTH_FIX); - A12 = MPC_SHL(B12 + B13, MPC_FIXED_POINT_SYNTH_FIX); - A13 = MPC_SCALE_CONST_SHL((B12 - B13) , 0.7071067691f , 31, MPC_FIXED_POINT_SYNTH_FIX); - A14 = MPC_SHL(B14 + B15, MPC_FIXED_POINT_SYNTH_FIX); - A15 = MPC_SCALE_CONST_SHL((B14 - B15) , 0.7071067691f , 31, MPC_FIXED_POINT_SYNTH_FIX); + A00 = B00 + B01; + A01 = MPC_MULTIPLY_V((B00 - B01), INVCOS16); + A02 = B02 + B03; + A03 = MPC_MULTIPLY_V((B02 - B03), INVCOS16); + A04 = B04 + B05; + A05 = MPC_MULTIPLY_V((B04 - B05), INVCOS16); + A06 = B06 + B07; + A07 = MPC_MULTIPLY_V((B06 - B07), INVCOS16); + A08 = B08 + B09; + A09 = MPC_MULTIPLY_V((B08 - B09), INVCOS16); + A10 = B10 + B11; + A11 = MPC_MULTIPLY_V((B10 - B11), INVCOS16); + A12 = B12 + B13; + A13 = MPC_MULTIPLY_V((B12 - B13), INVCOS16); + A14 = B14 + B15; + A15 = MPC_MULTIPLY_V((B14 - B15), INVCOS16); + // 8 adds, 8 subs, 8 muls, 8 shift - // mehrfach verwendete Ausdrücke: A04+A06+A07, A09+A13+A15 + // multiple used expressions: A04+A06+A07, A09+A13+A15 V[ 5] = (V[11] = (V[13] = A07 + (V[15] = A15)) + A11) + A05 + A13; V[ 7] = (V[ 9] = A03 + A11 + A15) + A13; V[33] = -(V[ 1] = A01 + A09 + A13 + A15) - A14; V[35] = -(V[ 3] = A05 + A07 + A09 + A13 + A15) - A06 - A14; V[37] = (tmp = -(A10 + A11 + A13 + A14 + A15)) - A05 - A06 - A07; - V[39] = tmp - A02 - A03; // abhängig vom Befehl drüber - V[41] = (tmp += A13 - A12) - A02 - A03; // abhängig vom Befehl 2 drüber - V[43] = tmp - A04 - A06 - A07; // abhängig von Befehlen 1 und 3 drüber + V[39] = tmp - A02 - A03; + V[41] = (tmp += A13 - A12) - A02 - A03; + V[43] = tmp - A04 - A06 - A07; V[47] = (tmp = -(A08 + A12 + A14 + A15)) - A00; - V[45] = tmp - A04 - A06 - A07; // abhängig vom Befehl drüber + V[45] = tmp - A04 - A06 - A07; V[32] = -V[ 0]; V[31] = -V[ 1]; @@ -328,123 +373,131 @@ V[49] = V[47]; } -static void Synthese_Filter_float_internal(MPC_SAMPLE_FORMAT * OutData,MPC_SAMPLE_FORMAT * V,const MPC_SAMPLE_FORMAT * Y) +static inline void +mpc_decoder_windowing_D(MPC_SAMPLE_FORMAT * Data, const MPC_SAMPLE_FORMAT * V) { + const MPC_SAMPLE_FORMAT *D = (const MPC_SAMPLE_FORMAT *) &Di_opt; + mpc_int32_t k; + + #if defined(CPU_COLDFIRE) + // 64=32x32-multiply assembler for Coldfire + for ( k = 0; k < 32; k++, D += 16, V++ ) + { + asm volatile ( + "movem.l (%[D]), %%d0-%%d3 \n\t" + "move.l (%[V]), %%a5 \n\t" + "mac.l %%d0, %%a5, (96*4, %[V]), %%a5, %%acc0 \n\t" + "mac.l %%d1, %%a5, (128*4, %[V]), %%a5, %%acc0\n\t" + "mac.l %%d2, %%a5, (224*4, %[V]), %%a5, %%acc0\n\t" + "mac.l %%d3, %%a5, (256*4, %[V]), %%a5, %%acc0\n\t" + "movem.l (4*4, %[D]), %%d0-%%d3 \n\t" + "mac.l %%d0, %%a5, (352*4, %[V]), %%a5, %%acc0\n\t" + "mac.l %%d1, %%a5, (384*4, %[V]), %%a5, %%acc0\n\t" + "mac.l %%d2, %%a5, (480*4, %[V]), %%a5, %%acc0\n\t" + "mac.l %%d3, %%a5, (512*4, %[V]), %%a5, %%acc0\n\t" + "movem.l (8*4, %[D]), %%d0-%%d3 \n\t" + "mac.l %%d0, %%a5, (608*4, %[V]), %%a5, %%acc0\n\t" + "mac.l %%d1, %%a5, (640*4, %[V]), %%a5, %%acc0\n\t" + "mac.l %%d2, %%a5, (736*4, %[V]), %%a5, %%acc0\n\t" + "mac.l %%d3, %%a5, (768*4, %[V]), %%a5, %%acc0\n\t" + "movem.l (12*4, %[D]), %%d0-%%d3 \n\t" + "mac.l %%d0, %%a5, (864*4, %[V]), %%a5, %%acc0\n\t" + "mac.l %%d1, %%a5, (896*4, %[V]), %%a5, %%acc0\n\t" + "mac.l %%d2, %%a5, (992*4, %[V]), %%a5, %%acc0\n\t" + "mac.l %%d3, %%a5, %%acc0 \n\t" + "movclr.l %%acc0, %%d0 \n\t" + "move.l %%d0, (%[Data])+ \n" + : [Data] "+a" (Data) + : [V] "a" (V), [D] "a" (D) + : "d0", "d1", "d2", "d3", "a5"); + } + #elif defined(CPU_ARM) + // 64=32x32-multiply assembler for ARM + for ( k = 0; k < 32; k++, V++ ) + { + asm volatile ( + "ldmia %[D]!, { r0-r3 } \n\t" + "ldr r4, [%[V]] \n\t" + "smull r5, r6, r0, r4 \n\t" + "ldr r4, [%[V], #96*4] \n\t" + "smlal r5, r6, r1, r4 \n\t" + "ldr r4, [%[V], #128*4] \n\t" + "smlal r5, r6, r2, r4 \n\t" + "ldr r4, [%[V], #224*4] \n\t" + "smlal r5, r6, r3, r4 \n\t" + + "ldmia %[D]!, { r0-r3 } \n\t" + "ldr r4, [%[V], #256*4] \n\t" + "smlal r5, r6, r0, r4 \n\t" + "ldr r4, [%[V], #352*4] \n\t" + "smlal r5, r6, r1, r4 \n\t" + "ldr r4, [%[V], #384*4] \n\t" + "smlal r5, r6, r2, r4 \n\t" + "ldr r4, [%[V], #480*4] \n\t" + "smlal r5, r6, r3, r4 \n\t" + + "ldmia %[D]!, { r0-r3 } \n\t" + "ldr r4, [%[V], #512*4] \n\t" + "smlal r5, r6, r0, r4 \n\t" + "ldr r4, [%[V], #608*4] \n\t" + "smlal r5, r6, r1, r4 \n\t" + "ldr r4, [%[V], #640*4] \n\t" + "smlal r5, r6, r2, r4 \n\t" + "ldr r4, [%[V], #736*4] \n\t" + "smlal r5, r6, r3, r4 \n\t" + + "ldmia %[D]!, { r0-r3 } \n\t" + "ldr r4, [%[V], #768*4] \n\t" + "smlal r5, r6, r0, r4 \n\t" + "ldr r4, [%[V], #864*4] \n\t" + "smlal r5, r6, r1, r4 \n\t" + "ldr r4, [%[V], #896*4] \n\t" + "smlal r5, r6, r2, r4 \n\t" + "ldr r4, [%[V], #992*4] \n\t" + "smlal r5, r6, r3, r4 \n\t" + "mov r4, r6, lsl #1 \n\t" + "orr r4, r4, r5, lsr #31\n\t" + "str r4, [%[Data]], #4 \n" + : [Data] "+r" (Data), [D] "+r" (D) + : [V] "r" (V) + : "r0", "r1", "r2", "r3", "r4", "r5", "r6"); + } + #else + // 64=64x64-multiply C + for ( k = 0; k < 32; k++, D += 16, V++ ) + { + *Data = MPC_MULTIPLY_EX(V[ 0],D[ 0],31) + MPC_MULTIPLY_EX(V[ 96],D[ 1],31) + MPC_MULTIPLY_EX(V[128],D[ 2],31) + MPC_MULTIPLY_EX(V[224],D[ 3],31) + + MPC_MULTIPLY_EX(V[256],D[ 4],31) + MPC_MULTIPLY_EX(V[352],D[ 5],31) + MPC_MULTIPLY_EX(V[384],D[ 6],31) + MPC_MULTIPLY_EX(V[480],D[ 7],31) + + MPC_MULTIPLY_EX(V[512],D[ 8],31) + MPC_MULTIPLY_EX(V[608],D[ 9],31) + MPC_MULTIPLY_EX(V[640],D[10],31) + MPC_MULTIPLY_EX(V[736],D[11],31) + + MPC_MULTIPLY_EX(V[768],D[12],31) + MPC_MULTIPLY_EX(V[864],D[13],31) + MPC_MULTIPLY_EX(V[896],D[14],31) + MPC_MULTIPLY_EX(V[992],D[15],31); + Data += 1; + } + #endif +} + +static void +mpc_full_synthesis_filter(MPC_SAMPLE_FORMAT *OutData, MPC_SAMPLE_FORMAT *V, const MPC_SAMPLE_FORMAT *Y) +{ mpc_uint32_t n; - for ( n = 0; n < 36; n++, Y += 32 ) { - V -= 64; - Calculate_New_V ( Y, V ); - if (OutData != NULL) - { - MPC_SAMPLE_FORMAT * Data = OutData; - const MPC_SAMPLE_FORMAT * D = (const MPC_SAMPLE_FORMAT *) &Di_opt; - mpc_int32_t k; - //mpc_int32_t tmp; - - - - #if defined(CPU_COLDFIRE) - for ( k = 0; k < 32; k++, D += 16, V++ ) { - asm volatile ( - "movem.l (%[D]), %%d0-%%d3 \n\t" - "move.l (%[V]), %%a5 \n\t" - "mac.l %%d0, %%a5, (96*4, %[V]), %%a5, %%acc0 \n\t" - "mac.l %%d1, %%a5, (128*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d2, %%a5, (224*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d3, %%a5, (256*4, %[V]), %%a5, %%acc0\n\t" - "movem.l (4*4, %[D]), %%d0-%%d3 \n\t" - "mac.l %%d0, %%a5, (352*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d1, %%a5, (384*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d2, %%a5, (480*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d3, %%a5, (512*4, %[V]), %%a5, %%acc0\n\t" - "movem.l (8*4, %[D]), %%d0-%%d3 \n\t" - "mac.l %%d0, %%a5, (608*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d1, %%a5, (640*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d2, %%a5, (736*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d3, %%a5, (768*4, %[V]), %%a5, %%acc0\n\t" - "movem.l (12*4, %[D]), %%d0-%%d3 \n\t" - "mac.l %%d0, %%a5, (864*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d1, %%a5, (896*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d2, %%a5, (992*4, %[V]), %%a5, %%acc0\n\t" - "mac.l %%d3, %%a5, %%acc0 \n\t" - "movclr.l %%acc0, %%d0 \n\t" - "move.l %%d0, (%[Data])+ \n" - : [Data] "+a" (Data) - : [V] "a" (V), [D] "a" (D) - : "d0", "d1", "d2", "d3", "a5"); - #elif defined(CPU_ARM) - for ( k = 0; k < 32; k++, V++ ) { - asm volatile ( - "ldmia %[D]!, { r0-r3 } \n\t" - "ldr r4, [%[V]] \n\t" - "smull r5, r6, r0, r4 \n\t" - "ldr r4, [%[V], #96*4] \n\t" - "smlal r5, r6, r1, r4 \n\t" - "ldr r4, [%[V], #128*4] \n\t" - "smlal r5, r6, r2, r4 \n\t" - "ldr r4, [%[V], #224*4] \n\t" - "smlal r5, r6, r3, r4 \n\t" - - "ldmia %[D]!, { r0-r3 } \n\t" - "ldr r4, [%[V], #256*4] \n\t" - "smlal r5, r6, r0, r4 \n\t" - "ldr r4, [%[V], #352*4] \n\t" - "smlal r5, r6, r1, r4 \n\t" - "ldr r4, [%[V], #384*4] \n\t" - "smlal r5, r6, r2, r4 \n\t" - "ldr r4, [%[V], #480*4] \n\t" - "smlal r5, r6, r3, r4 \n\t" - - "ldmia %[D]!, { r0-r3 } \n\t" - "ldr r4, [%[V], #512*4] \n\t" - "smlal r5, r6, r0, r4 \n\t" - "ldr r4, [%[V], #608*4] \n\t" - "smlal r5, r6, r1, r4 \n\t" - "ldr r4, [%[V], #640*4] \n\t" - "smlal r5, r6, r2, r4 \n\t" - "ldr r4, [%[V], #736*4] \n\t" - "smlal r5, r6, r3, r4 \n\t" - - "ldmia %[D]!, { r0-r3 } \n\t" - "ldr r4, [%[V], #768*4] \n\t" - "smlal r5, r6, r0, r4 \n\t" - "ldr r4, [%[V], #864*4] \n\t" - "smlal r5, r6, r1, r4 \n\t" - "ldr r4, [%[V], #896*4] \n\t" - "smlal r5, r6, r2, r4 \n\t" - "ldr r4, [%[V], #992*4] \n\t" - "smlal r5, r6, r3, r4 \n\t" - "mov r4, r6, lsl #1 \n\t" - "orr r4, r4, r5, lsr #31\n\t" - "str r4, [%[Data]], #4 \n" - : [Data] "+r" (Data), [D] "+r" (D) - : [V] "r" (V) - : "r0", "r1", "r2", "r3", "r4", "r5", "r6"); - #else - for ( k = 0; k < 32; k++, D += 16, V++ ) { - *Data = MPC_SHL( - MPC_MULTIPLY_FRACT(V[ 0],D[ 0]) + MPC_MULTIPLY_FRACT(V[ 96],D[ 1]) + MPC_MULTIPLY_FRACT(V[128],D[ 2]) + MPC_MULTIPLY_FRACT(V[224],D[ 3]) - + MPC_MULTIPLY_FRACT(V[256],D[ 4]) + MPC_MULTIPLY_FRACT(V[352],D[ 5]) + MPC_MULTIPLY_FRACT(V[384],D[ 6]) + MPC_MULTIPLY_FRACT(V[480],D[ 7]) - + MPC_MULTIPLY_FRACT(V[512],D[ 8]) + MPC_MULTIPLY_FRACT(V[608],D[ 9]) + MPC_MULTIPLY_FRACT(V[640],D[10]) + MPC_MULTIPLY_FRACT(V[736],D[11]) - + MPC_MULTIPLY_FRACT(V[768],D[12]) + MPC_MULTIPLY_FRACT(V[864],D[13]) + MPC_MULTIPLY_FRACT(V[896],D[14]) + MPC_MULTIPLY_FRACT(V[992],D[15]) - , 1); - - Data += 1; - #endif - } - V -= 32;//bleh - OutData+=32; - } - } + + if (NULL != OutData) + { + for ( n = 0; n < 36; n++, Y += 32, OutData += 32 ) + { + V -= 64; + mpc_calculate_new_V ( Y, V ); + mpc_decoder_windowing_D( OutData, V); + } + } } void -mpc_decoder_synthese_filter_float(mpc_decoder *d, MPC_SAMPLE_FORMAT* OutData) +mpc_decoder_synthese_filter_float(mpc_decoder *d, MPC_SAMPLE_FORMAT *OutData) { /********* left channel ********/ memmove(d->V_L + MPC_V_MEM, d->V_L, 960 * sizeof(MPC_SAMPLE_FORMAT) ); - Synthese_Filter_float_internal( + mpc_full_synthesis_filter( OutData, (MPC_SAMPLE_FORMAT *)(d->V_L + MPC_V_MEM), (MPC_SAMPLE_FORMAT *)(d->Y_L [0])); @@ -452,7 +505,7 @@ /******** right channel ********/ memmove(d->V_R + MPC_V_MEM, d->V_R, 960 * sizeof(MPC_SAMPLE_FORMAT) ); - Synthese_Filter_float_internal( + mpc_full_synthesis_filter( (OutData == NULL ? NULL : OutData + MPC_FRAME_LENGTH), (MPC_SAMPLE_FORMAT *)(d->V_R + MPC_V_MEM), (MPC_SAMPLE_FORMAT *)(d->Y_R [0])); Index: apps/codecs/libmusepack/requant.c =================================================================== --- apps/codecs/libmusepack/requant.c (revision 13591) +++ apps/codecs/libmusepack/requant.c (working copy) @@ -53,8 +53,8 @@ const MPC_SAMPLE_FORMAT __Cc [1 + 18] = { _(111.285962475327f), // 32768/2/255*sqrt(3) _(65536.000000000000f), _(21845.333333333332f), _(13107.200000000001f), _(9362.285714285713f), - _(7281.777777777777f), _(4369.066666666666f), _(2114.064516129032f), _(1040.253968253968f), - _(516.031496062992f), _(257.003921568627f), _(128.250489236790f), _(64.062561094819f), + _(7281.777777777777f), _(4369.066666666666f), _(2114.064516129032f), _(1040.253968253968f), + _(516.031496062992f), _(257.003921568627f), _(128.250489236790f), _(64.062561094819f), _(32.015632633121f), _(16.003907203907f), _(8.000976681723f), _(4.000244155527f), _(2.000061037018f), _(1.000015259021f) }; Index: apps/codecs/libmusepack/Makefile =================================================================== --- apps/codecs/libmusepack/Makefile (revision 13591) +++ apps/codecs/libmusepack/Makefile (working copy) @@ -14,7 +14,13 @@ INCLUDES += $(patsubst %,-I$(APPSDIR)/%,$(subst :, ,$(APPEXTRA))) endif -MUSEPACKOPTS = -O2 +# libmusepack is faster on ipods with -O1 instead of -O2 +ifeq ($(findstring IPOD,$(TARGET)),IPOD) + MUSEPACKOPTS += -O1 +else + MUSEPACKOPTS += -O2 +endif + CFLAGS = $(INCLUDES) $(GCCOPTS) $(TARGET_INC) $(MUSEPACKOPTS) $(TARGET) \ $(EXTRA_DEFINES) -DMEM=${MEMORYSIZE} $(PROFILE_OPTS)