Index: apps/codecs/libfaad/sbr_qmf.c =================================================================== --- apps/codecs/libfaad/sbr_qmf.c (revision 27338) +++ apps/codecs/libfaad/sbr_qmf.c (working copy) @@ -38,6 +38,12 @@ #include "sbr_qmf_c.h" #include "sbr_syntax.h" +#ifdef FIXED_POINT + #define FAAD_SCALE(X) ((X)>>1) +#else + #define FAAD_SCALE(X) ((X)*scale) +#endif + qmfa_info *qmfa_init(uint8_t channels) { qmfa_info *qmfa = (qmfa_info*)faad_malloc(sizeof(qmfa_info)); @@ -66,42 +72,53 @@ void sbr_qmf_analysis_32(sbr_info *sbr, qmfa_info *qmfa, const real_t *input, qmf_t X[MAX_NTSRHFG][64], uint8_t offset, uint8_t kx) { - ALIGN real_t u[64]; + static ALIGN real_t u[64] IBSS_ATTR_FAAD_LARGE_IRAM; #ifndef SBR_LOW_POWER - static ALIGN real_t in_real[32], in_imag[32], out_real[32], out_imag[32]; + static ALIGN real_t in_real[32] IBSS_ATTR_FAAD_LARGE_IRAM; + static ALIGN real_t in_imag[32] IBSS_ATTR_FAAD_LARGE_IRAM; + static ALIGN real_t out_real[32] IBSS_ATTR_FAAD_LARGE_IRAM; + static ALIGN real_t out_imag[32] IBSS_ATTR_FAAD_LARGE_IRAM; #else ALIGN real_t y[32]; #endif - uint16_t in = 0; - uint8_t l; + uint32_t in = 0; + uint32_t l, idx0, idx1; /* qmf subsample l */ for (l = 0; l < sbr->numTimeSlotsRate; l++) { - int16_t n; + int32_t n; /* shift input buffer x */ /* input buffer is not shifted anymore, x is implemented as double ringbuffer */ //memmove(qmfa->x + 32, qmfa->x, (320-32)*sizeof(real_t)); /* add new samples to input buffer x */ - for (n = 32 - 1; n >= 0; n--) + idx0 = qmfa->x_index + 31; idx1 = idx0 + 320; + for (n = 32 - 1; n >= 0; n-=4) { #ifdef FIXED_POINT - qmfa->x[qmfa->x_index + n] = qmfa->x[qmfa->x_index + n + 320] = (input[in++]) >> 4; + qmfa->x[idx0--] = qmfa->x[idx1--] = (input[in++]) >> 4; + qmfa->x[idx0--] = qmfa->x[idx1--] = (input[in++]) >> 4; + qmfa->x[idx0--] = qmfa->x[idx1--] = (input[in++]) >> 4; + qmfa->x[idx0--] = qmfa->x[idx1--] = (input[in++]) >> 4; #else - qmfa->x[qmfa->x_index + n] = qmfa->x[qmfa->x_index + n + 320] = input[in++]; + qmfa->x[idx0--] = qmfa->x[idx1--] = (input[in++]); + qmfa->x[idx0--] = qmfa->x[idx1--] = (input[in++]); + qmfa->x[idx0--] = qmfa->x[idx1--] = (input[in++]); + qmfa->x[idx0--] = qmfa->x[idx1--] = (input[in++]); #endif } /* window and summation to create array u */ for (n = 0; n < 64; n++) { - u[n] = MUL_F(qmfa->x[qmfa->x_index + n], qmf_c[2*n]) + - MUL_F(qmfa->x[qmfa->x_index + n + 64], qmf_c[2*(n + 64)]) + - MUL_F(qmfa->x[qmfa->x_index + n + 128], qmf_c[2*(n + 128)]) + - MUL_F(qmfa->x[qmfa->x_index + n + 192], qmf_c[2*(n + 192)]) + - MUL_F(qmfa->x[qmfa->x_index + n + 256], qmf_c[2*(n + 256)]); + idx0 = qmfa->x_index + n; idx1 = n * 2; + u[n] = MUL_F(qmfa->x[idx0 ], qmf_c[idx1]) + + MUL_F(qmfa->x[idx0 + 64], qmf_c[idx1 + 2 * 64]) + + MUL_F(qmfa->x[idx0 + 128], qmf_c[idx1 + 2 * 128]) + + MUL_F(qmfa->x[idx0 + 192], qmf_c[idx1 + 2 * 192]) + + MUL_F(qmfa->x[idx0 + 256], qmf_c[idx1 + 2 * 256]); } /* update ringbuffer index */ @@ -132,18 +149,19 @@ QMF_RE(X[l + offset][n]) = 0; } } -#else +#else /* #ifdef SBR_LOW_POWER */ // Reordering of data moved from DCT_IV to here - in_imag[31] = u[1]; - in_real[0] = u[0]; - for (n = 1; n < 31; n++) + idx0 = 30; idx1 = 63; + in_imag[31] = u[ 1]; in_real[ 0] = u[ 0]; + for (n = 1; n < 31; n+=4) { - in_imag[31 - n] = u[n+1]; - in_real[n] = -u[64-n]; + in_imag[idx0--] = u[n+1]; in_real[n ] = -u[idx1--]; + in_imag[idx0--] = u[n+2]; in_real[n+1] = -u[idx1--]; + in_imag[idx0--] = u[n+3]; in_real[n+2] = -u[idx1--]; + in_imag[idx0--] = u[n+4]; in_real[n+3] = -u[idx1--]; } - in_imag[0] = u[32]; - in_real[31] = -u[33]; + in_imag[ 0] = u[32]; in_real[31] = -u[33]; // dct4_kernel is DCT_IV without reordering which is done before and after FFT dct4_kernel(in_real, in_imag, out_real, out_imag); @@ -180,7 +198,7 @@ QMF_IM(X[l + offset][2*n+1]) = 0; } } -#endif +#endif /* #ifdef SBR_LOW_POWER */ } } @@ -249,8 +267,8 @@ void sbr_qmf_synthesis_32(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64], real_t *output) { - ALIGN real_t x[16]; - ALIGN real_t y[16]; + static ALIGN real_t x[16]; + static ALIGN real_t y[16]; int16_t n, k, out = 0; uint8_t l; @@ -384,18 +402,27 @@ qmfs->v_index = (1280-128); } } -#else +#else /* #ifdef SBR_LOW_POWER */ + +#define FAAD_CMPLX_PRETWIDDLE_SUB(k) \ + FAAD_SCALE((MUL_F(QMF_RE(X[l][k]), RE(qmf32_pre_twiddle[k])) - \ + MUL_F(QMF_IM(X[l][k]), IM(qmf32_pre_twiddle[k])))) \ + +#define FAAD_CMPLX_PRETWIDDLE_ADD(k) \ + FAAD_SCALE((MUL_F(QMF_IM(X[l][k]), RE(qmf32_pre_twiddle[k])) + \ + MUL_F(QMF_RE(X[l][k]), IM(qmf32_pre_twiddle[k])))) \ + void sbr_qmf_synthesis_32(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64], real_t *output) { - ALIGN real_t x1[32], x2[32]; + static ALIGN real_t x1[32] IBSS_ATTR_FAAD_LARGE_IRAM; + static ALIGN real_t x2[32] IBSS_ATTR_FAAD_LARGE_IRAM; #ifndef FIXED_POINT real_t scale = 1.f/64.f; #endif - int16_t n, k, out = 0; - uint8_t l; + int32_t n, k, idx0, idx1, out = 0; + uint32_t l; - /* qmf subsample l */ for (l = 0; l < sbr->numTimeSlotsRate; l++) { @@ -405,43 +432,42 @@ /* calculate 64 samples */ /* complex pre-twiddle */ - for (k = 0; k < 32; k++) + for (k = 0; k < 32;) { - x1[k] = MUL_F(QMF_RE(X[l][k]), RE(qmf32_pre_twiddle[k])) - MUL_F(QMF_IM(X[l][k]), IM(qmf32_pre_twiddle[k])); - x2[k] = MUL_F(QMF_IM(X[l][k]), RE(qmf32_pre_twiddle[k])) + MUL_F(QMF_RE(X[l][k]), IM(qmf32_pre_twiddle[k])); - -#ifndef FIXED_POINT - x1[k] *= scale; - x2[k] *= scale; -#else - x1[k] >>= 1; - x2[k] >>= 1; -#endif + x1[k] = FAAD_CMPLX_PRETWIDDLE_SUB(k); x2[k] = FAAD_CMPLX_PRETWIDDLE_ADD(k); k++; + x1[k] = FAAD_CMPLX_PRETWIDDLE_SUB(k); x2[k] = FAAD_CMPLX_PRETWIDDLE_ADD(k); k++; + x1[k] = FAAD_CMPLX_PRETWIDDLE_SUB(k); x2[k] = FAAD_CMPLX_PRETWIDDLE_ADD(k); k++; + x1[k] = FAAD_CMPLX_PRETWIDDLE_SUB(k); x2[k] = FAAD_CMPLX_PRETWIDDLE_ADD(k); k++; } /* transform */ DCT4_32(x1, x1); DST4_32(x2, x2); - for (n = 0; n < 32; n++) + idx0 = qmfs->v_index; + idx1 = qmfs->v_index + 63; + for (n = 0; n < 32; n+=2) { - qmfs->v[qmfs->v_index + n] = qmfs->v[qmfs->v_index + 640 + n] = -x1[n] + x2[n]; - qmfs->v[qmfs->v_index + 63 - n] = qmfs->v[qmfs->v_index + 640 + 63 - n] = x1[n] + x2[n]; + qmfs->v[idx0] = qmfs->v[idx0 + 640] = -x1[n ] + x2[n ]; idx0++; + qmfs->v[idx1] = qmfs->v[idx1 + 640] = x1[n ] + x2[n ]; idx1--; + qmfs->v[idx0] = qmfs->v[idx0 + 640] = -x1[n+1] + x2[n+1]; idx0++; + qmfs->v[idx1] = qmfs->v[idx1 + 640] = x1[n+1] + x2[n+1]; idx1--; } /* calculate 32 output samples and window */ for (k = 0; k < 32; k++) { - output[out++] = MUL_F(qmfs->v[qmfs->v_index + k], qmf_c[2*k]) + - MUL_F(qmfs->v[qmfs->v_index + 96 + k], qmf_c[64 + 2*k]) + - MUL_F(qmfs->v[qmfs->v_index + 128 + k], qmf_c[128 + 2*k]) + - MUL_F(qmfs->v[qmfs->v_index + 224 + k], qmf_c[192 + 2*k]) + - MUL_F(qmfs->v[qmfs->v_index + 256 + k], qmf_c[256 + 2*k]) + - MUL_F(qmfs->v[qmfs->v_index + 352 + k], qmf_c[320 + 2*k]) + - MUL_F(qmfs->v[qmfs->v_index + 384 + k], qmf_c[384 + 2*k]) + - MUL_F(qmfs->v[qmfs->v_index + 480 + k], qmf_c[448 + 2*k]) + - MUL_F(qmfs->v[qmfs->v_index + 512 + k], qmf_c[512 + 2*k]) + - MUL_F(qmfs->v[qmfs->v_index + 608 + k], qmf_c[576 + 2*k]); + idx0 = qmfs->v_index + k; idx1 = 2*k; + output[out++] = MUL_F(qmfs->v[idx0 ], qmf_c[idx1 ]) + + MUL_F(qmfs->v[idx0 + 96], qmf_c[idx1 + 64]) + + MUL_F(qmfs->v[idx0 + 128], qmf_c[idx1 + 128]) + + MUL_F(qmfs->v[idx0 + 224], qmf_c[idx1 + 192]) + + MUL_F(qmfs->v[idx0 + 256], qmf_c[idx1 + 256]) + + MUL_F(qmfs->v[idx0 + 352], qmf_c[idx1 + 320]) + + MUL_F(qmfs->v[idx0 + 384], qmf_c[idx1 + 384]) + + MUL_F(qmfs->v[idx0 + 480], qmf_c[idx1 + 448]) + + MUL_F(qmfs->v[idx0 + 512], qmf_c[idx1 + 512]) + + MUL_F(qmfs->v[idx0 + 608], qmf_c[idx1 + 576]); } /* update ringbuffer index */ @@ -454,31 +480,24 @@ void sbr_qmf_synthesis_64(sbr_info *sbr, qmfs_info *qmfs, qmf_t X[MAX_NTSRHFG][64], real_t *output) { -// ALIGN real_t x1[64], x2[64]; #ifndef SBR_LOW_POWER - static ALIGN real_t in_real1[32], in_imag1[32], out_real1[32], out_imag1[32]; - static ALIGN real_t in_real2[32], in_imag2[32], out_real2[32], out_imag2[32]; + static ALIGN real_t in_real1[32] IBSS_ATTR_FAAD_LARGE_IRAM; + static ALIGN real_t in_imag1[32] IBSS_ATTR_FAAD_LARGE_IRAM; + static ALIGN real_t out_real1[32] IBSS_ATTR_FAAD_LARGE_IRAM; + static ALIGN real_t out_imag1[32] IBSS_ATTR_FAAD_LARGE_IRAM; + static ALIGN real_t in_real2[32] IBSS_ATTR_FAAD_LARGE_IRAM; + static ALIGN real_t in_imag2[32] IBSS_ATTR_FAAD_LARGE_IRAM; + static ALIGN real_t out_real2[32] IBSS_ATTR_FAAD_LARGE_IRAM; + static ALIGN real_t out_imag2[32] IBSS_ATTR_FAAD_LARGE_IRAM; #endif qmf_t * pX; - real_t * pring_buffer_1, * pring_buffer_3; -// real_t * ptemp_1, * ptemp_2; -#ifdef PREFER_POINTERS - // These pointers are used if target platform has autoinc address generators - real_t * pring_buffer_2, * pring_buffer_4; - real_t * pring_buffer_5, * pring_buffer_6; - real_t * pring_buffer_7, * pring_buffer_8; - real_t * pring_buffer_9, * pring_buffer_10; - const real_t * pqmf_c_1, * pqmf_c_2, * pqmf_c_3, * pqmf_c_4; - const real_t * pqmf_c_5, * pqmf_c_6, * pqmf_c_7, * pqmf_c_8; - const real_t * pqmf_c_9, * pqmf_c_10; -#endif // #ifdef PREFER_POINTERS + real_t * p_buf_1, * p_buf_3; #ifndef FIXED_POINT real_t scale = 1.f/64.f; #endif - int16_t n, k, out = 0; - uint8_t l; - - + int32_t n, k, idx0, idx1, out = 0; + uint32_t l; + /* qmf subsample l */ for (l = 0; l < sbr->numTimeSlotsRate; l++) { @@ -487,139 +506,55 @@ //memmove(qmfs->v + 128, qmfs->v, (1280-128)*sizeof(real_t)); /* calculate 128 samples */ -#ifndef FIXED_POINT - pX = X[l]; - in_imag1[31] = scale*QMF_RE(pX[1]); - in_real1[0] = scale*QMF_RE(pX[0]); - in_imag2[31] = scale*QMF_IM(pX[63-1]); - in_real2[0] = scale*QMF_IM(pX[63-0]); + in_imag1[31] = FAAD_SCALE(QMF_RE(pX[ 1])); + in_real1[ 0] = FAAD_SCALE(QMF_RE(pX[ 0])); + in_imag2[31] = FAAD_SCALE(QMF_IM(pX[62])); + in_real2[ 0] = FAAD_SCALE(QMF_IM(pX[63])); for (k = 1; k < 31; k++) { - in_imag1[31 - k] = scale*QMF_RE(pX[2*k + 1]); - in_real1[ k] = scale*QMF_RE(pX[2*k ]); - in_imag2[31 - k] = scale*QMF_IM(pX[63 - (2*k + 1)]); - in_real2[ k] = scale*QMF_IM(pX[63 - (2*k )]); + in_imag1[31 - k] = FAAD_SCALE(QMF_RE(pX[ (2*k + 1)])); + in_real1[ k] = FAAD_SCALE(QMF_RE(pX[ (2*k )])); + in_imag2[31 - k] = FAAD_SCALE(QMF_IM(pX[63 - (2*k + 1)])); + in_real2[ k] = FAAD_SCALE(QMF_IM(pX[63 - (2*k )])); } - in_imag1[0] = scale*QMF_RE(pX[63]); - in_real1[31] = scale*QMF_RE(pX[62]); - in_imag2[0] = scale*QMF_IM(pX[63-63]); - in_real2[31] = scale*QMF_IM(pX[63-62]); + in_imag1[ 0] = FAAD_SCALE(QMF_RE(pX[63])); + in_real1[31] = FAAD_SCALE(QMF_RE(pX[62])); + in_imag2[ 0] = FAAD_SCALE(QMF_IM(pX[ 0])); + in_real2[31] = FAAD_SCALE(QMF_IM(pX[ 1])); -#else - - pX = X[l]; - - in_imag1[31] = QMF_RE(pX[1]) >> 1; - in_real1[0] = QMF_RE(pX[0]) >> 1; - in_imag2[31] = QMF_IM(pX[62]) >> 1; - in_real2[0] = QMF_IM(pX[63]) >> 1; - for (k = 1; k < 31; k++) - { - in_imag1[31 - k] = QMF_RE(pX[2*k + 1]) >> 1; - in_real1[ k] = QMF_RE(pX[2*k ]) >> 1; - in_imag2[31 - k] = QMF_IM(pX[63 - (2*k + 1)]) >> 1; - in_real2[ k] = QMF_IM(pX[63 - (2*k )]) >> 1; - } - in_imag1[0] = QMF_RE(pX[63]) >> 1; - in_real1[31] = QMF_RE(pX[62]) >> 1; - in_imag2[0] = QMF_IM(pX[0]) >> 1; - in_real2[31] = QMF_IM(pX[1]) >> 1; - -#endif - - // dct4_kernel is DCT_IV without reordering which is done before and after FFT dct4_kernel(in_real1, in_imag1, out_real1, out_imag1); dct4_kernel(in_real2, in_imag2, out_real2, out_imag2); + p_buf_1 = qmfs->v + qmfs->v_index; + p_buf_3 = p_buf_1 + 1280; - pring_buffer_1 = qmfs->v + qmfs->v_index; - pring_buffer_3 = pring_buffer_1 + 1280; -#ifdef PREFER_POINTERS - pring_buffer_2 = pring_buffer_1 + 127; - pring_buffer_4 = pring_buffer_1 + (1280 + 127); -#endif // #ifdef PREFER_POINTERS -// ptemp_1 = x1; -// ptemp_2 = x2; -#ifdef PREFER_POINTERS - for (n = 0; n < 32; n ++) - { - //real_t x1 = *ptemp_1++; - //real_t x2 = *ptemp_2++; - // pring_buffer_3 and pring_buffer_4 are needed only for double ring buffer - *pring_buffer_1++ = *pring_buffer_3++ = out_real2[n] - out_real1[n]; - *pring_buffer_2-- = *pring_buffer_4-- = out_real2[n] + out_real1[n]; - //x1 = *ptemp_1++; - //x2 = *ptemp_2++; - *pring_buffer_1++ = *pring_buffer_3++ = out_imag2[31-n] + out_imag1[31-n]; - *pring_buffer_2-- = *pring_buffer_4-- = out_imag2[31-n] - out_imag1[31-n]; - } -#else // #ifdef PREFER_POINTERS - + idx0 = 0; idx1 = 127; for (n = 0; n < 32; n++) { - // pring_buffer_3 and pring_buffer_4 are needed only for double ring buffer - pring_buffer_1[2*n] = pring_buffer_3[2*n] = out_real2[n] - out_real1[n]; - pring_buffer_1[127-2*n] = pring_buffer_3[127-2*n] = out_real2[n] + out_real1[n]; - pring_buffer_1[2*n+1] = pring_buffer_3[2*n+1] = out_imag2[31-n] + out_imag1[31-n]; - pring_buffer_1[127-(2*n+1)] = pring_buffer_3[127-(2*n+1)] = out_imag2[31-n] - out_imag1[31-n]; + p_buf_1[idx0] = p_buf_3[idx0] = out_real2[ n] - out_real1[ n]; idx0++; + p_buf_1[idx1] = p_buf_3[idx1] = out_real2[ n] + out_real1[ n]; idx1--; + p_buf_1[idx0] = p_buf_3[idx0] = out_imag2[31-n] + out_imag1[31-n]; idx0++; + p_buf_1[idx1] = p_buf_3[idx1] = out_imag2[31-n] - out_imag1[31-n]; idx1--; } -#endif // #ifdef PREFER_POINTERS + p_buf_1 = qmfs->v + qmfs->v_index; - pring_buffer_1 = qmfs->v + qmfs->v_index; -#ifdef PREFER_POINTERS - pring_buffer_2 = pring_buffer_1 + 192; - pring_buffer_3 = pring_buffer_1 + 256; - pring_buffer_4 = pring_buffer_1 + (256 + 192); - pring_buffer_5 = pring_buffer_1 + 512; - pring_buffer_6 = pring_buffer_1 + (512 + 192); - pring_buffer_7 = pring_buffer_1 + 768; - pring_buffer_8 = pring_buffer_1 + (768 + 192); - pring_buffer_9 = pring_buffer_1 + 1024; - pring_buffer_10 = pring_buffer_1 + (1024 + 192); - pqmf_c_1 = qmf_c; - pqmf_c_2 = qmf_c + 64; - pqmf_c_3 = qmf_c + 128; - pqmf_c_4 = qmf_c + 192; - pqmf_c_5 = qmf_c + 256; - pqmf_c_6 = qmf_c + 320; - pqmf_c_7 = qmf_c + 384; - pqmf_c_8 = qmf_c + 448; - pqmf_c_9 = qmf_c + 512; - pqmf_c_10 = qmf_c + 576; -#endif // #ifdef PREFER_POINTERS - /* calculate 64 output samples and window */ for (k = 0; k < 64; k++) { -#ifdef PREFER_POINTERS - output[out++] = - MUL_F(*pring_buffer_1++, *pqmf_c_1++) + - MUL_F(*pring_buffer_2++, *pqmf_c_2++) + - MUL_F(*pring_buffer_3++, *pqmf_c_3++) + - MUL_F(*pring_buffer_4++, *pqmf_c_4++) + - MUL_F(*pring_buffer_5++, *pqmf_c_5++) + - MUL_F(*pring_buffer_6++, *pqmf_c_6++) + - MUL_F(*pring_buffer_7++, *pqmf_c_7++) + - MUL_F(*pring_buffer_8++, *pqmf_c_8++) + - MUL_F(*pring_buffer_9++, *pqmf_c_9++) + - MUL_F(*pring_buffer_10++, *pqmf_c_10++); -#else // #ifdef PREFER_POINTERS - output[out++] = - MUL_F(pring_buffer_1[k+0], qmf_c[k+0]) + - MUL_F(pring_buffer_1[k+192], qmf_c[k+64]) + - MUL_F(pring_buffer_1[k+256], qmf_c[k+128]) + - MUL_F(pring_buffer_1[k+(256+192)], qmf_c[k+192]) + - MUL_F(pring_buffer_1[k+512], qmf_c[k+256]) + - MUL_F(pring_buffer_1[k+(512+192)], qmf_c[k+320]) + - MUL_F(pring_buffer_1[k+768], qmf_c[k+384]) + - MUL_F(pring_buffer_1[k+(768+192)], qmf_c[k+448]) + - MUL_F(pring_buffer_1[k+1024], qmf_c[k+512]) + - MUL_F(pring_buffer_1[k+(1024+192)], qmf_c[k+576]); -#endif // #ifdef PREFER_POINTERS + output[out++] = MUL_F(p_buf_1[k ], qmf_c[k ]) + + MUL_F(p_buf_1[k+ 192 ], qmf_c[k+ 64]) + + MUL_F(p_buf_1[k+ 256 ], qmf_c[k+128]) + + MUL_F(p_buf_1[k+ 256+192], qmf_c[k+192]) + + MUL_F(p_buf_1[k+ 512 ], qmf_c[k+256]) + + MUL_F(p_buf_1[k+ 512+192], qmf_c[k+320]) + + MUL_F(p_buf_1[k+ 768 ], qmf_c[k+384]) + + MUL_F(p_buf_1[k+ 768+192], qmf_c[k+448]) + + MUL_F(p_buf_1[k+1024 ], qmf_c[k+512]) + + MUL_F(p_buf_1[k+1024+192], qmf_c[k+576]); } /* update ringbuffer index */ Index: apps/codecs/libfaad/sbr_qmf_c.h =================================================================== --- apps/codecs/libfaad/sbr_qmf_c.h (revision 27338) +++ apps/codecs/libfaad/sbr_qmf_c.h (working copy) @@ -38,7 +38,7 @@ #pragma warning(disable:4244) #endif -ALIGN static const real_t qmf_c[640] = { +ALIGN static const real_t qmf_c[640] ICONST_ATTR_FAAD_LARGE_IRAM = { FRAC_CONST(0), FRAC_CONST(-0.00055252865047), FRAC_CONST(-0.00056176925738), FRAC_CONST(-0.00049475180896), FRAC_CONST(-0.00048752279712), FRAC_CONST(-0.00048937912498), Index: apps/codecs/libfaad/sbr_dct.c =================================================================== --- apps/codecs/libfaad/sbr_dct.c (revision 27338) +++ apps/codecs/libfaad/sbr_dct.c (working copy) @@ -26,7 +26,10 @@ **/ #include "common.h" +#include "../lib/fft.h" +#include "../lib/mdct_lookup.h" + #ifdef SBR_DEC #ifdef _MSC_VER @@ -1449,265 +1452,7 @@ #else - -#define n 32 -#define log2n 5 - -// w_array_real[i] = cos(2*M_PI*i/32) -static const real_t w_array_real[] = { - FRAC_CONST(1.000000000000000), FRAC_CONST(0.980785279337272), - FRAC_CONST(0.923879528329380), FRAC_CONST(0.831469603195765), - FRAC_CONST(0.707106765732237), FRAC_CONST(0.555570210304169), - FRAC_CONST(0.382683402077046), FRAC_CONST(0.195090284503576), - FRAC_CONST(0.000000000000000), FRAC_CONST(-0.195090370246552), - FRAC_CONST(-0.382683482845162), FRAC_CONST(-0.555570282993553), - FRAC_CONST(-0.707106827549476), FRAC_CONST(-0.831469651765257), - FRAC_CONST(-0.923879561784627), FRAC_CONST(-0.980785296392607) -}; - -// w_array_imag[i] = sin(-2*M_PI*i/32) -static const real_t w_array_imag[] = { - FRAC_CONST(0.000000000000000), FRAC_CONST(-0.195090327375064), - FRAC_CONST(-0.382683442461104), FRAC_CONST(-0.555570246648862), - FRAC_CONST(-0.707106796640858), FRAC_CONST(-0.831469627480512), - FRAC_CONST(-0.923879545057005), FRAC_CONST(-0.980785287864940), - FRAC_CONST(-1.000000000000000), FRAC_CONST(-0.980785270809601), - FRAC_CONST(-0.923879511601754), FRAC_CONST(-0.831469578911016), - FRAC_CONST(-0.707106734823616), FRAC_CONST(-0.555570173959476), - FRAC_CONST(-0.382683361692986), FRAC_CONST(-0.195090241632088) -}; - -// FFT decimation in frequency -// 4*16*2+16=128+16=144 multiplications -// 6*16*2+10*8+4*16*2=192+80+128=400 additions -static void fft_dif(real_t * Real, real_t * Imag) -{ - real_t w_real, w_imag; // For faster access - real_t point1_real, point1_imag, point2_real, point2_imag; // For faster access - uint32_t j, i, i2, w_index; // Counters - - // First 2 stages of 32 point FFT decimation in frequency - // 4*16*2=64*2=128 multiplications - // 6*16*2=96*2=192 additions - // Stage 1 of 32 point FFT decimation in frequency - for (i = 0; i < 16; i++) - { - point1_real = Real[i]; - point1_imag = Imag[i]; - i2 = i+16; - point2_real = Real[i2]; - point2_imag = Imag[i2]; - - w_real = w_array_real[i]; - w_imag = w_array_imag[i]; - - // temp1 = x[i] - x[i2] - point1_real -= point2_real; - point1_imag -= point2_imag; - - // x[i1] = x[i] + x[i2] - Real[i] += point2_real; - Imag[i] += point2_imag; - - // x[i2] = (x[i] - x[i2]) * w - Real[i2] = (MUL_F(point1_real,w_real) - MUL_F(point1_imag,w_imag)); - Imag[i2] = (MUL_F(point1_real,w_imag) + MUL_F(point1_imag,w_real)); - } - // Stage 2 of 32 point FFT decimation in frequency - for (j = 0, w_index = 0; j < 8; j++, w_index += 2) - { - w_real = w_array_real[w_index]; - w_imag = w_array_imag[w_index]; - - i = j; - point1_real = Real[i]; - point1_imag = Imag[i]; - i2 = i+8; - point2_real = Real[i2]; - point2_imag = Imag[i2]; - - // temp1 = x[i] - x[i2] - point1_real -= point2_real; - point1_imag -= point2_imag; - - // x[i1] = x[i] + x[i2] - Real[i] += point2_real; - Imag[i] += point2_imag; - - // x[i2] = (x[i] - x[i2]) * w - Real[i2] = (MUL_F(point1_real,w_real) - MUL_F(point1_imag,w_imag)); - Imag[i2] = (MUL_F(point1_real,w_imag) + MUL_F(point1_imag,w_real)); - - i = j+16; - point1_real = Real[i]; - point1_imag = Imag[i]; - i2 = i+8; - point2_real = Real[i2]; - point2_imag = Imag[i2]; - - // temp1 = x[i] - x[i2] - point1_real -= point2_real; - point1_imag -= point2_imag; - - // x[i1] = x[i] + x[i2] - Real[i] += point2_real; - Imag[i] += point2_imag; - - // x[i2] = (x[i] - x[i2]) * w - Real[i2] = (MUL_F(point1_real,w_real) - MUL_F(point1_imag,w_imag)); - Imag[i2] = (MUL_F(point1_real,w_imag) + MUL_F(point1_imag,w_real)); - } - - // Stage 3 of 32 point FFT decimation in frequency - // 2*4*2=16 multiplications - // 4*4*2+6*4*2=10*8=80 additions - for (i = 0; i < n; i += 8) - { - i2 = i+4; - point1_real = Real[i]; - point1_imag = Imag[i]; - - point2_real = Real[i2]; - point2_imag = Imag[i2]; - - // out[i1] = point1 + point2 - Real[i] += point2_real; - Imag[i] += point2_imag; - - // out[i2] = point1 - point2 - Real[i2] = point1_real - point2_real; - Imag[i2] = point1_imag - point2_imag; - } - w_real = w_array_real[4]; // = sqrt(2)/2 - // w_imag = -w_real; // = w_array_imag[4]; // = -sqrt(2)/2 - for (i = 1; i < n; i += 8) - { - i2 = i+4; - point1_real = Real[i]; - point1_imag = Imag[i]; - - point2_real = Real[i2]; - point2_imag = Imag[i2]; - - // temp1 = x[i] - x[i2] - point1_real -= point2_real; - point1_imag -= point2_imag; - - // x[i1] = x[i] + x[i2] - Real[i] += point2_real; - Imag[i] += point2_imag; - - // x[i2] = (x[i] - x[i2]) * w - Real[i2] = MUL_F(point1_real+point1_imag, w_real); - Imag[i2] = MUL_F(point1_imag-point1_real, w_real); - } - for (i = 2; i < n; i += 8) - { - i2 = i+4; - point1_real = Real[i]; - point1_imag = Imag[i]; - - point2_real = Real[i2]; - point2_imag = Imag[i2]; - - // x[i] = x[i] + x[i2] - Real[i] += point2_real; - Imag[i] += point2_imag; - - // x[i2] = (x[i] - x[i2]) * (-i) - Real[i2] = point1_imag - point2_imag; - Imag[i2] = point2_real - point1_real; - } - w_real = w_array_real[12]; // = -sqrt(2)/2 - // w_imag = w_real; // = w_array_imag[12]; // = -sqrt(2)/2 - for (i = 3; i < n; i += 8) - { - i2 = i+4; - point1_real = Real[i]; - point1_imag = Imag[i]; - - point2_real = Real[i2]; - point2_imag = Imag[i2]; - - // temp1 = x[i] - x[i2] - point1_real -= point2_real; - point1_imag -= point2_imag; - - // x[i1] = x[i] + x[i2] - Real[i] += point2_real; - Imag[i] += point2_imag; - - // x[i2] = (x[i] - x[i2]) * w - Real[i2] = MUL_F(point1_real-point1_imag, w_real); - Imag[i2] = MUL_F(point1_real+point1_imag, w_real); - } - - - // Stage 4 of 32 point FFT decimation in frequency (no multiplications) - // 16*4=64 additions - for (i = 0; i < n; i += 4) - { - i2 = i+2; - point1_real = Real[i]; - point1_imag = Imag[i]; - - point2_real = Real[i2]; - point2_imag = Imag[i2]; - - // x[i1] = x[i] + x[i2] - Real[i] += point2_real; - Imag[i] += point2_imag; - - // x[i2] = x[i] - x[i2] - Real[i2] = point1_real - point2_real; - Imag[i2] = point1_imag - point2_imag; - } - for (i = 1; i < n; i += 4) - { - i2 = i+2; - point1_real = Real[i]; - point1_imag = Imag[i]; - - point2_real = Real[i2]; - point2_imag = Imag[i2]; - - // x[i] = x[i] + x[i2] - Real[i] += point2_real; - Imag[i] += point2_imag; - - // x[i2] = (x[i] - x[i2]) * (-i) - Real[i2] = point1_imag - point2_imag; - Imag[i2] = point2_real - point1_real; - } - - // Stage 5 of 32 point FFT decimation in frequency (no multiplications) - // 16*4=64 additions - for (i = 0; i < n; i += 2) - { - i2 = i+1; - point1_real = Real[i]; - point1_imag = Imag[i]; - - point2_real = Real[i2]; - point2_imag = Imag[i2]; - - // out[i1] = point1 + point2 - Real[i] += point2_real; - Imag[i] += point2_imag; - - // out[i2] = point1 - point2 - Real[i2] = point1_real - point2_real; - Imag[i2] = point1_imag - point2_imag; - } - -#ifdef REORDER_IN_FFT - FFTReorder(Real, Imag); -#endif // #ifdef REORDER_IN_FFT -} -#undef n -#undef log2n - -static const real_t dct4_64_tab[] = { +static const real_t dct4_64_tab[] ICONST_ATTR_FAAD_LARGE_IRAM = { COEF_CONST(0.999924719333649), COEF_CONST(0.998118102550507), COEF_CONST(0.993906974792480), COEF_CONST(0.987301409244537), COEF_CONST(0.978317379951477), COEF_CONST(0.966976463794708), @@ -1806,57 +1551,68 @@ COEF_CONST(0.897167563438416), COEF_CONST(0.949727773666382) }; +/* +// Tables for resorting results of codeclib's fft +static const uint32_t bit_rev_tab2[32] ICONST_ATTR_FAAD_LARGE_IRAM = { + 0,31,30,29,28,27,26,25,24,23,22,21,20,19,18, + 17,16,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1 +}; +*/ + /* size 64 only! */ void dct4_kernel(real_t * in_real, real_t * in_imag, real_t * out_real, real_t * out_imag) { - // Tables with bit reverse values for 5 bits, bit reverse of i at i-th position - const uint8_t bit_rev_tab[32] = { 0,16,8,24,4,20,12,28,2,18,10,26,6,22,14,30,1,17,9,25,5,21,13,29,3,19,11,27,7,23,15,31 }; - uint16_t i, i_rev; + uint32_t i, idx; + real_t x_re, x_im, tmp; + FFTComplex xc[32]; /* used for calling coeclib's fft implementation */ - /* Step 2: modulate */ + /* Step 2: modulate and pre-rotate for codeclib's fft implementation */ // 3*32=96 multiplications // 3*32=96 additions for (i = 0; i < 32; i++) { - real_t x_re, x_im, tmp; + idx = revtab[i]>>(12-5); x_re = in_real[i]; x_im = in_imag[i]; - tmp = MUL_C(x_re + x_im, dct4_64_tab[i]); - in_real[i] = MUL_C(x_im, dct4_64_tab[i + 64]) + tmp; - in_imag[i] = MUL_C(x_re, dct4_64_tab[i + 32]) + tmp; + tmp = MUL_C(x_re + x_im, dct4_64_tab[i ]); + xc[idx].re = MUL_C(x_im , dct4_64_tab[i + 64]) + tmp; + xc[idx].im = MUL_C(x_re , dct4_64_tab[i + 32]) + tmp; } - /* Step 3: FFT, but with output in bit reverse order */ - fft_dif(in_real, in_imag); + /* Step 3: FFT (codeclib's implementation) */ + ff_fft_calc_c(5, xc); /* Step 4: modulate + bitreverse reordering */ // 3*31+2=95 multiplications // 3*31+2=95 additions - for (i = 0; i < 16; i++) + x_re = xc[0].re; + x_im = xc[0].im; + tmp = MUL_C(x_re + x_im, dct4_64_tab[0 + 3*32]); + out_real[0] = MUL_C(x_im , dct4_64_tab[0 + 5*32]) + tmp; + out_imag[0] = MUL_C(x_re , dct4_64_tab[0 + 4*32]) + tmp; + for (i = 1; i < 16; i++) { - real_t x_re, x_im, tmp; - i_rev = bit_rev_tab[i]; - x_re = in_real[i_rev]; - x_im = in_imag[i_rev]; - - tmp = MUL_C(x_re + x_im, dct4_64_tab[i + 3*32]); - out_real[i] = MUL_C(x_im, dct4_64_tab[i + 5*32]) + tmp; - out_imag[i] = MUL_C(x_re, dct4_64_tab[i + 4*32]) + tmp; + idx = 32-i; + x_re = xc[idx].re; + x_im = xc[idx].im; + tmp = MUL_C(x_re + x_im, dct4_64_tab[i + 3*32]); + out_real[i] = MUL_C(x_im , dct4_64_tab[i + 5*32]) + tmp; + out_imag[i] = MUL_C(x_re , dct4_64_tab[i + 4*32]) + tmp; } - // i = 16, i_rev = 1 = rev(16); - out_imag[16] = MUL_C(in_imag[1] - in_real[1], dct4_64_tab[16 + 3*32]); - out_real[16] = MUL_C(in_real[1] + in_imag[1], dct4_64_tab[16 + 3*32]); + // i = 16, i_rev = 16 = rev(16); + x_re = xc[16].re; + x_im = xc[16].im; + out_imag[16] = MUL_C(x_im - x_re, dct4_64_tab[16 + 3*32]); + out_real[16] = MUL_C(x_re + x_im, dct4_64_tab[16 + 3*32]); for (i = 17; i < 32; i++) { - real_t x_re, x_im, tmp; - i_rev = bit_rev_tab[i]; - x_re = in_real[i_rev]; - x_im = in_imag[i_rev]; - tmp = MUL_C(x_re + x_im, dct4_64_tab[i + 3*32]); - out_real[i] = MUL_C(x_im, dct4_64_tab[i + 5*32]) + tmp; - out_imag[i] = MUL_C(x_re, dct4_64_tab[i + 4*32]) + tmp; + idx = 32-i; + x_re = xc[idx].re; + x_im = xc[idx].im; + tmp = MUL_C(x_re + x_im, dct4_64_tab[i + 3*32]); + out_real[i] = MUL_C(x_im , dct4_64_tab[i + 5*32]) + tmp; + out_imag[i] = MUL_C(x_re , dct4_64_tab[i + 4*32]) + tmp; } - } void DST4_32(real_t *y, real_t *x)