Index: apps/codecs/libmad/D_odd.dat =================================================================== --- apps/codecs/libmad/D_odd.dat (revision 0) +++ apps/codecs/libmad/D_odd.dat (revision 0) @@ -0,0 +1,307 @@ +{ + -PRESHIFT(0x0001d000) /* -0.000442505 */, + -PRESHIFT(0x001cb000) /* -0.007003784 */, + -PRESHIFT(0x01421000) /* -0.078628540 */, + -PRESHIFT(0x09271000) /* -0.572036743 */, + PRESHIFT(0x09271000) /* 0.572036743 */, + PRESHIFT(0x01421000) /* 0.078628540 */, + PRESHIFT(0x001cb000) /* 0.007003784 */, + PRESHIFT(0x0001d000) /* 0.000442505 */, + -PRESHIFT(0x0001d000) /* -0.000442505 */, + -PRESHIFT(0x001cb000) /* -0.007003784 */, + -PRESHIFT(0x01421000) /* -0.078628540 */, + -PRESHIFT(0x09271000) /* -0.572036743 */, + PRESHIFT(0x09271000) /* 0.572036743 */, + PRESHIFT(0x01421000) /* 0.078628540 */, + PRESHIFT(0x001cb000) /* 0.007003784 */, + PRESHIFT(0x0001d000) /* 0.000442505 */ , +}, + { + -PRESHIFT(0x0001f000) /* -0.000473022 */, + -PRESHIFT(0x00207000) /* -0.007919312 */, + -PRESHIFT(0x0158d000) /* -0.084182739 */, + -PRESHIFT(0x099a8000) /* -0.600219727 */, + PRESHIFT(0x08b38000) /* 0.543823242 */, + PRESHIFT(0x012b4000) /* 0.073059082 */, + PRESHIFT(0x00191000) /* 0.006118774 */, + PRESHIFT(0x0001a000) /* 0.000396729 */, + -PRESHIFT(0x0001f000) /* -0.000473022 */, + -PRESHIFT(0x00207000) /* -0.007919312 */, + -PRESHIFT(0x0158d000) /* -0.084182739 */, + -PRESHIFT(0x099a8000) /* -0.600219727 */, + PRESHIFT(0x08b38000) /* 0.543823242 */, + PRESHIFT(0x012b4000) /* 0.073059082 */, + PRESHIFT(0x00191000) /* 0.006118774 */, + PRESHIFT(0x0001a000) /* 0.000396729 */ , +}, + { + -PRESHIFT(0x00023000) /* -0.000534058 */, + -PRESHIFT(0x00245000) /* -0.008865356 */, + -PRESHIFT(0x016f7000) /* -0.089706421 */, + -PRESHIFT(0x0a0d8000) /* -0.628295898 */, + PRESHIFT(0x083ff000) /* 0.515609741 */, + PRESHIFT(0x01149000) /* 0.067520142 */, + PRESHIFT(0x0015b000) /* 0.005294800 */, + PRESHIFT(0x00018000) /* 0.000366211 */, + -PRESHIFT(0x00023000) /* -0.000534058 */, + -PRESHIFT(0x00245000) /* -0.008865356 */, + -PRESHIFT(0x016f7000) /* -0.089706421 */, + -PRESHIFT(0x0a0d8000) /* -0.628295898 */, + PRESHIFT(0x083ff000) /* 0.515609741 */, + PRESHIFT(0x01149000) /* 0.067520142 */, + PRESHIFT(0x0015b000) /* 0.005294800 */, + PRESHIFT(0x00018000) /* 0.000366211 */ , +}, + { + -PRESHIFT(0x00026000) /* -0.000579834 */, + -PRESHIFT(0x00285000) /* -0.009841919 */, + -PRESHIFT(0x0185d000) /* -0.095169067 */, + -PRESHIFT(0x0a7fe000) /* -0.656219482 */, + PRESHIFT(0x07ccb000) /* 0.487472534 */, + PRESHIFT(0x00fdf000) /* 0.061996460 */, + PRESHIFT(0x00126000) /* 0.004486084 */, + PRESHIFT(0x00015000) /* 0.000320435 */, + -PRESHIFT(0x00026000) /* -0.000579834 */, + -PRESHIFT(0x00285000) /* -0.009841919 */, + -PRESHIFT(0x0185d000) /* -0.095169067 */, + -PRESHIFT(0x0a7fe000) /* -0.656219482 */, + PRESHIFT(0x07ccb000) /* 0.487472534 */, + PRESHIFT(0x00fdf000) /* 0.061996460 */, + PRESHIFT(0x00126000) /* 0.004486084 */, + PRESHIFT(0x00015000) /* 0.000320435 */ , +}, + { + -PRESHIFT(0x00029000) /* -0.000625610 */, + -PRESHIFT(0x002c7000) /* -0.010848999 */, + -PRESHIFT(0x019bd000) /* -0.100540161 */, + -PRESHIFT(0x0af15000) /* -0.683914185 */, + PRESHIFT(0x075a0000) /* 0.459472656 */, + PRESHIFT(0x00e79000) /* 0.056533813 */, + PRESHIFT(0x000f4000) /* 0.003723145 */, + PRESHIFT(0x00013000) /* 0.000289917 */, + -PRESHIFT(0x00029000) /* -0.000625610 */, + -PRESHIFT(0x002c7000) /* -0.010848999 */, + -PRESHIFT(0x019bd000) /* -0.100540161 */, + -PRESHIFT(0x0af15000) /* -0.683914185 */, + PRESHIFT(0x075a0000) /* 0.459472656 */, + PRESHIFT(0x00e79000) /* 0.056533813 */, + PRESHIFT(0x000f4000) /* 0.003723145 */, + PRESHIFT(0x00013000) /* 0.000289917 */ , +}, + { + -PRESHIFT(0x0002d000) /* -0.000686646 */, + -PRESHIFT(0x0030b000) /* -0.011886597 */, + -PRESHIFT(0x01b17000) /* -0.105819702 */, + -PRESHIFT(0x0b619000) /* -0.711318970 */, + PRESHIFT(0x06e81000) /* 0.431655884 */, + PRESHIFT(0x00d17000) /* 0.051132202 */, + PRESHIFT(0x000c5000) /* 0.003005981 */, + PRESHIFT(0x00011000) /* 0.000259399 */, + -PRESHIFT(0x0002d000) /* -0.000686646 */, + -PRESHIFT(0x0030b000) /* -0.011886597 */, + -PRESHIFT(0x01b17000) /* -0.105819702 */, + -PRESHIFT(0x0b619000) /* -0.711318970 */, + PRESHIFT(0x06e81000) /* 0.431655884 */, + PRESHIFT(0x00d17000) /* 0.051132202 */, + PRESHIFT(0x000c5000) /* 0.003005981 */, + PRESHIFT(0x00011000) /* 0.000259399 */ , +}, + { + -PRESHIFT(0x00031000) /* -0.000747681 */, + -PRESHIFT(0x00350000) /* -0.012939453 */, + -PRESHIFT(0x01c67000) /* -0.110946655 */, + -PRESHIFT(0x0bd06000) /* -0.738372803 */, + PRESHIFT(0x06772000) /* 0.404083252 */, + PRESHIFT(0x00bbc000) /* 0.045837402 */, + PRESHIFT(0x00099000) /* 0.002334595 */, + PRESHIFT(0x00010000) /* 0.000244141 */, + -PRESHIFT(0x00031000) /* -0.000747681 */, + -PRESHIFT(0x00350000) /* -0.012939453 */, + -PRESHIFT(0x01c67000) /* -0.110946655 */, + -PRESHIFT(0x0bd06000) /* -0.738372803 */, + PRESHIFT(0x06772000) /* 0.404083252 */, + PRESHIFT(0x00bbc000) /* 0.045837402 */, + PRESHIFT(0x00099000) /* 0.002334595 */, + PRESHIFT(0x00010000) /* 0.000244141 */ , +}, + { + -PRESHIFT(0x00035000) /* -0.000808716 */, + -PRESHIFT(0x00397000) /* -0.014022827 */, + -PRESHIFT(0x01dad000) /* -0.115921021 */, + -PRESHIFT(0x0c3d9000) /* -0.765029907 */, + PRESHIFT(0x06076000) /* 0.376800537 */, + PRESHIFT(0x00a67000) /* 0.040634155 */, + PRESHIFT(0x0006f000) /* 0.001693726 */, + PRESHIFT(0x0000e000) /* 0.000213623 */, + -PRESHIFT(0x00035000) /* -0.000808716 */, + -PRESHIFT(0x00397000) /* -0.014022827 */, + -PRESHIFT(0x01dad000) /* -0.115921021 */, + -PRESHIFT(0x0c3d9000) /* -0.765029907 */, + PRESHIFT(0x06076000) /* 0.376800537 */, + PRESHIFT(0x00a67000) /* 0.040634155 */, + PRESHIFT(0x0006f000) /* 0.001693726 */, + PRESHIFT(0x0000e000) /* 0.000213623 */ , +}, + { + -PRESHIFT(0x0003a000) /* -0.000885010 */, + -PRESHIFT(0x003df000) /* -0.015121460 */, + -PRESHIFT(0x01ee6000) /* -0.120697021 */, + -PRESHIFT(0x0ca8d000) /* -0.791213989 */, + PRESHIFT(0x05991000) /* 0.349868774 */, + PRESHIFT(0x0091a000) /* 0.035552979 */, + PRESHIFT(0x00048000) /* 0.001098633 */, + PRESHIFT(0x0000d000) /* 0.000198364 */, + -PRESHIFT(0x0003a000) /* -0.000885010 */, + -PRESHIFT(0x003df000) /* -0.015121460 */, + -PRESHIFT(0x01ee6000) /* -0.120697021 */, + -PRESHIFT(0x0ca8d000) /* -0.791213989 */, + PRESHIFT(0x05991000) /* 0.349868774 */, + PRESHIFT(0x0091a000) /* 0.035552979 */, + PRESHIFT(0x00048000) /* 0.001098633 */, + PRESHIFT(0x0000d000) /* 0.000198364 */ , +}, + { + -PRESHIFT(0x0003f000) /* -0.000961304 */, + -PRESHIFT(0x00428000) /* -0.016235352 */, + -PRESHIFT(0x02011000) /* -0.125259399 */, + -PRESHIFT(0x0d11e000) /* -0.816864014 */, + PRESHIFT(0x052c5000) /* 0.323318481 */, + PRESHIFT(0x007d6000) /* 0.030609131 */, + PRESHIFT(0x00024000) /* 0.000549316 */, + PRESHIFT(0x0000b000) /* 0.000167847 */, + -PRESHIFT(0x0003f000) /* -0.000961304 */, + -PRESHIFT(0x00428000) /* -0.016235352 */, + -PRESHIFT(0x02011000) /* -0.125259399 */, + -PRESHIFT(0x0d11e000) /* -0.816864014 */, + PRESHIFT(0x052c5000) /* 0.323318481 */, + PRESHIFT(0x007d6000) /* 0.030609131 */, + PRESHIFT(0x00024000) /* 0.000549316 */, + PRESHIFT(0x0000b000) /* 0.000167847 */ , +}, + { + -PRESHIFT(0x00044000) /* -0.001037598 */, + -PRESHIFT(0x00471000) /* -0.017349243 */, + -PRESHIFT(0x0212b000) /* -0.129562378 */, + -PRESHIFT(0x0d78a000) /* -0.841949463 */, + PRESHIFT(0x04c16000) /* 0.297210693 */, + PRESHIFT(0x0069c000) /* 0.025817871 */, + PRESHIFT(0x00002000) /* 0.000030518 */, + PRESHIFT(0x0000a000) /* 0.000152588 */, + -PRESHIFT(0x00044000) /* -0.001037598 */, + -PRESHIFT(0x00471000) /* -0.017349243 */, + -PRESHIFT(0x0212b000) /* -0.129562378 */, + -PRESHIFT(0x0d78a000) /* -0.841949463 */, + PRESHIFT(0x04c16000) /* 0.297210693 */, + PRESHIFT(0x0069c000) /* 0.025817871 */, + PRESHIFT(0x00002000) /* 0.000030518 */, + PRESHIFT(0x0000a000) /* 0.000152588 */ , +}, + { + -PRESHIFT(0x00049000) /* -0.001113892 */, + -PRESHIFT(0x004ba000) /* -0.018463135 */, + -PRESHIFT(0x02233000) /* -0.133590698 */, + -PRESHIFT(0x0ddca000) /* -0.866363525 */, + PRESHIFT(0x04587000) /* 0.271591187 */, + PRESHIFT(0x0056c000) /* 0.021179199 */, + -PRESHIFT(0x0001d000) /* -0.000442505 */, + PRESHIFT(0x00009000) /* 0.000137329 */, + -PRESHIFT(0x00049000) /* -0.001113892 */, + -PRESHIFT(0x004ba000) /* -0.018463135 */, + -PRESHIFT(0x02233000) /* -0.133590698 */, + -PRESHIFT(0x0ddca000) /* -0.866363525 */, + PRESHIFT(0x04587000) /* 0.271591187 */, + PRESHIFT(0x0056c000) /* 0.021179199 */, + -PRESHIFT(0x0001d000) /* -0.000442505 */, + PRESHIFT(0x00009000) /* 0.000137329 */ , +}, + { + -PRESHIFT(0x0004f000) /* -0.001205444 */, + -PRESHIFT(0x00503000) /* -0.019577026 */, + -PRESHIFT(0x02326000) /* -0.137298584 */, + -PRESHIFT(0x0e3dd000) /* -0.890090942 */, + PRESHIFT(0x03f1b000) /* 0.246505737 */, + PRESHIFT(0x00447000) /* 0.016708374 */, + -PRESHIFT(0x00039000) /* -0.000869751 */, + PRESHIFT(0x00008000) /* 0.000122070 */, + -PRESHIFT(0x0004f000) /* -0.001205444 */, + -PRESHIFT(0x00503000) /* -0.019577026 */, + -PRESHIFT(0x02326000) /* -0.137298584 */, + -PRESHIFT(0x0e3dd000) /* -0.890090942 */, + PRESHIFT(0x03f1b000) /* 0.246505737 */, + PRESHIFT(0x00447000) /* 0.016708374 */, + -PRESHIFT(0x00039000) /* -0.000869751 */, + PRESHIFT(0x00008000) /* 0.000122070 */ , +}, + { + -PRESHIFT(0x00055000) /* -0.001296997 */, + -PRESHIFT(0x0054c000) /* -0.020690918 */, + -PRESHIFT(0x02403000) /* -0.140670776 */, + -PRESHIFT(0x0e9be000) /* -0.913055420 */, + PRESHIFT(0x038d4000) /* 0.221984863 */, + PRESHIFT(0x0032e000) /* 0.012420654 */, + -PRESHIFT(0x00053000) /* -0.001266479 */, + PRESHIFT(0x00007000) /* 0.000106812 */, + -PRESHIFT(0x00055000) /* -0.001296997 */, + -PRESHIFT(0x0054c000) /* -0.020690918 */, + -PRESHIFT(0x02403000) /* -0.140670776 */, + -PRESHIFT(0x0e9be000) /* -0.913055420 */, + PRESHIFT(0x038d4000) /* 0.221984863 */, + PRESHIFT(0x0032e000) /* 0.012420654 */, + -PRESHIFT(0x00053000) /* -0.001266479 */, + PRESHIFT(0x00007000) /* 0.000106812 */ , +}, + { + -PRESHIFT(0x0005b000) /* -0.001388550 */, + -PRESHIFT(0x00594000) /* -0.021789551 */, + -PRESHIFT(0x024c8000) /* -0.143676758 */, + -PRESHIFT(0x0ef69000) /* -0.935195923 */, + PRESHIFT(0x032b4000) /* 0.198059082 */, + PRESHIFT(0x00221000) /* 0.008316040 */, + -PRESHIFT(0x0006a000) /* -0.001617432 */, + PRESHIFT(0x00007000) /* 0.000106812 */, + -PRESHIFT(0x0005b000) /* -0.001388550 */, + -PRESHIFT(0x00594000) /* -0.021789551 */, + -PRESHIFT(0x024c8000) /* -0.143676758 */, + -PRESHIFT(0x0ef69000) /* -0.935195923 */, + PRESHIFT(0x032b4000) /* 0.198059082 */, + PRESHIFT(0x00221000) /* 0.008316040 */, + -PRESHIFT(0x0006a000) /* -0.001617432 */, + PRESHIFT(0x00007000) /* 0.000106812 */ , +}, + { + -PRESHIFT(0x00061000) /* -0.001480103 */, + -PRESHIFT(0x005da000) /* -0.022857666 */, + -PRESHIFT(0x02571000) /* -0.146255493 */, + -PRESHIFT(0x0f4dc000) /* -0.956481934 */, + PRESHIFT(0x02cbf000) /* 0.174789429 */, + PRESHIFT(0x00120000) /* 0.004394531 */, + -PRESHIFT(0x0007f000) /* -0.001937866 */, + PRESHIFT(0x00006000) /* 0.000091553 */, + -PRESHIFT(0x00061000) /* -0.001480103 */, + -PRESHIFT(0x005da000) /* -0.022857666 */, + -PRESHIFT(0x02571000) /* -0.146255493 */, + -PRESHIFT(0x0f4dc000) /* -0.956481934 */, + PRESHIFT(0x02cbf000) /* 0.174789429 */, + PRESHIFT(0x00120000) /* 0.004394531 */, + -PRESHIFT(0x0007f000) /* -0.001937866 */, + PRESHIFT(0x00006000) /* 0.000091553 */ , +}, + { + -PRESHIFT(0x00068000) /* -0.001586914 */, + -PRESHIFT(0x0061f000) /* -0.023910522 */, + -PRESHIFT(0x025ff000) /* -0.148422241 */, + -PRESHIFT(0x0fa13000) /* -0.976852417 */, + PRESHIFT(0x026f7000) /* 0.152206421 */, + PRESHIFT(0x0002d000) /* 0.000686646 */, + -PRESHIFT(0x00092000) /* -0.002227783 */, + PRESHIFT(0x00005000) /* 0.000076294 */, + -PRESHIFT(0x00068000) /* -0.001586914 */, + -PRESHIFT(0x0061f000) /* -0.023910522 */, + -PRESHIFT(0x025ff000) /* -0.148422241 */, + -PRESHIFT(0x0fa13000) /* -0.976852417 */, + PRESHIFT(0x026f7000) /* 0.152206421 */, + PRESHIFT(0x0002d000) /* 0.000686646 */, + -PRESHIFT(0x00092000) /* -0.002227783 */, + PRESHIFT(0x00005000) /* 0.000076294 */ +}, + \ No newline at end of file Index: apps/codecs/libmad/synth_full_arm_v5e.S =================================================================== --- apps/codecs/libmad/synth_full_arm_v5e.S (revision 0) +++ apps/codecs/libmad/synth_full_arm_v5e.S (revision 0) @@ -0,0 +1,140 @@ +/*************************************************************************** + * __________ __ ___. + * Open \______ \ ____ ____ | | _\_ |__ _______ ___ + * Source | _// _ \_/ ___\| |/ /| __ \ / _ \ \/ / + * Jukebox | | ( <_> ) \___| < | \_\ ( <_> > < < + * Firmware |____|_ /\____/ \___ >__|_ \|___ /\____/__/\_ \ + * \/ \/ \/ \/ \/ + * + * + * Copyright (C) 2010 by Michael Giacomelli + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This software is distributed on an "AS IS" basis, WITHOUT WARRANTY OF ANY + * KIND, either express or implied. + * + ****************************************************************************/ + +#include "config.h" +#include "mad_iram.h" + + .section ICODE_SECTION_MPA_ARM,"ax",%progbits + + .global synth_full_mod_zero + //.global synth_full_even_sbsample + + ;; r0 = pcm + ;; r1 = fe, fo = fe+1536 + ;; r2 = D0eptr, r2+544 = D0optr + ;; r3 = D1eptr, r3+544 = D1optr + ;; r4 = accum2 + ;; r5 = accum + ;; r6-r14 scratch + +fo .req r1 +D0ptr .req r2 +D1ptr .req r3 +accum .req r5 +accum2 .req r4 + +synth_full_mod_zero: + /*stack all registers, then store the stack pointer as well*/ + stmdb sp!, {r4-r11, lr} + ldr r4, [sp, #36] + ldr r5, =synth_full_sp + str sp, [r5] + mov r5, #15 + //add r1, r1, #32 +.l: + +//just screw around and see what the register pressure is like, we have options for saving some if needed + + + //PROD_O_ODD_ODD(hi, lo, fo, ptr, 0) + //fo:0, optr:2 (bottom of packed variable, not 64 bit aligned) + optr = *D0optr; + eptr = *D1eptr; + ML0(hi, accum, (*fo)[0], optr[0]); /*0 */ + ML0(hi2, accum2, (*fo)[0], eptr[7]); /*15*/ + MLA(hi, accum, (*fo)[1], optr[7]); /*14*/ + MLA(hi2, accum2, (*fo)[1], eptr[8]); /*17*/ + MLA(hi, accum, (*fo)[2], optr[6]); /*12*/ + MLA(hi2, accum2, (*fo)[2], eptr[9]); /*19*/ + MLA(hi, accum, (*fo)[3], optr[5]); /*10*/ + MLA(hi2, accum2, (*fo)[3], eptr[10]); /*21*/ + MLA(hi, accum, (*fo)[4], optr[4]); /*8 */ + MLA(hi2, accum2, (*fo)[4], eptr[11]); /*23*/ + MLA(hi, accum, (*fo)[5], optr[3]); /*6 */ + MLA(hi2, accum2, (*fo)[5], eptr[12]); /*25*/ + MLA(hi, accum, (*fo)[6], optr[2]); /*4 */ + MLA(hi2, accum2, (*fo)[6], eptr[13]); /*27*/ + MLA(hi, accum, (*fo)[7], optr[1]); /*2 */ + MLA(hi2, accum2, (*fo)[7], eptr[14]); /*29*/ + + + + /*f is always 64 bit aligned*/ + ldmia r1!, {r10-r14} /*load f[0..4]*/ + + /*compute D0optr*/ + add D0ptr, D0ptr, #544 + /* D0o is 32 bit aligned*/ + ldrd r6,r7, [D0ptr,#8] /*load (packed) D0o[4..7]*/ + + /*D1e is 16 or 48 bit aligned so 14 byte offset is safe*/ + /*ARM11: 1 cycle stall on 50% of D loads that are not 64 bit aligned*/ + + ldrd r8, r9, [D1ptr,#14] /*load (packed) D1e[7..10]*/ + + /*ARM11: 1 cycle stall waiting on r8*/ + + smulwt accum, r11,r7 /*ML0(accum, (*f)[1], optr[7]); */ + smulwb accum2, r10,r8 /*ML0(accum2, (*f)[0], eptr[7]); */ + + smlawb accum, r12, r7, accum /*MLA(accum, (*f)[2], optr[6]); */ + smlawt accum2, r11, r8, accum2 /*MLA(accum2, (*f)[1], eptr[8]); */ + + //free: r11,r7,r8 + ldrd r7,r8, [D0ptr,#0] /*load (packed) D0o[0..3]*/ + + smlawt accum, r13, r6, accum /*MLA(accum, (*f)[3], optr[5]); */ + smlawb accum2, r12, r9, accum2 /*MLA(accum2, (*f)[2], eptr[9]); */ + + //free: r11,r12 + ldrd r11, r12, [D1ptr,#22] /*load (packed) D1e[11..14]*/ + + smlawb accum, r14, r6, accum /*MLA(accum, (*f)[4], optr[4]); */ + smlawt accum2, r13, r9, accum2 /*MLA(accum2, (*f)[3], eptr[10]); */ + + //free: r13, r6,r9 + ldmia r1!, {r6,r9,r13} /*load f[5..7]*/ + + smlawb accum, r10, r7, accum /*MLA(accum, (*f)[0], optr[0]); */ + smlawb accum2, r14, r11, accum2 /*MLA(accum2, (*f)[4], eptr[11]); */ + + smlawt accum, r6, r8, accum /*MLA(accum, (*f)[5], optr[3]); */ + smlawt accum2, r6, r11, accum2 /*MLA(accum2, (*f)[5], eptr[12]); */ + + smlawb accum, r9, r8, accum /*MLA(accum, (*f)[6], optr[2]); */ + smlawb accum2, r9, r12, accum2 /*MLA(accum2, (*f)[6], eptr[13]); */ + + smlawt accum, r13, r8, accum /*MLA(accum, (*f)[7], optr[1]); */ + smlawt accum2, r13, r12, accum2 /*MLA(accum2, (*f)[7], eptr[14]); */ + + + /*ARM9E: 16x mul (16*1 clk) + 2 ldm (8 clk) + 4 ldrd (2*4 clk) = 32 clks*/ + /*ARM11: 16x mul (16*1 clk) + 2 ldm (4 clk) + 4 ldrd (1.5*4 clk) = 26 clks*/ + + + + + + + + .section IBSS_SECTION_MPA_ARM,"aw",%nobits +synth_full_sp: + .space 4 Index: apps/codecs/libmad/D_even.dat =================================================================== --- apps/codecs/libmad/D_even.dat (revision 0) +++ apps/codecs/libmad/D_even.dat (revision 0) @@ -0,0 +1,307 @@ +{ + PRESHIFT(0x00000000) /* 0.000000000 */, /* 0 */ + PRESHIFT(0x000d5000) /* 0.003250122 */, + PRESHIFT(0x007f5000) /* 0.031082153 */, + PRESHIFT(0x019ae000) /* 0.100311279 */, + PRESHIFT(0x1251e000) /* 1.144989014 */, + PRESHIFT(0x019ae000) /* 0.100311279 */, + PRESHIFT(0x007f5000) /* 0.031082153 */, + PRESHIFT(0x000d5000) /* 0.003250122 */, + PRESHIFT(0x00000000) /* 0.000000000 */, + PRESHIFT(0x000d5000) /* 0.003250122 */, + PRESHIFT(0x007f5000) /* 0.031082153 */, + PRESHIFT(0x019ae000) /* 0.100311279 */, + PRESHIFT(0x1251e000) /* 1.144989014 */, + PRESHIFT(0x019ae000) /* 0.100311279 */, + PRESHIFT(0x007f5000) /* 0.031082153 */, + PRESHIFT(0x000d5000) /* 0.003250122 */, +}, + { + -PRESHIFT(0x00001000) /* -0.000015259 */, /* 1 */ + PRESHIFT(0x000da000) /* 0.003326416 */, + PRESHIFT(0x007d0000) /* 0.030517578 */, + PRESHIFT(0x01747000) /* 0.090927124 */, + PRESHIFT(0x124f0000) /* 1.144287109 */, + PRESHIFT(0x01bde000) /* 0.108856201 */, + PRESHIFT(0x0080f000) /* 0.031478882 */, + PRESHIFT(0x000d0000) /* 0.003173828 */, + -PRESHIFT(0x00001000) /* -0.000015259 */, + PRESHIFT(0x000da000) /* 0.003326416 */, + PRESHIFT(0x007d0000) /* 0.030517578 */, + PRESHIFT(0x01747000) /* 0.090927124 */, + PRESHIFT(0x124f0000) /* 1.144287109 */, + PRESHIFT(0x01bde000) /* 0.108856201 */, + PRESHIFT(0x0080f000) /* 0.031478882 */, + PRESHIFT(0x000d0000) /* 0.003173828 */, +}, + { + -PRESHIFT(0x00001000) /* -0.000015259 */, /* 2 */ + PRESHIFT(0x000de000) /* 0.003387451 */, + PRESHIFT(0x007a0000) /* 0.029785156 */, + PRESHIFT(0x014a8000) /* 0.080688477 */, + PRESHIFT(0x12468000) /* 1.142211914 */, + PRESHIFT(0x01dd8000) /* 0.116577148 */, + PRESHIFT(0x00820000) /* 0.031738281 */, + PRESHIFT(0x000ca000) /* 0.003082275 */, + -PRESHIFT(0x00001000) /* -0.000015259 */, + PRESHIFT(0x000de000) /* 0.003387451 */, + PRESHIFT(0x007a0000) /* 0.029785156 */, + PRESHIFT(0x014a8000) /* 0.080688477 */, + PRESHIFT(0x12468000) /* 1.142211914 */, + PRESHIFT(0x01dd8000) /* 0.116577148 */, + PRESHIFT(0x00820000) /* 0.031738281 */, + PRESHIFT(0x000ca000) /* 0.003082275 */, +}, + { + -PRESHIFT(0x00001000) /* -0.000015259 */, /* 3 */ + PRESHIFT(0x000e1000) /* 0.003433228 */, + PRESHIFT(0x00765000) /* 0.028884888 */, + PRESHIFT(0x011d1000) /* 0.069595337 */, + PRESHIFT(0x12386000) /* 1.138763428 */, + PRESHIFT(0x01f9c000) /* 0.123474121 */, + PRESHIFT(0x00827000) /* 0.031845093 */, + PRESHIFT(0x000c4000) /* 0.002990723 */, + -PRESHIFT(0x00001000) /* -0.000015259 */, + PRESHIFT(0x000e1000) /* 0.003433228 */, + PRESHIFT(0x00765000) /* 0.028884888 */, + PRESHIFT(0x011d1000) /* 0.069595337 */, + PRESHIFT(0x12386000) /* 1.138763428 */, + PRESHIFT(0x01f9c000) /* 0.123474121 */, + PRESHIFT(0x00827000) /* 0.031845093 */, + PRESHIFT(0x000c4000) /* 0.002990723 */, +}, + { + -PRESHIFT(0x00001000) /* -0.000015259 */, /* 4 */ + PRESHIFT(0x000e3000) /* 0.003463745 */, + PRESHIFT(0x0071e000) /* 0.027801514 */, + PRESHIFT(0x00ec0000) /* 0.057617187 */, + PRESHIFT(0x12249000) /* 1.133926392 */, + PRESHIFT(0x0212c000) /* 0.129577637 */, + PRESHIFT(0x00825000) /* 0.031814575 */, + PRESHIFT(0x000be000) /* 0.002899170 */, + -PRESHIFT(0x00001000) /* -0.000015259 */, + PRESHIFT(0x000e3000) /* 0.003463745 */, + PRESHIFT(0x0071e000) /* 0.027801514 */, + PRESHIFT(0x00ec0000) /* 0.057617187 */, + PRESHIFT(0x12249000) /* 1.133926392 */, + PRESHIFT(0x0212c000) /* 0.129577637 */, + PRESHIFT(0x00825000) /* 0.031814575 */, + PRESHIFT(0x000be000) /* 0.002899170 */, +}, + { + -PRESHIFT(0x00001000) /* -0.000015259 */, /* 5 */ + PRESHIFT(0x000e4000) /* 0.003479004 */, + PRESHIFT(0x006cb000) /* 0.026535034 */, + PRESHIFT(0x00b77000) /* 0.044784546 */, + PRESHIFT(0x120b4000) /* 1.127746582 */, + PRESHIFT(0x02288000) /* 0.134887695 */, + PRESHIFT(0x0081b000) /* 0.031661987 */, + PRESHIFT(0x000b7000) /* 0.002792358 */, + -PRESHIFT(0x00001000) /* -0.000015259 */, + PRESHIFT(0x000e4000) /* 0.003479004 */, + PRESHIFT(0x006cb000) /* 0.026535034 */, + PRESHIFT(0x00b77000) /* 0.044784546 */, + PRESHIFT(0x120b4000) /* 1.127746582 */, + PRESHIFT(0x02288000) /* 0.134887695 */, + PRESHIFT(0x0081b000) /* 0.031661987 */, + PRESHIFT(0x000b7000) /* 0.002792358 */, +}, + { + -PRESHIFT(0x00001000) /* -0.000015259 */, /* 6 */ + PRESHIFT(0x000e4000) /* 0.003479004 */, + PRESHIFT(0x0066c000) /* 0.025085449 */, + PRESHIFT(0x007f5000) /* 0.031082153 */, + PRESHIFT(0x11ec7000) /* 1.120223999 */, + PRESHIFT(0x023b3000) /* 0.139450073 */, + PRESHIFT(0x00809000) /* 0.031387329 */, + PRESHIFT(0x000b0000) /* 0.002685547 */, + -PRESHIFT(0x00001000) /* -0.000015259 */, + PRESHIFT(0x000e4000) /* 0.003479004 */, + PRESHIFT(0x0066c000) /* 0.025085449 */, + PRESHIFT(0x007f5000) /* 0.031082153 */, + PRESHIFT(0x11ec7000) /* 1.120223999 */, + PRESHIFT(0x023b3000) /* 0.139450073 */, + PRESHIFT(0x00809000) /* 0.031387329 */, + PRESHIFT(0x000b0000) /* 0.002685547 */, +}, + { + -PRESHIFT(0x00002000) /* -0.000030518 */, /* 7 */ + PRESHIFT(0x000e3000) /* 0.003463745 */, + PRESHIFT(0x005ff000) /* 0.023422241 */, + PRESHIFT(0x0043a000) /* 0.016510010 */, + PRESHIFT(0x11c83000) /* 1.111373901 */, + PRESHIFT(0x024ad000) /* 0.143264771 */, + PRESHIFT(0x007f0000) /* 0.031005859 */, + PRESHIFT(0x000a9000) /* 0.002578735 */, + -PRESHIFT(0x00002000) /* -0.000030518 */, + PRESHIFT(0x000e3000) /* 0.003463745 */, + PRESHIFT(0x005ff000) /* 0.023422241 */, + PRESHIFT(0x0043a000) /* 0.016510010 */, + PRESHIFT(0x11c83000) /* 1.111373901 */, + PRESHIFT(0x024ad000) /* 0.143264771 */, + PRESHIFT(0x007f0000) /* 0.031005859 */, + PRESHIFT(0x000a9000) /* 0.002578735 */, +}, + { + -PRESHIFT(0x00002000) /* -0.000030518 */, /* 8 */ + PRESHIFT(0x000e0000) /* 0.003417969 */, + PRESHIFT(0x00586000) /* 0.021575928 */, + PRESHIFT(0x00046000) /* 0.001068115 */, + PRESHIFT(0x119e9000) /* 1.101211548 */, + PRESHIFT(0x02578000) /* 0.146362305 */, + PRESHIFT(0x007d1000) /* 0.030532837 */, + PRESHIFT(0x000a1000) /* 0.002456665 */, + -PRESHIFT(0x00002000) /* -0.000030518 */, + PRESHIFT(0x000e0000) /* 0.003417969 */, + PRESHIFT(0x00586000) /* 0.021575928 */, + PRESHIFT(0x00046000) /* 0.001068115 */, + PRESHIFT(0x119e9000) /* 1.101211548 */, + PRESHIFT(0x02578000) /* 0.146362305 */, + PRESHIFT(0x007d1000) /* 0.030532837 */, + PRESHIFT(0x000a1000) /* 0.002456665 */, +}, + { + -PRESHIFT(0x00002000) /* -0.000030518 */, /* 9 */ + PRESHIFT(0x000dd000) /* 0.003372192 */, + PRESHIFT(0x00500000) /* 0.019531250 */, + -PRESHIFT(0x003e6000) /* -0.015228271 */, + PRESHIFT(0x116fc000) /* 1.089782715 */, + PRESHIFT(0x02616000) /* 0.148773193 */, + PRESHIFT(0x007aa000) /* 0.029937744 */, + PRESHIFT(0x0009a000) /* 0.002349854 */, + -PRESHIFT(0x00002000) /* -0.000030518 */, + PRESHIFT(0x000dd000) /* 0.003372192 */, + PRESHIFT(0x00500000) /* 0.019531250 */, + -PRESHIFT(0x003e6000) /* -0.015228271 */, + PRESHIFT(0x116fc000) /* 1.089782715 */, + PRESHIFT(0x02616000) /* 0.148773193 */, + PRESHIFT(0x007aa000) /* 0.029937744 */, + PRESHIFT(0x0009a000) /* 0.002349854 */, +}, + { + -PRESHIFT(0x00002000) /* -0.000030518 */, /* 10 */ + PRESHIFT(0x000d7000) /* 0.003280640 */, + PRESHIFT(0x0046b000) /* 0.017257690 */, + -PRESHIFT(0x0084a000) /* -0.032379150 */, + PRESHIFT(0x113be000) /* 1.077117920 */, + PRESHIFT(0x02687000) /* 0.150497437 */, + PRESHIFT(0x0077f000) /* 0.029281616 */, + PRESHIFT(0x00093000) /* 0.002243042 */, + -PRESHIFT(0x00002000) /* -0.000030518 */, + PRESHIFT(0x000d7000) /* 0.003280640 */, + PRESHIFT(0x0046b000) /* 0.017257690 */, + -PRESHIFT(0x0084a000) /* -0.032379150 */, + PRESHIFT(0x113be000) /* 1.077117920 */, + PRESHIFT(0x02687000) /* 0.150497437 */, + PRESHIFT(0x0077f000) /* 0.029281616 */, + PRESHIFT(0x00093000) /* 0.002243042 */, +}, + { + -PRESHIFT(0x00003000) /* -0.000045776 */, /* 11 */ + PRESHIFT(0x000d0000) /* 0.003173828 */, + PRESHIFT(0x003ca000) /* 0.014801025 */, + -PRESHIFT(0x00ce4000) /* -0.050354004 */, + PRESHIFT(0x1102f000) /* 1.063217163 */, + PRESHIFT(0x026cf000) /* 0.151596069 */, + PRESHIFT(0x0074e000) /* 0.028533936 */, + PRESHIFT(0x0008b000) /* 0.002120972 */, + -PRESHIFT(0x00003000) /* -0.000045776 */, + PRESHIFT(0x000d0000) /* 0.003173828 */, + PRESHIFT(0x003ca000) /* 0.014801025 */, + -PRESHIFT(0x00ce4000) /* -0.050354004 */, + PRESHIFT(0x1102f000) /* 1.063217163 */, + PRESHIFT(0x026cf000) /* 0.151596069 */, + PRESHIFT(0x0074e000) /* 0.028533936 */, + PRESHIFT(0x0008b000) /* 0.002120972 */, +}, + { + -PRESHIFT(0x00003000) /* -0.000045776 */, /* 12 */ + PRESHIFT(0x000c8000) /* 0.003051758 */, + PRESHIFT(0x0031a000) /* 0.012115479 */, + -PRESHIFT(0x011b5000) /* -0.069168091 */, + PRESHIFT(0x10c54000) /* 1.048156738 */, + PRESHIFT(0x026ee000) /* 0.152069092 */, + PRESHIFT(0x00719000) /* 0.027725220 */, + PRESHIFT(0x00084000) /* 0.002014160 */, + -PRESHIFT(0x00003000) /* -0.000045776 */, + PRESHIFT(0x000c8000) /* 0.003051758 */, + PRESHIFT(0x0031a000) /* 0.012115479 */, + -PRESHIFT(0x011b5000) /* -0.069168091 */, + PRESHIFT(0x10c54000) /* 1.048156738 */, + PRESHIFT(0x026ee000) /* 0.152069092 */, + PRESHIFT(0x00719000) /* 0.027725220 */, + PRESHIFT(0x00084000) /* 0.002014160 */, +}, + { + -PRESHIFT(0x00004000) /* -0.000061035 */, /* 13 */ + PRESHIFT(0x000bd000) /* 0.002883911 */, + PRESHIFT(0x0025d000) /* 0.009231567 */, + -PRESHIFT(0x016ba000) /* -0.088775635 */, + PRESHIFT(0x1082d000) /* 1.031936646 */, + PRESHIFT(0x026e7000) /* 0.151962280 */, + PRESHIFT(0x006df000) /* 0.026840210 */, + PRESHIFT(0x0007d000) /* 0.001907349 */, + -PRESHIFT(0x00004000) /* -0.000061035 */, + PRESHIFT(0x000bd000) /* 0.002883911 */, + PRESHIFT(0x0025d000) /* 0.009231567 */, + -PRESHIFT(0x016ba000) /* -0.088775635 */, + PRESHIFT(0x1082d000) /* 1.031936646 */, + PRESHIFT(0x026e7000) /* 0.151962280 */, + PRESHIFT(0x006df000) /* 0.026840210 */, + PRESHIFT(0x0007d000) /* 0.001907349 */, +}, + { + -PRESHIFT(0x00004000) /* -0.000061035 */, /* 14 */ + PRESHIFT(0x000b1000) /* 0.002700806 */, + PRESHIFT(0x00192000) /* 0.006134033 */, + -PRESHIFT(0x01bf2000) /* -0.109161377 */, + PRESHIFT(0x103be000) /* 1.014617920 */, + PRESHIFT(0x026bc000) /* 0.151306152 */, + PRESHIFT(0x006a2000) /* 0.025909424 */, + PRESHIFT(0x00075000) /* 0.001785278 */, + -PRESHIFT(0x00004000) /* -0.000061035 */, + PRESHIFT(0x000b1000) /* 0.002700806 */, + PRESHIFT(0x00192000) /* 0.006134033 */, + -PRESHIFT(0x01bf2000) /* -0.109161377 */, + PRESHIFT(0x103be000) /* 1.014617920 */, + PRESHIFT(0x026bc000) /* 0.151306152 */, + PRESHIFT(0x006a2000) /* 0.025909424 */, + PRESHIFT(0x00075000) /* 0.001785278 */, +}, + { + -PRESHIFT(0x00005000) /* -0.000076294 */, /* 15 */ + PRESHIFT(0x000a3000) /* 0.002487183 */, + PRESHIFT(0x000b9000) /* 0.002822876 */, + -PRESHIFT(0x0215c000) /* -0.130310059 */, + PRESHIFT(0x0ff0a000) /* 0.996246338 */, + PRESHIFT(0x0266e000) /* 0.150115967 */, + PRESHIFT(0x00662000) /* 0.024932861 */, + PRESHIFT(0x0006f000) /* 0.001693726 */, + -PRESHIFT(0x00005000) /* -0.000076294 */, + PRESHIFT(0x000a3000) /* 0.002487183 */, + PRESHIFT(0x000b9000) /* 0.002822876 */, + -PRESHIFT(0x0215c000) /* -0.130310059 */, + PRESHIFT(0x0ff0a000) /* 0.996246338 */, + PRESHIFT(0x0266e000) /* 0.150115967 */, + PRESHIFT(0x00662000) /* 0.024932861 */, + PRESHIFT(0x0006f000) /* 0.001693726 */, +}, + { + -PRESHIFT(0x00005000) /* -0.000076294 */, /* 16 */ + PRESHIFT(0x00092000) /* 0.002227783 */, + -PRESHIFT(0x0002d000) /* -0.000686646 */, + -PRESHIFT(0x026f7000) /* -0.152206421 */, + PRESHIFT(0x0fa13000) /* 0.976852417 */, + PRESHIFT(0x025ff000) /* 0.148422241 */, + PRESHIFT(0x0061f000) /* 0.023910522 */, + PRESHIFT(0x00068000) /* 0.001586914 */, + -PRESHIFT(0x00005000) /* -0.000076294 */, + PRESHIFT(0x00092000) /* 0.002227783 */, + -PRESHIFT(0x0002d000) /* -0.000686646 */, + -PRESHIFT(0x026f7000) /* -0.152206421 */, + PRESHIFT(0x0fa13000) /* 0.976852417 */, + PRESHIFT(0x025ff000) /* 0.148422241 */, + PRESHIFT(0x0061f000) /* 0.023910522 */, + PRESHIFT(0x00068000) /* 0.001586914 */, +}, + \ No newline at end of file Index: apps/codecs/libmad/synth.c =================================================================== --- apps/codecs/libmad/synth.c (revision 28580) +++ apps/codecs/libmad/synth.c (working copy) @@ -58,14 +58,7 @@ memset(synth->filter, 0, sizeof(synth->filter)); } -#if 0 /* dct32 asm implementation is slower on current arm systems */ -/* #ifdef FPM_ARM */ -void dct32(mad_fixed_t const in[32], unsigned int slot, - mad_fixed_t lo[16][8], mad_fixed_t hi[16][8]); - -#else - /* * An optional optimization called here the Subband Synthesis Optimization * (SSO) improves the performance of subband synthesis at the expense of @@ -97,7 +90,8 @@ /* second SSO shift, with rounding */ # if defined(OPT_SSO) -# define SHIFT(x) (((x) + (1L << 11)) >> 12) +# define SHIFT(x) (x) +//# define SHIFT(x) (((x) + (1L << 11)) >> 12) # else # define SHIFT(x) (x) # endif @@ -115,7 +109,7 @@ : [a] "r" ((x)), [b] "r" ((y))); \ hi; \ }) -# elif defined(FPM_ARM) +# elif defined(FPM_ARM) & 0 /* This is an ARM version of the OPT_SPEED optimisation below. This implementation will loose 1 bit of accuracy. */ # define MUL(x, y) \ @@ -130,16 +124,19 @@ hi; \ }) # elif defined(OPT_SPEED) && defined(MAD_F_MLX) -# define MUL(x, y) \ - ({ mad_fixed64hi_t hi; \ - mad_fixed64lo_t lo; \ - MAD_F_MLX(hi, lo, (x), (y)); \ - hi << (32 - MAD_F_SCALEBITS - 3); \ - }) + # else -# define MUL(x, y) mad_f_mul((x), (y>>3)) +//# define MUL(x, y) mad_f_mul((x), (y>>3)) +# define MUL(x, y) \ + (int32_t)(((int64_t)x) *((int64_t)y)>>31) + + # endif + + + + /* * NAME: dct32() * DESCRIPTION: perform fast in[32]->out[32] DCT @@ -491,7 +488,7 @@ /* 31 */ lo[15][slot] = SHIFT((((((((MUL(t171 - t172, costab16) * 2) - t173) * 2) - t174) * 2) - t175) * 2) - t176); - + //DEBUGF("lo[13][slot]: %d\n", lo[13][slot]); //lo[13][slot]: -33 /* * Totals: * 80 multiplies @@ -504,20 +501,21 @@ # undef MUL # undef SHIFT -#endif + /* third SSO shift and/or D[] optimization preshift */ -# if defined(OPT_SSO) +# if 1 # if MAD_F_FRACBITS != 28 # error "MAD_F_FRACBITS must be 28 to use OPT_SSO" # endif -# define ML0(hi, lo, x, y) ((lo) = (x) * (y)) -# define MLA(hi, lo, x, y) ((lo) += (x) * (y)) +//# define ML0(hi, lo, x, y) ((lo) = (x) * (y)) +//# define MLA(hi, lo, x, y) ((lo) += (x) * (y)) # define MLN(hi, lo) ((lo) = -(lo)) # define MLZ(hi, lo) ((void) (hi), (mad_fixed_t) (lo)) -# define SHIFT(x) ((x) >> 2) -# define PRESHIFT(x) ((MAD_F(x) + (1L << 13)) >> 14) +# define SHIFT(x) ((x) << 2) +# define PRESHIFT(x) (((x) + (1L << 15)) >> 16) +//# define PRESHIFT(x) ((x) >> 12) # else # define ML0(hi, lo, x, y) MAD_F_ML0((hi), (lo), (x), (y)) # define MLA(hi, lo, x, y) MAD_F_MLA((hi), (lo), (x), (y)) @@ -533,12 +531,48 @@ # endif # endif + + + +# define ML0(hi, lo, x, y) \ + lo =(int32_t) (( ((int64_t)x) * ((int64_t)y)) >>16); \ + hi=0; + +# define MLA(hi, lo, x, y) \ + lo +=(int32_t) (( ((int64_t)x) * ((int64_t)y)) >>16); \ + + + +/* static -mad_fixed_t const D[17][32] ICONST_ATTR = { -# include "D.dat" +int32_t const D[17][32] ICONST_ATTR = { +# include "D_sort.dat" }; +*/ +static +int16_t const D[34][16] ICONST_ATTR = +{ +# include "D_even.dat" +# include "D_odd.dat" +}; + + + /* +static +int16_t const De[17][16] ICONST_ATTR = { +# include "D_even.dat" +}; + +static +int16_t const Do[17][16] ICONST_ATTR = { +# include "D_odd.dat" +}; +*/ + + +/* * NAME: synth->full() * DESCRIPTION: perform full frequency PCM synthesis */ @@ -763,7 +797,7 @@ } } -#elif defined(FPM_ARM) +#elif defined(FPM_ARM) && 0 #define PROD_O(hi, lo, f, ptr) \ ({ \ @@ -913,58 +947,197 @@ } # else /* not FPM_COLDFIRE_EMAC and not FPM_ARM */ +/* -#define PROD_O(hi, lo, f, ptr, offset) \ - ML0(hi, lo, (*f)[0], ptr[ 0+offset]); \ - MLA(hi, lo, (*f)[1], ptr[14+offset]); \ - MLA(hi, lo, (*f)[2], ptr[12+offset]); \ - MLA(hi, lo, (*f)[3], ptr[10+offset]); \ - MLA(hi, lo, (*f)[4], ptr[ 8+offset]); \ - MLA(hi, lo, (*f)[5], ptr[ 6+offset]); \ - MLA(hi, lo, (*f)[6], ptr[ 4+offset]); \ - MLA(hi, lo, (*f)[7], ptr[ 2+offset]); - -#define PROD_A(hi, lo, f, ptr, offset) \ - MLA(hi, lo, (*f)[0], ptr[ 0+offset]); \ - MLA(hi, lo, (*f)[1], ptr[14+offset]); \ - MLA(hi, lo, (*f)[2], ptr[12+offset]); \ - MLA(hi, lo, (*f)[3], ptr[10+offset]); \ - MLA(hi, lo, (*f)[4], ptr[ 8+offset]); \ - MLA(hi, lo, (*f)[5], ptr[ 6+offset]); \ - MLA(hi, lo, (*f)[6], ptr[ 4+offset]); \ - MLA(hi, lo, (*f)[7], ptr[ 2+offset]); - -#define PROD_SB(hi, lo, ptr, offset, first_idx, last_idx) \ - ML0(hi, lo, (*fe)[0], ptr[first_idx]); \ - MLA(hi, lo, (*fe)[1], ptr[16+offset]); \ - MLA(hi, lo, (*fe)[2], ptr[18+offset]); \ - MLA(hi, lo, (*fe)[3], ptr[20+offset]); \ - MLA(hi, lo, (*fe)[4], ptr[22+offset]); \ - MLA(hi, lo, (*fe)[5], ptr[24+offset]); \ - MLA(hi, lo, (*fe)[6], ptr[26+offset]); \ - MLA(hi, lo, (*fe)[7], ptr[28+offset]); \ - MLA(hi, lo, (*fo)[7], ptr[29-offset]); \ - MLA(hi, lo, (*fo)[6], ptr[27-offset]); \ - MLA(hi, lo, (*fo)[5], ptr[25-offset]); \ - MLA(hi, lo, (*fo)[4], ptr[23-offset]); \ - MLA(hi, lo, (*fo)[3], ptr[21-offset]); \ - MLA(hi, lo, (*fo)[2], ptr[19-offset]); \ - MLA(hi, lo, (*fo)[1], ptr[17-offset]); \ - MLA(hi, lo, (*fo)[0], ptr[last_idx ]); +basic idea: +The PROD_ calls all step by two at a time, so pairing all the even values, and then all the odd values together would mean that +we could do a packed load everytime. However, this is complicated by the fact that not all accesses will be aligned to either +the first or second even or odd digit. So for instance, one call might load and need the second of two packed values but not the +first. We could try and detect this in the code by checking if ptr%4 is 0,1,2,3 (which is what I think the even/odd does). +*/ + +/*functions that will not go into the .S file*/ + +//case where (ptr/4)%4 = 3 (middle odd) +#if 1 && defined(FPM_ARM) + //use 1 register for accum (lo), 1 for f ptr, 1 for D ptr, 4 for audio data, 4 for D values = 11 used +#define PROD_O_ODD_ODD(hi, lo, f, ptr) \ + ({ \ + mad_fixed_t *__p = *(f); \ + asm volatile ( \ + /*f is always 64 bit aligned, D is 16/48 bit aligned*/ \ + "ldmia %2!, {r4-r8} \n\t" /*load (packed) D[0..7]*/ \ + /*^ARM11: 1 cycle stall on 50% that are not 64 bit aligned*/ \ + "ldmia %1!, {r0-r3} \n\t" /*load f[0..3]*/ \ + /*ARM11: 1 cycle stall waiting for r0*/\ + "smulwt %0, r0, r4 \n\t" /*ML0(hi, lo, (*f)[0], ptr[0]); */\ + "smlawb %0,r1,r8,%0 \n\t" /*MLA(hi, lo, (*f)[1], ptr[7]); */\ + "smlawt %0,r2,r7,%0 \n\t" /*MLA(hi, lo, (*f)[2], ptr[6]); */\ + "ldmia %1!,{r0-r2,r8} \n\t" /*load f[4..7]*/ \ + "smlawb %0,r3,r7,%0 \n\t" /*MLA(hi, lo, (*f)[3], ptr[5]); */\ + /*ARM11: 1 cycle stall waiting for r0*/\ + "smlawt %0,r0,r6,%0 \n\t" /*MLA(hi, lo, (*f)[4], ptr[4]); */\ + "smlawb %0,r1,r6,%0 \n\t" /*MLA(hi, lo, (*f)[5], ptr[3]); */\ + "smlawt %0,r2,r5,%0 \n\t" /*MLA(hi, lo, (*f)[6], ptr[2]); */\ + "smlawb %0,r8,r5,%0 \n\t" /*MLA(hi, lo, (*f)[7], ptr[1]); */\ + /*ARM9E: 8x mul (8*1 clk) + 3 ldm (12 clk) + 1 ldr (1 clk) = 20 clks*/\ + /*ARM11: */\ + : "=&r" (lo),"+r" (__p), "+r" (ptr) \ + : \ + : "r0", "r1", "r2", "r3", "r4","r5","r6","r7", "r8", "memory"); \ + }); +#else +#define PROD_O_ODD_ODD(hi, lo, f, ptr) \ + ML0(hi, lo, (*f)[0], ptr[0]); /*0 */\ + MLA(hi, lo, (*f)[1], ptr[7]); /*14*/\ + MLA(hi, lo, (*f)[2], ptr[6]); /*12*/\ + MLA(hi, lo, (*f)[3], ptr[5]); /*10*/\ + MLA(hi, lo, (*f)[4], ptr[4]); /*8 */\ + MLA(hi, lo, (*f)[5], ptr[3]); /*6 */\ + MLA(hi, lo, (*f)[6], ptr[2]); /*4 */\ + MLA(hi, lo, (*f)[7], ptr[1]); /*2 */ +#endif + +//case where (ptr/4)%4 = 1 (simple odd) +#if 1 && defined(FPM_ARM) +#define PROD_O_ODD_EVEN(hi, lo, f, ptr) \ + ({ \ + mad_fixed_t *__p = *(f); \ + asm volatile ( \ + /*f is always 64 bit aligned, D is only 32 bit aligned*/\ + "ldmia %2!, {r4-r7} \n\t" /*load (packed) D[0..7] */ \ + /*^ARM11: 1 cycle stall on 50% that are not 64 bit aligned*/\ + "ldmia %1!, {r0-r3} \n\t" /*load f[0..3]*/ \ + /*ARM11: 1 cycle stall waiting for r0*/\ + "smulwb %0, r0, r4 \n\t" /*ML0(hi, lo, (*f)[0], ptr[0]); */\ + "smlawt %0,r1,r7,%0 \n\t" /*MLA(hi, lo, (*f)[1], ptr[7]); */\ + "smlawb %0,r2,r7,%0 \n\t" /*MLA(hi, lo, (*f)[2], ptr[6]); */\ + "ldmia %1!,{r0-r2,r7} \n\t" /*load f[4..7]*/ \ + "smlawt %0,r3,r6,%0 \n\t" /*MLA(hi, lo, (*f)[3], ptr[5]); */\ + /*ARM11: 1 cycle stall waiting for r0*/\ + "smlawb %0,r0,r6,%0 \n\t" /*MLA(hi, lo, (*f)[4], ptr[4]); */\ + "smlawt %0,r1,r5,%0 \n\t" /*MLA(hi, lo, (*f)[5], ptr[3]); */\ + "smlawb %0,r2,r5,%0 \n\t" /*MLA(hi, lo, (*f)[6], ptr[2]); */\ + "smlawt %0,r7,r4,%0 \n\t" /*MLA(hi, lo, (*f)[7], ptr[1]); */\ + /*ARM9E: 8x mul (8*1 clk) + 3 ldm (12 clk) = 20 clks */\ + /*ARM11: lots of stalling due to %0 */\ + : "=&r" (lo),"+r" (__p), "+r" (ptr) \ + : \ + : "r0", "r1", "r2", "r3", "r4","r5","r6","r7", "memory"); \ + }); +#else +#define PROD_O_ODD_EVEN(hi, lo, f, ptr) \ + ML0(hi, lo, (*f)[0], ptr[0]); /*0 */\ + MLA(hi, lo, (*f)[1], ptr[7]); /*14*/\ + MLA(hi, lo, (*f)[2], ptr[6]); /*12*/\ + MLA(hi, lo, (*f)[3], ptr[5]); /*10*/\ + MLA(hi, lo, (*f)[4], ptr[4]); /*8 */\ + MLA(hi, lo, (*f)[5], ptr[3]); /*6 */\ + MLA(hi, lo, (*f)[6], ptr[2]); /*4 */\ + MLA(hi, lo, (*f)[7], ptr[1]); /*2 */ +#endif + +//case where ptr is evenly divided by 2 twice (e.g. (ptr/4) % 4 == 0) +#if 1 && defined(FPM_ARM) +#define PROD_A_EVEN_EVEN(hi, lo, f, ptr) \ + ({ \ + mad_fixed_t *__p = *(f); \ + asm volatile ( \ + /*f is always 64 bit aligned, D is only 32 bit aligned*/\ + "ldmia %2!, {r4-r7} \n\t" /*load (packed) D[0..7] */ \ + /*^ARM11: 1 cycle stall on 50% that are not 64 bit aligned*/\ + "ldmia %1!, {r0-r3} \n\t" /*load f[0..3]*/ \ + /*ARM11: 1 cycle stall waiting for r0*/\ + "smlawb %0, r0, r4,%0 \n\t" /*ML0(hi, lo, (*f)[0], ptr[0]); */\ + "smlawt %0,r1,r7,%0 \n\t" /*MLA(hi, lo, (*f)[1], ptr[7]); */\ + "smlawb %0,r2,r7,%0 \n\t" /*MLA(hi, lo, (*f)[2], ptr[6]); */\ + "ldmia %1!,{r0-r2,r7} \n\t" /*load f[4..7]*/ \ + "smlawt %0,r3,r6,%0 \n\t" /*MLA(hi, lo, (*f)[3], ptr[5]); */\ + /*ARM11: 1 cycle stall waiting for r0*/\ + "smlawb %0,r0,r6,%0 \n\t" /*MLA(hi, lo, (*f)[4], ptr[4]); */\ + "smlawt %0,r1,r5,%0 \n\t" /*MLA(hi, lo, (*f)[5], ptr[3]); */\ + "smlawb %0,r2,r5,%0 \n\t" /*MLA(hi, lo, (*f)[6], ptr[2]); */\ + "smlawt %0,r7,r4,%0 \n\t" /*MLA(hi, lo, (*f)[7], ptr[1]); */\ + /*ARM9E: 8x mul (8*1 clk) + 3 ldm (12 clk) = 20 clks */\ + /*ARM11: */\ + : "+&r" (lo),"+r" (__p), "+r" (ptr) \ + : \ + : "r0", "r1", "r2", "r3", "r4","r5","r6","r7", "memory"); \ + }); +#else +#define PROD_A_EVEN_EVEN(hi, lo, f, ptr) \ + MLA(hi, lo, (*fe)[0], eptr[0]); /*0 */\ + MLA(hi, lo, (*fe)[1], eptr[7]); /*14*/\ + MLA(hi, lo, (*fe)[2], eptr[6]); /*12*/\ + MLA(hi, lo, (*fe)[3], eptr[5]); /*10*/\ + MLA(hi, lo, (*fe)[4], eptr[4]); /*8 */\ + MLA(hi, lo, (*fe)[5], eptr[3]); /*6 */\ + MLA(hi, lo, (*fe)[6], eptr[2]); /*4 */\ + MLA(hi, lo, (*fe)[7], eptr[1]); /*2 */ +#endif + +//case where (ptr/4)%4 == 2 (middle even) +#if 1 && defined(FPM_ARM) + //use 1 register for accum (lo), 1 for f ptr, 1 for D ptr, 4 for audio data, 4 for D values = 11 used +#define PROD_A_EVEN_ODD(hi, lo, f, ptr) \ + ({ \ + mad_fixed_t *__p = *(f); \ + asm volatile ( \ + /*f is always 64 bit aligned, D is 16/48 bit aligned*/ \ + "ldmia %2!, {r4-r8} \n\t" /*load (packed) D[0..7]*/ \ + /*^ARM11: 1 cycle stall on 50% that are not 64 bit aligned*/ \ + "ldmia %1!, {r0-r3} \n\t" /*load f[0..3]*/ \ + /*ARM11: 1 cycle stall waiting for r0*/\ + "smlawt %0, r0, r4,%0 \n\t" /*ML0(hi, lo, (*f)[0], ptr[0]); */\ + "smlawb %0,r1,r8,%0 \n\t" /*MLA(hi, lo, (*f)[1], ptr[7]); */\ + "smlawt %0,r2,r7,%0 \n\t" /*MLA(hi, lo, (*f)[2], ptr[6]); */\ + "ldmia %1!,{r0-r2,r8} \n\t" /*load f[4..7]*/ \ + "smlawb %0,r3,r7,%0 \n\t" /*MLA(hi, lo, (*f)[3], ptr[5]); */\ + /*ARM11: 1 cycle stall waiting for r0*/\ + "smlawt %0,r0,r6,%0 \n\t" /*MLA(hi, lo, (*f)[4], ptr[4]); */\ + "smlawb %0,r1,r6,%0 \n\t" /*MLA(hi, lo, (*f)[5], ptr[3]); */\ + "smlawt %0,r2,r5,%0 \n\t" /*MLA(hi, lo, (*f)[6], ptr[2]); */\ + "smlawb %0,r8,r5,%0 \n\t" /*MLA(hi, lo, (*f)[7], ptr[1]); */\ + /*ARM9E: 8x mul (8*1 clk) + 3 ldm (13 clk) = 21 clks*/\ + /*ARM11: */\ + : "+&r" (lo),"+r" (__p), "+r" (ptr) \ + : \ + : "r0", "r1", "r2", "r3", "r4","r5","r6","r7", "r8", "memory"); \ + }); + +#else + +#define PROD_A_EVEN_ODD(hi, lo, f, ptr) \ + MLA(hi, lo, (*fe)[0], eptr[0]); /*0 */\ + MLA(hi, lo, (*fe)[1], eptr[7]); /*14*/\ + MLA(hi, lo, (*fe)[2], eptr[6]); /*12*/\ + MLA(hi, lo, (*fe)[3], eptr[5]); /*10*/\ + MLA(hi, lo, (*fe)[4], eptr[4]); /*8 */\ + MLA(hi, lo, (*fe)[5], eptr[3]); /*6 */\ + MLA(hi, lo, (*fe)[6], eptr[2]); /*4 */\ + MLA(hi, lo, (*fe)[7], eptr[1]); /*2 */ +#endif + + + static void synth_full(struct mad_synth *synth, struct mad_frame const *frame, unsigned int nch, unsigned int ns) { - int p, sb; + int p,p2, sb; unsigned int phase, ch, s; mad_fixed_t *pcm, (*filter)[2][2][16][8]; mad_fixed_t (*sbsample)[36][32]; mad_fixed_t (*fe)[8], (*fx)[8], (*fo)[8]; - mad_fixed_t const (*D0ptr)[32], *ptr; - mad_fixed_t const (*D1ptr)[32]; - mad_fixed64hi_t hi; + //int32_t const (*D0ptr)[32]; + int16_t const (*D0eptr)[16],(*D0optr)[16],(*D1eptr)[16],(*D1optr)[16],*ptr2,*eptr,*optr; + int16_t const (*De)[16]=&(D[0][0]); + int16_t const (*Do)[16]=&(D[17][0]); // + //int32_t const (*D1ptr)[32]; + mad_fixed64hi_t hi, hi2; mad_fixed64lo_t lo; + mad_fixed_t accum2; for (ch = 0; ch < nch; ++ch) { sbsample = &(*frame->sbsample_prev)[ch]; @@ -973,86 +1146,565 @@ pcm = synth->pcm.samples[ch]; for (s = 0; s < ns; ++s) { + dct32((*sbsample)[s], phase >> 1, (*filter)[0][phase & 1], (*filter)[1][phase & 1]); - p = (phase - 1) & 0xf; - + p = (phase - 1) & 0xf; //steps 15,0,1,2,3..15, 0... + p2 = (phase - 1) & 0xf; //steps 7,0,1,2,3..7 + //DEBUGF("p: %d s: %d\n", p,s); /* calculate 32 samples */ fe = &(*filter)[0][ phase & 1][0]; fx = &(*filter)[0][~phase & 1][0]; fo = &(*filter)[1][~phase & 1][0]; + /*if even, fo=fe+1536, if odd fo=fe+512*/ + //DEBUGF("Do offset: %ld \n", (intptr_t)Do-(intptr_t) D); - D0ptr = (void*)&D[0][ p]; - D1ptr = (void*)&D[0][-p]; + D0optr = (void*)&Do[0][ p2>>1]; //17x16 + D0eptr = (void*)&De[0][ p2>>1]; //17x16 + D1optr = (void*)&Do[0][-(p2>>1)]; //17x16 + D1eptr = (void*)&De[0][-(p2>>1)]; //17x16 + + + switch (s & 3) //p = (s-1)%16 + { + case 0: /*s%4 == 0 and p%4 == 3*/ + //DEBUGF("[0]Do offset: %ld \n", (intptr_t)fo-(intptr_t) fe); + //DEBUGF("D0odd offset: %ld \n", (intptr_t)D0optr-(intptr_t) D1optr); + //ptr = *D0ptr; //alignment is p%4, so ptr is 3 aligned + optr = *D0optr; + #if 1 + //fx:0, optr:2/6 + PROD_O_ODD_ODD(hi, lo, fx, optr) + #else + ML0(hi, lo, (*fx)[0], optr[0]); /*0 */ + MLA(hi, lo, (*fx)[1], optr[7]); /*14*/ + MLA(hi, lo, (*fx)[2], optr[6]); /*12*/ + MLA(hi, lo, (*fx)[3], optr[5]); /*10*/ + MLA(hi, lo, (*fx)[4], optr[4]); /*8 */ + MLA(hi, lo, (*fx)[5], optr[3]); /*6 */ + MLA(hi, lo, (*fx)[6], optr[2]); /*4 */ + MLA(hi, lo, (*fx)[7], optr[1]); /*2 */ + #endif + + MLN(hi, lo); + //ptr=ptr+1; //alignment is p%4+1, so ptr is 0 aligned + eptr=*D0eptr; + eptr=eptr+1; + #if 1 + //fe:0,0 eptr:0 (bottom of packed variable, 64 bit aligned) + PROD_A_EVEN_EVEN(hi, lo, fe, eptr) //why can fe or fx be used here with identical results??? + #else - if(s & 1) - { - ptr = *D0ptr; - PROD_O(hi, lo, fx, ptr, 1) - MLN(hi, lo); - PROD_A(hi, lo, fe, ptr, 0) - pcm[0] = SHIFT(MLZ(hi, lo)); - pcm += 16; + MLA(hi, lo, (*fe)[0], eptr[0]); /*0 */ + MLA(hi, lo, (*fe)[1], eptr[7]); /*14*/ + MLA(hi, lo, (*fe)[2], eptr[6]); /*12*/ + MLA(hi, lo, (*fe)[3], eptr[5]); /*10*/ + MLA(hi, lo, (*fe)[4], eptr[4]); /*8 */ + MLA(hi, lo, (*fe)[5], eptr[3]); /*6 */ + MLA(hi, lo, (*fe)[6], eptr[2]); /*4 */ + MLA(hi, lo, (*fe)[7], eptr[1]); /*2 */ + #endif + pcm[0] = SHIFT(MLZ(hi, lo)); + pcm += 16; +#if 0 + synth_full_mod_zero(pcm, fe, D0eptr, D1eptr); +#else + for (sb = 15; sb; sb--, fo++) + { + ++fe; + ++D0eptr; + ++D0optr; + ++D1eptr; + ++D1optr; + + /* D[32 - sb][i] == -D[sb][31 - i] */ + //ptr = *D0ptr; //alignment is p%4, so ptr is 3 aligned + optr = *D0optr; + //PROD_O_ODD_ODD(hi, lo, fo, ptr, 0) + //fo:0, optr:2 (bottom of packed variable, not 64 bit aligned) + ML0(hi, lo, (*fo)[0], optr[0]); /*0 */ + MLA(hi, lo, (*fo)[1], optr[7]); /*14*/ + MLA(hi, lo, (*fo)[2], optr[6]); /*12*/ + MLA(hi, lo, (*fo)[3], optr[5]); /*10*/ + MLA(hi, lo, (*fo)[4], optr[4]); /*8 */ + MLA(hi, lo, (*fo)[5], optr[3]); /*6 */ + MLA(hi, lo, (*fo)[6], optr[2]); /*4 */ + MLA(hi, lo, (*fo)[7], optr[1]); /*2 */ - for (sb = 15; sb; sb--, fo++) - { - ++fe; - ++D0ptr; - ++D1ptr; - /* D[32 - sb][i] == -D[sb][31 - i] */ - ptr = *D0ptr; - PROD_O(hi, lo, fo, ptr, 1) - MLN(hi, lo); - PROD_A(hi, lo, fe, ptr, 0) - pcm[-sb] = SHIFT(MLZ(hi, lo)); + MLN(hi, lo); //negate + // ptr2 = *D1ptr; //alignment is 4-p%4, so ptr is 1 aligned + + eptr = *D1eptr; + //DEBUGF("eptr: %ld %ld\n", (intptr_t)eptr-(intptr_t) De, ((intptr_t)eptr)%8); + //fo:0, eptr:2 (bottom of packed variable, not 64 bit aligned) + ML0(hi2, accum2, (*fo)[7], eptr[14]); /*29*/ + MLA(hi2, accum2, (*fo)[6], eptr[13]); /*27*/ + MLA(hi2, accum2, (*fo)[5], eptr[12]); /*25*/ + MLA(hi2, accum2, (*fo)[4], eptr[11]); /*23*/ + MLA(hi2, accum2, (*fo)[3], eptr[10]); /*21*/ + MLA(hi2, accum2, (*fo)[2], eptr[9]); /*19*/ + MLA(hi2, accum2, (*fo)[1], eptr[8]); /*17*/ + MLA(hi2, accum2, (*fo)[0], eptr[7]); /*15*/ + // ptr=ptr+1; //alignement is p%4+1, so ptr is 0 aligned + // PROD_A_EVEN_EVEN(hi, lo, fe, ptr, 1) - ptr = *D1ptr; - PROD_SB(hi, lo, ptr, 1, 15, 30) - pcm[sb] = SHIFT(MLZ(hi, lo)); - } + //ptr=ptr+1; + eptr=*D0eptr; + eptr=eptr+1; - ptr = *(D0ptr + 1); - PROD_O(hi, lo, fo, ptr, 1) - pcm[0] = SHIFT(-MLZ(hi, lo)); - } - else - { - ptr = *D0ptr; - PROD_O(hi, lo, fx, ptr, 0) - MLN(hi, lo); - PROD_A(hi, lo, fe, ptr, 1) - pcm[0] = SHIFT(MLZ(hi, lo)); - pcm += 16; + //fo:0, eptr:0 (bottom of packed variable, 64 bit aligned) + MLA(hi, lo, (*fe)[0], eptr[0]); /*0 */ + MLA(hi, lo, (*fe)[1], eptr[7]); /*14*/ + MLA(hi, lo, (*fe)[2], eptr[6]); /*12*/ + MLA(hi, lo, (*fe)[3], eptr[5]); /*10*/ + MLA(hi, lo, (*fe)[4], eptr[4]); /*8 */ + MLA(hi, lo, (*fe)[5], eptr[3]); /*6 */ + MLA(hi, lo, (*fe)[6], eptr[2]); /*4 */ + MLA(hi, lo, (*fe)[7], eptr[1]); /*2 */ - for (sb = 15; sb; sb--, fo++) - { - ++fe; - ++D0ptr; - ++D1ptr; + pcm[-sb] = SHIFT(MLZ(hi, lo)); - /* D[32 - sb][i] == -D[sb][31 - i] */ - ptr = *D0ptr; - PROD_O(hi, lo, fo, ptr, 0) - MLN(hi, lo); - PROD_A(hi, lo, fe, ptr, 1) - pcm[-sb] = SHIFT(MLZ(hi, lo)); + //PROD_SB_EVEN_ODD_ONE(hi2, lo2, D1ptr, 0, 30, 15) + optr = *D1optr; + //fe:0, optr:2 (bottom of packed variable, not 64 bit aligned) + MLA(hi2, accum2, (*fe)[0], optr[14]); /*30*/ + MLA(hi2, accum2, (*fe)[1], optr[7]); /*16*/ + MLA(hi2, accum2, (*fe)[2], optr[8]); /*18*/ + MLA(hi2, accum2, (*fe)[3], optr[9]); /*20*/ + MLA(hi2, accum2, (*fe)[4], optr[10]); /*22*/ + MLA(hi2, accum2, (*fe)[5], optr[11]); /*24*/ + MLA(hi2, accum2, (*fe)[6], optr[12]); /*26*/ + MLA(hi2, accum2, (*fe)[7], optr[13]); /*28*/ - ptr = *D1ptr; - PROD_SB(hi, lo, ptr, 0, 30, 15) - pcm[sb] = SHIFT(MLZ(hi, lo)); - } + pcm[sb] = SHIFT(MLZ(hi2, accum2)); + } +#endif //asm loop - ptr = *(D0ptr + 1); - PROD_O(hi, lo, fo, ptr, 0) - pcm[0] = SHIFT(-MLZ(hi, lo)); - } + optr = *(D0optr+1); + #if 1 + //ptr = *(D0ptr + 1); //alignment is p%4, so ptr is 3 aligned + //fo:0, optr:2/6 + PROD_O_ODD_ODD(hi, lo, fo, optr) + #else + ML0(hi, lo, (*fo)[0], optr[0]); /*0 */ + MLA(hi, lo, (*fo)[1], optr[7]); /*14*/ + MLA(hi, lo, (*fo)[2], optr[6]); /*12*/ + MLA(hi, lo, (*fo)[3], optr[5]); /*10*/ + MLA(hi, lo, (*fo)[4], optr[4]); /*8 */ + MLA(hi, lo, (*fo)[5], optr[3]); /*6 */ + MLA(hi, lo, (*fo)[6], optr[2]); /*4 */ + MLA(hi, lo, (*fo)[7], optr[1]); /*2 */ + #endif + pcm[0] = SHIFT(-MLZ(hi, lo)); + + break; + + case 1: /*s%4 == 1 and p%4 == 0*/ + //ptr = *D0ptr; //alignment is p%4, so ptr is 0 aligned + //ptr=ptr+1; //alignment is p%4+1, so ptr is 1 aligned + + optr = *D0optr; //alignment is p%4, so ptr is 0 aligned + #if 1 + //fx:0, optr:0/4 + PROD_O_ODD_EVEN(hi, lo, fx, optr) + #else + ML0(hi, lo, (*fx)[0], optr[0]); /*0 */ + MLA(hi, lo, (*fx)[1], optr[7]); /*14*/ + MLA(hi, lo, (*fx)[2], optr[6]); /*12*/ + MLA(hi, lo, (*fx)[3], optr[5]); /*10*/ + MLA(hi, lo, (*fx)[4], optr[4]); /*8 */ + MLA(hi, lo, (*fx)[5], optr[3]); /*6 */ + MLA(hi, lo, (*fx)[6], optr[2]); /*4 */ + MLA(hi, lo, (*fx)[7], optr[1]); /*2 */ + #endif + + MLN(hi, lo); + //ptr=ptr-1; //alignment is p%4, so ptr is 0 aligned + eptr = *D0eptr; + #if 1 + //fe:0, eptr:0/4 (bottom of packed variable, 64 bit aligned) + PROD_A_EVEN_EVEN(hi, lo, fe, eptr) + #else + + MLA(hi, lo, (*fe)[0], eptr[0]); /*0 */ + MLA(hi, lo, (*fe)[1], eptr[7]); /*14*/ + MLA(hi, lo, (*fe)[2], eptr[6]); /*12*/ + MLA(hi, lo, (*fe)[3], eptr[5]); /*10*/ + MLA(hi, lo, (*fe)[4], eptr[4]); /*8 */ + MLA(hi, lo, (*fe)[5], eptr[3]); /*6 */ + MLA(hi, lo, (*fe)[6], eptr[2]); /*4 */ + MLA(hi, lo, (*fe)[7], eptr[1]); /*2 */ + #endif + + pcm[0] = SHIFT(MLZ(hi, lo)); + pcm += 16; + + for (sb = 15; sb; sb--, fo++) + { + ++fe; + ++D0eptr; + ++D0optr; + ++D1eptr; + ++D1optr; + + + /* D[32 - sb][i] == -D[sb][31 - i] */ + //ptr = *D0ptr; //alignment is p%4, so ptr is 0 aligned + //ptr=ptr+1; //alignment is p%4+1, so ptr is 1 aligned + //ptr2 = *D1ptr; //alignment is 4-p%4, so ptr is 0 aligned + + //PROD_O_ODD_EVEN(hi, lo, fo, ptr, 1) + + + optr = *D0optr; //alignment is p%4, so ptr is 0 aligned + optr=optr+0; //not sure why this ended up being +0 + + //fo:0, optr:0 (bottom of packed variable, 64 bit aligned) + ML0(hi, lo, (*fo)[0], optr[0]); /*0 */ + MLA(hi, lo, (*fo)[1], optr[7]); /*14*/ + MLA(hi, lo, (*fo)[2], optr[6]); /*12*/ + MLA(hi, lo, (*fo)[3], optr[5]); /*10*/ + MLA(hi, lo, (*fo)[4], optr[4]); /*8 */ + MLA(hi, lo, (*fo)[5], optr[3]); /*6 */ + MLA(hi, lo, (*fo)[6], optr[2]); /*4 */ + MLA(hi, lo, (*fo)[7], optr[1]); /*2 */ + + eptr = *D1eptr; + //fo:0, eptr:0 (bottom of packed variable, 64 bit aligned) + ML0(hi2, accum2, (*fo)[7], eptr[14]); /*28*/ + MLA(hi2, accum2, (*fo)[6], eptr[13]); /*26*/ + MLA(hi2, accum2, (*fo)[5], eptr[12]); /*24*/ + MLA(hi2, accum2, (*fo)[4], eptr[11]); /*22*/ + MLA(hi2, accum2, (*fo)[3], eptr[10]); /*20*/ + MLA(hi2, accum2, (*fo)[2], eptr[ 9]); /*18*/ + MLA(hi2, accum2, (*fo)[1], eptr[ 8]); /*16*/ + MLA(hi2, accum2, (*fo)[0], eptr[15]); /*30*/ + + MLN(hi, lo); + //ptr=ptr-1; //alignment is p%4, so ptr is 1 aligned + + //PROD_A_EVEN_EVEN(hi, lo, fe, ptr, 0) + + eptr = *D0eptr; + //DEBUGF("eptr: %ld %ld\n", (intptr_t)eptr-(intptr_t) De, ((intptr_t)eptr)%8); + //eptr: 0/4 + MLA(hi, lo, (*fe)[0], eptr[0]); /*0 */ + MLA(hi, lo, (*fe)[1], eptr[7]); /*14*/ + MLA(hi, lo, (*fe)[2], eptr[6]); /*12*/ + MLA(hi, lo, (*fe)[3], eptr[5]); /*10*/ + MLA(hi, lo, (*fe)[4], eptr[4]); /*8 */ + MLA(hi, lo, (*fe)[5], eptr[3]); /*6 */ + MLA(hi, lo, (*fe)[6], eptr[2]); /*4 */ + MLA(hi, lo, (*fe)[7], eptr[1]); /*2 */ + + pcm[-sb] = SHIFT(MLZ(hi, lo)); + + //PROD_SB_ODD_EVEN_ZERO(hi, lo, ptr, 1, 15, 30) + + optr = *D1optr; + //optr: 0/4 + MLA(hi2, accum2, (*fe)[0], optr[7]); /*15*/ + MLA(hi2, accum2, (*fe)[1], optr[8]); /*17*/ + MLA(hi2, accum2, (*fe)[2], optr[9]); /*19*/ + MLA(hi2, accum2, (*fe)[3], optr[10]); /*21*/ + MLA(hi2, accum2, (*fe)[4], optr[11]); /*23*/ + MLA(hi2, accum2, (*fe)[5], optr[12]); /*25*/ + MLA(hi2, accum2, (*fe)[6], optr[13]); /*27*/ + MLA(hi2, accum2, (*fe)[7], optr[14]); /*29*/ + + pcm[sb] = SHIFT(MLZ(hi2, accum2)); + } + + + optr = *(D0optr+1); + #if 1 + //ptr = *(D0ptr + 1)+1; //alignment is p%4+1, so ptr is 1 aligned + //fo:0, optr: 0/4 + PROD_O_ODD_EVEN(hi, lo, fo, optr) + #else + + ML0(hi, lo, (*fo)[0], optr[0]); /*0 */ + MLA(hi, lo, (*fo)[1], optr[7]); /*14*/ + MLA(hi, lo, (*fo)[2], optr[6]); /*12*/ + MLA(hi, lo, (*fo)[3], optr[5]); /*10*/ + MLA(hi, lo, (*fo)[4], optr[4]); /*8 */ + MLA(hi, lo, (*fo)[5], optr[3]); /*6 */ + MLA(hi, lo, (*fo)[6], optr[2]); /*4 */ + MLA(hi, lo, (*fo)[7], optr[1]); /*2 */ + #endif + pcm[0] = SHIFT(-MLZ(hi, lo)); + break; + + case 2: /*s%4 == 2 and p%4 == 1*/ + optr = *D0optr; + #if 1 + //ptr = *D0ptr; //alignment is p%4, so ptr is 1 aligned + //fx:0, optr: 0/4 + PROD_O_ODD_EVEN(hi, lo, fx, optr) + #else + + ML0(hi, lo, (*fx)[0], optr[0]); /*0 */ + MLA(hi, lo, (*fx)[1], optr[7]); /*14*/ + MLA(hi, lo, (*fx)[2], optr[6]); /*12*/ + MLA(hi, lo, (*fx)[3], optr[5]); /*10*/ + MLA(hi, lo, (*fx)[4], optr[4]); /*8 */ + MLA(hi, lo, (*fx)[5], optr[3]); /*6 */ + MLA(hi, lo, (*fx)[6], optr[2]); /*4 */ + MLA(hi, lo, (*fx)[7], optr[1]); /*2 */ + #endif + MLN(hi, lo); + eptr = *D0eptr; + eptr = eptr+1; + #if 1 + //ptr=ptr+1; //alignment is p%4+1, so ptr is 2 aligned + //DEBUGF("fe: %ld %ld eptr: %ld %ld\n", ((intptr_t)fe)-((intptr_t)filter),((intptr_t)fe)%8, ((intptr_t)eptr)-((intptr_t)De), ((intptr_t)eptr)%8); + //fe:0, eptr:2/6 + PROD_A_EVEN_ODD(hi, lo, fe, eptr) + #else + + MLA(hi, lo, (*fe)[0], eptr[0]); /*0 */ + MLA(hi, lo, (*fe)[1], eptr[7]); /*14*/ + MLA(hi, lo, (*fe)[2], eptr[6]); /*12*/ + MLA(hi, lo, (*fe)[3], eptr[5]); /*10*/ + MLA(hi, lo, (*fe)[4], eptr[4]); /*8 */ + MLA(hi, lo, (*fe)[5], eptr[3]); /*6 */ + MLA(hi, lo, (*fe)[6], eptr[2]); /*4 */ + MLA(hi, lo, (*fe)[7], eptr[1]); /*2 */ + #endif + pcm[0] = SHIFT(MLZ(hi, lo)); + pcm += 16; + + for (sb = 15; sb; sb--, fo++) + { + ++fe; + ++D0eptr; + ++D0optr; + ++D1eptr; + ++D1optr; + + /* D[32 - sb][i] == -D[sb][31 - i] */ + //ptr = *D0ptr; //alignment is p%4, so ptr is 1 aligned + + //PROD_O_ODD_EVEN(hi, lo, fo, ptr, 0) + + optr = *D0optr; + + //fo: 64 bit aligned, optr: 0/4 aligned + ML0(hi, lo, (*fo)[0], optr[0]); /*0 */\ + MLA(hi, lo, (*fo)[1], optr[7]); /*14*/\ + MLA(hi, lo, (*fo)[2], optr[6]); /*12*/\ + MLA(hi, lo, (*fo)[3], optr[5]); /*10*/\ + MLA(hi, lo, (*fo)[4], optr[4]); /*8 */\ + MLA(hi, lo, (*fo)[5], optr[3]); /*6 */\ + MLA(hi, lo, (*fo)[6], optr[2]); /*4 */\ + MLA(hi, lo, (*fo)[7], optr[1]); /*2 */ + + MLN(hi, lo); + + ptr2=*D1eptr; + ML0(hi2, accum2, (*fo)[7], ptr2[14]); /*29*/\ + MLA(hi2, accum2, (*fo)[6], ptr2[13]); /*27*/\ + MLA(hi2, accum2, (*fo)[5], ptr2[12]); /*25*/\ + MLA(hi2, accum2, (*fo)[4], ptr2[11]); /*23*/\ + MLA(hi2, accum2, (*fo)[3], ptr2[10]); /*21*/\ + MLA(hi2, accum2, (*fo)[2], ptr2[ 9]); /*19*/\ + MLA(hi2, accum2, (*fo)[1], ptr2[ 8]); /*17*/\ + MLA(hi2, accum2, (*fo)[0], ptr2[ 7]); /*15*/ + + + //ptr=ptr+1; //alignment is p%4+1, so ptr is 2 aligned + //PROD_A_EVEN_ODD(hi, lo, fe, ptr, 1) + + eptr = *D0eptr; + eptr = eptr+1; + + MLA(hi, lo, (*fe)[0], eptr[0]); /*0 */\ + MLA(hi, lo, (*fe)[1], eptr[7]); /*14*/\ + MLA(hi, lo, (*fe)[2], eptr[6]); /*12*/\ + MLA(hi, lo, (*fe)[3], eptr[5]); /*10*/\ + MLA(hi, lo, (*fe)[4], eptr[4]); /*8 */\ + MLA(hi, lo, (*fe)[5], eptr[3]); /*6 */\ + MLA(hi, lo, (*fe)[6], eptr[2]); /*4 */\ + MLA(hi, lo, (*fe)[7], eptr[1]); /*2 */ + + + pcm[-sb] = SHIFT(MLZ(hi, lo)); + + //ptr = *D1ptr; //alignment is 4-p%4, so ptry is 3 aligned + + //PROD_SB_EVEN_ODD_THREE(hi, lo, ptr, 0, 30, 15) + + ptr2 = *D1optr; + ptr2=ptr2-1; + MLA(hi2, accum2, (*fe)[0], ptr2[15]); /*30*/\ + MLA(hi2, accum2, (*fe)[1], ptr2[8]); /*16*/\ + MLA(hi2, accum2, (*fe)[2], ptr2[9]); /*18*/\ + MLA(hi2, accum2, (*fe)[3], ptr2[10]); /*20*/\ + MLA(hi2, accum2, (*fe)[4], ptr2[11]); /*22*/\ + MLA(hi2, accum2, (*fe)[5], ptr2[12]); /*24*/\ + MLA(hi2, accum2, (*fe)[6], ptr2[13]); /*26*/\ + MLA(hi2, accum2, (*fe)[7], ptr2[14]); /*28*/\ + + pcm[sb] = SHIFT(MLZ(hi2, accum2)); + } + + //ptr = *(D0ptr + 1); //alignment is p%4, so ptr is 1 aligned + optr = *(D0optr+1); + #if 1 + PROD_O_ODD_EVEN(hi, lo, fo, optr) + #else + ML0(hi, lo, (*fo)[0], optr[0]); /*0 */\ + MLA(hi, lo, (*fo)[1], optr[7]); /*14*/\ + MLA(hi, lo, (*fo)[2], optr[6]); /*12*/\ + MLA(hi, lo, (*fo)[3], optr[5]); /*10*/\ + MLA(hi, lo, (*fo)[4], optr[4]); /*8 */\ + MLA(hi, lo, (*fo)[5], optr[3]); /*6 */\ + MLA(hi, lo, (*fo)[6], optr[2]); /*4 */\ + MLA(hi, lo, (*fo)[7], optr[1]); /*2 */ + #endif + + pcm[0] = SHIFT(-MLZ(hi, lo)); + + break; + + case 3: + //ptr = *D0ptr+1; //alignment is p%4+1, so ptr is 3 aligned + optr = *D0optr; + #if 1 + //fx:0, optr: 2/6 + PROD_O_ODD_ODD(hi, lo, fx, optr) + #else + + ML0(hi, lo, (*fx)[0], optr[0]); /*0 */\ + MLA(hi, lo, (*fx)[1], optr[7]); /*14*/\ + MLA(hi, lo, (*fx)[2], optr[6]); /*12*/\ + MLA(hi, lo, (*fx)[3], optr[5]); /*10*/\ + MLA(hi, lo, (*fx)[4], optr[4]); /*8 */\ + MLA(hi, lo, (*fx)[5], optr[3]); /*6 */\ + MLA(hi, lo, (*fx)[6], optr[2]); /*4 */\ + MLA(hi, lo, (*fx)[7], optr[1]); /*2 */ + #endif + + MLN(hi, lo); + //ptr=ptr-1; //alignment is p%4, so ptr is 2 aligned + eptr= *D0eptr; + #if 1 + //fe: 0, eptr: 2/6 + PROD_A_EVEN_ODD(hi, lo, fe, eptr) + #else + + MLA(hi, lo, (*fe)[0], eptr[0]); /*0 */ + MLA(hi, lo, (*fe)[1], eptr[7]); /*14*/ + MLA(hi, lo, (*fe)[2], eptr[6]); /*12*/ + MLA(hi, lo, (*fe)[3], eptr[5]); /*10*/ + MLA(hi, lo, (*fe)[4], eptr[4]); /*8 */ + MLA(hi, lo, (*fe)[5], eptr[3]); /*6 */ + MLA(hi, lo, (*fe)[6], eptr[2]); /*4 */ + MLA(hi, lo, (*fe)[7], eptr[1]); /*2 */ + #endif + pcm[0] = SHIFT(MLZ(hi, lo)); + pcm += 16; + + for (sb = 15; sb; sb--, fo++) + { + ++fe; + ++D0eptr; + ++D0optr; + ++D1eptr; + ++D1optr; + + /* D[32 - sb][i] == -D[sb][31 - i] */ + //ptr = *D0ptr+1; //alignment is p%4+1, so ptr is 3 aligned + + //PROD_O_ODD_ODD(hi, lo, fo, ptr, 1) + + optr = *D0optr; + + //optr: 2/6 + ML0(hi, lo, (*fo)[0], optr[0]); /*0 */\ + MLA(hi, lo, (*fo)[1], optr[7]); /*14*/\ + MLA(hi, lo, (*fo)[2], optr[6]); /*12*/\ + MLA(hi, lo, (*fo)[3], optr[5]); /*10*/\ + MLA(hi, lo, (*fo)[4], optr[4]); /*8 */\ + MLA(hi, lo, (*fo)[5], optr[3]); /*6 */\ + MLA(hi, lo, (*fo)[6], optr[2]); /*4 */\ + MLA(hi, lo, (*fo)[7], optr[1]); /*2 */ + + MLN(hi, lo); + + ptr2=*D1eptr; + //ptr2:2/4 + ML0(hi2, accum2, (*fo)[7], ptr2[14]); /*28*/\ + MLA(hi2, accum2, (*fo)[6], ptr2[13]); /*26*/\ + MLA(hi2, accum2, (*fo)[5], ptr2[12]); /*24*/\ + MLA(hi2, accum2, (*fo)[4], ptr2[11]); /*22*/\ + MLA(hi2, accum2, (*fo)[3], ptr2[10]); /*20*/\ + MLA(hi2, accum2, (*fo)[2], ptr2[ 9]); /*18*/\ + MLA(hi2, accum2, (*fo)[1], ptr2[ 8]); /*16*/\ + MLA(hi2, accum2, (*fo)[0], ptr2[15]); /*30*/ + + //ptr=ptr-1; //alignment is p%4, so ptr is 2 aligned + + //PROD_A_EVEN_ODD(hi, lo, fe, ptr, 0) + + eptr= *D0eptr; + MLA(hi, lo, (*fe)[0], eptr[0]); /*0 */\ + MLA(hi, lo, (*fe)[1], eptr[7]); /*14*/\ + MLA(hi, lo, (*fe)[2], eptr[6]); /*12*/\ + MLA(hi, lo, (*fe)[3], eptr[5]); /*10*/\ + MLA(hi, lo, (*fe)[4], eptr[4]); /*8 */\ + MLA(hi, lo, (*fe)[5], eptr[3]); /*6 */\ + MLA(hi, lo, (*fe)[6], eptr[2]); /*4 */\ + MLA(hi, lo, (*fe)[7], eptr[1]); /*2 */ + + pcm[-sb] = SHIFT(MLZ(hi, lo)); + + ptr2 = *D1optr; + //DEBUGF("optr: %ld %ld\n", (intptr_t)*D1optr-(intptr_t) Do, ((intptr_t)*D1optr)%8); + //DEBUGF("eptr: %ld %ld\n", (intptr_t)*D1eptr-(intptr_t) De, ((intptr_t)*D1eptr)%8); + //ptr2:2/4 + MLA(hi2, accum2, (*fe)[0], ptr2[7]); /*15*/\ + MLA(hi2, accum2, (*fe)[1], ptr2[8]); /*17*/\ + MLA(hi2, accum2, (*fe)[2], ptr2[9]); /*19*/\ + MLA(hi2, accum2, (*fe)[3], ptr2[10]); /*21*/\ + MLA(hi2, accum2, (*fe)[4], ptr2[11]); /*23*/\ + MLA(hi2, accum2, (*fe)[5], ptr2[12]); /*25*/\ + MLA(hi2, accum2, (*fe)[6], ptr2[13]); /*27*/\ + MLA(hi2, accum2, (*fe)[7], ptr2[14]); /*29*/\ + + pcm[sb] = SHIFT(MLZ(hi2, accum2)); + } + + //ptr = *(D0ptr + 1)+1; //alignment is p%4+1, so ptr is 3 aligned + optr = *(D0optr+1); + #if 1 + //fo:0, optr: 2/6 + PROD_O_ODD_ODD(hi, lo, fo, optr) + #else + ML0(hi, lo, (*fo)[0], optr[0]); /*0 */\ + MLA(hi, lo, (*fo)[1], optr[7]); /*14*/\ + MLA(hi, lo, (*fo)[2], optr[6]); /*12*/\ + MLA(hi, lo, (*fo)[3], optr[5]); /*10*/\ + MLA(hi, lo, (*fo)[4], optr[4]); /*8 */\ + MLA(hi, lo, (*fo)[5], optr[3]); /*6 */\ + MLA(hi, lo, (*fo)[6], optr[2]); /*4 */\ + MLA(hi, lo, (*fo)[7], optr[1]); /*2 */ + #endif + pcm[0] = SHIFT(-MLZ(hi, lo)); + break; + } + + pcm += 16; phase = (phase + 1) % 16; + } + } + } # endif /* FPM_COLDFIRE_EMAC, FPM_ARM */ @@ -1095,6 +1747,7 @@ Dptr = &D[0]; ptr = *Dptr + po; + ML0(hi, lo, (*fx)[0], ptr[ 0]); MLA(hi, lo, (*fx)[1], ptr[14]); MLA(hi, lo, (*fx)[2], ptr[12]);