/*
 * idct.c
 * Copyright (C) 2000-2003 Michel Lespinasse <walken@zoy.org>
 * Copyright (C) 1999-2000 Aaron Holtzman <aholtzma@ess.engr.uvic.ca>
 *
 * This file is part of mpeg2dec, a free MPEG-2 video stream decoder.
 * See http://libmpeg2.sourceforge.net/ for updates.
 *
 * mpeg2dec is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * mpeg2dec is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
 */

#include "plugin.h"

#include "mpeg2dec_config.h"

#include "mpeg2.h"
#include "attributes.h"
#include "mpeg2_internal.h"
#define W0 2048 /* 2048 * sqrt (2) * cos (4 * pi / 16) */
#define W1 2841 /* 2048 * sqrt (2) * cos (1 * pi / 16) */
#define W2 2676 /* 2048 * sqrt (2) * cos (2 * pi / 16) */
#define W3 2408 /* 2048 * sqrt (2) * cos (3 * pi / 16) */
#define W4 2048 /* 2048 * sqrt (2) * cos (4 * pi / 16) */
#define W5 1609 /* 2048 * sqrt (2) * cos (5 * pi / 16) */
#define W6 1108 /* 2048 * sqrt (2) * cos (6 * pi / 16) */
#define W7 565  /* 2048 * sqrt (2) * cos (7 * pi / 16) */
 
#define SC(x) #x

#define WA 1448 //=cos(pi/4)	2048*cos(4pi/16)  	0,707106781 
#define WB 2009 //=cos(pi/16)	2048*cos(pi/16)	 	0,98078528	
#define WC 1892 //=cos(pi/8)	2048*cos(2pi/16)	0,923879533
#define WD 1703 //=cos(3pi/16)	2048*cos(3pi/16)	0,831469612
#define WE 1448 //=sin(pi/4)	2048*sin(4pi/16)	0,707106781
#define WF 1138 //=sin(3pi/16)	2048*sin(3pi/16)	0,555570233
#define WG 784 //=sin(pi/8)	2048*sin(2pi/16)	0,382683432
#define WH 400 //=sin(pi/16)	2048*sin(pi/16)		0,195090322

/* theroic matrix used in the asm 1d_idct */
/* 
{WA,  WC,  WE,  WG,
 WA,  WG, -WE, -WC,
 WA, -WG, -WE,  WC,
 WA, -WC,  WE, -WG};

{WB,  WD,  WF,  WH,
 WD, -WH, -WB, -WF,
 WF, -WB,  WH,  WD,
 WH, -WF,  WD, -WB};
*/

/* idct main entry point  */
void (* mpeg2_idct_copy) (int16_t * block, uint8_t * dest, int stride);
void (* mpeg2_idct_add) (int last, int16_t * block,
                         uint8_t * dest, int stride);

/*
 * In legal streams, the IDCT output should be between -384 and +384.
 * In corrupted streams, it is possible to force the IDCT output to go
 * to +-3826 - this is the worst case for a column IDCT where the
 * column inputs are 16-bit values.
 */
#ifdef CPU_COLDFIRE
static inline unsigned CLIP(int value)
{
    asm (  /* Note: Uses knowledge that only the low byte of the result is used */
       "cmp.l   #255,%[v]   \n"  /* overflow? */
        "bls.b   1f          \n"  /* no: return value */
        "spl.b   %[v]        \n"  /* yes: set low byte to appropriate boundary */
    "1	:                    \n"
        : /* outputs */
        [v]"+d"(value)
    );
	
    return value;
}
#elif defined CPU_ARM
static inline unsigned CLIP(int value)
{
    asm ( /* Note: Uses knowledge that only the low byte of the result is used */
        "cmp     %[v], #255          \n"
        "mvnhi   %[v], %[v], asr #31 \n"
        : /* outputs */
        [v]"+r"(value)
    );
    return value;
}
#else
uint8_t mpeg2_clip[3840 * 2 + 256] IBSS_ATTR;
#define CLIP(i) ((mpeg2_clip + 3840)[i])
#endif

#define MATRIX_MUL(m00,m01,m02,m03,\
				m10,m11,m12,m13, \
				m20,m21,m22,m23, \
				m30,m31,m32,m33, \
				n0,n1,n2,n3,\
				p0,p1,p2,p3) \
do{							\
	p0=n0*m00+n1*m01+n2*m02+n3*m03; \
	p1=n0*m10+n1*m11+n2*m12+n3*m13; \
	p2=n0*m20+n1*m21+n2*m22+n3*m23; \
	p3=n0*m30+n1*m31+n2*m32+n3*m33; \
} while(0)



#if 0
#define BUTTERFLY(t0,t1,W0,W1,d0,d1)        \
do {                                        \
    t0 = W0 * d0 + W1 * d1;                \
    t1 = W0 * d1 - W1 * d0;                \
} while (0)
#else
#define BUTTERFLY(t0,t1,W0,W1,d0,d1)        \
do {                                        \
    int tmp = W0 * (d0 + d1);                \
    t0 = tmp + (W1 - W0) * d1;                \
    t1 = tmp - (W1 + W0) * d0;                \
} while (0)
#endif


#ifdef C_VERSION_OF_ASM
static inline void idct_row (int16_t * const block)
{
	int f0,f1,f2,f3,f4,f5,f6,f7;
	int a0,a1,a2,a3;
	int b0,b1,b2,b3;
	
	f0 = block[0]+1;
    f2 = block[1];
    f4 = block[2];
    f6 = block[3];
    f1 = block[4];
    f3 = block[5];
    f5 = block[6];
    f7 = block[7];
    
    MATRIX_MUL(W1,  W3,  W5,  W7,
			   W3, -W7, -W1, -W5,
			   W5, -W1,  W7,  W3,
			   W7, -W5,  W3, -W1,
			   f1,f3,f5,f7,
 			   b0,b1,b2,b3);
    
	MATRIX_MUL(W0,  W2,  W4,  W6,
 			   W0,  W6, -W4, -W2,
 			   W0, -W6, -W4,  W2,
 			   W0, -W2,  W4, -W6,
 			   f0,f2,f4,f6,
 			   a0,a1,a2,a3);

	block[0]= (a0+b0) >> 12;
	block[1]= (a1+b1) >> 12;
	block[2]= (a2+b2) >> 12;
	block[3]= (a3+b3) >> 12;
	block[4]= (a3-b3) >> 12;
	block[5]= (a2-b2) >> 12;
	block[6]= (a1-b1) >> 12;
	block[7]= (a0-b0) >> 12;
}
#else
static inline void idct_row (int16_t * const block)
{
    int d0, d1, d2, d3;
    int a0, a1, a2, a3, b0, b1, b2, b3;
    int t0, t1, t2, t3;

    /* shortcut */
    if (likely (!(block[1] | ((int32_t *)block)[1] | ((int32_t *)block)[2] |
                  ((int32_t *)block)[3]))) {
        uint32_t tmp = (uint16_t) (block[0] >> 1);
        tmp |= tmp << 16;
        ((int32_t *)block)[0] = tmp;
        ((int32_t *)block)[1] = tmp;
        ((int32_t *)block)[2] = tmp;
        ((int32_t *)block)[3] = tmp;
        return;
    }

    d0 = (block[0] << 11) + 2048;
    d1 = block[1];
    d2 = block[2] << 11;
    d3 = block[3];
    t0 = d0 + d2;
    t1 = d0 - d2;
    BUTTERFLY (t2, t3, W6, W2, d3, d1);
    a0 = t0 + t2;
    a1 = t1 + t3;
    a2 = t1 - t3;
    a3 = t0 - t2;

    d0 = block[4];
    d1 = block[5];
    d2 = block[6];
    d3 = block[7];
    BUTTERFLY (t0, t1, W7, W1, d3, d0);
    BUTTERFLY (t2, t3, W3, W5, d1, d2);
    b0 = t0 + t2;
    b3 = t1 + t3;
    t0 -= t2;
    t1 -= t3;
    b1 = ((t0 + t1) >> 8) * 181;
    b2 = ((t0 - t1) >> 8) * 181;

    block[0] = (a0 + b0) >> 12;
    block[1] = (a1 + b1) >> 12;
    block[2] = (a2 + b2) >> 12;
    block[3] = (a3 + b3) >> 12;
    block[4] = (a3 - b3) >> 12;
    block[5] = (a2 - b2) >> 12;
    block[6] = (a1 - b1) >> 12;
    block[7] = (a0 - b0) >> 12;
}
#endif

#ifdef C_VERSION_OF_ASM
static inline void idct_col (int16_t * const block)
{
	int f0,f1,f2,f3,f4,f5,f6,f7;
	int a0,a1,a2,a3;
	int b0,b1,b2,b3;
	
	f0 = block[0*8] + 32;
    f2 = block[1*8];
    f4 = block[2*8];
    f6 = block[3*8];
    f1 = block[4*8];
    f3 = block[5*8];
    f5 = block[6*8];
    f7 = block[7*8];
    
    MATRIX_MUL(W1,  W3,  W5,  W7,
			   W3, -W7, -W1, -W5,
			   W5, -W1,  W7,  W3,
			   W7, -W5,  W3, -W1,
			   f1,f3,f5,f7,
 			   b0,b1,b2,b3);
    
	MATRIX_MUL(W0,  W2,  W4,  W6,
 			   W0,  W6, -W4, -W2,
 			   W0, -W6, -W4,  W2,
 			   W0, -W2,  W4, -W6,
 			   f0,f2,f4,f6,
 			   a0,a1,a2,a3);

	block[0*8]= (a0+b0) >> 17;
	block[1*8]= (a1+b1) >> 17;
	block[2*8]= (a2+b2) >> 17;
	block[3*8]= (a3+b3) >> 17;
	block[4*8]= (a3-b3) >> 17;
	block[5*8]= (a2-b2) >> 17;
	block[6*8]= (a1-b1) >> 17;
	block[7*8]= (a0-b0) >> 17;

}
#else
static inline void idct_col (int16_t * const block)
{
    int d0, d1, d2, d3;
    int a0, a1, a2, a3, b0, b1, b2, b3;
    int t0, t1, t2, t3;

    d0 = (block[8*0] << 11) + 65536;
    d1 = block[8*1];
    d2 = block[8*2] << 11;
    d3 = block[8*3];
    t0 = d0 + d2;
    t1 = d0 - d2;
    BUTTERFLY (t2, t3, W6, W2, d3, d1);
    a0 = t0 + t2;
    a1 = t1 + t3;
    a2 = t1 - t3;
    a3 = t0 - t2;

    d0 = block[8*4];
    d1 = block[8*5];
    d2 = block[8*6];
    d3 = block[8*7];
    BUTTERFLY (t0, t1, W7, W1, d3, d0);
    BUTTERFLY (t2, t3, W3, W5, d1, d2);
    b0 = t0 + t2;
    b3 = t1 + t3;
    t0 -= t2;
    t1 -= t3;
    b1 = ((t0 + t1) >> 8) * 181;
    b2 = ((t0 - t1) >> 8) * 181;

    block[8*0] = (a0 + b0) >> 17;
    block[8*1] = (a1 + b1) >> 17;
    block[8*2] = (a2 + b2) >> 17;
    block[8*3] = (a3 + b3) >> 17;
    block[8*4] = (a3 - b3) >> 17;
    block[8*5] = (a2 - b2) >> 17;
    block[8*6] = (a1 - b1) >> 17;
    block[8*7] = (a0 - b0) >> 17;
}
#endif

#if defined(CPU_COLDFIRE) && !defined(SIMULATOR)
static  void idct(int16_t * const block) ICODE_ATTR;
static  void idct(int16_t * const block)
{
	asm volatile(

		"move.l #0x80,%%macsr			\n\t"
		/* matrix start */	

		"move.l #2048*65536+2841,%%a0				\n\t"	
		"move.l #2676*65536+2408,%%a1           	\n\t"	
		"move.l #2048*65536+1609,%%a2				\n\t"	
		"move.l #1108*65536+565,%%a3            	\n\t"	
	
		"move.l %[block],%%a4						\n\t"	
		"add.l #128,%%a4							\n\t"	
		/* init loop for i=0 to 7 */	
		"move.l #8,%%d3							\n\t"

		"row:										\n\t"
			/* load P[0] P[2] P[4] P[6] P[1] P[3] P[5] P[7] from the row */	
			"movem.l (%[block]),%%d4-%%d7	\n\t"
			
			"clr.l	%%d2				\n\t"
			"move.w %%d4,%%d2				\n\t"
			"or.l 	%%d5,%%d2				\n\t"				
			"or.l 	%%d6,%%d2				\n\t"				
			"or.l 	%%d7,%%d2				\n\t"							
			"bne  	continue				\n\t"
				/*  uint32_t tmp = (uint16_t) (block[0] >> 1); tmp |= tmp << 16; */
				"move.l #17,%%d5				\n\t"
				"asr.l 	%%d5,%%d4				\n\t"
	
	       		/* ((int32_t *)block)[0] = tmp; ((int32_t *)block)[1] = tmp;
	       		 * ((int32_t *)block)[2] = tmp; ((int32_t *)block)[3] = tmp; */
				"move.w %%d4,(%%a4)		\n\t"
				"move.w %%d4,(1*16,%%a4)	\n\t"
				"move.w %%d4,(2*16,%%a4)	\n\t"
				"move.w %%d4,(3*16,%%a4)	\n\t"
				"move.w %%d4,(4*16,%%a4)	\n\t"
				"move.w %%d4,(5*16,%%a4)	\n\t"
				"move.w %%d4,(6*16,%%a4)	\n\t"
				"move.w %%d4,(7*16,%%a4)	\n\t"	
				
			"jmp  	shortcut				\n\t"
			"continue:							\n\t"
			/* compute second matrix * P[1],P[3],P[5],P[7]*/	
			"mac.w 	%%d6u, %%a0l, %%acc0					\n\t"	
			"mac.w 	%%d6u, %%a1l, %%acc1					\n\t"	
			"mac.w 	%%d6u, %%a2l, %%acc2					\n\t"	
			"mac.w 	%%d6u, %%a3l, %%acc3					\n\t"	
	
			"mac.w 	%%d6l, %%a1l, %%acc0					\n\t"
			"msac.w %%d6l, %%a3l, %%acc1					\n\t"
			"msac.w %%d6l, %%a0l, %%acc2					\n\t"
			"msac.w %%d6l, %%a2l, %%acc3					\n\t"
	
			"mac.w 	%%d7u, %%a2l, %%acc0					\n\t"	
			"msac.w %%d7u, %%a0l, %%acc1					\n\t"	
			"mac.w 	%%d7u, %%a3l, %%acc2					\n\t"	
			"mac.w 	%%d7u, %%a1l, %%acc3					\n\t"	
	
			"mac.w 	%%d7l, %%a3l, %%acc0					\n\t"
			"msac.w %%d7l, %%a2l, %%acc1					\n\t"
			"mac.w 	%%d7l, %%a1l, %%acc2					\n\t"
			"msac.w %%d7l, %%a0l, %%acc3					\n\t"
				
			/* save second matrix row */
			"movclr.l %%acc0, %%d2							\n\t"
			"movclr.l %%acc1, %%d6							\n\t"
			"movclr.l %%acc2, %%d7							\n\t"
			"movclr.l %%acc3, %%a5							\n\t"
	
			/* probably some error correction */	
			"add.l	#65536*1,%%d4			\n\t"
			/* compute first matrix * P[0],P[2],P[4],P[6]*/	
			"mac.w 	%%d4u, %%a0u, %%acc0					\n\t"
			"mac.w 	%%d4u, %%a0u, %%acc1					\n\t"
			"mac.w 	%%d4u, %%a0u, %%acc2					\n\t"
			"mac.w 	%%d4u, %%a0u, %%acc3					\n\t"
	
			"mac.w 	%%d4l, %%a1u, %%acc0					\n\t"
			"mac.w  %%d4l, %%a3u, %%acc1					\n\t"
			"msac.w %%d4l, %%a3u, %%acc2					\n\t"
			"msac.w %%d4l, %%a1u, %%acc3					\n\t"
	
			"mac.w 	%%d5u, %%a2u, %%acc0					\n\t"	
			"msac.w %%d5u, %%a2u, %%acc1					\n\t"	
			"msac.w %%d5u, %%a2u, %%acc2					\n\t"	
			"mac.w 	%%d5u, %%a2u, %%acc3					\n\t"	
	
			"mac.w 	%%d5l, %%a3u, %%acc0					\n\t"
			"msac.w %%d5l, %%a1u, %%acc1					\n\t"
			"mac.w 	%%d5l, %%a1u, %%acc2					\n\t"
			"msac.w %%d5l, %%a3u, %%acc3					\n\t"
	
			/*   divide by  */
			"move.l #12, %%d4			\n\t"
	
			/* mat 1 + mat 2 */
			"move.l %%acc0, %%d5		\n\t"
			"add.l	%%d2,%%d5			\n\t"	
			"asr.l %%d4, %%d5			\n\t"		
			"move.w %%d5,(%%a4)		\n\t"
				
			"move.l %%acc1, %%d5		\n\t"
			"add.l	%%d6,%%d5			\n\t"	
			"asr.l %%d4, %%d5			\n\t"		
			"move.w %%d5,(1*16,%%a4)	\n\t"
				
			"move.l %%acc2, %%d5		\n\t"
			"add.l	%%d7,%%d5			\n\t"	
			"asr.l %%d4, %%d5			\n\t"		
			"move.w %%d5,(2*16,%%a4)	\n\t"
				
			"move.l %%acc3, %%d5		\n\t"
			"add.l	%%a5,%%d5			\n\t"	
			"asr.l %%d4, %%d5			\n\t"		
			"move.w %%d5,(3*16,%%a4)	\n\t"
				
			/* mat 1 - mat2*/
			"movclr.l %%acc0, %%d5		\n\t"
			"sub.l	%%d2,%%d5			\n\t"	
			"asr.l %%d4, %%d5			\n\t"		
			"move.w %%d5,(7*16,%%a4)	\n\t"
				
			"movclr.l %%acc1, %%d5		\n\t"
			"sub.l	%%d6,%%d5			\n\t"	
			"asr.l %%d4, %%d5			\n\t"		
			"move.w %%d5,(6*16,%%a4)	\n\t"
				
			"movclr.l %%acc2, %%d5		\n\t"
			"sub.l	%%d7,%%d5			\n\t"	
			"asr.l %%d4, %%d5			\n\t"		
			"move.w %%d5,(5*16,%%a4)	\n\t"
				
			"movclr.l %%acc3, %%d5		\n\t"
			"sub.l	%%a5,%%d5			\n\t"	
			"asr.l %%d4, %%d5			\n\t"		
			"move.w %%d5,(4*16,%%a4)	\n\t"	
	
			"shortcut:					\n\t"
			/* next line, address +16 */	
			"add.l	#16,%[block]		\n\t"
			/* next dest column */	
			"add.l	#2,%%a4				\n\t"	
			"sub.l	#1,%%d3				\n\t"			
		"bne 	row						\n\t"	
		/* for end */
			
		"sub.l	#16+128,%%a4			\n\t"	/* return to intial block value */
		"move.l #8,%%d3					\n\t"
// columns
		"col:							\n\t"
			
			/* load P[0] P[2] P[4] P[6] P[1] P[3] P[5] P[7] from the row */	
			"movem.l (%[block]),%%d4-%%d7	\n\t"
			
			/* compute second matrix * P[1],P[3],P[5],P[7]*/	
			"mac.w 	%%d6u, %%a0l, %%acc0					\n\t"	
			"mac.w 	%%d6u, %%a1l, %%acc1					\n\t"	
			"mac.w 	%%d6u, %%a2l, %%acc2					\n\t"	
			"mac.w 	%%d6u, %%a3l, %%acc3					\n\t"	
	
			"mac.w 	%%d6l, %%a1l, %%acc0					\n\t"
			"msac.w %%d6l, %%a3l, %%acc1					\n\t"
			"msac.w %%d6l, %%a0l, %%acc2					\n\t"
			"msac.w %%d6l, %%a2l, %%acc3					\n\t"
	
			"mac.w 	%%d7u, %%a2l, %%acc0					\n\t"	
			"msac.w %%d7u, %%a0l, %%acc1					\n\t"	
			"mac.w 	%%d7u, %%a3l, %%acc2					\n\t"	
			"mac.w 	%%d7u, %%a1l, %%acc3					\n\t"	
	
			"mac.w 	%%d7l, %%a3l, %%acc0					\n\t"
			"msac.w %%d7l, %%a2l, %%acc1					\n\t"
			"mac.w 	%%d7l, %%a1l, %%acc2					\n\t"
			"msac.w %%d7l, %%a0l, %%acc3					\n\t"
				
			/* save second matrix row */
			"movclr.l %%acc0, %%d2							\n\t"
			"movclr.l %%acc1, %%d6							\n\t"
			"movclr.l %%acc2, %%d7							\n\t"
			"movclr.l %%acc3, %%a5							\n\t"
	
			/* load P[0] P[2] P[4] P[6] from the row */	
			/* probably some error correction */	
			"add.l	#32*65536,%%d4				\n\t"
			/* compute first matrix * P[0],P[2],P[4],P[6]*/	
			"mac.w 	%%d4u, %%a0u, %%acc0					\n\t"
			"mac.w 	%%d4u, %%a0u, %%acc1					\n\t"
			"mac.w 	%%d4u, %%a0u, %%acc2					\n\t"
			"mac.w 	%%d4u, %%a0u, %%acc3					\n\t"
	
			"mac.w 	%%d4l, %%a1u, %%acc0					\n\t"
			"mac.w  %%d4l, %%a3u, %%acc1					\n\t"
			"msac.w %%d4l, %%a3u, %%acc2					\n\t"
			"msac.w %%d4l, %%a1u, %%acc3					\n\t"
	
			"mac.w 	%%d5u, %%a2u, %%acc0					\n\t"	
			"msac.w %%d5u, %%a2u, %%acc1					\n\t"	
			"msac.w %%d5u, %%a2u, %%acc2					\n\t"	
			"mac.w 	%%d5u, %%a2u, %%acc3					\n\t"	
	
			"mac.w 	%%d5l, %%a3u, %%acc0					\n\t"
			"msac.w %%d5l, %%a1u, %%acc1					\n\t"
			"mac.w 	%%d5l, %%a1u, %%acc2					\n\t"
			"msac.w %%d5l, %%a3u, %%acc3					\n\t"
	
			/*   divide by  */
			"move.l #17, %%d4			\n\t"
	
			/* mat 1 + mat 2 */
			"move.l %%acc0, %%d5		\n\t"
			"add.l	%%d2,%%d5			\n\t"	
			"asr.l %%d4, %%d5			\n\t"		
			"move.w %%d5,(%%a4)		\n\t"
				
			"move.l %%acc1, %%d5		\n\t"
			"add.l	%%d6,%%d5			\n\t"	
			"asr.l %%d4, %%d5			\n\t"		
			"move.w %%d5,(1*16,%%a4)	\n\t"
				
			"move.l %%acc2, %%d5		\n\t"
			"add.l	%%d7,%%d5			\n\t"	
			"asr.l %%d4, %%d5			\n\t"		
			"move.w %%d5,(2*16,%%a4)	\n\t"
				
			"move.l %%acc3, %%d5		\n\t"
			"add.l	%%a5,%%d5			\n\t"	
			"asr.l %%d4, %%d5			\n\t"		
			"move.w %%d5,(3*16,%%a4)	\n\t"
				
			/* mat 1 - mat2*/
			"movclr.l %%acc0, %%d5		\n\t"
			"sub.l	%%d2,%%d5			\n\t"	
			"asr.l %%d4, %%d5			\n\t"		
			"move.w %%d5,(7*16,%%a4)	\n\t"
				
			"movclr.l %%acc1, %%d5		\n\t"
			"sub.l	%%d6,%%d5			\n\t"	
			"asr.l %%d4, %%d5			\n\t"		
			"move.w %%d5,(6*16,%%a4)	\n\t"
				
			"movclr.l %%acc2, %%d5		\n\t"
			"sub.l	%%d7,%%d5			\n\t"	
			"asr.l %%d4, %%d5			\n\t"		
			"move.w %%d5,(5*16,%%a4)	\n\t"
				
			"movclr.l %%acc3, %%d5		\n\t"
			"sub.l	%%a5,%%d5			\n\t"	
			"asr.l %%d4, %%d5			\n\t"		
			"move.w %%d5,(4*16,%%a4)	\n\t"	
			/****************************/
				
			/* next line, address +16 */	
			"add.l	#16,%[block]			\n\t"
			/* next dest column */	
			"add.l	#2,%%a4			\n\t"
				
			"sub.l #1,%%d3					\n\t"			
		"bne 	col						\n\t"
			
		"sub.l	#16*16,%[block]			\n\t"
		:
		:	[block]"a"(block)	
		:	"%a0","%a1","%a2","%a3","%a4","%a5","%d2","%d3","%d4","%d5","%d6","%d7"
	);

}
#endif

#if defined(CPU_COLDFIRE) && !defined(SIMULATOR)
static  void clip_block_to_dest(int16_t * block, uint8_t * dest,const int stride)
{
	asm volatile(
		
		"move.l %[dest],%%a0					\n\t"
		/* init loop for i=0 to 7 */	
		"move.l #8,%%d1							\n\t"
		"clr.l %%d0								\n\t"
		"clr.l %%d3								\n\t"	
	"foradd:								\n\t"

		"movem.l (%[block]),%%d4-%%d7				\n\t"
	
		/* dot 2*/		
		"move.w %%d4,%%d3				\n\t"
		"cmp.l   #255,%%d3   				\n\t"
		"bls.b   2f          				\n\t"
		"spl.b   %%d3						\n\t"  
		"2	:                    			\n\t"
		
		/* dot 1*/	
		"swap.w %%d4					\n\t"	
		"move.w %%d4,%%d0				\n\t"
		"cmp.l   #255,%%d0   				\n\t"  /* overflow? */
		"bls.b   1f          				\n\t"  /* no: return value */
		"spl.b   %%d0						\n\t"  /* yes: set low byte to appropriate boundary */
		"1	:                    			\n\t"
		
		/* save dot 1*/		
		"move.b %%d0,%%d2					\n\t"
		/* save dot 2*/		
		"lsl.l  #8,%%d2					\n\t"
		"move.b %%d3,%%d2					\n\t"

		/* dot 4*/				
		"move.w %%d5,%%d3				\n\t"
		"cmp.l   #255,%%d3   				\n\t"
		"bls.b   4f          				\n\t"
		"spl.b   %%d3						\n\t"  
		"4	:                    			\n\t"

		/* dot 3*/	
		"swap.w %%d5					\n\t"	
		"move.w %%d5,%%d0				\n\t"
		"cmp.l   #255,%%d0   				\n\t"  /* overflow? */
		"bls.b   3f          				\n\t"  /* no: return value */
		"spl.b   %%d0						\n\t"  /* yes: set low byte to appropriate boundary */
		"3	:                    			\n\t"

		/* save dot 3*/	
		"lsl.l  #8,%%d2					\n\t"	
		"move.b %%d0,%%d2					\n\t"
		/* save dot 4*/		
		"lsl.l  #8,%%d2					\n\t"
		"move.b %%d3,%%d2					\n\t"
			
		
		/* writes 4 bytes */
		"move.l %%d2,(%%a0)		\n\t"	

		/* dot 6 */	
		"move.w %%d6,%%d3				\n\t"
		"cmp.l   #255,%%d3   			\n\t"
		"bls.b   6f          			\n\t"
		"spl.b   %%d3					\n\t"  
		"6	:                    		\n\t"
			
		"move.l #0,%%d4				\n\t"		/* */
		/* dot 5 */
		"move.l %%d4,(%[block])+		\n\t"	/* */
		"swap.w %%d6					\n\t"	
		"move.w %%d6,%%d0				\n\t"
		"move.l %%d4,(%[block])+		\n\t"	/* */ 	
		"cmp.l	#255,%%d0   			\n\t"
		"bls.b  5f          			\n\t"
		"spl.b  %%d0					\n\t"  
		"5	:                    		\n\t"
		
		/* save dot 5*/	
		"move.b %%d0,%%d2				\n\t"
		/* save dot 6*/							
		"lsl.l  #8,%%d2					\n\t"
		"move.b %%d3,%%d2				\n\t"

			
		/* dot 8*/	
		"move.w %%d7,%%d3				\n\t"
		"cmp.l   #255,%%d3   			\n\t"
		"bls.b   8f         	 		\n\t"
		"spl.b   %%d3					\n\t"  
		"8	:  		                  	\n\t"
			
		"move.l %%d4,(%[block])+		\n\t"	/* */
		/* dot 7*/	
		"swap.w %%d7					\n\t"	
		"move.w %%d7,%%d0				\n\t"
		"cmp.l   #255,%%d0   			\n\t"
		"bls.b   7f          			\n\t"
		"spl.b   %%d0					\n\t"  
		"7	:                    		\n\t"

		"move.l %%d4,(%[block])+		\n\t"  /*  */
		/* save dot 7*/
		"lsl.l  #8,%%d2					\n\t"
		"move.b %%d0,%%d2				\n\t"
		/* save dot 8*/			
		"lsl.l  #8,%%d2					\n\t"
		"move.b %%d3,%%d2				\n\t"	

		
		/* write to dest */	
		"move.l %%d2,(4,%%a0)		\n\t"
		
		/* clear row, address +16 */
		
		/* next line, address +stride */	
		"add.l	%[stride],%%a0					\n\t"	
		"sub.l	#1,%%d1							\n\t"			
	"bne 	foradd							\n\t"	
		/* for end */
		"sub.l	#128,%[block]					\n\t"	
	
		:
		:	[block]"a"(block),[dest]"a"(dest),[stride]"a"(stride)
		:	"%a0","%d0","%d1","%d2","%d3","%d4","%d5","%d6","%d7"
	);

} 
#endif



#ifdef CPU_COLDFIRE__
static  void clip_block_to_dest_add(int16_t * block, uint8_t * dest,const int stride)
{
	asm volatile (
		
		"move.l %[dest],%%a1					\n\t"
		/* init loop for i=0 to 7 */	
		"move.l #8,%%d1							\n\t"
		"clr.l %%d2								\n\t"
		"foradd2:								\n\t"
		
		"move.w (%[block]),%%d3					\n\t"
		"move.b (%%a1),%%d2					\n\t"
		"ext.l %%d3							\n\t"	
		"add.l %%d2,%%d3					\n\t"	
		"cmp.l   #255,%%d3   				\n\t"  /* overflow? */
		"bls.b   1f          				\n\t"  /* no: return value */
		"spl.b   %%d3						\n\t"  /* yes: set low byte to appropriate boundary */
		"1	:                    			\n\t"

			
		"move.w (2,%[block]),%%d0				\n\t"
		"move.b (1,%%a1),%%d2				\n\t"	
		"ext.l %%d0							\n\t"	
		"add.l %%d2,%%d0					\n\t"	
		"cmp.l   #255,%%d0   				\n\t"
		"bls.b   2f          				\n\t"
		"spl.b   %%d0						\n\t"  
		"2	:                    			\n\t"
		"lsl.l  #8,%%d3					\n\t"
		"move.b %%d0,%%d3					\n\t"

			
		"move.w (4,%[block]),%%d0				\n\t"
		"move.b (2,%%a1),%%d2				\n\t"	
		"ext.l %%d0							\n\t"	
		"add.l %%d2,%%d0				\n\t"
		"cmp.l   #255,%%d0   			\n\t"
		"bls.b   3f          			\n\t"
		"spl.b   %%d0					\n\t"  
		"3	:                    		\n\t"
		"lsl.l  #8,%%d3					\n\t"
		"move.b %%d0,%%d3				\n\t"

			
		"move.w (6,%[block]),%%d0			\n\t"
		"move.b (3,%%a1),%%d2			\n\t"	
		"ext.l %%d0						\n\t"	
		"add.l 	%%d2,%%d0				\n\t"
		"cmp.l  #255,%%d0   			\n\t"
		"bls.b  4f          			\n\t"
		"spl.b  %%d0					\n\t"  
		"4	:                    		\n\t"
		"lsl.l  #8,%%d3					\n\t"
		"move.b %%d0,%%d3				\n\t"
		
		/* writes 4 bytes */
		"move.l %%d3,(%%a1)		\n\t"	
		
			
		"move.w (8,%[block]),%%d3		\n\t"
		"move.b (4,%%a1),%%d2			\n\t"	
		"ext.l %%d3							\n\t"	
		"add.l	%%d2,%%d3			\n\t"
		"cmp.l	#255,%%d3   			\n\t"
		"bls.b  5f          			\n\t"
		"spl.b  %%d3					\n\t"  
		"5	:                    		\n\t"
		
		"move.w (10,%[block]),%%d0		\n\t"
		"move.b (5,%%a1),%%d2			\n\t"	
		"ext.l %%d0							\n\t"	
		"add.l 	%%d2,%%d0			\n\t"	
		"cmp.l   #255,%%d0   			\n\t"
		"bls.b   6f          			\n\t"
		"spl.b   %%d0					\n\t"  
		"6	:                    		\n\t"
		"lsl.l  #8,%%d3					\n\t"	
		"move.b %%d0,%%d3				\n\t"

		"move.w (12,%[block]),%%d0		\n\t"
		"move.b (6,%%a1),%%d2			\n\t"	
		"ext.l %%d0							\n\t"	
		"add.l 	%%d2,%%d0			\n\t"	
		"cmp.l   #255,%%d0   			\n\t"
		"bls.b   7f          			\n\t"
		"spl.b   %%d0					\n\t"  
		"7	:                    		\n\t"
		"lsl.l  #8,%%d3					\n\t"	
		"move.b %%d0,%%d3				\n\t"

		"move.w (14,%[block]),%%d0		\n\t"
		"move.b (7,%%a1),%%d2			\n\t"	
		"ext.l %%d0							\n\t"	
		"add.l 	%%d2,%%d0			\n\t"	
		"cmp.l   #255,%%d0   			\n\t"
		"bls.b   8f         	 		\n\t"
		"spl.b   %%d0					\n\t"  
		"8	:  		                  	\n\t"
		"lsl.l  #8,%%d3					\n\t"			
		"move.b %%d0,%%d3				\n\t"
		
		/* write to dest */	
		"move.l %%d3,(4,%%a1)		\n\t"
		
		/* clear row, address +16 */
		"move.l #0,%%d0				\n\t"
		"move.l %%d0,(%[block])+		\n\t"
		"move.l %%d0,(%[block])+		\n\t"
		"move.l %%d0,(%[block])+		\n\t"
		"move.l %%d0,(%[block])+		\n\t"
		
		/* next line, address +stride */	
		"add.l	%[stride],%%a1		\n\t"	
		"sub.l	#1,%%d1					\n\t"			
		"bne 	foradd2						\n\t"	
		/* for end */
		"sub.l	#128,%[block]					\n\t"			
		:
		:	[block]"a"(block),[dest]"a"(dest),[stride]"a"(stride)
		:	"%a1","%d0","%d1","%d2","%d3"
	);

} 
#endif

#if defined(CPU_COLDFIRE) && !defined(SIMULATOR)
static  void clip_block_to_dest_add(int16_t * block, uint8_t * dest,const int stride)
{
	asm volatile (
		
		"move.l %[dest],%%a0					\n\t"
		/* init loop for i=0 to 7 */	
		"move.l #8,%%d1							\n\t"
		"clr.l %%d2								\n\t"
		"foradd2:								\n\t"
		
		"movem.l (%%a0),%%d4-%%d5					\n\t"
		"movem.l (%[block]),%%d6-%%d7/%%a1-%%a2		\n\t"	

		/* dot 4 */
		"move.w %%d7,%%d0				\n\t"
		"ext.l %%d0						\n\t"	
		"move.b %%d4,%%d2				\n\t"
		"add.l 	%%d2,%%d0				\n\t"
		"cmp.l  #255,%%d0   			\n\t"
		"bls.b  4f          			\n\t"
		"spl.b  %%d0					\n\t"  
		"4	:                    		\n\t"
		"move.b %%d0,%%d3				\n\t"
			
		/* dot 3 */
		"swap.w %%d7					\n\t"	
		"ext.l %%d7						\n\t"
		"lsr.l  #8,%%d4					\n\t"	
		"move.b %%d4,%%d2				\n\t"
		"add.l 	%%d2,%%d7				\n\t"
		"cmp.l  #255,%%d7   			\n\t"
		"bls.b  3f          			\n\t"
		"spl.b  %%d7					\n\t"  
		"3	:                    		\n\t"
		"lsl.l  #8,%%d3					\n\t"
		"move.b %%d7,%%d3				\n\t"	

		/* dot 2 */
		"move.w %%d6,%%d0				\n\t"
		"ext.l %%d0						\n\t"	
		"lsr.l  #8,%%d4					\n\t"	
		"move.b %%d4,%%d2				\n\t"
		"add.l 	%%d2,%%d0				\n\t"
		"cmp.l  #255,%%d0   			\n\t"
		"bls.b  2f          			\n\t"
		"spl.b  %%d0					\n\t"  
		"2	:                    		\n\t"
		"lsl.l  #8,%%d3					\n\t"
		"move.b %%d0,%%d3				\n\t"
			
		/* dot 1 */
		"swap.w %%d6					\n\t"	
		"ext.l %%d6						\n\t"
		"lsr.l  #8,%%d4					\n\t"	
		"move.b %%d4,%%d2				\n\t"
		"add.l 	%%d2,%%d6				\n\t"
		"cmp.l  #255,%%d6   			\n\t"
		"bls.b  1f          			\n\t"
		"spl.b  %%d6					\n\t"  
		"1	:                    		\n\t"
		"lsl.l  #8,%%d3					\n\t"
		"move.b %%d6,%%d3				\n\t"	
	
		"move.l  #0x00FF00FF,%%d7		\n\t"  /* val  = ABCD */
        "and.l   %%d3,%%d7	  			\n\t"  /* mask = .B.D */
        "eor.l   %%d7,%%d3	  			\n\t"  /* val  = A.C. */
        "lsl.l   #8,%%d7		  		\n\t"  /* mask = B.D. */
        "lsr.l   #8,%%d3			  	\n\t"  /* val  = .A.C */
        "or.l    %%d7,%%d3	  			\n\t"  /* val  = BADC */
        "swap    %%d3	          		\n\t"  /* val  = DCBA */
			
		/* writes 4 bytes */
		"move.l %%d3,(%%a0)		\n\t"	
		
		/* dot 8 */
		"move.l %%a2,%%d0				\n\t"
		"ext.l %%d0						\n\t"	
		"move.b %%d5,%%d2				\n\t"
		"add.l 	%%d2,%%d0				\n\t"
		"cmp.l  #255,%%d0   			\n\t"
		"bls.b  8f          			\n\t"
		"spl.b  %%d0					\n\t"  
		"8	:                    		\n\t"
		"move.b %%d0,%%d3				\n\t"
			
		/* dot 7 */
		"move.l %%a2,%%d0				\n\t"	
		"swap	%%d0					\n\t"	
		"ext.l %%d0						\n\t"
		"lsr.l  #8,%%d5					\n\t"	
		"move.b %%d5,%%d2				\n\t"
		"add.l 	%%d2,%%d0				\n\t"
		"cmp.l  #255,%%d0   			\n\t"
		"bls.b  7f          			\n\t"
		"spl.b  %%d0					\n\t"  
		"7	:                    		\n\t"
		"lsl.l  #8,%%d3					\n\t"
		"move.b %%d0,%%d3				\n\t"	

		/* dot 6 */
		"move.l %%a1,%%d0				\n\t"
		"ext.l %%d0						\n\t"	
		"lsr.l  #8,%%d5					\n\t"	
		"move.b %%d5,%%d2				\n\t"
		"add.l 	%%d2,%%d0				\n\t"
		"cmp.l  #255,%%d0   			\n\t"
		"bls.b  6f          			\n\t"
		"spl.b  %%d0					\n\t"  
		"6	:                    		\n\t"
		"lsl.l  #8,%%d3					\n\t"
		"move.b %%d0,%%d3				\n\t"
			
		/* dot 5 */
		"move.l %%a1,%%d0				\n\t"
		"swap	%%d0					\n\t"	
		"ext.l %%d0						\n\t"
		"lsr.l  #8,%%d5					\n\t"	
		"move.b %%d5,%%d2				\n\t"
		"add.l 	%%d2,%%d0				\n\t"
		"cmp.l  #255,%%d0   			\n\t"
		"bls.b  5f          			\n\t"
		"spl.b  %%d0					\n\t"  
		"5	:                    		\n\t"
		"lsl.l  #8,%%d3					\n\t"
		"move.b %%d0,%%d3				\n\t"	
	
		"move.l  #0x00FF00FF,%%d7		\n\t"  /* val  = ABCD */
		
		/* clear row, address +16 */ /* (pipelined below)*/	
		"move.l #0,%%d0					\n\t"
		"move.l %%d0,(%[block])+		\n\t"						/* */
        "and.l   %%d3,%%d7	  			\n\t"  /* mask = .B.D */
		"move.l %%d0,(%[block])+		\n\t"						/* */
        "eor.l   %%d7,%%d3	  			\n\t"  /* val  = A.C. */
		"move.l %%d0,(%[block])+		\n\t"						/* */
		"lsl.l   #8,%%d7		  		\n\t"  /* mask = B.D. */
		"move.l %%d0,(%[block])+		\n\t"						/* */
		"lsr.l   #8,%%d3			  	\n\t"  /* val  = .A.C */
			
        "or.l    %%d7,%%d3	  			\n\t"  /* val  = BADC */
        "swap    %%d3	          		\n\t"  /* val  = DCBA */
			
			
		/* writes 4 bytes */
		"move.l %%d3,(4,%%a0)		\n\t"	
		

			
		/* next line, address +stride */	
		"add.l	%[stride],%%a0		\n\t"	
		"sub.l	#1,%%d1					\n\t"			
		"bne 	foradd2						\n\t"	
		/* for end */
		"sub.l	#128,%[block]					\n\t"			
		:
		:	[block]"a"(block),[dest]"a"(dest),[stride]"a"(stride)
		:	"%a0","%a1","%a2","%d0","%d1","%d2","%d3","%d4","%d5","%d6","%d7"
	);

} 
#endif

#if defined(CPU_COLDFIRE) && !defined(SIMULATOR)
static  void clip_block_to_dest_add_DC(int16_t * block, uint8_t * dest,const int stride)
{
    asm volatile (
		
		"move.l %[dest],%%a1					\n\t"
    	
    	/* DC = (block[0] + 64) >> 7; */
		"move.w (%[block]),%%d0					\n\t"
    	"ext.l %%d0							\n\t"		
    	"add.l #64,%%d0						\n\t"
		"asr.l #7,%%d0						\n\t"
    		
        /*block[0] = block[63] = 0;*/
    	"move.w #0,%%d2					\n\t"		
 		"move.w %%d2,(%[block])					\n\t"
    	/* init loop for i=0 to 7 */	
		"move.l #8,%%d1						\n\t"	 	 /* pipeline on store */
 		"move.w %%d2,(63*2,%[block])				\n\t"	
	

		"clr.l %%d2							\n\t"
		"foraddDC:							\n\t"
			"movem.l (%%a1),%%d4-%%d5					\n\t"
    		
			/* read dot 3, add DC, save dot */
			"move.b %%d4,%%d2				\n\t"	
			"add.l 	%%d0,%%d2				\n\t"
			"cmp.l  #255,%%d2   			\n\t"
			"bls.b  4f          			\n\t"
			"spl.b  %%d2					\n\t"  
			"4	:                    		\n\t"
			"move.b %%d2,%%d3				\n\t"
			
			/* read dot 2, add DC, save dot */
			"lsr.l   #8,%%d4			  	\n\t"
    		"move.b %%d4,%%d2			\n\t"	
			"add.l %%d0,%%d2				\n\t"
			"cmp.l   #255,%%d2   			\n\t"
			"bls.b   3f          			\n\t"
			"spl.b   %%d2					\n\t"  
			"3	:                    		\n\t"
			"move.b %%d2,%%d6				\n\t"

	
			/* read dot 1, add DC, save dot */
			"lsr.l   #8,%%d4			  	\n\t"
    		"move.b %%d4,%%d2			\n\t"
			"add.l %%d0,%%d2					\n\t"	
			"cmp.l   #255,%%d2   				\n\t"
			"bls.b   2f          				\n\t"
			"spl.b   %%d2						\n\t"  
			"2	:                    			\n\t"
			"move.b %%d2,%%d7				\n\t"
	
			/* read dot 0, add DC, save dot */
			"lsr.l   #8,%%d4			  	\n\t"
    		"move.b %%d4,%%d2			\n\t"
			"add.l %%d0,%%d2					\n\t"	
			"cmp.l   #255,%%d2   				\n\t"  /* overflow? */
			"bls.b   1f          				\n\t"  /* no: return value */
			"spl.b   %%d2						\n\t"  /* yes: set low byte to appropriate boundary */
			"1	:                    			\n\t"
    		
    		/* save dot 0 1 2 3 */
			"move.b %%d2,%%d4				\n\t"
    		"lsl.l  #8,%%d4					\n\t"
    		"move.b %%d7,%%d4				\n\t"   		
    		"lsl.l  #8,%%d4					\n\t"
    		"move.b %%d6,%%d4				\n\t"
    		"lsl.l  #8,%%d4					\n\t"
    		"move.b %%d3,%%d4				\n\t"
    		
   		
			/* read dot 7, add DC, save dot */
			"move.b %%d5,%%d2				\n\t"	
			"add.l 	%%d0,%%d2				\n\t"
			"cmp.l  #255,%%d2   			\n\t"
			"bls.b  8f          			\n\t"
			"spl.b  %%d2					\n\t"  
			"8	:                    		\n\t"
			"move.b %%d2,%%d3				\n\t"
			
			/* read dot 6, add DC, save dot */
			"lsr.l   #8,%%d5			  	\n\t"
    		"move.b %%d5,%%d2			\n\t"	
			"add.l %%d0,%%d2				\n\t"
			"cmp.l   #255,%%d2   			\n\t"
			"bls.b   7f          			\n\t"
			"spl.b   %%d2					\n\t"  
			"7	:                    		\n\t"
			"move.b %%d2,%%d6				\n\t"

	
			/* read dot 5, add DC, save dot */
			"lsr.l   #8,%%d5			  	\n\t"
    		"move.b %%d5,%%d2			\n\t"
			"add.l %%d0,%%d2					\n\t"	
			"cmp.l   #255,%%d2   				\n\t"
			"bls.b   6f          				\n\t"
			"spl.b   %%d2						\n\t"  
			"6	:                    			\n\t"
			"move.b %%d2,%%d7				\n\t"
	
			/* read dot 4, add DC, save dot */
			"lsr.l   #8,%%d5			  	\n\t"
    		"move.b %%d5,%%d2			\n\t"
			"add.l %%d0,%%d2					\n\t"	
			"cmp.l   #255,%%d2   				\n\t"  /* overflow? */
			"bls.b   5f          				\n\t"  /* no: return value */
			"spl.b   %%d2						\n\t"  /* yes: set low byte to appropriate boundary */
			"5	:                    			\n\t"
    		/* save dot 4 5 6 7 */
    		"move.b %%d2,%%d5				\n\t"
    		"lsl.l  #8,%%d5					\n\t"
    		"move.b %%d7,%%d5				\n\t"   		
    		"lsl.l  #8,%%d5					\n\t"
    		"move.b %%d6,%%d5				\n\t"
    		"lsl.l  #8,%%d5					\n\t"
    		"move.b %%d3,%%d5				\n\t"
    		
			/* write to dest */	
			"movem.l %%d4-%%d5,(%%a1)		\n\t"
			
			"add.l	%[stride],%%a1			\n\t"	
			"sub.l	#1,%%d1					\n\t"			
		"bne 	foraddDC					\n\t"	
		/* for end */
			
		:
		:	[block]"a"(block),[dest]"a"(dest),[stride]"a"(stride)
		:	"%a0","%a1","%d0","%d1","%d2","%d3","%d4","%d5","%d6","%d7"
	);

} 
#endif

#ifdef CPU_COLDFIRE__
static  void clip_block_to_dest_add_DC(int16_t * block, uint8_t * dest,const int stride)
{
    asm volatile (
		
		"move.l %[block],%%a0					\n\t"
		"move.l %[dest],%%a1					\n\t"
    	
    	/* DC */
		"move.w (%%a0),%%d0					\n\t"
    	"ext.l %%d0							\n\t"		
    	"add.l #64,%%d0						\n\t"
		"asr.l #7,%%d0						\n\t"
 		"move.w #0,(%%a0)					\n\t"
    	"move.w #0,%%d2					\n\t"	
 		"move.w %%d2,(126,%%a0)				\n\t"	
	
		/* init loop for i=0 to 7 */
		"move.l #8,%%d1						\n\t"
		"clr.l %%d2							\n\t"
		"foraddDC:							\n\t"

			"move.b (%%a1),%%d2					\n\t"
			"add.l %%d0,%%d2					\n\t"	
			"cmp.l   #255,%%d2   				\n\t"  /* overflow? */
			"bls.b   1f          				\n\t"  /* no: return value */
			"spl.b   %%d2						\n\t"  /* yes: set low byte to appropriate boundary */
			"1	:                    			\n\t"
			"move.b %%d2,%%d3				\n\t"
			"lsl.l  #8,%%d3					\n\t"
	
			"move.b (1,%%a1),%%d2				\n\t"	
			"add.l %%d0,%%d2					\n\t"	
			"cmp.l   #255,%%d2   				\n\t"
			"bls.b   2f          				\n\t"
			"spl.b   %%d2						\n\t"  
			"2	:                    			\n\t"
			"move.b %%d2,%%d3				\n\t"
			"lsl.l  #8,%%d3					\n\t"
	
			"move.b (2,%%a1),%%d2			\n\t"	
			"add.l %%d0,%%d2			\n\t"
			"cmp.l   #255,%%d2   			\n\t"
			"bls.b   3f          			\n\t"
			"spl.b   %%d2					\n\t"  
			"3	:                    		\n\t"
			"move.b %%d2,%%d3				\n\t"
			"lsl.l  #8,%%d3					\n\t"
	
			"move.b (3,%%a1),%%d2			\n\t"	
			"add.l 	%%d0,%%d2			\n\t"
			"cmp.l  #255,%%d2   			\n\t"
			"bls.b  4f          			\n\t"
			"spl.b  %%d2					\n\t"  
			"4	:                    		\n\t"
			"move.b %%d2,%%d3				\n\t"
			
			"move.b (4,%%a1),%%d2			\n\t"	
			"add.l	%%d0,%%d2			\n\t"
			"cmp.l	#255,%%d2   			\n\t"
			"bls.b  5f          			\n\t"
			"spl.b  %%d2					\n\t"  
			"5	:                    		\n\t"
			"move.b %%d2,%%d4				\n\t"
			"lsl.l  #8,%%d4					\n\t"
			
			"move.b (5,%%a1),%%d2			\n\t"	
			"add.l 	%%d0,%%d2			\n\t"	
			"cmp.l   #255,%%d2   			\n\t"
			"bls.b   6f          			\n\t"
			"spl.b   %%d2					\n\t"  
			"6	:                    		\n\t"
			"move.b %%d2,%%d4				\n\t"
			"lsl.l  #8,%%d4					\n\t"
	
			"move.b (6,%%a1),%%d2			\n\t"	
			"add.l 	%%d0,%%d2			\n\t"	
			"cmp.l   #255,%%d2   			\n\t"
			"bls.b   7f          			\n\t"
			"spl.b   %%d2					\n\t"  
			"7	:                    		\n\t"
			"move.b %%d2,%%d4				\n\t"
			"lsl.l  #8,%%d4					\n\t"
	
			"move.b (7,%%a1),%%d2			\n\t"	
			"add.l 	%%d0,%%d2			\n\t"	
			"cmp.l   #255,%%d2   			\n\t"
			"bls.b   8f         	 		\n\t"
			"spl.b   %%d2					\n\t"  
			"8	:  		                  	\n\t"
			"move.b %%d2,%%d4				\n\t"
	
			/* write to dest */	
			"movem.l %%d3-%%d4,(%%a1)		\n\t"
			
			/* next line, address +16 */
	    	"add.l	#16,%%a0				\n\t"	
			"add.l	%[stride],%%a1			\n\t"	
			"sub.l	#1,%%d1					\n\t"			
		"bne 	foraddDC					\n\t"	
		/* for end */
			
		:
		:	[block]"a"(block),[dest]"a"(dest),[stride]"a"(stride)
		:	"%a0","%a1","%d0","%d1","%d2","%d3","%d4"
	);

} 
#endif

#if defined(CPU_COLDFIRE) && !defined(SIMULATOR)
static void mpeg2_idct_copy_c (int16_t * block, uint8_t * dest,
                               const int stride)
{
	idct (block);
	clip_block_to_dest(block,dest,stride);
}
#else
static void mpeg2_idct_copy_c (int16_t * block, uint8_t * dest,
                               const int stride)
{
	int i;
	for (i = 0; i < 8; i++)
        idct_row (block + 8 * i);
    for (i = 0; i < 8; i++)
        idct_col (block + i);
    do {
        dest[0] = CLIP (block[0]);
        dest[1] = CLIP (block[1]);
        dest[2] = CLIP (block[2]);
        dest[3] = CLIP (block[3]);
        dest[4] = CLIP (block[4]);
        dest[5] = CLIP (block[5]);
        dest[6] = CLIP (block[6]);
        dest[7] = CLIP (block[7]);

        ((int32_t *)block)[0] = 0;        ((int32_t *)block)[1] = 0;
        ((int32_t *)block)[2] = 0;        ((int32_t *)block)[3] = 0;

        dest += stride;
        block += 8;
    } while (--i);
}	
#endif 

#if defined(CPU_COLDFIRE) && !defined(SIMULATOR)
static void mpeg2_idct_add_c (const int last, int16_t * block,
                              uint8_t * dest, const int stride)
{
    if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) {
		idct (block);
		clip_block_to_dest_add(block,dest,stride);
    } else {
    	clip_block_to_dest_add_DC(block,dest,stride);
    }
}
#else
static void mpeg2_idct_add_c (const int last, int16_t * block,
                              uint8_t * dest, const int stride)
{
    int i;

    if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) {
		for (i = 0; i < 8; i++)
        	idct_row (block + 8 * i);
    	for (i = 0; i < 8; i++)
        	idct_col (block + i);
        do {
            dest[0] = CLIP (block[0] + dest[0]);
            dest[1] = CLIP (block[1] + dest[1]);
            dest[2] = CLIP (block[2] + dest[2]);
            dest[3] = CLIP (block[3] + dest[3]);
            dest[4] = CLIP (block[4] + dest[4]);
            dest[5] = CLIP (block[5] + dest[5]);
            dest[6] = CLIP (block[6] + dest[6]);
            dest[7] = CLIP (block[7] + dest[7]);

            ((int32_t *)block)[0] = 0;        ((int32_t *)block)[1] = 0;
            ((int32_t *)block)[2] = 0;        ((int32_t *)block)[3] = 0;

            dest += stride;
            block += 8;
        } while (--i);
    } else {
        int DC;

        DC = (block[0] + 64) >> 7;
        block[0] = block[63] = 0;
        i = 8;
        do {
            dest[0] = CLIP (DC + dest[0]);
            dest[1] = CLIP (DC + dest[1]);
            dest[2] = CLIP (DC + dest[2]);
            dest[3] = CLIP (DC + dest[3]);
            dest[4] = CLIP (DC + dest[4]);
            dest[5] = CLIP (DC + dest[5]);
            dest[6] = CLIP (DC + dest[6]);
            dest[7] = CLIP (DC + dest[7]);
            dest += stride;
        } while (--i);
    }
}
#endif	
	
void mpeg2_idct_init (uint32_t accel)
{
    (void)accel;
#ifdef ARCH_X86
    if (accel & MPEG2_ACCEL_X86_MMXEXT) {
        mpeg2_idct_copy = mpeg2_idct_copy_mmxext;
        mpeg2_idct_add = mpeg2_idct_add_mmxext;
        mpeg2_idct_mmx_init ();
    } else if (accel & MPEG2_ACCEL_X86_MMX) {
        mpeg2_idct_copy = mpeg2_idct_copy_mmx;
        mpeg2_idct_add = mpeg2_idct_add_mmx;
        mpeg2_idct_mmx_init ();
    } else
#endif
#ifdef ARCH_PPC
    if (accel & MPEG2_ACCEL_PPC_ALTIVEC) {
        mpeg2_idct_copy = mpeg2_idct_copy_altivec;
        mpeg2_idct_add = mpeg2_idct_add_altivec;
        mpeg2_idct_altivec_init ();
    } else
#endif
#ifdef ARCH_ALPHA
    if (accel & MPEG2_ACCEL_ALPHA_MVI) {
        mpeg2_idct_copy = mpeg2_idct_copy_mvi;
        mpeg2_idct_add = mpeg2_idct_add_mvi;
        mpeg2_idct_alpha_init ();
    } else if (accel & MPEG2_ACCEL_ALPHA) {
        int i;

        mpeg2_idct_copy = mpeg2_idct_copy_alpha;
        mpeg2_idct_add = mpeg2_idct_add_alpha;
        mpeg2_idct_alpha_init ();
        for (i = -3840; i < 3840 + 256; i++)
            CLIP(i) = (i < 0) ? 0 : ((i > 255) ? 255 : i);
    } else
#endif
    {
        extern uint8_t mpeg2_scan_norm[64];
        extern uint8_t mpeg2_scan_alt[64];
        int i, j;

        mpeg2_idct_copy = mpeg2_idct_copy_c;
        mpeg2_idct_add = mpeg2_idct_add_c;
#if !defined(CPU_COLDFIRE) && !defined(CPU_ARM)
        for (i = -3840; i < 3840 + 256; i++)
            CLIP(i) = (i < 0) ? 0 : ((i > 255) ? 255 : i);
#endif
        for (i = 0; i < 64; i++) {
            j = mpeg2_scan_norm[i];
            mpeg2_scan_norm[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
            j = mpeg2_scan_alt[i];
            mpeg2_scan_alt[i] = ((j & 0x36) >> 1) | ((j & 0x09) << 2);
        }
    }
}
