Index: apps/plugins/mpegplayer/decode.c
===================================================================
RCS file: /cvsroot/rockbox/apps/plugins/mpegplayer/decode.c,v
retrieving revision 1.1
diff -u -r1.1 decode.c
--- apps/plugins/mpegplayer/decode.c	7 Aug 2006 22:11:07 -0000	1.1
+++ apps/plugins/mpegplayer/decode.c	25 Sep 2006 22:15:23 -0000
@@ -416,14 +416,21 @@
 
 }
 
+#if defined(CPU_COLDFIRE) && !defined(SIMULATOR)
+static mpeg2dec_t static_mpeg2dec IBSS_ATTR;
+#endif
+
 mpeg2dec_t * mpeg2_init (void)
 {
     mpeg2dec_t * mpeg2dec;
 
     mpeg2_accel (MPEG2_ACCEL_DETECT);
 
-    mpeg2dec = (mpeg2dec_t *) mpeg2_malloc (sizeof (mpeg2dec_t),
-					    MPEG2_ALLOC_MPEG2DEC);
+#if defined(CPU_COLDFIRE) && !defined(SIMULATOR)
+	mpeg2dec = &static_mpeg2dec;
+#else  
+	mpeg2dec = (mpeg2dec_t *) mpeg2_malloc (sizeof (mpeg2dec_t),MPEG2_ALLOC_MPEG2DEC);
+#endif    
     if (mpeg2dec == NULL)
 	return NULL;
 
Index: apps/plugins/mpegplayer/idct.c
===================================================================
RCS file: /cvsroot/rockbox/apps/plugins/mpegplayer/idct.c,v
retrieving revision 1.2
diff -u -r1.2 idct.c
--- apps/plugins/mpegplayer/idct.c	8 Aug 2006 22:56:35 -0000	1.2
+++ apps/plugins/mpegplayer/idct.c	25 Sep 2006 22:15:25 -0000
@@ -28,13 +28,38 @@
 #include "mpeg2.h"
 #include "attributes.h"
 #include "mpeg2_internal.h"
-
+#define W0 2048 /* 2048 * sqrt (2) * cos (4 * pi / 16) */
 #define W1 2841 /* 2048 * sqrt (2) * cos (1 * pi / 16) */
 #define W2 2676 /* 2048 * sqrt (2) * cos (2 * pi / 16) */
 #define W3 2408 /* 2048 * sqrt (2) * cos (3 * pi / 16) */
+#define W4 2048 /* 2048 * sqrt (2) * cos (4 * pi / 16) */
 #define W5 1609 /* 2048 * sqrt (2) * cos (5 * pi / 16) */
 #define W6 1108 /* 2048 * sqrt (2) * cos (6 * pi / 16) */
 #define W7 565  /* 2048 * sqrt (2) * cos (7 * pi / 16) */
+ 
+#define SC(x) #x
+
+#define WA 1448 //=cos(pi/4)	2048*cos(4pi/16)  	0,707106781 
+#define WB 2009 //=cos(pi/16)	2048*cos(pi/16)	 	0,98078528	
+#define WC 1892 //=cos(pi/8)	2048*cos(2pi/16)	0,923879533
+#define WD 1703 //=cos(3pi/16)	2048*cos(3pi/16)	0,831469612
+#define WE 1448 //=sin(pi/4)	2048*sin(4pi/16)	0,707106781
+#define WF 1138 //=sin(3pi/16)	2048*sin(3pi/16)	0,555570233
+#define WG 784 //=sin(pi/8)	2048*sin(2pi/16)	0,382683432
+#define WH 400 //=sin(pi/16)	2048*sin(pi/16)		0,195090322
+
+/* theroic matrix used in the asm 1d_idct */
+/* 
+{WA,  WC,  WE,  WG,
+ WA,  WG, -WE, -WC,
+ WA, -WG, -WE,  WC,
+ WA, -WC,  WE, -WG};
+
+{WB,  WD,  WF,  WH,
+ WD, -WH, -WB, -WF,
+ WF, -WB,  WH,  WD,
+ WH, -WF,  WD, -WB};
+*/
 
 /* idct main entry point  */
 void (* mpeg2_idct_copy) (int16_t * block, uint8_t * dest, int stride);
@@ -51,13 +76,14 @@
 static inline unsigned CLIP(int value)
 {
     asm (  /* Note: Uses knowledge that only the low byte of the result is used */
-        "cmp.l   #255,%[v]   \n"  /* overflow? */
+       "cmp.l   #255,%[v]   \n"  /* overflow? */
         "bls.b   1f          \n"  /* no: return value */
         "spl.b   %[v]        \n"  /* yes: set low byte to appropriate boundary */
-    "1:                      \n"
+    "1	:                    \n"
         : /* outputs */
         [v]"+d"(value)
     );
+	
     return value;
 }
 #elif defined CPU_ARM
@@ -76,6 +102,21 @@
 #define CLIP(i) ((mpeg2_clip + 3840)[i])
 #endif
 
+#define MATRIX_MUL(m00,m01,m02,m03,\
+				m10,m11,m12,m13, \
+				m20,m21,m22,m23, \
+				m30,m31,m32,m33, \
+				n0,n1,n2,n3,\
+				p0,p1,p2,p3) \
+do{							\
+	p0=n0*m00+n1*m01+n2*m02+n3*m03; \
+	p1=n0*m10+n1*m11+n2*m12+n3*m13; \
+	p2=n0*m20+n1*m21+n2*m22+n3*m23; \
+	p3=n0*m30+n1*m31+n2*m32+n3*m33; \
+} while(0)
+
+
+
 #if 0
 #define BUTTERFLY(t0,t1,W0,W1,d0,d1)        \
 do {                                        \
@@ -91,6 +132,47 @@
 } while (0)
 #endif
 
+
+#ifdef C_VERSION_OF_ASM
+static inline void idct_row (int16_t * const block)
+{
+	int f0,f1,f2,f3,f4,f5,f6,f7;
+	int a0,a1,a2,a3;
+	int b0,b1,b2,b3;
+	
+	f0 = block[0]+1;
+    f2 = block[1];
+    f4 = block[2];
+    f6 = block[3];
+    f1 = block[4];
+    f3 = block[5];
+    f5 = block[6];
+    f7 = block[7];
+    
+    MATRIX_MUL(W1,  W3,  W5,  W7,
+			   W3, -W7, -W1, -W5,
+			   W5, -W1,  W7,  W3,
+			   W7, -W5,  W3, -W1,
+			   f1,f3,f5,f7,
+ 			   b0,b1,b2,b3);
+    
+	MATRIX_MUL(W0,  W2,  W4,  W6,
+ 			   W0,  W6, -W4, -W2,
+ 			   W0, -W6, -W4,  W2,
+ 			   W0, -W2,  W4, -W6,
+ 			   f0,f2,f4,f6,
+ 			   a0,a1,a2,a3);
+
+	block[0]= (a0+b0) >> 12;
+	block[1]= (a1+b1) >> 12;
+	block[2]= (a2+b2) >> 12;
+	block[3]= (a3+b3) >> 12;
+	block[4]= (a3-b3) >> 12;
+	block[5]= (a2-b2) >> 12;
+	block[6]= (a1-b1) >> 12;
+	block[7]= (a0-b0) >> 12;
+}
+#else
 static inline void idct_row (int16_t * const block)
 {
     int d0, d1, d2, d3;
@@ -143,7 +225,49 @@
     block[6] = (a1 - b1) >> 12;
     block[7] = (a0 - b0) >> 12;
 }
+#endif
+
+#ifdef C_VERSION_OF_ASM
+static inline void idct_col (int16_t * const block)
+{
+	int f0,f1,f2,f3,f4,f5,f6,f7;
+	int a0,a1,a2,a3;
+	int b0,b1,b2,b3;
+	
+	f0 = block[0*8] + 32;
+    f2 = block[1*8];
+    f4 = block[2*8];
+    f6 = block[3*8];
+    f1 = block[4*8];
+    f3 = block[5*8];
+    f5 = block[6*8];
+    f7 = block[7*8];
+    
+    MATRIX_MUL(W1,  W3,  W5,  W7,
+			   W3, -W7, -W1, -W5,
+			   W5, -W1,  W7,  W3,
+			   W7, -W5,  W3, -W1,
+			   f1,f3,f5,f7,
+ 			   b0,b1,b2,b3);
+    
+	MATRIX_MUL(W0,  W2,  W4,  W6,
+ 			   W0,  W6, -W4, -W2,
+ 			   W0, -W6, -W4,  W2,
+ 			   W0, -W2,  W4, -W6,
+ 			   f0,f2,f4,f6,
+ 			   a0,a1,a2,a3);
+
+	block[0*8]= (a0+b0) >> 17;
+	block[1*8]= (a1+b1) >> 17;
+	block[2*8]= (a2+b2) >> 17;
+	block[3*8]= (a3+b3) >> 17;
+	block[4*8]= (a3-b3) >> 17;
+	block[5*8]= (a2-b2) >> 17;
+	block[6*8]= (a1-b1) >> 17;
+	block[7*8]= (a0-b0) >> 17;
 
+}
+#else
 static inline void idct_col (int16_t * const block)
 {
     int d0, d1, d2, d3;
@@ -184,13 +308,962 @@
     block[8*6] = (a1 - b1) >> 17;
     block[8*7] = (a0 - b0) >> 17;
 }
+#endif
+
+#if defined(CPU_COLDFIRE) && !defined(SIMULATOR)
+static  void idct(int16_t * const block) ICODE_ATTR;
+static  void idct(int16_t * const block)
+{
+	asm volatile(
+
+		"move.l #0x80,%%macsr			\n\t"
+		/* matrix start */	
+
+		"move.l #2048*65536+2841,%%a0				\n\t"	
+		"move.l #2676*65536+2408,%%a1           	\n\t"	
+		"move.l #2048*65536+1609,%%a2				\n\t"	
+		"move.l #1108*65536+565,%%a3            	\n\t"	
+	
+		"move.l %[block],%%a4						\n\t"	
+		"add.l #128,%%a4							\n\t"	
+		/* init loop for i=0 to 7 */	
+		"move.l #8,%%d3							\n\t"
+
+		"row:										\n\t"
+			/* load P[0] P[2] P[4] P[6] P[1] P[3] P[5] P[7] from the row */	
+			"movem.l (%[block]),%%d4-%%d7	\n\t"
+			
+			"clr.l	%%d2				\n\t"
+			"move.w %%d4,%%d2				\n\t"
+			"or.l 	%%d5,%%d2				\n\t"				
+			"or.l 	%%d6,%%d2				\n\t"				
+			"or.l 	%%d7,%%d2				\n\t"							
+			"bne  	continue				\n\t"
+				/*  uint32_t tmp = (uint16_t) (block[0] >> 1); tmp |= tmp << 16; */
+				"move.l #17,%%d5				\n\t"
+				"asr.l 	%%d5,%%d4				\n\t"
+	
+	       		/* ((int32_t *)block)[0] = tmp; ((int32_t *)block)[1] = tmp;
+	       		 * ((int32_t *)block)[2] = tmp; ((int32_t *)block)[3] = tmp; */
+				"move.w %%d4,(%%a4)		\n\t"
+				"move.w %%d4,(1*16,%%a4)	\n\t"
+				"move.w %%d4,(2*16,%%a4)	\n\t"
+				"move.w %%d4,(3*16,%%a4)	\n\t"
+				"move.w %%d4,(4*16,%%a4)	\n\t"
+				"move.w %%d4,(5*16,%%a4)	\n\t"
+				"move.w %%d4,(6*16,%%a4)	\n\t"
+				"move.w %%d4,(7*16,%%a4)	\n\t"	
+				
+			"jmp  	shortcut				\n\t"
+			"continue:							\n\t"
+			/* compute second matrix * P[1],P[3],P[5],P[7]*/	
+			"mac.w 	%%d6u, %%a0l, %%acc0					\n\t"	
+			"mac.w 	%%d6u, %%a1l, %%acc1					\n\t"	
+			"mac.w 	%%d6u, %%a2l, %%acc2					\n\t"	
+			"mac.w 	%%d6u, %%a3l, %%acc3					\n\t"	
+	
+			"mac.w 	%%d6l, %%a1l, %%acc0					\n\t"
+			"msac.w %%d6l, %%a3l, %%acc1					\n\t"
+			"msac.w %%d6l, %%a0l, %%acc2					\n\t"
+			"msac.w %%d6l, %%a2l, %%acc3					\n\t"
+	
+			"mac.w 	%%d7u, %%a2l, %%acc0					\n\t"	
+			"msac.w %%d7u, %%a0l, %%acc1					\n\t"	
+			"mac.w 	%%d7u, %%a3l, %%acc2					\n\t"	
+			"mac.w 	%%d7u, %%a1l, %%acc3					\n\t"	
+	
+			"mac.w 	%%d7l, %%a3l, %%acc0					\n\t"
+			"msac.w %%d7l, %%a2l, %%acc1					\n\t"
+			"mac.w 	%%d7l, %%a1l, %%acc2					\n\t"
+			"msac.w %%d7l, %%a0l, %%acc3					\n\t"
+				
+			/* save second matrix row */
+			"movclr.l %%acc0, %%d2							\n\t"
+			"movclr.l %%acc1, %%d6							\n\t"
+			"movclr.l %%acc2, %%d7							\n\t"
+			"movclr.l %%acc3, %%a5							\n\t"
+	
+			/* probably some error correction */	
+			"add.l	#65536*1,%%d4			\n\t"
+			/* compute first matrix * P[0],P[2],P[4],P[6]*/	
+			"mac.w 	%%d4u, %%a0u, %%acc0					\n\t"
+			"mac.w 	%%d4u, %%a0u, %%acc1					\n\t"
+			"mac.w 	%%d4u, %%a0u, %%acc2					\n\t"
+			"mac.w 	%%d4u, %%a0u, %%acc3					\n\t"
+	
+			"mac.w 	%%d4l, %%a1u, %%acc0					\n\t"
+			"mac.w  %%d4l, %%a3u, %%acc1					\n\t"
+			"msac.w %%d4l, %%a3u, %%acc2					\n\t"
+			"msac.w %%d4l, %%a1u, %%acc3					\n\t"
+	
+			"mac.w 	%%d5u, %%a2u, %%acc0					\n\t"	
+			"msac.w %%d5u, %%a2u, %%acc1					\n\t"	
+			"msac.w %%d5u, %%a2u, %%acc2					\n\t"	
+			"mac.w 	%%d5u, %%a2u, %%acc3					\n\t"	
+	
+			"mac.w 	%%d5l, %%a3u, %%acc0					\n\t"
+			"msac.w %%d5l, %%a1u, %%acc1					\n\t"
+			"mac.w 	%%d5l, %%a1u, %%acc2					\n\t"
+			"msac.w %%d5l, %%a3u, %%acc3					\n\t"
+	
+			/*   divide by  */
+			"move.l #12, %%d4			\n\t"
+	
+			/* mat 1 + mat 2 */
+			"move.l %%acc0, %%d5		\n\t"
+			"add.l	%%d2,%%d5			\n\t"	
+			"asr.l %%d4, %%d5			\n\t"		
+			"move.w %%d5,(%%a4)		\n\t"
+				
+			"move.l %%acc1, %%d5		\n\t"
+			"add.l	%%d6,%%d5			\n\t"	
+			"asr.l %%d4, %%d5			\n\t"		
+			"move.w %%d5,(1*16,%%a4)	\n\t"
+				
+			"move.l %%acc2, %%d5		\n\t"
+			"add.l	%%d7,%%d5			\n\t"	
+			"asr.l %%d4, %%d5			\n\t"		
+			"move.w %%d5,(2*16,%%a4)	\n\t"
+				
+			"move.l %%acc3, %%d5		\n\t"
+			"add.l	%%a5,%%d5			\n\t"	
+			"asr.l %%d4, %%d5			\n\t"		
+			"move.w %%d5,(3*16,%%a4)	\n\t"
+				
+			/* mat 1 - mat2*/
+			"movclr.l %%acc0, %%d5		\n\t"
+			"sub.l	%%d2,%%d5			\n\t"	
+			"asr.l %%d4, %%d5			\n\t"		
+			"move.w %%d5,(7*16,%%a4)	\n\t"
+				
+			"movclr.l %%acc1, %%d5		\n\t"
+			"sub.l	%%d6,%%d5			\n\t"	
+			"asr.l %%d4, %%d5			\n\t"		
+			"move.w %%d5,(6*16,%%a4)	\n\t"
+				
+			"movclr.l %%acc2, %%d5		\n\t"
+			"sub.l	%%d7,%%d5			\n\t"	
+			"asr.l %%d4, %%d5			\n\t"		
+			"move.w %%d5,(5*16,%%a4)	\n\t"
+				
+			"movclr.l %%acc3, %%d5		\n\t"
+			"sub.l	%%a5,%%d5			\n\t"	
+			"asr.l %%d4, %%d5			\n\t"		
+			"move.w %%d5,(4*16,%%a4)	\n\t"	
+	
+			"shortcut:					\n\t"
+			/* next line, address +16 */	
+			"add.l	#16,%[block]		\n\t"
+			/* next dest column */	
+			"add.l	#2,%%a4				\n\t"	
+			"sub.l	#1,%%d3				\n\t"			
+		"bne 	row						\n\t"	
+		/* for end */
+			
+		"sub.l	#16+128,%%a4			\n\t"	/* return to intial block value */
+		"move.l #8,%%d3					\n\t"
+// columns
+		"col:							\n\t"
+			
+			/* load P[0] P[2] P[4] P[6] P[1] P[3] P[5] P[7] from the row */	
+			"movem.l (%[block]),%%d4-%%d7	\n\t"
+			
+			/* compute second matrix * P[1],P[3],P[5],P[7]*/	
+			"mac.w 	%%d6u, %%a0l, %%acc0					\n\t"	
+			"mac.w 	%%d6u, %%a1l, %%acc1					\n\t"	
+			"mac.w 	%%d6u, %%a2l, %%acc2					\n\t"	
+			"mac.w 	%%d6u, %%a3l, %%acc3					\n\t"	
+	
+			"mac.w 	%%d6l, %%a1l, %%acc0					\n\t"
+			"msac.w %%d6l, %%a3l, %%acc1					\n\t"
+			"msac.w %%d6l, %%a0l, %%acc2					\n\t"
+			"msac.w %%d6l, %%a2l, %%acc3					\n\t"
+	
+			"mac.w 	%%d7u, %%a2l, %%acc0					\n\t"	
+			"msac.w %%d7u, %%a0l, %%acc1					\n\t"	
+			"mac.w 	%%d7u, %%a3l, %%acc2					\n\t"	
+			"mac.w 	%%d7u, %%a1l, %%acc3					\n\t"	
+	
+			"mac.w 	%%d7l, %%a3l, %%acc0					\n\t"
+			"msac.w %%d7l, %%a2l, %%acc1					\n\t"
+			"mac.w 	%%d7l, %%a1l, %%acc2					\n\t"
+			"msac.w %%d7l, %%a0l, %%acc3					\n\t"
+				
+			/* save second matrix row */
+			"movclr.l %%acc0, %%d2							\n\t"
+			"movclr.l %%acc1, %%d6							\n\t"
+			"movclr.l %%acc2, %%d7							\n\t"
+			"movclr.l %%acc3, %%a5							\n\t"
+	
+			/* load P[0] P[2] P[4] P[6] from the row */	
+			/* probably some error correction */	
+			"add.l	#32*65536,%%d4				\n\t"
+			/* compute first matrix * P[0],P[2],P[4],P[6]*/	
+			"mac.w 	%%d4u, %%a0u, %%acc0					\n\t"
+			"mac.w 	%%d4u, %%a0u, %%acc1					\n\t"
+			"mac.w 	%%d4u, %%a0u, %%acc2					\n\t"
+			"mac.w 	%%d4u, %%a0u, %%acc3					\n\t"
+	
+			"mac.w 	%%d4l, %%a1u, %%acc0					\n\t"
+			"mac.w  %%d4l, %%a3u, %%acc1					\n\t"
+			"msac.w %%d4l, %%a3u, %%acc2					\n\t"
+			"msac.w %%d4l, %%a1u, %%acc3					\n\t"
+	
+			"mac.w 	%%d5u, %%a2u, %%acc0					\n\t"	
+			"msac.w %%d5u, %%a2u, %%acc1					\n\t"	
+			"msac.w %%d5u, %%a2u, %%acc2					\n\t"	
+			"mac.w 	%%d5u, %%a2u, %%acc3					\n\t"	
+	
+			"mac.w 	%%d5l, %%a3u, %%acc0					\n\t"
+			"msac.w %%d5l, %%a1u, %%acc1					\n\t"
+			"mac.w 	%%d5l, %%a1u, %%acc2					\n\t"
+			"msac.w %%d5l, %%a3u, %%acc3					\n\t"
+	
+			/*   divide by  */
+			"move.l #17, %%d4			\n\t"
+	
+			/* mat 1 + mat 2 */
+			"move.l %%acc0, %%d5		\n\t"
+			"add.l	%%d2,%%d5			\n\t"	
+			"asr.l %%d4, %%d5			\n\t"		
+			"move.w %%d5,(%%a4)		\n\t"
+				
+			"move.l %%acc1, %%d5		\n\t"
+			"add.l	%%d6,%%d5			\n\t"	
+			"asr.l %%d4, %%d5			\n\t"		
+			"move.w %%d5,(1*16,%%a4)	\n\t"
+				
+			"move.l %%acc2, %%d5		\n\t"
+			"add.l	%%d7,%%d5			\n\t"	
+			"asr.l %%d4, %%d5			\n\t"		
+			"move.w %%d5,(2*16,%%a4)	\n\t"
+				
+			"move.l %%acc3, %%d5		\n\t"
+			"add.l	%%a5,%%d5			\n\t"	
+			"asr.l %%d4, %%d5			\n\t"		
+			"move.w %%d5,(3*16,%%a4)	\n\t"
+				
+			/* mat 1 - mat2*/
+			"movclr.l %%acc0, %%d5		\n\t"
+			"sub.l	%%d2,%%d5			\n\t"	
+			"asr.l %%d4, %%d5			\n\t"		
+			"move.w %%d5,(7*16,%%a4)	\n\t"
+				
+			"movclr.l %%acc1, %%d5		\n\t"
+			"sub.l	%%d6,%%d5			\n\t"	
+			"asr.l %%d4, %%d5			\n\t"		
+			"move.w %%d5,(6*16,%%a4)	\n\t"
+				
+			"movclr.l %%acc2, %%d5		\n\t"
+			"sub.l	%%d7,%%d5			\n\t"	
+			"asr.l %%d4, %%d5			\n\t"		
+			"move.w %%d5,(5*16,%%a4)	\n\t"
+				
+			"movclr.l %%acc3, %%d5		\n\t"
+			"sub.l	%%a5,%%d5			\n\t"	
+			"asr.l %%d4, %%d5			\n\t"		
+			"move.w %%d5,(4*16,%%a4)	\n\t"	
+			/****************************/
+				
+			/* next line, address +16 */	
+			"add.l	#16,%[block]			\n\t"
+			/* next dest column */	
+			"add.l	#2,%%a4			\n\t"
+				
+			"sub.l #1,%%d3					\n\t"			
+		"bne 	col						\n\t"
+			
+		"sub.l	#16*16,%[block]			\n\t"
+		:
+		:	[block]"a"(block)	
+		:	"%a0","%a1","%a2","%a3","%a4","%a5","%d2","%d3","%d4","%d5","%d6","%d7"
+	);
+
+}
+#endif
+
+#if defined(CPU_COLDFIRE) && !defined(SIMULATOR)
+static  void clip_block_to_dest(int16_t * block, uint8_t * dest,const int stride)
+{
+	asm volatile(
+		
+		"move.l %[dest],%%a0					\n\t"
+		/* init loop for i=0 to 7 */	
+		"move.l #8,%%d1							\n\t"
+		"clr.l %%d0								\n\t"
+		"clr.l %%d3								\n\t"	
+	"foradd:								\n\t"
+
+		"movem.l (%[block]),%%d4-%%d7				\n\t"
+	
+		/* dot 2*/		
+		"move.w %%d4,%%d3				\n\t"
+		"cmp.l   #255,%%d3   				\n\t"
+		"bls.b   2f          				\n\t"
+		"spl.b   %%d3						\n\t"  
+		"2	:                    			\n\t"
+		
+		/* dot 1*/	
+		"swap.w %%d4					\n\t"	
+		"move.w %%d4,%%d0				\n\t"
+		"cmp.l   #255,%%d0   				\n\t"  /* overflow? */
+		"bls.b   1f          				\n\t"  /* no: return value */
+		"spl.b   %%d0						\n\t"  /* yes: set low byte to appropriate boundary */
+		"1	:                    			\n\t"
+		
+		/* save dot 1*/		
+		"move.b %%d0,%%d2					\n\t"
+		/* save dot 2*/		
+		"lsl.l  #8,%%d2					\n\t"
+		"move.b %%d3,%%d2					\n\t"
+
+		/* dot 4*/				
+		"move.w %%d5,%%d3				\n\t"
+		"cmp.l   #255,%%d3   				\n\t"
+		"bls.b   4f          				\n\t"
+		"spl.b   %%d3						\n\t"  
+		"4	:                    			\n\t"
+
+		/* dot 3*/	
+		"swap.w %%d5					\n\t"	
+		"move.w %%d5,%%d0				\n\t"
+		"cmp.l   #255,%%d0   				\n\t"  /* overflow? */
+		"bls.b   3f          				\n\t"  /* no: return value */
+		"spl.b   %%d0						\n\t"  /* yes: set low byte to appropriate boundary */
+		"3	:                    			\n\t"
+
+		/* save dot 3*/	
+		"lsl.l  #8,%%d2					\n\t"	
+		"move.b %%d0,%%d2					\n\t"
+		/* save dot 4*/		
+		"lsl.l  #8,%%d2					\n\t"
+		"move.b %%d3,%%d2					\n\t"
+			
+		
+		/* writes 4 bytes */
+		"move.l %%d2,(%%a0)		\n\t"	
+
+		/* dot 6 */	
+		"move.w %%d6,%%d3				\n\t"
+		"cmp.l   #255,%%d3   			\n\t"
+		"bls.b   6f          			\n\t"
+		"spl.b   %%d3					\n\t"  
+		"6	:                    		\n\t"
+			
+		"move.l #0,%%d4				\n\t"		/* */
+		/* dot 5 */
+		"move.l %%d4,(%[block])+		\n\t"	/* */
+		"swap.w %%d6					\n\t"	
+		"move.w %%d6,%%d0				\n\t"
+		"move.l %%d4,(%[block])+		\n\t"	/* */ 	
+		"cmp.l	#255,%%d0   			\n\t"
+		"bls.b  5f          			\n\t"
+		"spl.b  %%d0					\n\t"  
+		"5	:                    		\n\t"
+		
+		/* save dot 5*/	
+		"move.b %%d0,%%d2				\n\t"
+		/* save dot 6*/							
+		"lsl.l  #8,%%d2					\n\t"
+		"move.b %%d3,%%d2				\n\t"
+
+			
+		/* dot 8*/	
+		"move.w %%d7,%%d3				\n\t"
+		"cmp.l   #255,%%d3   			\n\t"
+		"bls.b   8f         	 		\n\t"
+		"spl.b   %%d3					\n\t"  
+		"8	:  		                  	\n\t"
+			
+		"move.l %%d4,(%[block])+		\n\t"	/* */
+		/* dot 7*/	
+		"swap.w %%d7					\n\t"	
+		"move.w %%d7,%%d0				\n\t"
+		"cmp.l   #255,%%d0   			\n\t"
+		"bls.b   7f          			\n\t"
+		"spl.b   %%d0					\n\t"  
+		"7	:                    		\n\t"
+
+		"move.l %%d4,(%[block])+		\n\t"  /*  */
+		/* save dot 7*/
+		"lsl.l  #8,%%d2					\n\t"
+		"move.b %%d0,%%d2				\n\t"
+		/* save dot 8*/			
+		"lsl.l  #8,%%d2					\n\t"
+		"move.b %%d3,%%d2				\n\t"	
+
+		
+		/* write to dest */	
+		"move.l %%d2,(4,%%a0)		\n\t"
+		
+		/* clear row, address +16 */
+		
+		/* next line, address +stride */	
+		"add.l	%[stride],%%a0					\n\t"	
+		"sub.l	#1,%%d1							\n\t"			
+	"bne 	foradd							\n\t"	
+		/* for end */
+		"sub.l	#128,%[block]					\n\t"	
+	
+		:
+		:	[block]"a"(block),[dest]"a"(dest),[stride]"a"(stride)
+		:	"%a0","%d0","%d1","%d2","%d3","%d4","%d5","%d6","%d7"
+	);
+
+} 
+#endif
+
+
+
+#ifdef CPU_COLDFIRE__
+static  void clip_block_to_dest_add(int16_t * block, uint8_t * dest,const int stride)
+{
+	asm volatile (
+		
+		"move.l %[dest],%%a1					\n\t"
+		/* init loop for i=0 to 7 */	
+		"move.l #8,%%d1							\n\t"
+		"clr.l %%d2								\n\t"
+		"foradd2:								\n\t"
+		
+		"move.w (%[block]),%%d3					\n\t"
+		"move.b (%%a1),%%d2					\n\t"
+		"ext.l %%d3							\n\t"	
+		"add.l %%d2,%%d3					\n\t"	
+		"cmp.l   #255,%%d3   				\n\t"  /* overflow? */
+		"bls.b   1f          				\n\t"  /* no: return value */
+		"spl.b   %%d3						\n\t"  /* yes: set low byte to appropriate boundary */
+		"1	:                    			\n\t"
+
+			
+		"move.w (2,%[block]),%%d0				\n\t"
+		"move.b (1,%%a1),%%d2				\n\t"	
+		"ext.l %%d0							\n\t"	
+		"add.l %%d2,%%d0					\n\t"	
+		"cmp.l   #255,%%d0   				\n\t"
+		"bls.b   2f          				\n\t"
+		"spl.b   %%d0						\n\t"  
+		"2	:                    			\n\t"
+		"lsl.l  #8,%%d3					\n\t"
+		"move.b %%d0,%%d3					\n\t"
+
+			
+		"move.w (4,%[block]),%%d0				\n\t"
+		"move.b (2,%%a1),%%d2				\n\t"	
+		"ext.l %%d0							\n\t"	
+		"add.l %%d2,%%d0				\n\t"
+		"cmp.l   #255,%%d0   			\n\t"
+		"bls.b   3f          			\n\t"
+		"spl.b   %%d0					\n\t"  
+		"3	:                    		\n\t"
+		"lsl.l  #8,%%d3					\n\t"
+		"move.b %%d0,%%d3				\n\t"
+
+			
+		"move.w (6,%[block]),%%d0			\n\t"
+		"move.b (3,%%a1),%%d2			\n\t"	
+		"ext.l %%d0						\n\t"	
+		"add.l 	%%d2,%%d0				\n\t"
+		"cmp.l  #255,%%d0   			\n\t"
+		"bls.b  4f          			\n\t"
+		"spl.b  %%d0					\n\t"  
+		"4	:                    		\n\t"
+		"lsl.l  #8,%%d3					\n\t"
+		"move.b %%d0,%%d3				\n\t"
+		
+		/* writes 4 bytes */
+		"move.l %%d3,(%%a1)		\n\t"	
+		
+			
+		"move.w (8,%[block]),%%d3		\n\t"
+		"move.b (4,%%a1),%%d2			\n\t"	
+		"ext.l %%d3							\n\t"	
+		"add.l	%%d2,%%d3			\n\t"
+		"cmp.l	#255,%%d3   			\n\t"
+		"bls.b  5f          			\n\t"
+		"spl.b  %%d3					\n\t"  
+		"5	:                    		\n\t"
+		
+		"move.w (10,%[block]),%%d0		\n\t"
+		"move.b (5,%%a1),%%d2			\n\t"	
+		"ext.l %%d0							\n\t"	
+		"add.l 	%%d2,%%d0			\n\t"	
+		"cmp.l   #255,%%d0   			\n\t"
+		"bls.b   6f          			\n\t"
+		"spl.b   %%d0					\n\t"  
+		"6	:                    		\n\t"
+		"lsl.l  #8,%%d3					\n\t"	
+		"move.b %%d0,%%d3				\n\t"
+
+		"move.w (12,%[block]),%%d0		\n\t"
+		"move.b (6,%%a1),%%d2			\n\t"	
+		"ext.l %%d0							\n\t"	
+		"add.l 	%%d2,%%d0			\n\t"	
+		"cmp.l   #255,%%d0   			\n\t"
+		"bls.b   7f          			\n\t"
+		"spl.b   %%d0					\n\t"  
+		"7	:                    		\n\t"
+		"lsl.l  #8,%%d3					\n\t"	
+		"move.b %%d0,%%d3				\n\t"
+
+		"move.w (14,%[block]),%%d0		\n\t"
+		"move.b (7,%%a1),%%d2			\n\t"	
+		"ext.l %%d0							\n\t"	
+		"add.l 	%%d2,%%d0			\n\t"	
+		"cmp.l   #255,%%d0   			\n\t"
+		"bls.b   8f         	 		\n\t"
+		"spl.b   %%d0					\n\t"  
+		"8	:  		                  	\n\t"
+		"lsl.l  #8,%%d3					\n\t"			
+		"move.b %%d0,%%d3				\n\t"
+		
+		/* write to dest */	
+		"move.l %%d3,(4,%%a1)		\n\t"
+		
+		/* clear row, address +16 */
+		"move.l #0,%%d0				\n\t"
+		"move.l %%d0,(%[block])+		\n\t"
+		"move.l %%d0,(%[block])+		\n\t"
+		"move.l %%d0,(%[block])+		\n\t"
+		"move.l %%d0,(%[block])+		\n\t"
+		
+		/* next line, address +stride */	
+		"add.l	%[stride],%%a1		\n\t"	
+		"sub.l	#1,%%d1					\n\t"			
+		"bne 	foradd2						\n\t"	
+		/* for end */
+		"sub.l	#128,%[block]					\n\t"			
+		:
+		:	[block]"a"(block),[dest]"a"(dest),[stride]"a"(stride)
+		:	"%a1","%d0","%d1","%d2","%d3"
+	);
 
+} 
+#endif
+
+#if defined(CPU_COLDFIRE) && !defined(SIMULATOR)
+static  void clip_block_to_dest_add(int16_t * block, uint8_t * dest,const int stride)
+{
+	asm volatile (
+		
+		"move.l %[dest],%%a0					\n\t"
+		/* init loop for i=0 to 7 */	
+		"move.l #8,%%d1							\n\t"
+		"clr.l %%d2								\n\t"
+		"foradd2:								\n\t"
+		
+		"movem.l (%%a0),%%d4-%%d5					\n\t"
+		"movem.l (%[block]),%%d6-%%d7/%%a1-%%a2		\n\t"	
+
+		/* dot 4 */
+		"move.w %%d7,%%d0				\n\t"
+		"ext.l %%d0						\n\t"	
+		"move.b %%d4,%%d2				\n\t"
+		"add.l 	%%d2,%%d0				\n\t"
+		"cmp.l  #255,%%d0   			\n\t"
+		"bls.b  4f          			\n\t"
+		"spl.b  %%d0					\n\t"  
+		"4	:                    		\n\t"
+		"move.b %%d0,%%d3				\n\t"
+			
+		/* dot 3 */
+		"swap.w %%d7					\n\t"	
+		"ext.l %%d7						\n\t"
+		"lsr.l  #8,%%d4					\n\t"	
+		"move.b %%d4,%%d2				\n\t"
+		"add.l 	%%d2,%%d7				\n\t"
+		"cmp.l  #255,%%d7   			\n\t"
+		"bls.b  3f          			\n\t"
+		"spl.b  %%d7					\n\t"  
+		"3	:                    		\n\t"
+		"lsl.l  #8,%%d3					\n\t"
+		"move.b %%d7,%%d3				\n\t"	
+
+		/* dot 2 */
+		"move.w %%d6,%%d0				\n\t"
+		"ext.l %%d0						\n\t"	
+		"lsr.l  #8,%%d4					\n\t"	
+		"move.b %%d4,%%d2				\n\t"
+		"add.l 	%%d2,%%d0				\n\t"
+		"cmp.l  #255,%%d0   			\n\t"
+		"bls.b  2f          			\n\t"
+		"spl.b  %%d0					\n\t"  
+		"2	:                    		\n\t"
+		"lsl.l  #8,%%d3					\n\t"
+		"move.b %%d0,%%d3				\n\t"
+			
+		/* dot 1 */
+		"swap.w %%d6					\n\t"	
+		"ext.l %%d6						\n\t"
+		"lsr.l  #8,%%d4					\n\t"	
+		"move.b %%d4,%%d2				\n\t"
+		"add.l 	%%d2,%%d6				\n\t"
+		"cmp.l  #255,%%d6   			\n\t"
+		"bls.b  1f          			\n\t"
+		"spl.b  %%d6					\n\t"  
+		"1	:                    		\n\t"
+		"lsl.l  #8,%%d3					\n\t"
+		"move.b %%d6,%%d3				\n\t"	
+	
+		"move.l  #0x00FF00FF,%%d7		\n\t"  /* val  = ABCD */
+        "and.l   %%d3,%%d7	  			\n\t"  /* mask = .B.D */
+        "eor.l   %%d7,%%d3	  			\n\t"  /* val  = A.C. */
+        "lsl.l   #8,%%d7		  		\n\t"  /* mask = B.D. */
+        "lsr.l   #8,%%d3			  	\n\t"  /* val  = .A.C */
+        "or.l    %%d7,%%d3	  			\n\t"  /* val  = BADC */
+        "swap    %%d3	          		\n\t"  /* val  = DCBA */
+			
+		/* writes 4 bytes */
+		"move.l %%d3,(%%a0)		\n\t"	
+		
+		/* dot 8 */
+		"move.l %%a2,%%d0				\n\t"
+		"ext.l %%d0						\n\t"	
+		"move.b %%d5,%%d2				\n\t"
+		"add.l 	%%d2,%%d0				\n\t"
+		"cmp.l  #255,%%d0   			\n\t"
+		"bls.b  8f          			\n\t"
+		"spl.b  %%d0					\n\t"  
+		"8	:                    		\n\t"
+		"move.b %%d0,%%d3				\n\t"
+			
+		/* dot 7 */
+		"move.l %%a2,%%d0				\n\t"	
+		"swap	%%d0					\n\t"	
+		"ext.l %%d0						\n\t"
+		"lsr.l  #8,%%d5					\n\t"	
+		"move.b %%d5,%%d2				\n\t"
+		"add.l 	%%d2,%%d0				\n\t"
+		"cmp.l  #255,%%d0   			\n\t"
+		"bls.b  7f          			\n\t"
+		"spl.b  %%d0					\n\t"  
+		"7	:                    		\n\t"
+		"lsl.l  #8,%%d3					\n\t"
+		"move.b %%d0,%%d3				\n\t"	
+
+		/* dot 6 */
+		"move.l %%a1,%%d0				\n\t"
+		"ext.l %%d0						\n\t"	
+		"lsr.l  #8,%%d5					\n\t"	
+		"move.b %%d5,%%d2				\n\t"
+		"add.l 	%%d2,%%d0				\n\t"
+		"cmp.l  #255,%%d0   			\n\t"
+		"bls.b  6f          			\n\t"
+		"spl.b  %%d0					\n\t"  
+		"6	:                    		\n\t"
+		"lsl.l  #8,%%d3					\n\t"
+		"move.b %%d0,%%d3				\n\t"
+			
+		/* dot 5 */
+		"move.l %%a1,%%d0				\n\t"
+		"swap	%%d0					\n\t"	
+		"ext.l %%d0						\n\t"
+		"lsr.l  #8,%%d5					\n\t"	
+		"move.b %%d5,%%d2				\n\t"
+		"add.l 	%%d2,%%d0				\n\t"
+		"cmp.l  #255,%%d0   			\n\t"
+		"bls.b  5f          			\n\t"
+		"spl.b  %%d0					\n\t"  
+		"5	:                    		\n\t"
+		"lsl.l  #8,%%d3					\n\t"
+		"move.b %%d0,%%d3				\n\t"	
+	
+		"move.l  #0x00FF00FF,%%d7		\n\t"  /* val  = ABCD */
+		
+		/* clear row, address +16 */ /* (pipelined below)*/	
+		"move.l #0,%%d0					\n\t"
+		"move.l %%d0,(%[block])+		\n\t"						/* */
+        "and.l   %%d3,%%d7	  			\n\t"  /* mask = .B.D */
+		"move.l %%d0,(%[block])+		\n\t"						/* */
+        "eor.l   %%d7,%%d3	  			\n\t"  /* val  = A.C. */
+		"move.l %%d0,(%[block])+		\n\t"						/* */
+		"lsl.l   #8,%%d7		  		\n\t"  /* mask = B.D. */
+		"move.l %%d0,(%[block])+		\n\t"						/* */
+		"lsr.l   #8,%%d3			  	\n\t"  /* val  = .A.C */
+			
+        "or.l    %%d7,%%d3	  			\n\t"  /* val  = BADC */
+        "swap    %%d3	          		\n\t"  /* val  = DCBA */
+			
+			
+		/* writes 4 bytes */
+		"move.l %%d3,(4,%%a0)		\n\t"	
+		
+
+			
+		/* next line, address +stride */	
+		"add.l	%[stride],%%a0		\n\t"	
+		"sub.l	#1,%%d1					\n\t"			
+		"bne 	foradd2						\n\t"	
+		/* for end */
+		"sub.l	#128,%[block]					\n\t"			
+		:
+		:	[block]"a"(block),[dest]"a"(dest),[stride]"a"(stride)
+		:	"%a0","%a1","%a2","%d0","%d1","%d2","%d3","%d4","%d5","%d6","%d7"
+	);
+
+} 
+#endif
+
+#if defined(CPU_COLDFIRE) && !defined(SIMULATOR)
+static  void clip_block_to_dest_add_DC(int16_t * block, uint8_t * dest,const int stride)
+{
+    asm volatile (
+		
+		"move.l %[dest],%%a1					\n\t"
+    	
+    	/* DC = (block[0] + 64) >> 7; */
+		"move.w (%[block]),%%d0					\n\t"
+    	"ext.l %%d0							\n\t"		
+    	"add.l #64,%%d0						\n\t"
+		"asr.l #7,%%d0						\n\t"
+    		
+        /*block[0] = block[63] = 0;*/
+    	"move.w #0,%%d2					\n\t"		
+ 		"move.w %%d2,(%[block])					\n\t"
+    	/* init loop for i=0 to 7 */	
+		"move.l #8,%%d1						\n\t"	 	 /* pipeline on store */
+ 		"move.w %%d2,(63*2,%[block])				\n\t"	
+	
+
+		"clr.l %%d2							\n\t"
+		"foraddDC:							\n\t"
+			"movem.l (%%a1),%%d4-%%d5					\n\t"
+    		
+			/* read dot 3, add DC, save dot */
+			"move.b %%d4,%%d2				\n\t"	
+			"add.l 	%%d0,%%d2				\n\t"
+			"cmp.l  #255,%%d2   			\n\t"
+			"bls.b  4f          			\n\t"
+			"spl.b  %%d2					\n\t"  
+			"4	:                    		\n\t"
+			"move.b %%d2,%%d3				\n\t"
+			
+			/* read dot 2, add DC, save dot */
+			"lsr.l   #8,%%d4			  	\n\t"
+    		"move.b %%d4,%%d2			\n\t"	
+			"add.l %%d0,%%d2				\n\t"
+			"cmp.l   #255,%%d2   			\n\t"
+			"bls.b   3f          			\n\t"
+			"spl.b   %%d2					\n\t"  
+			"3	:                    		\n\t"
+			"move.b %%d2,%%d6				\n\t"
+
+	
+			/* read dot 1, add DC, save dot */
+			"lsr.l   #8,%%d4			  	\n\t"
+    		"move.b %%d4,%%d2			\n\t"
+			"add.l %%d0,%%d2					\n\t"	
+			"cmp.l   #255,%%d2   				\n\t"
+			"bls.b   2f          				\n\t"
+			"spl.b   %%d2						\n\t"  
+			"2	:                    			\n\t"
+			"move.b %%d2,%%d7				\n\t"
+	
+			/* read dot 0, add DC, save dot */
+			"lsr.l   #8,%%d4			  	\n\t"
+    		"move.b %%d4,%%d2			\n\t"
+			"add.l %%d0,%%d2					\n\t"	
+			"cmp.l   #255,%%d2   				\n\t"  /* overflow? */
+			"bls.b   1f          				\n\t"  /* no: return value */
+			"spl.b   %%d2						\n\t"  /* yes: set low byte to appropriate boundary */
+			"1	:                    			\n\t"
+    		
+    		/* save dot 0 1 2 3 */
+			"move.b %%d2,%%d4				\n\t"
+    		"lsl.l  #8,%%d4					\n\t"
+    		"move.b %%d7,%%d4				\n\t"   		
+    		"lsl.l  #8,%%d4					\n\t"
+    		"move.b %%d6,%%d4				\n\t"
+    		"lsl.l  #8,%%d4					\n\t"
+    		"move.b %%d3,%%d4				\n\t"
+    		
+   		
+			/* read dot 7, add DC, save dot */
+			"move.b %%d5,%%d2				\n\t"	
+			"add.l 	%%d0,%%d2				\n\t"
+			"cmp.l  #255,%%d2   			\n\t"
+			"bls.b  8f          			\n\t"
+			"spl.b  %%d2					\n\t"  
+			"8	:                    		\n\t"
+			"move.b %%d2,%%d3				\n\t"
+			
+			/* read dot 6, add DC, save dot */
+			"lsr.l   #8,%%d5			  	\n\t"
+    		"move.b %%d5,%%d2			\n\t"	
+			"add.l %%d0,%%d2				\n\t"
+			"cmp.l   #255,%%d2   			\n\t"
+			"bls.b   7f          			\n\t"
+			"spl.b   %%d2					\n\t"  
+			"7	:                    		\n\t"
+			"move.b %%d2,%%d6				\n\t"
+
+	
+			/* read dot 5, add DC, save dot */
+			"lsr.l   #8,%%d5			  	\n\t"
+    		"move.b %%d5,%%d2			\n\t"
+			"add.l %%d0,%%d2					\n\t"	
+			"cmp.l   #255,%%d2   				\n\t"
+			"bls.b   6f          				\n\t"
+			"spl.b   %%d2						\n\t"  
+			"6	:                    			\n\t"
+			"move.b %%d2,%%d7				\n\t"
+	
+			/* read dot 4, add DC, save dot */
+			"lsr.l   #8,%%d5			  	\n\t"
+    		"move.b %%d5,%%d2			\n\t"
+			"add.l %%d0,%%d2					\n\t"	
+			"cmp.l   #255,%%d2   				\n\t"  /* overflow? */
+			"bls.b   5f          				\n\t"  /* no: return value */
+			"spl.b   %%d2						\n\t"  /* yes: set low byte to appropriate boundary */
+			"5	:                    			\n\t"
+    		/* save dot 4 5 6 7 */
+    		"move.b %%d2,%%d5				\n\t"
+    		"lsl.l  #8,%%d5					\n\t"
+    		"move.b %%d7,%%d5				\n\t"   		
+    		"lsl.l  #8,%%d5					\n\t"
+    		"move.b %%d6,%%d5				\n\t"
+    		"lsl.l  #8,%%d5					\n\t"
+    		"move.b %%d3,%%d5				\n\t"
+    		
+			/* write to dest */	
+			"movem.l %%d4-%%d5,(%%a1)		\n\t"
+			
+			"add.l	%[stride],%%a1			\n\t"	
+			"sub.l	#1,%%d1					\n\t"			
+		"bne 	foraddDC					\n\t"	
+		/* for end */
+			
+		:
+		:	[block]"a"(block),[dest]"a"(dest),[stride]"a"(stride)
+		:	"%a0","%a1","%d0","%d1","%d2","%d3","%d4","%d5","%d6","%d7"
+	);
+
+} 
+#endif
+
+#ifdef CPU_COLDFIRE__
+static  void clip_block_to_dest_add_DC(int16_t * block, uint8_t * dest,const int stride)
+{
+    asm volatile (
+		
+		"move.l %[block],%%a0					\n\t"
+		"move.l %[dest],%%a1					\n\t"
+    	
+    	/* DC */
+		"move.w (%%a0),%%d0					\n\t"
+    	"ext.l %%d0							\n\t"		
+    	"add.l #64,%%d0						\n\t"
+		"asr.l #7,%%d0						\n\t"
+ 		"move.w #0,(%%a0)					\n\t"
+    	"move.w #0,%%d2					\n\t"	
+ 		"move.w %%d2,(126,%%a0)				\n\t"	
+	
+		/* init loop for i=0 to 7 */
+		"move.l #8,%%d1						\n\t"
+		"clr.l %%d2							\n\t"
+		"foraddDC:							\n\t"
+
+			"move.b (%%a1),%%d2					\n\t"
+			"add.l %%d0,%%d2					\n\t"	
+			"cmp.l   #255,%%d2   				\n\t"  /* overflow? */
+			"bls.b   1f          				\n\t"  /* no: return value */
+			"spl.b   %%d2						\n\t"  /* yes: set low byte to appropriate boundary */
+			"1	:                    			\n\t"
+			"move.b %%d2,%%d3				\n\t"
+			"lsl.l  #8,%%d3					\n\t"
+	
+			"move.b (1,%%a1),%%d2				\n\t"	
+			"add.l %%d0,%%d2					\n\t"	
+			"cmp.l   #255,%%d2   				\n\t"
+			"bls.b   2f          				\n\t"
+			"spl.b   %%d2						\n\t"  
+			"2	:                    			\n\t"
+			"move.b %%d2,%%d3				\n\t"
+			"lsl.l  #8,%%d3					\n\t"
+	
+			"move.b (2,%%a1),%%d2			\n\t"	
+			"add.l %%d0,%%d2			\n\t"
+			"cmp.l   #255,%%d2   			\n\t"
+			"bls.b   3f          			\n\t"
+			"spl.b   %%d2					\n\t"  
+			"3	:                    		\n\t"
+			"move.b %%d2,%%d3				\n\t"
+			"lsl.l  #8,%%d3					\n\t"
+	
+			"move.b (3,%%a1),%%d2			\n\t"	
+			"add.l 	%%d0,%%d2			\n\t"
+			"cmp.l  #255,%%d2   			\n\t"
+			"bls.b  4f          			\n\t"
+			"spl.b  %%d2					\n\t"  
+			"4	:                    		\n\t"
+			"move.b %%d2,%%d3				\n\t"
+			
+			"move.b (4,%%a1),%%d2			\n\t"	
+			"add.l	%%d0,%%d2			\n\t"
+			"cmp.l	#255,%%d2   			\n\t"
+			"bls.b  5f          			\n\t"
+			"spl.b  %%d2					\n\t"  
+			"5	:                    		\n\t"
+			"move.b %%d2,%%d4				\n\t"
+			"lsl.l  #8,%%d4					\n\t"
+			
+			"move.b (5,%%a1),%%d2			\n\t"	
+			"add.l 	%%d0,%%d2			\n\t"	
+			"cmp.l   #255,%%d2   			\n\t"
+			"bls.b   6f          			\n\t"
+			"spl.b   %%d2					\n\t"  
+			"6	:                    		\n\t"
+			"move.b %%d2,%%d4				\n\t"
+			"lsl.l  #8,%%d4					\n\t"
+	
+			"move.b (6,%%a1),%%d2			\n\t"	
+			"add.l 	%%d0,%%d2			\n\t"	
+			"cmp.l   #255,%%d2   			\n\t"
+			"bls.b   7f          			\n\t"
+			"spl.b   %%d2					\n\t"  
+			"7	:                    		\n\t"
+			"move.b %%d2,%%d4				\n\t"
+			"lsl.l  #8,%%d4					\n\t"
+	
+			"move.b (7,%%a1),%%d2			\n\t"	
+			"add.l 	%%d0,%%d2			\n\t"	
+			"cmp.l   #255,%%d2   			\n\t"
+			"bls.b   8f         	 		\n\t"
+			"spl.b   %%d2					\n\t"  
+			"8	:  		                  	\n\t"
+			"move.b %%d2,%%d4				\n\t"
+	
+			/* write to dest */	
+			"movem.l %%d3-%%d4,(%%a1)		\n\t"
+			
+			/* next line, address +16 */
+	    	"add.l	#16,%%a0				\n\t"	
+			"add.l	%[stride],%%a1			\n\t"	
+			"sub.l	#1,%%d1					\n\t"			
+		"bne 	foraddDC					\n\t"	
+		/* for end */
+			
+		:
+		:	[block]"a"(block),[dest]"a"(dest),[stride]"a"(stride)
+		:	"%a0","%a1","%d0","%d1","%d2","%d3","%d4"
+	);
+
+} 
+#endif
+
+#if defined(CPU_COLDFIRE) && !defined(SIMULATOR)
 static void mpeg2_idct_copy_c (int16_t * block, uint8_t * dest,
                                const int stride)
 {
-    int i;
-
-    for (i = 0; i < 8; i++)
+	idct (block);
+	clip_block_to_dest(block,dest,stride);
+}
+#else
+static void mpeg2_idct_copy_c (int16_t * block, uint8_t * dest,
+                               const int stride)
+{
+	int i;
+	for (i = 0; i < 8; i++)
         idct_row (block + 8 * i);
     for (i = 0; i < 8; i++)
         idct_col (block + i);
@@ -210,18 +1283,31 @@
         dest += stride;
         block += 8;
     } while (--i);
-}
+}	
+#endif 
 
+#if defined(CPU_COLDFIRE) && !defined(SIMULATOR)
+static void mpeg2_idct_add_c (const int last, int16_t * block,
+                              uint8_t * dest, const int stride)
+{
+    if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) {
+		idct (block);
+		clip_block_to_dest_add(block,dest,stride);
+    } else {
+    	clip_block_to_dest_add_DC(block,dest,stride);
+    }
+}
+#else
 static void mpeg2_idct_add_c (const int last, int16_t * block,
                               uint8_t * dest, const int stride)
 {
     int i;
 
     if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) {
-        for (i = 0; i < 8; i++)
-            idct_row (block + 8 * i);
-        for (i = 0; i < 8; i++)
-            idct_col (block + i);
+		for (i = 0; i < 8; i++)
+        	idct_row (block + 8 * i);
+    	for (i = 0; i < 8; i++)
+        	idct_col (block + i);
         do {
             dest[0] = CLIP (block[0] + dest[0]);
             dest[1] = CLIP (block[1] + dest[1]);
@@ -257,7 +1343,8 @@
         } while (--i);
     }
 }
-
+#endif	
+	
 void mpeg2_idct_init (uint32_t accel)
 {
     (void)accel;
Index: apps/plugins/mpegplayer/mpeg2_internal.h
===================================================================
RCS file: /cvsroot/rockbox/apps/plugins/mpegplayer/mpeg2_internal.h,v
retrieving revision 1.1
diff -u -r1.1 mpeg2_internal.h
--- apps/plugins/mpegplayer/mpeg2_internal.h	7 Aug 2006 22:11:07 -0000	1.1
+++ apps/plugins/mpegplayer/mpeg2_internal.h	25 Sep 2006 22:15:26 -0000
@@ -92,8 +92,12 @@
     int16_t dc_dct_pred[3];
 
     /* DCT coefficients */
+#if defined(CPU_COLDFIRE) && !defined(SIMULATOR)
+	// we need another temp buffer
+    int16_t DCTblock[128] ATTR_ALIGN(64);
+#else
     int16_t DCTblock[64] ATTR_ALIGN(64);
-
+#endif
     uint8_t * picture_dest[3];
     void (* convert) (void * convert_id, uint8_t * const * src,
 		      unsigned int v_offset);
