Index: apps/plugins/mpegplayer/decode.c
===================================================================
RCS file: /cvsroot/rockbox/apps/plugins/mpegplayer/decode.c,v
retrieving revision 1.1
diff -u -r1.1 decode.c
--- apps/plugins/mpegplayer/decode.c	7 Aug 2006 22:11:07 -0000	1.1
+++ apps/plugins/mpegplayer/decode.c	14 Sep 2006 21:49:14 -0000
@@ -416,14 +416,21 @@
 
 }
 
+#ifdef CPU_COLDFIRE
+static mpeg2dec_t static_mpeg2dec IBSS_ATTR;
+#endif
+
 mpeg2dec_t * mpeg2_init (void)
 {
     mpeg2dec_t * mpeg2dec;
 
     mpeg2_accel (MPEG2_ACCEL_DETECT);
 
-    mpeg2dec = (mpeg2dec_t *) mpeg2_malloc (sizeof (mpeg2dec_t),
-					    MPEG2_ALLOC_MPEG2DEC);
+#ifdef CPU_COLDFIRE
+	mpeg2dec = &static_mpeg2dec;
+#else  
+	mpeg2dec = (mpeg2dec_t *) mpeg2_malloc (sizeof (mpeg2dec_t),MPEG2_ALLOC_MPEG2DEC);
+#endif    
     if (mpeg2dec == NULL)
 	return NULL;
 
Index: apps/plugins/mpegplayer/idct.c
===================================================================
RCS file: /cvsroot/rockbox/apps/plugins/mpegplayer/idct.c,v
retrieving revision 1.2
diff -u -r1.2 idct.c
--- apps/plugins/mpegplayer/idct.c	8 Aug 2006 22:56:35 -0000	1.2
+++ apps/plugins/mpegplayer/idct.c	14 Sep 2006 21:49:16 -0000
@@ -28,13 +28,38 @@
 #include "mpeg2.h"
 #include "attributes.h"
 #include "mpeg2_internal.h"
-
+#define W0 2048 /* 2048 * sqrt (2) * cos (4 * pi / 16) */
 #define W1 2841 /* 2048 * sqrt (2) * cos (1 * pi / 16) */
 #define W2 2676 /* 2048 * sqrt (2) * cos (2 * pi / 16) */
 #define W3 2408 /* 2048 * sqrt (2) * cos (3 * pi / 16) */
+#define W4 2048 /* 2048 * sqrt (2) * cos (4 * pi / 16) */
 #define W5 1609 /* 2048 * sqrt (2) * cos (5 * pi / 16) */
 #define W6 1108 /* 2048 * sqrt (2) * cos (6 * pi / 16) */
 #define W7 565  /* 2048 * sqrt (2) * cos (7 * pi / 16) */
+ 
+#define SC(x) #x
+
+#define WA 1448 //=cos(pi/4)	2048*cos(4pi/16)  	0,707106781 
+#define WB 2009 //=cos(pi/16)	2048*cos(pi/16)	 	0,98078528	
+#define WC 1892 //=cos(pi/8)	2048*cos(2pi/16)	0,923879533
+#define WD 1703 //=cos(3pi/16)	2048*cos(3pi/16)	0,831469612
+#define WE 1448 //=sin(pi/4)	2048*sin(4pi/16)	0,707106781
+#define WF 1138 //=sin(3pi/16)	2048*sin(3pi/16)	0,555570233
+#define WG 784 //=sin(pi/8)	2048*sin(2pi/16)	0,382683432
+#define WH 400 //=sin(pi/16)	2048*sin(pi/16)		0,195090322
+
+/* theroic matrix used in the asm 1d_idct */
+/* 
+{WA,  WC,  WE,  WG,
+ WA,  WG, -WE, -WC,
+ WA, -WG, -WE,  WC,
+ WA, -WC,  WE, -WG};
+
+{WB,  WD,  WF,  WH,
+ WD, -WH, -WB, -WF,
+ WF, -WB,  WH,  WD,
+ WH, -WF,  WD, -WB};
+*/
 
 /* idct main entry point  */
 void (* mpeg2_idct_copy) (int16_t * block, uint8_t * dest, int stride);
@@ -51,13 +76,14 @@
 static inline unsigned CLIP(int value)
 {
     asm (  /* Note: Uses knowledge that only the low byte of the result is used */
-        "cmp.l   #255,%[v]   \n"  /* overflow? */
+       "cmp.l   #255,%[v]   \n"  /* overflow? */
         "bls.b   1f          \n"  /* no: return value */
         "spl.b   %[v]        \n"  /* yes: set low byte to appropriate boundary */
-    "1:                      \n"
+    "1	:                    \n"
         : /* outputs */
         [v]"+d"(value)
     );
+	
     return value;
 }
 #elif defined CPU_ARM
@@ -76,6 +102,21 @@
 #define CLIP(i) ((mpeg2_clip + 3840)[i])
 #endif
 
+#define MATRIX_MUL(m00,m01,m02,m03,\
+				m10,m11,m12,m13, \
+				m20,m21,m22,m23, \
+				m30,m31,m32,m33, \
+				n0,n1,n2,n3,\
+				p0,p1,p2,p3) \
+do{							\
+	p0=n0*m00+n1*m01+n2*m02+n3*m03; \
+	p1=n0*m10+n1*m11+n2*m12+n3*m13; \
+	p2=n0*m20+n1*m21+n2*m22+n3*m23; \
+	p3=n0*m30+n1*m31+n2*m32+n3*m33; \
+} while(0)
+
+
+
 #if 0
 #define BUTTERFLY(t0,t1,W0,W1,d0,d1)        \
 do {                                        \
@@ -91,6 +132,47 @@
 } while (0)
 #endif
 
+
+#ifdef C_VERSION_OF_ASM
+static inline void idct_row (int16_t * const block)
+{
+	int f0,f1,f2,f3,f4,f5,f6,f7;
+	int a0,a1,a2,a3;
+	int b0,b1,b2,b3;
+	
+	f0 = block[0]+1;
+    f2 = block[1];
+    f4 = block[2];
+    f6 = block[3];
+    f1 = block[4];
+    f3 = block[5];
+    f5 = block[6];
+    f7 = block[7];
+    
+    MATRIX_MUL(W1,  W3,  W5,  W7,
+			   W3, -W7, -W1, -W5,
+			   W5, -W1,  W7,  W3,
+			   W7, -W5,  W3, -W1,
+			   f1,f3,f5,f7,
+ 			   b0,b1,b2,b3);
+    
+	MATRIX_MUL(W0,  W2,  W4,  W6,
+ 			   W0,  W6, -W4, -W2,
+ 			   W0, -W6, -W4,  W2,
+ 			   W0, -W2,  W4, -W6,
+ 			   f0,f2,f4,f6,
+ 			   a0,a1,a2,a3);
+
+	block[0]= (a0+b0) >> 12;
+	block[1]= (a1+b1) >> 12;
+	block[2]= (a2+b2) >> 12;
+	block[3]= (a3+b3) >> 12;
+	block[4]= (a3-b3) >> 12;
+	block[5]= (a2-b2) >> 12;
+	block[6]= (a1-b1) >> 12;
+	block[7]= (a0-b0) >> 12;
+}
+#else
 static inline void idct_row (int16_t * const block)
 {
     int d0, d1, d2, d3;
@@ -143,7 +225,49 @@
     block[6] = (a1 - b1) >> 12;
     block[7] = (a0 - b0) >> 12;
 }
+#endif
+
+#ifdef C_VERSION_OF_ASM
+static inline void idct_col (int16_t * const block)
+{
+	int f0,f1,f2,f3,f4,f5,f6,f7;
+	int a0,a1,a2,a3;
+	int b0,b1,b2,b3;
+	
+	f0 = block[0*8] + 32;
+    f2 = block[1*8];
+    f4 = block[2*8];
+    f6 = block[3*8];
+    f1 = block[4*8];
+    f3 = block[5*8];
+    f5 = block[6*8];
+    f7 = block[7*8];
+    
+    MATRIX_MUL(W1,  W3,  W5,  W7,
+			   W3, -W7, -W1, -W5,
+			   W5, -W1,  W7,  W3,
+			   W7, -W5,  W3, -W1,
+			   f1,f3,f5,f7,
+ 			   b0,b1,b2,b3);
+    
+	MATRIX_MUL(W0,  W2,  W4,  W6,
+ 			   W0,  W6, -W4, -W2,
+ 			   W0, -W6, -W4,  W2,
+ 			   W0, -W2,  W4, -W6,
+ 			   f0,f2,f4,f6,
+ 			   a0,a1,a2,a3);
+
+	block[0*8]= (a0+b0) >> 17;
+	block[1*8]= (a1+b1) >> 17;
+	block[2*8]= (a2+b2) >> 17;
+	block[3*8]= (a3+b3) >> 17;
+	block[4*8]= (a3-b3) >> 17;
+	block[5*8]= (a2-b2) >> 17;
+	block[6*8]= (a1-b1) >> 17;
+	block[7*8]= (a0-b0) >> 17;
 
+}
+#else
 static inline void idct_col (int16_t * const block)
 {
     int d0, d1, d2, d3;
@@ -184,13 +308,702 @@
     block[8*6] = (a1 - b1) >> 17;
     block[8*7] = (a0 - b0) >> 17;
 }
+#endif
+
+#ifdef CPU_COLDFIRE
+static  void idct(int16_t * const block) ICODE_ATTR;
+static  void idct(int16_t * const block)
+{
+	asm volatile(
+
+		"move.l #0x80,%%macsr			\n\t"
+		/* matrix start */	
+
+		"move.l #2048*65536+2841,%%a0				\n\t"	
+		"move.l #2676*65536+2408,%%a1           	\n\t"	
+		"move.l #2048*65536+1609,%%a2				\n\t"	
+		"move.l #1108*65536+565,%%a3            	\n\t"	
+	
+		"move.l %[block],%%a4						\n\t"	
+		"add.l #128,%%a4							\n\t"	
+		/* init loop for i=0 to 7 */	
+		"move.l #8,%%d3							\n\t"
+
+		"row:										\n\t"
+		/* load P[1] P[3] P[5] P[7] from the row */	
+		"movem.l (%[block]),%%d0-%%d2/%%a5	\n\t"
+
+		/* compute second matrix * P[1],P[3],P[5],P[7]*/	
+		"mac.w 	%%d2u, %%a0l, %%acc0					\n\t"	
+		"mac.w 	%%d2u, %%a1l, %%acc1					\n\t"	
+		"mac.w 	%%d2u, %%a2l, %%acc2					\n\t"	
+		"mac.w 	%%d2u, %%a3l, %%acc3					\n\t"	
+
+		"mac.w 	%%d2l, %%a1l, %%acc0					\n\t"
+		"msac.w %%d2l, %%a3l, %%acc1					\n\t"
+		"msac.w %%d2l, %%a0l, %%acc2					\n\t"
+		"msac.w %%d2l, %%a2l, %%acc3					\n\t"
+
+		"mac.w 	%%a5u, %%a2l, %%acc0					\n\t"	
+		"msac.w %%a5u, %%a0l, %%acc1					\n\t"	
+		"mac.w 	%%a5u, %%a3l, %%acc2					\n\t"	
+		"mac.w 	%%a5u, %%a1l, %%acc3					\n\t"	
+
+		"mac.w 	%%a5l, %%a3l, %%acc0					\n\t"
+		"msac.w %%a5l, %%a2l, %%acc1					\n\t"
+		"mac.w 	%%a5l, %%a1l, %%acc2					\n\t"
+		"msac.w %%a5l, %%a0l, %%acc3					\n\t"
+			
+		/* save second matrix row */
+		"movclr.l %%acc0, %%d4							\n\t"
+		"movclr.l %%acc1, %%d5							\n\t"
+		"movclr.l %%acc2, %%d6							\n\t"
+		"movclr.l %%acc3, %%d7							\n\t"
+
+		/* load P[0] P[2] P[4] P[6] from the row */	
+		/* probably some error correction */	
+		"add.l	#65536*1,%%d0			\n\t"
+		/* compute first matrix * P[0],P[2],P[4],P[6]*/	
+		"mac.w 	%%d0u, %%a0u, %%acc0					\n\t"
+		"mac.w 	%%d0u, %%a0u, %%acc1					\n\t"
+		"mac.w 	%%d0u, %%a0u, %%acc2					\n\t"
+		"mac.w 	%%d0u, %%a0u, %%acc3					\n\t"
+
+		"mac.w 	%%d0l, %%a1u, %%acc0					\n\t"
+		"mac.w  %%d0l, %%a3u, %%acc1					\n\t"
+		"msac.w %%d0l, %%a3u, %%acc2					\n\t"
+		"msac.w %%d0l, %%a1u, %%acc3					\n\t"
+
+		"mac.w 	%%d1u, %%a2u, %%acc0					\n\t"	
+		"msac.w %%d1u, %%a2u, %%acc1					\n\t"	
+		"msac.w %%d1u, %%a2u, %%acc2					\n\t"	
+		"mac.w 	%%d1u, %%a2u, %%acc3					\n\t"	
+
+		"mac.w 	%%d1l, %%a3u, %%acc0					\n\t"
+		"msac.w %%d1l, %%a1u, %%acc1					\n\t"
+		"mac.w 	%%d1l, %%a1u, %%acc2					\n\t"
+		"msac.w %%d1l, %%a3u, %%acc3					\n\t"
+
+		/*   divide by  */
+		"move.l #12, %%d1			\n\t"
+
+		/* mat 1 + mat 2 */
+		"move.l %%acc0, %%d0		\n\t"
+		"add.l	%%d4,%%d0			\n\t"	
+		"asr.l %%d1, %%d0			\n\t"		
+		"move.w %%d0,(%%a4)		\n\t"
+			
+		"move.l %%acc1, %%d0		\n\t"
+		"add.l	%%d5,%%d0			\n\t"	
+		"asr.l %%d1, %%d0			\n\t"		
+		"move.w %%d0,(1*16,%%a4)	\n\t"
+			
+		"move.l %%acc2, %%d0		\n\t"
+		"add.l	%%d6,%%d0			\n\t"	
+		"asr.l %%d1, %%d0			\n\t"		
+		"move.w %%d0,(2*16,%%a4)	\n\t"
+			
+		"move.l %%acc3, %%d0		\n\t"
+		"add.l	%%d7,%%d0			\n\t"	
+		"asr.l %%d1, %%d0			\n\t"		
+		"move.w %%d0,(3*16,%%a4)	\n\t"
+			
+		/* mat 1 - mat2*/
+		"movclr.l %%acc0, %%d0		\n\t"
+		"sub.l	%%d4,%%d0			\n\t"	
+		"asr.l %%d1, %%d0			\n\t"		
+		"move.w %%d0,(7*16,%%a4)	\n\t"
+			
+		"movclr.l %%acc1, %%d0		\n\t"
+		"sub.l	%%d5,%%d0			\n\t"	
+		"asr.l %%d1, %%d0			\n\t"		
+		"move.w %%d0,(6*16,%%a4)	\n\t"
+			
+		"movclr.l %%acc2, %%d0		\n\t"
+		"sub.l	%%d6,%%d0			\n\t"	
+		"asr.l %%d1, %%d0			\n\t"		
+		"move.w %%d0,(5*16,%%a4)	\n\t"
+			
+		"movclr.l %%acc3, %%d0		\n\t"
+		"sub.l	%%d7,%%d0			\n\t"	
+		"asr.l %%d1, %%d0			\n\t"		
+		"move.w %%d0,(4*16,%%a4)	\n\t"	
+
+		/* next line, address +16 */	
+		"add.l	#16,%[block]			\n\t"
+		/* next dest column */	
+		"add.l	#2,%%a4			\n\t"	
+		"sub.l	#1,%%d3				\n\t"			
+		"bne 	row					\n\t"	
+		/* for end */
+			
+		"sub.l	#16+128,%%a4		\n\t"
+		"move.l #8,%%d3				\n\t"
+// columns
+		"col:					\n\t"	
+
+		/* load P[1] P[3] P[5] P[7] from the row */
+		"movem.l (%[block]),%%d0-%%d2/%%a5	\n\t"	
+		
+		/* compute second matrix * P[1],P[3],P[5],P[7]*/
+		"mac.w 	%%d2u, %%a0l, %%acc0					\n\t"	
+		"mac.w 	%%d2u, %%a1l, %%acc1					\n\t"	
+		"mac.w 	%%d2u, %%a2l, %%acc2					\n\t"	
+		"mac.w 	%%d2u, %%a3l, %%acc3					\n\t"	
+		
+		"mac.w 	%%d2l, %%a1l, %%acc0					\n\t"
+		"msac.w %%d2l, %%a3l, %%acc1					\n\t"
+		"msac.w %%d2l, %%a0l, %%acc2					\n\t"
+		"msac.w %%d2l, %%a2l, %%acc3					\n\t"
+		
+		"mac.w 	%%a5u, %%a2l, %%acc0					\n\t"	
+		"msac.w %%a5u, %%a0l, %%acc1					\n\t"	
+		"mac.w 	%%a5u, %%a3l, %%acc2					\n\t"	
+		"mac.w 	%%a5u, %%a1l, %%acc3					\n\t"	
+		
+		"mac.w 	%%a5l, %%a3l, %%acc0					\n\t"
+		"msac.w %%a5l, %%a2l, %%acc1					\n\t"
+		"mac.w 	%%a5l, %%a1l, %%acc2					\n\t"
+		"msac.w %%a5l, %%a0l, %%acc3					\n\t"
+				
+		/* save second matrix row */
+		"movclr.l %%acc0, %%d4			\n\t"
+		"movclr.l %%acc1, %%d5			\n\t"
+		"movclr.l %%acc2, %%d6			\n\t"
+		"movclr.l %%acc3, %%d7			\n\t"
+		
+		/* load P[0] P[2] P[4] P[6] from the row */
+		//this is to have +0,5 in the output block		
+		"add.l	#32*65536,%%d0				\n\t"
+		
+		/* compute first matrix * P[0],P[2],P[4],P[6]*/
+		"mac.w 	%%d0u, %%a0u, %%acc0					\n\t"
+		"mac.w 	%%d0u, %%a0u, %%acc1					\n\t"
+		"mac.w 	%%d0u, %%a0u, %%acc2					\n\t"
+		"mac.w 	%%d0u, %%a0u, %%acc3					\n\t"
+		
+		"mac.w 	%%d0l, %%a1u, %%acc0					\n\t"
+		"mac.w  %%d0l, %%a3u, %%acc1					\n\t"
+		"msac.w %%d0l, %%a3u, %%acc2					\n\t"
+		"msac.w %%d0l, %%a1u, %%acc3					\n\t"
+		
+		"mac.w 	%%d1u, %%a2u, %%acc0					\n\t"
+		"msac.w %%d1u, %%a2u, %%acc1					\n\t"
+		"msac.w %%d1u, %%a2u, %%acc2					\n\t"
+		"mac.w 	%%d1u, %%a2u, %%acc3					\n\t"
+
+		"mac.w 	%%d1l, %%a3u, %%acc0					\n\t"
+		"msac.w %%d1l, %%a1u, %%acc1					\n\t"
+		"mac.w 	%%d1l, %%a1u, %%acc2					\n\t"
+		"msac.w %%d1l, %%a3u, %%acc3					\n\t"
+				
+		/*   divide  */	
+		"move.l #17, %%d1			\n\t"
+
+		/* mat 1 + mat 2 */	
+		"move.l %%acc0, %%d0		\n\t"
+		"add.l	%%d4,%%d0			\n\t"
+		"asr.l %%d1, %%d0			\n\t"		
+		"move.w %%d0,(%%a4)		\n\t"
+			
+		"move.l %%acc1, %%d0		\n\t"
+		"add.l	%%d5,%%d0			\n\t"	
+		"asr.l %%d1, %%d0			\n\t"		
+		"move.w %%d0,(1*16,%%a4)	\n\t"
+			
+		"move.l %%acc2, %%d0		\n\t"
+		"add.l	%%d6,%%d0			\n\t"	
+		"asr.l %%d1, %%d0			\n\t"		
+		"move.w %%d0,(2*16,%%a4)	\n\t"
+			
+		"move.l %%acc3, %%d0		\n\t"
+		"add.l	%%d7,%%d0			\n\t"	
+		"asr.l %%d1, %%d0			\n\t"		
+		"move.w %%d0,(3*16,%%a4)	\n\t"
+			
+		/* mat 1 - mat2*/	
+			
+		"movclr.l %%acc0, %%d0		\n\t"
+		"sub.l	%%d4,%%d0			\n\t"	
+		"asr.l %%d1, %%d0			\n\t"		
+		"move.w %%d0,(7*16,%%a4)	\n\t"
+			
+		"movclr.l %%acc1, %%d0		\n\t"
+		"sub.l	%%d5,%%d0			\n\t"	
+		"asr.l %%d1, %%d0			\n\t"		
+		"move.w %%d0,(6*16,%%a4)	\n\t"
+			
+		"movclr.l %%acc2, %%d0		\n\t"
+		"sub.l	%%d6,%%d0			\n\t"	
+		"asr.l %%d1, %%d0			\n\t"		
+		"move.w %%d0,(5*16,%%a4)	\n\t"
+			
+		"movclr.l %%acc3, %%d0		\n\t"
+		"sub.l	%%d7,%%d0			\n\t"	
+		"asr.l %%d1, %%d0			\n\t"		
+		"move.w %%d0,(4*16,%%a4)	\n\t"
+		/****************************/
+			
+		/* next line, address +16 */	
+		"add.l	#16,%[block]			\n\t"
+		/* next dest column */	
+		"add.l	#2,%%a4			\n\t"
+			
+		"sub.l #1,%%d3					\n\t"			
+		"bne 	col						\n\t"
+			
+		"sub.l	#16*16,%[block]			\n\t"
+		:
+		:	[block]"a"(block)	
+		:	"%a0","%a1","%a2","%a3","%a4","%a5","%d0","%d1","%d2","%d3","%d4","%d5","%d6","%d7"
+	);
+
+}
+#endif
+
+#ifdef CPU_COLDFIRE
+static  void clip_block_to_dest(int16_t * block, uint8_t * dest,const int stride)
+{
+	asm volatile(
+		
+		"move.l %[block],%%a0					\n\t"
+		"move.l %[dest],%%a1					\n\t"
+		/* init loop for i=0 to 7 */	
+		"move.l #8,%%d1							\n\t"
+		"for:									\n\t"
+				
+		"move.w (%%a0),%%d0			\n\t"
+   		"cmp.l   #255,%%d0   			\n\t"  /* overflow? */
+   		"bls.b   1f          			\n\t"  /* no: return value */
+   		"spl.b   %%d0					\n\t"  /* yes: set low byte to appropriate boundary */
+   		"1	:                    		\n\t"
+		"move.b %%d0,%%d2				\n\t"
+		"lsl.l  #8,%%d2					\n\t"
+		
+		"move.w (2,%%a0),%%d0		\n\t"
+   		"cmp.l   #255,%%d0   			\n\t"
+   		"bls.b   2f          			\n\t"
+   		"spl.b   %%d0					\n\t"  
+   		"2	:                    		\n\t"
+		"move.b %%d0,%%d2				\n\t"
+		"lsl.l  #8,%%d2					\n\t"
+		
+		"move.w (4,%%a0),%%d0		\n\t"
+   		"cmp.l   #255,%%d0   			\n\t"
+   		"bls.b   3f          			\n\t"
+   		"spl.b   %%d0					\n\t"  
+   		"3	:                    		\n\t"
+		"move.b %%d0,%%d2				\n\t"
+		"lsl.l  #8,%%d2					\n\t"
+		
+		"move.w (6,%%a0),%%d0		\n\t"
+   		"cmp.l   #255,%%d0   			\n\t"
+   		"bls.b   4f          			\n\t"
+   		"spl.b   %%d0					\n\t"  
+   		"4	:                    		\n\t"
+		"move.b %%d0,%%d2				\n\t"
+		
+		"move.w (8,%%a0),%%d0		\n\t"
+   		"cmp.l   #255,%%d0   			\n\t"
+   		"bls.b   5f          			\n\t"
+   		"spl.b   %%d0					\n\t"  
+   		"5	:                    		\n\t"
+		"move.b %%d0,%%d3				\n\t"
+		"lsl.l  #8,%%d3					\n\t"
+		
+		"move.w (10,%%a0),%%d0		\n\t"
+   		"cmp.l   #255,%%d0   			\n\t"
+   		"bls.b   6f          			\n\t"
+   		"spl.b   %%d0					\n\t"  
+   		"6	:                    		\n\t"
+		"move.b %%d0,%%d3				\n\t"
+		"lsl.l  #8,%%d3					\n\t"
+		
+		"move.w (12,%%a0),%%d0		\n\t"
+   		"cmp.l   #255,%%d0   			\n\t"
+   		"bls.b   7f          			\n\t"
+   		"spl.b   %%d0					\n\t"  
+   		"7	:                    		\n\t"
+		"move.b %%d0,%%d3				\n\t"
+		"lsl.l  #8,%%d3					\n\t"
+		
+		"move.w (14,%%a0),%%d0		\n\t"
+   		"cmp.l   #255,%%d0   			\n\t"
+   		"bls.b   8f         	 		\n\t"
+   		"spl.b   %%d0					\n\t"  
+   		"8	:  		                  	\n\t"
+		"move.b %%d0,%%d3				\n\t"
+		
+		"move.l #0,(%%a0)+		\n\t"
+		"move.l #0,(%%a0)+		\n\t"
+		"move.l #0,(%%a0)+		\n\t"
+		"move.l #0,(%%a0)+		\n\t"
+			
+		"movem.l %%d2-%%d3,(%%a1)		\n\t"
+			
+		/* next line, address +16 */	
+		"add.l	%[stride],%%a1		\n\t"	
+		"sub.l	#1,%%d1					\n\t"			
+		"bne 	for						\n\t"	
+		/* for end */
+			
+		:
+		:	[block]"a"(block),[dest]"a"(dest),[stride]"d"(stride)
+		:	"%a0","%a1","%d0","%d1","%d2","%d3"
+	);
+
+} 
+#endif
+#ifdef CPU_COLDFIRE__
+static  void clip_block_to_dest(int16_t * block, uint8_t * dest,const int stride)
+{
+	asm volatile(
+		
+		"move.l %[block],%%a0					\n\t"
+		"move.l %[dest],%%a1					\n\t"
+		/* init loop for i=0 to 7 */	
+		"move.l #8,%%d1							\n\t"
+		"clr.l %%d0							\n\t"	
+		"for:									\n\t"
+		"movem.l (%%a0),%%d4-%%d7				\n\t"
+		
+		"swap.w %%d4					\n\t"	
+		"move.w %%d4,%%d0				\n\t"
+		"cmp.l   #255,%%d0   			\n\t"  /* overflow? */
+   		"bls.b   1f          			\n\t"  /* no: return value */
+   		"spl.b   %%d0					\n\t"  /* yes: set low byte to appropriate boundary */
+   		"1	:                    		\n\t"
+		"move.b %%d0,%%d2				\n\t"
+		"lsl.l  #8,%%d2					\n\t"
+			
+		"swap.w %%d4			\n\t"	
+		"move.w %%d4,%%d0		\n\t"
+   		"cmp.l   #255,%%d0   			\n\t"
+   		"bls.b   2f          			\n\t"
+   		"spl.b   %%d0					\n\t"  
+   		"2	:                    		\n\t"
+		"move.b %%d0,%%d2				\n\t"
+		"lsl.l  #8,%%d2					\n\t"
+		
+			
+		"swap.w %%d5					\n\t"	
+		"move.w %%d5,%%d0				\n\t"
+   		"cmp.l   #255,%%d0   			\n\t"  /* overflow? */
+   		"bls.b   3f          			\n\t"  /* no: return value */
+   		"spl.b   %%d0					\n\t"  /* yes: set low byte to appropriate boundary */
+   		"3	:                    		\n\t"
+		"move.b %%d0,%%d2				\n\t"
+		"lsl.l  #8,%%d2					\n\t"
+			
+		"swap.w %%d5			\n\t"	
+		"move.w %%d5,%%d0		\n\t"
+   		"cmp.l   #255,%%d0   			\n\t"
+   		"bls.b   4f          			\n\t"
+   		"spl.b   %%d0					\n\t"  
+   		"4	:                    		\n\t"
+		"move.b %%d0,%%d2				\n\t"
+		
+		"swap.w %%d6					\n\t"	
+		"move.w %%d6,%%d0				\n\t"
+   		"cmp.l   #255,%%d0   			\n\t"  /* overflow? */
+   		"bls.b   5f          			\n\t"  /* no: return value */
+   		"spl.b   %%d0					\n\t"  /* yes: set low byte to appropriate boundary */
+   		"5	:                    		\n\t"
+		"move.b %%d0,%%d3				\n\t"
+		"lsl.l  #8,%%d3					\n\t"
+			
+		"swap.w %%d6			\n\t"	
+		"move.w %%d6,%%d0		\n\t"
+   		"cmp.l   #255,%%d0   			\n\t"
+   		"bls.b   6f          			\n\t"
+   		"spl.b   %%d0					\n\t"  
+   		"6	:                    		\n\t"
+		"move.b %%d0,%%d3				\n\t"
+		"lsl.l  #8,%%d3					\n\t"
+		
+			
+		"swap.w %%d7					\n\t"	
+		"move.w %%d7,%%d0				\n\t"
+   		"cmp.l   #255,%%d0   			\n\t"  /* overflow? */
+   		"bls.b   7f          			\n\t"  /* no: return value */
+   		"spl.b   %%d0					\n\t"  /* yes: set low byte to appropriate boundary */
+   		"7	:                    		\n\t"
+		"move.b %%d0,%%d3				\n\t"
+		"lsl.l  #8,%%d3					\n\t"
+			
+		"swap.w %%d7			\n\t"	
+		"move.w %%d7,%%d0		\n\t"
+   		"cmp.l   #255,%%d0   			\n\t"
+   		"bls.b   8f          			\n\t"
+   		"spl.b   %%d0					\n\t"  
+   		"8	:                    		\n\t"
+		"move.b %%d0,%%d3				\n\t"
+
+		"move.l #0,(%%a0)+				\n\t"
+		"move.l #0,(%%a0)+				\n\t"
+		"move.l #0,(%%a0)+				\n\t"
+		"move.l #0,(%%a0)+				\n\t"
+		
+		"movem.l %%d2-%%d3,(%%a1)		\n\t"	
+			
+		/* next line, address +16 */	
+		"add.l	%[stride],%%a1		\n\t"	
+		"sub.l	#1,%%d1					\n\t"			
+		"bne 	for						\n\t"	
+		/* for end */
+			
+		:
+		:	[block]"a"(block),[dest]"a"(dest),[stride]"a"(stride)
+		:	"%a0","%a1","%d0","%d1","%d2","%d3","%d4","%d5","%d6","%d7"
+	);
+
+} 
+#endif
+
+#ifdef CPU_COLDFIRE
+static  void clip_block_to_dest_add(int16_t * block, uint8_t * dest,const int stride)
+{
+	asm volatile (
+		
+		"move.l %[block],%%a0					\n\t"
+		"move.l %[dest],%%a1					\n\t"
+		/* init loop for i=0 to 7 */	
+		"move.l #8,%%d1							\n\t"
+		"clr.l %%d2							\n\t"
+		"foradd:								\n\t"
+		
+		"move.w (%%a0),%%d0					\n\t"
+		"move.b (%%a1),%%d2					\n\t"
+		"ext.l %%d0							\n\t"	
+		"add.l %%d2,%%d0					\n\t"	
+		"cmp.l   #255,%%d0   				\n\t"  /* overflow? */
+		"bls.b   1f          				\n\t"  /* no: return value */
+		"spl.b   %%d0						\n\t"  /* yes: set low byte to appropriate boundary */
+		"1	:                    			\n\t"
+		"move.b %%d0,%%d3				\n\t"
+		"lsl.l  #8,%%d3					\n\t"
+	
+		"move.b %%d0,(%%a1)					\n\t"
+		
+		"move.w (2,%%a0),%%d0				\n\t"
+		"move.b (1,%%a1),%%d2				\n\t"	
+		"ext.l %%d0							\n\t"	
+		"add.l %%d2,%%d0					\n\t"	
+		"cmp.l   #255,%%d0   				\n\t"
+		"bls.b   2f          				\n\t"
+		"spl.b   %%d0						\n\t"  
+		"2	:                    			\n\t"
+		"move.b %%d0,%%d3				\n\t"
+		"lsl.l  #8,%%d3					\n\t"
+	
+
+		"move.w (4,%%a0),%%d0		\n\t"
+		"move.b (2,%%a1),%%d2			\n\t"	
+		"ext.l %%d0							\n\t"	
+		"add.l %%d2,%%d0			\n\t"
+		"cmp.l   #255,%%d0   			\n\t"
+		"bls.b   3f          			\n\t"
+		"spl.b   %%d0					\n\t"  
+		"3	:                    		\n\t"
+		"move.b %%d0,%%d3				\n\t"
+		"lsl.l  #8,%%d3					\n\t"
+
+		"move.w (6,%%a0),%%d0		\n\t"
+		"move.b (3,%%a1),%%d2			\n\t"	
+		"ext.l %%d0							\n\t"	
+		"add.l 	%%d2,%%d0			\n\t"
+		"cmp.l  #255,%%d0   			\n\t"
+		"bls.b  4f          			\n\t"
+		"spl.b  %%d0					\n\t"  
+		"4	:                    		\n\t"
+		"move.b %%d0,%%d3				\n\t"
+
+			
+		"move.w (8,%%a0),%%d0		\n\t"
+		"move.b (4,%%a1),%%d2			\n\t"	
+		"ext.l %%d0							\n\t"	
+		"add.l	%%d2,%%d0			\n\t"
+		"cmp.l	#255,%%d0   			\n\t"
+		"bls.b  5f          			\n\t"
+		"spl.b  %%d0					\n\t"  
+		"5	:                    		\n\t"
+		"move.b %%d0,%%d4				\n\t"
+		"lsl.l  #8,%%d4					\n\t"
+		
+		"move.w (10,%%a0),%%d0		\n\t"
+		"move.b (5,%%a1),%%d2			\n\t"	
+		"ext.l %%d0							\n\t"	
+		"add.l 	%%d2,%%d0			\n\t"	
+		"cmp.l   #255,%%d0   			\n\t"
+		"bls.b   6f          			\n\t"
+		"spl.b   %%d0					\n\t"  
+		"6	:                    		\n\t"
+		"move.b %%d0,%%d4				\n\t"
+		"lsl.l  #8,%%d4					\n\t"
+
+		"move.w (12,%%a0),%%d0		\n\t"
+		"move.b (6,%%a1),%%d2			\n\t"	
+		"ext.l %%d0							\n\t"	
+		"add.l 	%%d2,%%d0			\n\t"	
+		"cmp.l   #255,%%d0   			\n\t"
+		"bls.b   7f          			\n\t"
+		"spl.b   %%d0					\n\t"  
+		"7	:                    		\n\t"
+		"move.b %%d0,%%d4				\n\t"
+		"lsl.l  #8,%%d4					\n\t"
+
+		"move.w (14,%%a0),%%d0		\n\t"
+		"move.b (7,%%a1),%%d2			\n\t"	
+		"ext.l %%d0							\n\t"	
+		"add.l 	%%d2,%%d0			\n\t"	
+		"cmp.l   #255,%%d0   			\n\t"
+		"bls.b   8f         	 		\n\t"
+		"spl.b   %%d0					\n\t"  
+		"8	:  		                  	\n\t"
+		"move.b %%d0,%%d4				\n\t"
+		
+		/* write to dest */	
+		"movem.l %%d3-%%d4,(%%a1)		\n\t"
+		
+		/* clear block */	
+		"move.l #0,(%%a0)+		\n\t"
+		"move.l #0,(%%a0)+		\n\t"
+		"move.l #0,(%%a0)+		\n\t"
+		"move.l #0,(%%a0)+		\n\t"
+		
+		/* next line, address +16 */	
+		"add.l	%[stride],%%a1		\n\t"	
+		"sub.l	#1,%%d1					\n\t"			
+		"bne 	foradd						\n\t"	
+		/* for end */
+			
+		:
+		:	[block]"a"(block),[dest]"a"(dest),[stride]"a"(stride)
+		:	"%a0","%a1","%d0","%d1","%d2","%d3","%d4"
+	);
+
+} 
+#endif
+
+#ifdef CPU_COLDFIRE
+static  void clip_block_to_dest_add_DC(int16_t * block, uint8_t * dest,const int stride)
+{
+    asm volatile (
+		
+		"move.l %[block],%%a0					\n\t"
+		"move.l %[dest],%%a1					\n\t"
+    	
+    	/* DC */
+		"move.w (%%a0),%%d0					\n\t"
+    	"ext.l %%d0							\n\t"		
+    	"add.l #64,%%d0						\n\t"
+		"asr.l #7,%%d0						\n\t"
+ 		"move.w #0,(%%a0)					\n\t"
+    	"move.w #0,%%d2					\n\t"	
+ 		"move.w %%d2,(126,%%a0)				\n\t"	
+	
+		/* init loop for i=0 to 7 */
+		"move.l #8,%%d1						\n\t"
+		"clr.l %%d2							\n\t"
+		"foraddDC:							\n\t"
+
+		"move.b (%%a1),%%d2					\n\t"
+		"add.l %%d0,%%d2					\n\t"	
+		"cmp.l   #255,%%d2   				\n\t"  /* overflow? */
+		"bls.b   1f          				\n\t"  /* no: return value */
+		"spl.b   %%d2						\n\t"  /* yes: set low byte to appropriate boundary */
+		"1	:                    			\n\t"
+		"move.b %%d2,%%d3				\n\t"
+		"lsl.l  #8,%%d3					\n\t"
+
+		"move.b (1,%%a1),%%d2				\n\t"	
+		"add.l %%d0,%%d2					\n\t"	
+		"cmp.l   #255,%%d2   				\n\t"
+		"bls.b   2f          				\n\t"
+		"spl.b   %%d2						\n\t"  
+		"2	:                    			\n\t"
+		"move.b %%d2,%%d3				\n\t"
+		"lsl.l  #8,%%d3					\n\t"
+
+		"move.b (2,%%a1),%%d2			\n\t"	
+		"add.l %%d0,%%d2			\n\t"
+		"cmp.l   #255,%%d2   			\n\t"
+		"bls.b   3f          			\n\t"
+		"spl.b   %%d2					\n\t"  
+		"3	:                    		\n\t"
+		"move.b %%d2,%%d3				\n\t"
+		"lsl.l  #8,%%d3					\n\t"
+
+		"move.b (3,%%a1),%%d2			\n\t"	
+		"add.l 	%%d0,%%d2			\n\t"
+		"cmp.l  #255,%%d2   			\n\t"
+		"bls.b  4f          			\n\t"
+		"spl.b  %%d2					\n\t"  
+		"4	:                    		\n\t"
+		"move.b %%d2,%%d3				\n\t"
+		
+		"move.b (4,%%a1),%%d2			\n\t"	
+		"add.l	%%d0,%%d2			\n\t"
+		"cmp.l	#255,%%d2   			\n\t"
+		"bls.b  5f          			\n\t"
+		"spl.b  %%d2					\n\t"  
+		"5	:                    		\n\t"
+		"move.b %%d2,%%d4				\n\t"
+		"lsl.l  #8,%%d4					\n\t"
+		
+		"move.b (5,%%a1),%%d2			\n\t"	
+		"add.l 	%%d0,%%d2			\n\t"	
+		"cmp.l   #255,%%d2   			\n\t"
+		"bls.b   6f          			\n\t"
+		"spl.b   %%d2					\n\t"  
+		"6	:                    		\n\t"
+		"move.b %%d2,%%d4				\n\t"
+		"lsl.l  #8,%%d4					\n\t"
+
+		"move.b (6,%%a1),%%d2			\n\t"	
+		"add.l 	%%d0,%%d2			\n\t"	
+		"cmp.l   #255,%%d2   			\n\t"
+		"bls.b   7f          			\n\t"
+		"spl.b   %%d2					\n\t"  
+		"7	:                    		\n\t"
+		"move.b %%d2,%%d4				\n\t"
+		"lsl.l  #8,%%d4					\n\t"
+
+		"move.b (7,%%a1),%%d2			\n\t"	
+		"add.l 	%%d0,%%d2			\n\t"	
+		"cmp.l   #255,%%d2   			\n\t"
+		"bls.b   8f         	 		\n\t"
+		"spl.b   %%d2					\n\t"  
+		"8	:  		                  	\n\t"
+		"move.b %%d2,%%d4				\n\t"
+
+		/* write to dest */	
+		"movem.l %%d3-%%d4,(%%a1)		\n\t"
+		
+		/* next line, address +16 */
+    	"add.l	#16,%%a0				\n\t"	
+		"add.l	%[stride],%%a1			\n\t"	
+		"sub.l	#1,%%d1					\n\t"			
+		"bne 	foraddDC					\n\t"	
+		/* for end */
+			
+		:
+		:	[block]"a"(block),[dest]"a"(dest),[stride]"a"(stride)
+		:	"%a0","%a1","%d0","%d1","%d2","%d3","%d4"
+	);
+
+} 
+#endif
 
 static void mpeg2_idct_copy_c (int16_t * block, uint8_t * dest,
                                const int stride)
 {
-    int i;
-
-    for (i = 0; i < 8; i++)
+#ifdef CPU_COLDFIRE
+	idct (block);
+	clip_block_to_dest(block,dest,stride);
+#else
+	int i;
+	for (i = 0; i < 8; i++)
         idct_row (block + 8 * i);
     for (i = 0; i < 8; i++)
         idct_col (block + i);
@@ -210,6 +1023,7 @@
         dest += stride;
         block += 8;
     } while (--i);
+#endif  
 }
 
 static void mpeg2_idct_add_c (const int last, int16_t * block,
@@ -218,10 +1032,14 @@
     int i;
 
     if (last != 129 || (block[0] & (7 << 4)) == (4 << 4)) {
-        for (i = 0; i < 8; i++)
-            idct_row (block + 8 * i);
-        for (i = 0; i < 8; i++)
-            idct_col (block + i);
+#ifdef CPU_COLDFIRE
+		idct (block);
+		clip_block_to_dest_add(block,dest,stride);
+#else
+		for (i = 0; i < 8; i++)
+        	idct_row (block + 8 * i);
+    	for (i = 0; i < 8; i++)
+        	idct_col (block + i);
         do {
             dest[0] = CLIP (block[0] + dest[0]);
             dest[1] = CLIP (block[1] + dest[1]);
@@ -238,7 +1056,11 @@
             dest += stride;
             block += 8;
         } while (--i);
+#endif   
     } else {
+#ifdef CPU_COLDFIRE  	
+    	clip_block_to_dest_add_DC(block,dest,stride);
+#else
         int DC;
 
         DC = (block[0] + 64) >> 7;
@@ -255,6 +1077,7 @@
             dest[7] = CLIP (DC + dest[7]);
             dest += stride;
         } while (--i);
+#endif 
     }
 }
 
Index: apps/plugins/mpegplayer/mpeg2_internal.h
===================================================================
RCS file: /cvsroot/rockbox/apps/plugins/mpegplayer/mpeg2_internal.h,v
retrieving revision 1.1
diff -u -r1.1 mpeg2_internal.h
--- apps/plugins/mpegplayer/mpeg2_internal.h	7 Aug 2006 22:11:07 -0000	1.1
+++ apps/plugins/mpegplayer/mpeg2_internal.h	14 Sep 2006 21:49:17 -0000
@@ -92,8 +92,11 @@
     int16_t dc_dct_pred[3];
 
     /* DCT coefficients */
+#ifdef CPU_COLDFIRE
+    int16_t DCTblock[128] ATTR_ALIGN(64);
+#else
     int16_t DCTblock[64] ATTR_ALIGN(64);
-
+#endif
     uint8_t * picture_dest[3];
     void (* convert) (void * convert_id, uint8_t * const * src,
 		      unsigned int v_offset);
