--- coldfire.S.orig	2005-10-27 15:41:05.204083960 +0200
+++ coldfire.S	2005-10-27 16:44:59.888123376 +0200
@@ -25,7 +25,7 @@
  */
     .text
     .global lpc_decode_emac
-    .align 2
+    .align 16                 | cache line is 16 byte
 lpc_decode_emac:
     lea.l (-40, %sp), %sp
     movem.l %d2-%d7/%a2-%a5, (%sp)
@@ -41,6 +41,8 @@
     move.l %d2, %d3 
     neg.l %d3 
     lea.l (%a0, %d3.l*4), %a0 | history
+    movclr.l %acc0, %d3       | acc not assumed zero
+|    movclr.l %acc1, %d3
     clr.l %d3
     move.l %d3, %macsr        | we'll need integer mode for this
     tst.l %d0          
@@ -61,25 +63,41 @@
 
 | last jump table entry coincides with target, so leave it out
 .order8:
+|    asr #1, %d0  | assuming even number of iterations	
     movem.l (%a1), %d3-%d7/%a2-%a4 | load lpc coefs
     move.l (%a0)+, %a5             | load first history sample
 .loop8:
-    mac.l %a5, %a4, (%a0)+, %a5, %acc0
-    mac.l %a5, %a3, (%a0)+, %a5, %acc0
-    mac.l %a5, %a2, (%a0)+, %a5, %acc0
-    mac.l %a5, %d7, (%a0)+, %a5, %acc0
-    mac.l %a5, %d6, (%a0)+, %a5, %acc0
-    mac.l %a5, %d5, (%a0)+, %a5, %acc0
-    mac.l %a5, %d4, (%a0)+, %a5, %acc0
-    mac.l %a5, %d3, (-7*4, %a0), %a5, %acc0 | load for the next iteration
+    mac.l %a5, %a4, (%a0)+, %a5, %acc1
+    mac.l %a5, %a4                 | assuming the (%a0)+ fetch used wait states
+                                   | it might make sense to reuse %a5 for the next calculation
+    mac.l %a5, %a3, (%a0)+, %a5, %acc1
+    mac.l %a5, %a3                 | reuse %a5
+    mac.l %a5, %a2, (%a0)+, %a5, %acc1
+    mac.l %a5, %a2                 | reuse %a5
+    mac.l %a5, %d7, (%a0)+, %a5, %acc1
+    mac.l %a5, %d7                 | reuse %a5
+    mac.l %a5, %d6, (%a0)+, %a5, %acc1
+    mac.l %a5, %d6                 | reuse %a5
+    mac.l %a5, %d5, (%a0)+, %a5, %acc1
+    mac.l %a5, %d5                 | reuse %a5
+    mac.l %a5, %d4, (%a0)+, %a5, %acc1
+    mac.l %a5, %d4, (%a0)+, %a5, %acc0
+    mac.l %a5, %d3
+    mac.l %a5, %d3, (-6*4, %a0), %a5, %acc1 | load for the next iteration
+    movclr.l %acc1, %d2    | get sum
+    asr.l %d1, %d2         | shift sum by lp_quantization bits
+| add #1<<(%d1-1), %d2 | add (table,%d1),%d2 | to minimize rounding error
+    add.l %d2, (%a0)+      | add residual and save
     movclr.l %acc0, %d2    | get sum
     asr.l %d1, %d2         | shift sum by lp_quantization bits
+| add #1<<(%d1-1), %d2 here to minimize rounding error?
     add.l %d2, (%a0)       | add residual and save
-    lea.l (-6*4, %a0), %a0 | point history back at second element
+    lea.l (-5*4, %a0), %a0 | point history back at second element
     subq.l #1, %d0         | decrement counter
     jne .loop8             | are we done?
     jra .exit
 
+    .align 16              | cache line is 16 byte
 .order7:
     movem.l (%a1), %d3-%d7/%a2-%a3
     move.l (%a0)+, %a5
@@ -99,6 +117,7 @@
     jne .loop7
     jra .exit
 
+    .align 16                 | cache line is 16 byte
 .order6:
     movem.l (%a1), %d3-%d7/%a2
     move.l (%a0)+, %a5
@@ -117,6 +136,7 @@
     jne .loop6
     jra .exit
 
+    .align 16                 | cache line is 16 byte
 .order5:
     movem.l (%a1), %d3-%d7
     move.l (%a0)+, %a5
@@ -134,6 +154,7 @@
     jne .loop5
     jra .exit
 
+    .align 16                 | cache line is 16 byte
 .order4:
     movem.l (%a1), %d3-%d6
     move.l (%a0)+, %a5
@@ -150,6 +171,7 @@
     jne .loop4
     jra .exit
 
+    .align 16                 | cache line is 16 byte
 .order3:
     movem.l (%a1), %d3-%d5
     move.l (%a0)+, %a5
@@ -165,6 +187,7 @@
     jne .loop3
     jra .exit
 
+    .align 16                 | cache line is 16 byte
 .order2:
     movem.l (%a1), %d3-%d4
     move.l (%a0)+, %a5
@@ -178,6 +201,7 @@
     jne .loop2
     jra .exit
 
+    .align 16                 | cache line is 16 byte
 .order1:
     | no point in using mac here
     move.l (%a1), %d3
@@ -189,7 +213,8 @@
     subq.l #1, %d0
     jne .loop1
     jra .exit
-    
+
+    .align 16                 | cache line is 16 byte
 .default:
     /* we do the filtering in an unrolled by 4 loop as far as we can, and then
        do the rest in an ordinary one by one sample loop.

