Index: apps/codecs/libffmpegFLAC/arm.S
===================================================================
--- apps/codecs/libffmpegFLAC/arm.S	(revision 28300)
+++ apps/codecs/libffmpegFLAC/arm.S	(arbetskopia)
@@ -21,7 +21,320 @@
 
 #include "config.h"
 
+#if ARM_ARCH >= 5
 /* The following is an assembler optimised version of the LPC filtering
+   routines needed for FLAC decoding. It is optimised for use with ARMv5 
+   processors.
+   All LPC filtering up to order 12 is done in specially optimised unrolled
+   loops, while every order above this is handled by a slower default routine.
+ */
+#ifdef USE_IRAM
+    .section .icode,"ax",%progbits
+#else
+    .text
+#endif
+    .global lpc_decode_arm_v5
+lpc_decode_arm_v5:
+    stmdb sp!, { r4-r11, lr }
+    ldr r4, [sp, #36]
+    /* r0 = blocksize, r1 = qlevel, r2 = pred_order
+       r3 = data, r4 = coeffs
+     */
+     
+    /* the data pointer always lags behind history pointer by 'pred_order'
+       samples. since we have one loop for each order, we can hard code this
+       and free a register by not saving data pointer. 
+     */ 
+    sub   r3, r3, r2, lsl #2  @ r3 = history
+    cmp   r0, #0              @ no samples to process
+    beq   .exit
+    cmp   r2, #12             @ check if order is too high for unrolled loops
+    addls pc, pc, r2, lsl #2  @ jump to our unrolled decode loop if it exists
+@ jumptable:
+    b .default                @ order too high, go to default routine
+    b .exit                   @ zero order filter isn't possible, exit function
+    b .order1
+    b .order2
+    b .order3
+    b .order4
+    b .order5
+    b .order6
+    b .order7
+    b .order8
+    b .order9
+    b .order10
+    b .order11
+
+@ last jump table entry coincides with target, so leave it out
+.order12:
+    ldmia  r4, { r5-r10 } @ fetch coefs
+.loop12:
+    ldmia  r3!, {r4, r11-r12, r14}
+    smulbt r2, r4, r10
+    smlabb r2, r11, r10, r2
+    smlabt r2, r12, r9, r2
+    smlabb r2, r14, r9, r2
+    ldmia  r3!, {r4, r11-r12, r14}
+    smlabt r2, r4, r8, r2
+    smlabb r2, r11, r8, r2
+    smlabt r2, r12, r7, r2
+    smlabb r2, r14, r7, r2
+    ldmia  r3!, {r4, r11-r12, r14}
+    smlabt r2, r4, r6, r2
+    smlabb r2, r11, r6, r2
+    ldr    r4, [r3]              @ r4 = residual
+    smlabt r2, r12, r5, r2
+    smlabb r2, r14, r5, r2
+    add    r2, r4, r2, asr r1    @ shift sum by qlevel bits and add residual 
+    str    r2, [r3], #-11*4      @ save result and wrap history pointer back
+    subs   r0, r0, #1            @ check if we're done
+    bne    .loop12               @ nope, jump back
+    b      .exit
+
+.order11:
+    ldmia  r4, { r5-r10 } @ fetch coefs
+.loop11:
+    ldmia  r3!, {r4, r11-r12}
+    smulbb r2, r4, r10
+    smlabt r2, r11, r9, r2
+    smlabb r2, r12, r9, r2
+    ldmia  r3!, {r4, r11-r12, r14}
+    smlabt r2, r4, r8, r2
+    smlabb r2, r11, r8, r2
+    smlabt r2, r12, r7, r2
+    smlabb r2, r14, r7, r2
+    ldmia  r3!, {r4, r11-r12, r14}
+    smlabt r2, r4, r6, r2
+    smlabb r2, r11, r6, r2
+    ldr    r4, [r3]              @ r4 = residual
+    smlabt r2, r12, r5, r2
+    smlabb r2, r14, r5, r2
+    add    r2, r4, r2, asr r1    @ shift sum by qlevel bits and add residual 
+    str    r2, [r3], #-10*4      @ save result and wrap history pointer back
+    subs   r0, r0, #1            @ check if we're done
+    bne    .loop11               @ nope, jump back
+    b      .exit
+
+.order10:
+    ldmia  r4, { r5-r9 } @ fetch coefs
+.loop10:
+    ldmia  r3!, {r4, r10-r12, r14}
+    smulbt r2, r4, r9
+    smlabb r2, r10, r9, r2
+    smlabt r2, r11, r8, r2
+    smlabb r2, r12, r8, r2
+    smlabt r2, r14, r7, r2
+    ldmia  r3!, {r4, r10-r12, r14}
+    smlabb r2, r4, r7, r2
+    smlabt r2, r10, r6, r2
+    smlabb r2, r11, r6, r2
+    smlabt r2, r12, r5, r2
+    ldr    r4, [r3]              @ r4 = residual
+    smlabb r2, r14, r5, r2
+    add    r2, r4, r2, asr r1    @ shift sum by qlevel bits and add residual 
+    str    r2, [r3], #-9*4       @ save result and wrap history pointer back
+    subs   r0, r0, #1            @ check if we're done
+    bne    .loop10               @ nope, jump back
+    b      .exit
+
+.order9:
+    ldmia r4, { r5-r9 } @ fetch coefs
+.loop9:
+    ldmia r3!, {r4, r10-r12, r14}
+    smulbb r2, r4, r9           @ multiply with last coef
+    smlabt r2, r10, r8, r2
+    smlabb r2, r11, r8, r2
+    smlabt r2, r12, r7, r2
+    ldmia r3!, {r4, r10-r12}
+    smlabb r2, r14, r7, r2
+    smlabt r2, r4, r6, r2
+    smlabb r2, r10, r6, r2
+    smlabt r2, r11, r5, r2
+    ldr r4, [r3]              @ r4 = residual
+    smlabb r2, r12, r5, r2
+    add r2, r4, r2, asr r1    @ shift sum by qlevel bits and add residual 
+    str r2, [r3], #-8*4       @ save result and wrap history pointer back
+    subs r0, r0, #1           @ check if we're done
+    bne .loop9                @ nope, jump back
+    b .exit
+    
+.order8:
+    ldmia r4, { r5-r8 }
+.loop8:
+    @ we have more registers to spare here, so start block reading
+    ldmia r3!, { r4, r9-r11 }
+    smulbt r2, r4, r8
+    smlabb r2, r9, r8, r2
+    smlabt r2, r10, r7, r2
+    smlabb r2, r11, r7, r2
+    ldmia r3!, { r4, r9-r11 }
+    smlabt r2, r4, r6, r2
+    smlabb r2, r9, r6, r2
+    ldr r4, [r3]
+    smlabt r2, r10, r5, r2
+    smlabb r2, r11, r5, r2
+    add r2, r4, r2, asr r1
+    str r2, [r3], #-7*4
+    subs r0, r0, #1
+    bne .loop8
+    b .exit
+
+.order7:
+    ldmia r4, { r5-r8 }
+.loop7:
+    ldmia r3!, { r4, r12, r14 }
+    smulbb r2, r4, r8
+    smlabt r2, r12, r7, r2
+    smlabb r2, r14, r7, r2
+    ldmia r3!, { r4, r11-r12, r14 }
+    smlabt r2, r4, r6, r2
+    smlabb r2, r11, r6, r2
+    ldr r4, [r3]
+    smlabt r2, r12, r5, r2
+    smlabb r2, r14, r5, r2
+    add r2, r4, r2, asr r1
+    str r2, [r3], #-6*4
+    subs r0, r0, #1
+    bne .loop7
+    b .exit
+
+.order6:
+    ldmia r4, { r5-r7 }
+.loop6:
+    ldmia r3!, { r4, r9-r12, r14 }
+    smulbt r2, r4, r7
+    smlabb r2, r9, r7, r2
+    smlabt r2, r10, r6, r2
+    smlabb r2, r11, r6, r2
+    smlabt r2, r12, r5, r2
+    ldr r4, [r3]
+    smlabb r2, r14, r5, r2
+    add r2, r4, r2, asr r1
+    str r2, [r3], #-5*4
+    subs r0, r0, #1
+    bne .loop6
+    b .exit
+
+.order5:
+    ldmia r4, { r5-r7 }
+.loop5:
+    ldmia r3!, { r4, r10-r12, r14 }
+    smulbb r2, r4, r7
+    smlabt r2, r10, r6, r2
+    smlabb r2, r11, r6, r2
+    ldr r4, [r3]
+    smlabt r2, r12, r5, r2
+    smlabb r2, r14, r5, r2
+    add r2, r4, r2, asr r1
+    str r2, [r3], #-4*4
+    subs r0, r0, #1
+    bne .loop5
+    b .exit
+
+.order4:
+    ldmia r4, { r5-r6 }
+.loop4:
+    ldmia r3!, { r4, r11-r12, r14 }
+    smulbt r2, r4, r6
+    smlabb r2, r11, r6, r2
+    ldr r4, [r3]
+    smlabt r2, r12, r5, r2
+    smlabb r2, r14, r5, r2
+    add r2, r4, r2, asr r1
+    str r2, [r3], #-3*4
+    subs r0, r0, #1
+    bne .loop4
+    b .exit
+
+.order3:
+    ldmia r4, { r5-r6 }
+.loop3:
+    ldmia r3!, { r4, r12, r14 }
+    smulbb r2, r4, r6
+    ldr r4, [r3]
+    smlabt r2, r12, r5, r2
+    smlabb r2, r14, r5, r2
+    add r2, r4, r2, asr r1
+    str r2, [r3], #-2*4
+    subs r0, r0, #1
+    bne .loop3
+    b .exit
+
+.order2:
+    ldr r5, [r4]
+.loop2:
+    ldmia r3!, { r4, r14 }
+    smulbt r2, r4, r5
+    ldr r4, [r3]
+    smlabb r2, r14, r5, r2
+    add r2, r4, r2, asr r1
+    str r2, [r3], #-1*4
+    subs r0, r0, #1
+    bne .loop2
+    b .exit
+
+.order1:
+    ldrh r5, [r4]           @ load the one coef we need
+    ldr r4, [r3], #4        @ load one history sample, r3 now points to residual
+.loop1:
+    smulbb r2, r4, r5       @ multiply coef by history sample
+    ldr r4, [r3]            @ load residual
+    add r4, r4, r2, asr r1  @ add result to residual
+    str r4, [r3], #4        @ place r3 at next residual, we already have 
+    subs r0, r0, #1         @ the current sample in r4 for the next iteration
+    bne .loop1
+    b .exit
+
+.default:
+    /* we do the filtering in an unrolled by 4 loop as far as we can, and then
+       do the rest by jump table. */
+
+    /* we need to have aligned addresses in the loop so handle &3 first */
+    add r5, r4, r2, lsl #1    @ need to start in the other end of coefs
+    mov r7, r2, lsr #2        @ r7 = coefs/4
+    mov r14, #0               @ init accumulator
+    and r8, r2, #3            @ get remaining samples to be filtered
+    add pc, pc, r8, lsl #2    @ jump into accumulator chain
+@ jumptable:
+    b .dloop1 @ padding
+    b .dloop1
+    b .one
+    b .two
+@ implicit .three 
+    ldrh   r12, [r5, #-2]!
+    ldr    r8, [r3], #4
+    smlabb r14, r12, r8, r14
+.two:
+    ldrh   r12, [r5, #-2]!
+    ldr    r8, [r3], #4
+    smlabb r14, r12, r8, r14
+.one:
+    ldrh   r12, [r5, #-2]!
+    ldr    r8, [r3], #4
+    smlabb r14, r12, r8, r14
+
+.dloop1:
+    ldmdb  r5!, { r8-r9 }
+    ldmia  r3!, { r6, r10-r12 }
+    smlabt r14, r6, r9, r14
+    smlabb r14, r10, r9, r14
+    smlabt r14, r11, r8, r14
+    smlabb r14, r12, r8, r14
+    subs   r7, r7, #1
+    bne    .dloop1
+
+    ldr    r12, [r3]             @ load residual
+    add    r14, r12, r14, asr r1 @ shift sum by qlevel bits and add residual
+    str    r14, [r3], #4         @ store result
+    sub    r3, r3, r2, lsl #2    @ and wrap history pointer back to next first pos
+    subs   r0, r0, #1            @ are we done?
+    bne    .default              @ no, prepare for next sample
+
+.exit:
+    ldmpc regs=r4-r11
+#else
+
+/* The following is an assembler optimised version of the LPC filtering
    routines needed for FLAC decoding. It is optimised for use with ARM 
    processors.
    All LPC filtering up to order 9 is done in specially optimised unrolled
@@ -268,4 +581,4 @@
 
 .exit:
     ldmpc regs=r4-r11
-
+#endif
Index: apps/codecs/libffmpegFLAC/decoder.c
===================================================================
--- apps/codecs/libffmpegFLAC/decoder.c	(revision 28300)
+++ apps/codecs/libffmpegFLAC/decoder.c	(arbetskopia)
@@ -227,7 +227,11 @@
     int sum, i, j;
     int64_t wsum;
     int coeff_prec, qlevel;
-    int coeffs[pred_order];
+#if defined (CPU_ARM) && ARM_ARCH >= 6
+    int16_t coeffs[32] __attribute__((aligned(4)));
+#else
+    int coeffs[32];
+#endif
 
     /* warm up samples */
     for (i = 0; i < pred_order; i++)
@@ -262,9 +266,15 @@
         lpc_decode_emac(s->blocksize - pred_order, qlevel, pred_order,
                         decoded + pred_order, coeffs);
         #elif defined(CPU_ARM)
+        #if ARM_ARCH >= 5
         (void)sum;
+        lpc_decode_arm_v5(s->blocksize - pred_order, qlevel, pred_order,
+                          decoded + pred_order, coeffs);
+        #else*/
+        (void)sum;
         lpc_decode_arm(s->blocksize - pred_order, qlevel, pred_order,
                        decoded + pred_order, coeffs);
+        #endif
         #else
         for (i = pred_order; i < s->blocksize; i++)
         {
Index: apps/codecs/libffmpegFLAC/arm.h
===================================================================
--- apps/codecs/libffmpegFLAC/arm.h	(revision 28300)
+++ apps/codecs/libffmpegFLAC/arm.h	(arbetskopia)
@@ -2,6 +2,8 @@
 #define _FLAC_ARM_H
 
 #include "bitstream.h"
+void lpc_decode_arm_v5(int blocksize, int qlevel, int pred_order,
+                       int32_t* data, int16_t* coeffs);
 
 void lpc_decode_arm(int blocksize, int qlevel, int pred_order, int32_t* data, int* coeffs);
 
