diff --git a/apps/codecs/demac/libdemac/predictor-arm.S b/apps/codecs/demac/libdemac/predictor-arm.S
index 1d55876..92a78ed 100644
--- a/apps/codecs/demac/libdemac/predictor-arm.S
+++ b/apps/codecs/demac/libdemac/predictor-arm.S
@@ -505,7 +505,11 @@ loop:
 done:
     str     r14, [r12]              @ Save value of p->buf
     add     sp, sp, #12             @ Don't bother restoring r1-r3 
+#ifdef ROCKBOX
+    ldmpc   regs=r4-r11
+#else
     ldmia   sp!, {r4 - r11, pc}
+#endif
 
 move_hist:
     @ dest = r11 (p->historybuffer)
@@ -664,7 +668,11 @@ loopm:
 donem:
     str     r14, [r12]              @ Save value of p->buf
     add     sp, sp, #8              @ Don't bother restoring r1, r2
+#ifdef ROCKBOX
+    ldmpc   regs=r4-r11
+#else
     ldmia   sp!, {r4 - r11, pc}
+#endif
 
 move_histm:
     @ dest = r11 (p->historybuffer)
diff --git a/apps/codecs/lib/mdct_arm.S b/apps/codecs/lib/mdct_arm.S
index f2fa1d9..515b859 100644
--- a/apps/codecs/lib/mdct_arm.S
+++ b/apps/codecs/lib/mdct_arm.S
@@ -127,7 +127,7 @@ mdct_butterfly_16:
     @ mdct_butterfly_8 increments r0 by another #8*4 here
     @ at end, r0 has been incremented by #16*4
 
-    ldr     pc, [sp], #4
+    ldrpc
 
 mdct_butterfly_32:
     stmdb   sp!, {r4-r11, lr}
@@ -257,7 +257,7 @@ mdct_butterfly_32:
     @ and we wanted to advance by #16*4 anyway, so just call again
     bl      mdct_butterfly_16
 
-    ldmia   sp!, {r4-r11, pc}
+    ldmpc   regs=r4-r11
 
     @ mdct_butterfly_generic_loop(x1, x2, T0, step, Ttop)
 mdct_butterfly_generic_loop:
@@ -433,5 +433,5 @@ mdct_butterfly_generic_loop:
     cmp     r2, r4
     bhi     1b
 
-    ldmia   sp!, {r4-r11, pc}
+    ldmpc   regs=r4-r11
 
diff --git a/apps/codecs/libatrac/atrac3_arm.S b/apps/codecs/libatrac/atrac3_arm.S
index 0908d58..80eaa79 100644
--- a/apps/codecs/libatrac/atrac3_arm.S
+++ b/apps/codecs/libatrac/atrac3_arm.S
@@ -19,6 +19,8 @@
  *
  ****************************************************************************/
 
+#include "config.h"
+
     .section .text, "ax", %progbits
 
 /****************************************************************************
@@ -63,7 +65,7 @@ atrac3_iqmf_matrixing:
     subs r3, r3, #4                 /* counter -= 4 */
     bgt .iqmf_matrixing_loop
     
-    ldmfd   sp!, {r4-r9, pc}       /* restore registers */
+    ldmpc   regs=r4-r9              /* restore registers */
 
 .atrac3_iqmf_matrixing_end:
     .size   atrac3_iqmf_matrixing,.atrac3_iqmf_matrixing_end-atrac3_iqmf_matrixing
@@ -218,7 +220,7 @@ atrac3_iqmf_dewindowing:
     subs r3, r3, #1                 /* outer loop -= 1 */
     bgt .iqmf_dewindow_outer_loop
     
-    ldmfd   sp!, {r4-r9, pc}        /* restore registers */
+    ldmpc   regs=r4-r9              /* restore registers */
     
 .atrac3_iqmf_dewindowing_end:
     .size   atrac3_iqmf_dewindowing,.atrac3_iqmf_dewindowing_end-atrac3_iqmf_dewindowing
diff --git a/apps/codecs/libffmpegFLAC/arm.S b/apps/codecs/libffmpegFLAC/arm.S
index 2a2746e..8adca77 100644
--- a/apps/codecs/libffmpegFLAC/arm.S
+++ b/apps/codecs/libffmpegFLAC/arm.S
@@ -267,5 +267,5 @@ lpc_decode_arm:
     bne .default              @ no, prepare for next sample
 
 .exit:
-    ldmia sp!, { r4-r11, pc }
+    ldmpc regs=r4-r11
 
diff --git a/apps/codecs/libmad/dct32_arm.S b/apps/codecs/libmad/dct32_arm.S
index a4eda8a..440841b 100644
--- a/apps/codecs/libmad/dct32_arm.S
+++ b/apps/codecs/libmad/dct32_arm.S
@@ -220,7 +220,7 @@ dct32:
     cmp      r0, #9
     bne      .l4
     add      sp, sp, #144
-    ldmia    sp!, {r4-r11, pc}
+    ldmpc    regs=r4-r11
 bitrev:
     .word 0x0
     .word 0x2
diff --git a/apps/codecs/libmad/imdct_l_arm.S b/apps/codecs/libmad/imdct_l_arm.S
index b86ba11..b511ff1 100644
--- a/apps/codecs/libmad/imdct_l_arm.S
+++ b/apps/codecs/libmad/imdct_l_arm.S
@@ -45,6 +45,7 @@
 *
 ****************************************************************************/
 
+#include "config.h"
 
 /*
    On entry:
@@ -823,7 +824,7 @@ normal_block_x18_to_x35:
     @----
 
     add     sp, sp, #(21*4)             @ return stack frame
-    ldmia   sp!, { r4 - r11, pc }       @ restore callee saved regs, and return
+    ldmpc   regs=r4-r11                 @ restore callee saved regs, and return
 
     @----
 
@@ -992,7 +993,7 @@ start_block_x18_to_x35:
     @----
 
     add     sp, sp, #(21*4)             @ return stack frame
-    ldmia   sp!, { r4 - r11, pc }       @ restore callee saved regs, and return
+    ldmpc   regs=r4-r11                 @ restore callee saved regs, and return
 
     @----
     @END
diff --git a/apps/codecs/libmad/synth_full_arm.S b/apps/codecs/libmad/synth_full_arm.S
index e663b8f..dec437f 100644
--- a/apps/codecs/libmad/synth_full_arm.S
+++ b/apps/codecs/libmad/synth_full_arm.S
@@ -19,6 +19,7 @@
  *
  ****************************************************************************/
 
+#include "config.h"
 #include "mad_iram.h"
 
     .section    ICODE_SECTION_MPA_ARM,"ax",%progbits
@@ -135,7 +136,7 @@ synth_full_odd_sbsample:
 
     ldr     r5, =synth_full_sp
     ldr     sp, [r5]
-    ldmia   sp!, {r4-r11, pc}
+    ldmpc   regs=r4-r11
 
 synth_full_even_sbsample:
     stmdb   sp!, {r4-r11, lr}
@@ -241,7 +242,7 @@ synth_full_even_sbsample:
 
     ldr     r5, =synth_full_sp
     ldr     sp, [r5]
-    ldmia   sp!, {r4-r11, pc}
+    ldmpc   regs=r4-r11
 
     .global III_aliasreduce
 
@@ -289,7 +290,7 @@ III_aliasreduce:
     add     r0, r0, #72
     cmp     r0, r1
     blo     .arl1
-    ldmia   sp!, {r4-r11, pc}
+    ldmpc   regs=r4-r11
 
 csa:
     .word +0x0db84a81
@@ -332,7 +333,7 @@ III_overlap:
     stmia r1!, {r4, r5, r6, r7, r12, lr}
     ldmia r0!, {r4, r5, r6, r7, r12, lr}
     stmia r1!, {r4, r5, r6, r7, r12, lr}
-    ldmia   sp!, {r4-r7, pc}
+    ldmpc regs=r4-r7
 
     .section    IBSS_SECTION_MPA_ARM,"aw",%nobits
 synth_full_sp:
diff --git a/apps/codecs/libmusepack/synth_filter_arm.S b/apps/codecs/libmusepack/synth_filter_arm.S
index 3f78469..731a21c 100644
--- a/apps/codecs/libmusepack/synth_filter_arm.S
+++ b/apps/codecs/libmusepack/synth_filter_arm.S
@@ -19,6 +19,8 @@
  *
  ****************************************************************************/
 
+#include "config.h"
+
     .section .text, "ax", %progbits
 
 /****************************************************************************
@@ -89,7 +91,7 @@ mpc_decoder_windowing_D:
     subs lr, lr, #1
     bgt .loop32
     
-    ldmfd   sp!, {r4-r8, pc}
+    ldmpc regs=r4-r8
 #else
 mpc_decoder_windowing_D:
     /* r0 = Data[] */
@@ -287,7 +289,7 @@ mpc_decoder_windowing_D:
     str r8, [r0], #4            /* store Data */
     add r1, r1, #4              /* V++ */
     
-    ldmfd   sp!, {r4-r11, pc}
+    ldmpc regs=r4-r11
 #endif
 .mpc_dewindowing_end:
     .size   mpc_decoder_windowing_D,.mpc_dewindowing_end-mpc_decoder_windowing_D
diff --git a/apps/codecs/libspeex/filters_arm4.S b/apps/codecs/libspeex/filters_arm4.S
index 109556f..dcd1691 100644
--- a/apps/codecs/libspeex/filters_arm4.S
+++ b/apps/codecs/libspeex/filters_arm4.S
@@ -49,7 +49,7 @@ iir_mem16:
     beq     .order_10
     cmp     r5, #8
     beq     .order_8
-    ldmia   sp!, { r4-r11, pc }     @ Non-supported order, return
+    ldmpc   regs=r4-r11             @ Non-supported order, return
 
     @ TODO: try using direct form 1 filtering
 .order_8:
@@ -94,7 +94,7 @@ iir_mem16:
     bne     0b
     ldr     r4, [sp, #40]           @ r4 = mem
     stmia   r4, { r5-r12 }          @ Save back mem[]
-    ldmia   sp!, { r4-r11, pc }     @ Exit
+    ldmpc   regs=r4-r11             @ Exit
 
 .order_10:
     ldmia   r4, { r5-r9 }           @ r5-r9 = mem[0..4]
@@ -154,7 +154,7 @@ iir_mem16:
     sub     r1, r1, #10*2
     subs    r3, r3, #1
     bne     .order_10
-    ldmia   sp!, { r4-r11, pc }     @ Exit
+    ldmpc   regs=r4-r11             @ Exit
 
 
 /* void qmf_synth(const spx_word16_t *x1, const spx_word16_t *x2, const spx_word16_t *a, spx_word16_t *y, int N, int M, spx_word32_t *mem1, spx_word32_t *mem2, char *stack) */
@@ -300,7 +300,7 @@ qmf_synth:
     strh    r8, [r6], #4
     subs    r4, r4, #4
     bne     0b
-    ldmia   sp!, { r4-r11, pc }     @ Exit
+    ldmpc   regs=r4-r11             @ Exit
 
 
 /* void signal_mul(const spx_sig_t *x, spx_sig_t *y, spx_word32_t scale, int len) */
@@ -325,5 +325,5 @@ signal_mul:
     subs    r3, r3, #4                      @ Are we done?
     bne     0b
 
-    ldmia   sp!, { r4-r8, pc }              @ Exit
+    ldmpc   regs=r4-r8                      @ Exit
 
diff --git a/apps/codecs/libtta/filter_arm.S b/apps/codecs/libtta/filter_arm.S
index d34b182..f3959b8 100644
--- a/apps/codecs/libtta/filter_arm.S
+++ b/apps/codecs/libtta/filter_arm.S
@@ -167,7 +167,7 @@ hybrid_filter:
     @ set to the memory: *pA, *(pA-1), *(pA-2), *(pA-3), *pM, *(pM-1), *(pM-2), *(pM-3)
     stmneda  r2,  {r10, r11, r12, lr}
     stmneda  r3,  {r5,  r6,  r7,  r8}
-    ldmnefd  sp!, {r4-r12, pc}                   @ hybrid_filter end (when fs->index != 0)
+    ldmpc    cond=ne regs=r4-r12     @ hybrid_filter end (when fs->index != 0)
 
 .hf_memshl:
     @ memshl (fs->dl)
@@ -192,7 +192,7 @@ hybrid_filter:
     ldmia    r9,  {r1, r2, r3, r4}
     sub      r9,  r9,  #64                       @ r9 = fs->dx
     stmia    r9,  {r1 - r8}
-    ldmfd    sp!, {r4 - r12, pc}                 @ hybrid_filter end (when fs->index == 0)
+    ldmpc    regs=r4-r12                         @ hybrid_filter end (when fs->index == 0)
 
 hybrid_filter_end:
     .size    hybrid_filter, hybrid_filter_end - hybrid_filter
diff --git a/apps/codecs/libwavpack/arm.S b/apps/codecs/libwavpack/arm.S
index 90dfd46..32de1df 100644
--- a/apps/codecs/libwavpack/arm.S
+++ b/apps/codecs/libwavpack/arm.S
@@ -35,6 +35,9 @@
  * 32-bit multiply-accumulate instruction and so will overflow with 24-bit
  * WavPack files.
  */
+
+#include "config.h"
+
         .text
         .align
         .global         decorr_stereo_pass_cont_arm
@@ -470,5 +473,5 @@ term_minus_3_loop:
 common_exit:
         strh    r4, [r5, #4]
         strh    r0, [r5, #6]
-        ldmfd   sp!, {r4 - r8, r10, r11, pc}
+        ldmpc   regs="r4-r8, r10-r11"
 
diff --git a/apps/codecs/libwavpack/arml.S b/apps/codecs/libwavpack/arml.S
index 5745c81..60818aa 100644
--- a/apps/codecs/libwavpack/arml.S
+++ b/apps/codecs/libwavpack/arml.S
@@ -38,6 +38,8 @@
  * instruction.
  */
 
+#include "config.h"
+
         .text
         .align
         .global         decorr_stereo_pass_cont_arml
@@ -500,5 +502,5 @@ common_exit:
         mov     r4, r4, asr #18
         strh    r4, [r5, #4]
         strh    r0, [r5, #6]
-        ldmfd   sp!, {r4 - r8, r10, r11, pc}
+        ldmpc   regs="r4-r8, r10-r11"
 
diff --git a/apps/dsp_arm.S b/apps/dsp_arm.S
index 2150ff0..7e36074 100644
--- a/apps/dsp_arm.S
+++ b/apps/dsp_arm.S
@@ -23,6 +23,9 @@
 /****************************************************************************
  *  void channels_process_sound_chan_mono(int count, int32_t *buf[])
  */
+
+#include "config.h"
+
     .section .icode, "ax", %progbits
     .align  2
     .global channels_process_sound_chan_mono
@@ -47,7 +50,7 @@ channels_process_sound_chan_mono:
     stmia   r2!, { r12, r14 }          @ store Mo0, Mo1
     bgt     .monoloop                  @
                                        @
-    ldmltfd sp!, { r4, pc }            @ if count was even, we're done
+    ldmpc   cond=lt, regs=r4           @ if count was even, we're done
                                        @
 .mono_singlesample:                    @
     ldr     r3, [r1]                   @ r3 = Ls
@@ -57,7 +60,7 @@ channels_process_sound_chan_mono:
     str     r12, [r1]                  @ store Mo
     str     r12, [r2]                  @ store Mo
                                        @
-    ldmfd   sp!, { r4, pc }            @
+    ldmpc   regs=r4                    @
     .size   channels_process_sound_chan_mono, \
                 .-channels_process_sound_chan_mono
 
@@ -112,7 +115,7 @@ channels_process_sound_chan_custom:
 
     bgt     .custom_loop
     
-    ldmltfd sp!, { r4-r10, pc }        @ < 0? even count
+    ldmpc   cond=lt, regs=r4-r10       @ < 0? even count
     
 .custom_single_sample:
     ldr     r5, [r1]                   @ handle odd sample
@@ -131,7 +134,7 @@ channels_process_sound_chan_custom:
     str     r5, [r1]                   @ Store Lc0
     str     r7, [r2]                   @ Store Rc0
 
-    ldmfd   sp!, { r4-r10, pc }
+    ldmpc   regs=r4-r10
     .size   channels_process_sound_chan_custom, \
                 .-channels_process_sound_chan_custom
 
@@ -164,7 +167,7 @@ channels_process_sound_chan_karaoke:
     stmia   r2!, { r12, r14 }          @ store Ro0, Ro1
     bgt     .karaokeloop               @
                                        @
-    ldmltfd sp!, { r4, pc }            @ if count was even, we're done
+    ldmpc   cond=lt, regs=r4           @ if count was even, we're done
                                        @
 .karaoke_singlesample:                 @
     ldr     r3, [r1]                   @ r3 = Li
@@ -175,7 +178,7 @@ channels_process_sound_chan_karaoke:
     str     r3, [r1]                   @ store Lo
     str     r12, [r2]                  @ store Ro
                                        @
-    ldmfd   sp!, { r4, pc }            @
+    ldmpc   regs=r4                    @
     .size   channels_process_sound_chan_karaoke, \
                 .-channels_process_sound_chan_karaoke
 
@@ -225,7 +228,7 @@ sample_output_mono:
     subs    r0, r0, #2
     bgt     .somloop     
        
-    ldmltfd sp!, { r4-r6, pc }         @ even 'count'? return
+    ldmpc   cond=lt, regs=r4-r6        @ even 'count'? return
 
 .som_singlesample:
     ldr     r5, [r2]                   @ do odd sample
@@ -239,7 +242,7 @@ sample_output_mono:
     orr     r5, r5, r5, lsl #16
     str     r5, [r3]
 
-    ldmfd   sp!, { r4-r6, pc }
+    ldmpc   regs=r4-r6
     .size   sample_output_mono, .-sample_output_mono
     
 /****************************************************************************
@@ -302,7 +305,7 @@ sample_output_stereo:
     subs    r0, r0, #2
     bgt     .sosloop
 
-    ldmltfd sp!, { r4-r9, pc }         @ even 'count'? return
+    ldmpc   cond=lt, regs=r4-r9        @ even 'count'? return
 
 .sos_singlesample:    
     ldr     r6, [r2]                   @ left odd sample
@@ -324,7 +327,7 @@ sample_output_stereo:
 
     str     r8, [r3]
 
-    ldmfd   sp!, { r4-r9, pc }
+    ldmpc   regs=r4-r9
     .size   sample_output_stereo, .-sample_output_stereo
 #endif /* ARM_ARCH < 6 */    
 
@@ -387,7 +390,7 @@ apply_crossfeed:
     stmia   r12, { r8-r11 }            @ save filter history
     str     r0, [r12, #30*4]           @ save delay line index
     add     sp, sp, #8                 @ remove temp variables from stack
-    ldmia   sp!, { r4-r11, pc }
+    ldmpc   regs=r4-r11
     .size   apply_crossfeed, .-apply_crossfeed
 
 /****************************************************************************
@@ -444,7 +447,7 @@ dsp_downsample:
     ldr     r1, [r3]                @ r1 = &dst[0]
     sub     r8, r8, r1              @ dst - &dst[0]
     mov     r0, r8, lsr #2          @ convert bytes->samples
-    ldmia   sp!, { r4-r11, pc }     @ ... and we're out
+    ldmpc   regs=r4-r11             @ ... and we're out
     .size   dsp_downsample, .-dsp_downsample
 
 /****************************************************************************
@@ -507,7 +510,7 @@ dsp_upsample:
     sub     r8, r8, r2              @ dst - &dst[0]
     mov     r0, r8, lsr #2          @ convert bytes->samples
     add     sp, sp, #8              @ adjust stack for temp variables
-    ldmfd   sp!, { r4-r11, pc }     @ ... and we're out
+    ldmpc   regs=r4-r11             @ ... and we're out
     .size       dsp_upsample, .-dsp_upsample
 
 /****************************************************************************
@@ -554,5 +557,5 @@ dsp_apply_gain:
     subs    r3, r3, #1
     bgt     .dag_outerloop          @ end of outer loop
                
-    ldmfd   sp!, { r4-r8, pc }
+    ldmpc   regs=r4-r8
     .size   dsp_apply_gain, .-dsp_apply_gain
diff --git a/apps/eq_arm.S b/apps/eq_arm.S
index 92446e3..ca6ceec 100644
--- a/apps/eq_arm.S
+++ b/apps/eq_arm.S
@@ -85,5 +85,5 @@ eq_filter:
     bne .filterloop
 
     add sp, sp, #16            @ compensate for temp storage
-    ldmia sp!, { r4-r11, pc }
+    ldmpc regs=r4-r11
 
diff --git a/apps/plugins/mpegplayer/idct_arm.S b/apps/plugins/mpegplayer/idct_arm.S
index 7253d89..97a87a8 100644
--- a/apps/plugins/mpegplayer/idct_arm.S
+++ b/apps/plugins/mpegplayer/idct_arm.S
@@ -19,6 +19,8 @@
  *
  ****************************************************************************/
 
+#include "config.h"
+
     .global     mpeg2_idct_copy
     .type       mpeg2_idct_copy, %function
     .global     mpeg2_idct_add
@@ -313,7 +315,7 @@ mpeg2_idct_copy:
     add    r1, r1, r2
     cmp    r0, r12
     blo    1b
-    ldmfd  sp!, { r4-r11, pc }
+    ldmpc  regs=r4-r11
 
 mpeg2_idct_add:
     cmp    r0, #129
@@ -385,7 +387,7 @@ mpeg2_idct_add:
     add    r1, r1, r2
     cmp    r0, r12
     blo    2b
-    ldmfd  sp!, { r4-r11, pc }
+    ldmpc  regs=r4-r11
 3:
     stmfd  sp!, { r4-r5, lr }
     ldrsh  r1, [r0, #0]           /* r1 = block[0] */
@@ -438,4 +440,4 @@ mpeg2_idct_add:
     add    r2, r2, r3
     cmp    r2, r0
     blo    4b
-    ldmfd  sp!, { r4-r5, pc }
+    ldmpc  regs=r4-r5
diff --git a/apps/plugins/mpegplayer/motion_comp_arm_s.S b/apps/plugins/mpegplayer/motion_comp_arm_s.S
index fb29d59..49628c6 100644
--- a/apps/plugins/mpegplayer/motion_comp_arm_s.S
+++ b/apps/plugins/mpegplayer/motion_comp_arm_s.S
@@ -47,7 +47,7 @@ MC_put_o_16_align0:
         subs r3, r3, #1
         add r0, r0, r2
         bne MC_put_o_16_align0
-        ldmfd sp!, {r4-r7, pc} @@ update PC with LR content.
+        ldmpc regs=r4-r7 @@ update PC with LR content.
 
 .macro  ADJ_ALIGN_QW shift, R0, R1, R2, R3, R4
         mov \R0, \R0, lsr #(\shift)
@@ -71,7 +71,7 @@ MC_put_o_16_align1:
         subs r3, r3, #1
         add r0, r0, r2
         bne 1b
-        ldmfd sp!, {r4-r7, pc} @@ update PC with LR content.
+        ldmpc regs=r4-r7 @@ update PC with LR content.
 
 MC_put_o_16_align2:
         and r1, r1, #0xFFFFFFFC
@@ -83,7 +83,7 @@ MC_put_o_16_align2:
         subs r3, r3, #1
         add r0, r0, r2
         bne 1b
-        ldmfd sp!, {r4-r7, pc} @@ update PC with LR content.
+        ldmpc regs=r4-r7 @@ update PC with LR content.
 
 MC_put_o_16_align3:
         and r1, r1, #0xFFFFFFFC
@@ -95,7 +95,7 @@ MC_put_o_16_align3:
         subs r3, r3, #1
         add r0, r0, r2
         bne 1b
-        ldmfd sp!, {r4-r7, pc} @@ update PC with LR content.
+        ldmpc regs=r4-r7 @@ update PC with LR content.
 
 @ ----------------------------------------------------------------
         .align
@@ -120,7 +120,7 @@ MC_put_o_8_align0:
         add r0, r0, r2
         subs r3, r3, #1
         bne MC_put_o_8_align0
-        ldmfd sp!, {r4, r5, pc} @@ update PC with LR content.
+        ldmpc regs=r4-r5 @@ update PC with LR content.
 
 .macro  ADJ_ALIGN_DW shift, R0, R1, R2
         mov \R0, \R0, lsr #(\shift)
@@ -140,7 +140,7 @@ MC_put_o_8_align1:
         subs r3, r3, #1
         add r0, r0, r2
         bne 1b
-        ldmfd sp!, {r4, r5, pc} @@ update PC with LR content.
+        ldmpc regs=r4-r5 @@ update PC with LR content.
 
 MC_put_o_8_align2:
         and r1, r1, #0xFFFFFFFC
@@ -152,7 +152,7 @@ MC_put_o_8_align2:
         subs r3, r3, #1
         add r0, r0, r2
         bne 1b
-        ldmfd sp!, {r4, r5, pc} @@ update PC with LR content.
+        ldmpc regs=r4-r5 @@ update PC with LR content.
 
 MC_put_o_8_align3:
         and r1, r1, #0xFFFFFFFC
@@ -164,7 +164,7 @@ MC_put_o_8_align3:
         subs r3, r3, #1
         add r0, r0, r2
         bne 1b
-        ldmfd sp!, {r4, r5, pc} @@ update PC with LR content.
+        ldmpc regs=r4-r5 @@ update PC with LR content.
 
 @ ----------------------------------------------------------------
 .macro  AVG_PW rW1, rW2
@@ -218,7 +218,7 @@ MC_put_x_16_align0:
         subs r3, r3, #1
         add r0, r0, r2
         bne MC_put_x_16_align0
-        ldmfd sp!, {r4-r8, HIGH_REGS, pc} @@ update PC with LR content.
+        ldmpc regs="r4-r8, HIGH_REGS" @@ update PC with LR content.
 
 MC_put_x_16_align1:
         and r1, r1, #0xFFFFFFFC
@@ -234,7 +234,7 @@ MC_put_x_16_align1:
         subs r3, r3, #1
         add r0, r0, r2
         bne 1b
-        ldmfd sp!, {r4-r8, HIGH_REGS, pc} @@ update PC with LR content.
+        ldmpc regs="r4-r8, HIGH_REGS" @@ update PC with LR content.
 
 MC_put_x_16_align2:
         and r1, r1, #0xFFFFFFFC
@@ -250,7 +250,7 @@ MC_put_x_16_align2:
         subs r3, r3, #1
         add r0, r0, r2
         bne 1b
-        ldmfd sp!, {r4-r8, HIGH_REGS, pc} @@ update PC with LR content.
+        ldmpc regs="r4-r8, HIGH_REGS" @@ update PC with LR content.
 
 MC_put_x_16_align3:
         and r1, r1, #0xFFFFFFFC
@@ -266,7 +266,7 @@ MC_put_x_16_align3:
         subs r3, r3, #1
         add r0, r0, r2
         bne 1b
-        ldmfd sp!, {r4-r8, HIGH_REGS, pc} @@ update PC with LR content.
+        ldmpc regs="r4-r8, HIGH_REGS" @@ update PC with LR content.
 
 @ ----------------------------------------------------------------
         .align
@@ -297,7 +297,7 @@ MC_put_x_8_align0:
         subs r3, r3, #1
         add r0, r0, r2
         bne MC_put_x_8_align0
-        ldmfd sp!, {r4-r6, HIGH_REGS, pc} @@ update PC with LR content.
+        ldmpc regs="r4-r6, HIGH_REGS" @@ update PC with LR content.
 
 MC_put_x_8_align1:
         and r1, r1, #0xFFFFFFFC
@@ -311,7 +311,7 @@ MC_put_x_8_align1:
         subs r3, r3, #1
         add r0, r0, r2
         bne 1b
-        ldmfd sp!, {r4-r6, HIGH_REGS, pc} @@ update PC with LR content.
+        ldmpc regs="r4-r6, HIGH_REGS" @@ update PC with LR content.
 
 MC_put_x_8_align2:
         and r1, r1, #0xFFFFFFFC
@@ -325,7 +325,7 @@ MC_put_x_8_align2:
         subs r3, r3, #1
         add r0, r0, r2
         bne 1b
-        ldmfd sp!, {r4-r6, HIGH_REGS, pc} @@ update PC with LR content.
+        ldmpc regs="r4-r6, HIGH_REGS" @@ update PC with LR content.
 
 MC_put_x_8_align3:
         and r1, r1, #0xFFFFFFFC
@@ -339,4 +339,4 @@ MC_put_x_8_align3:
         subs r3, r3, #1
         add r0, r0, r2
         bne 1b
-        ldmfd sp!, {r4-r6, HIGH_REGS, pc} @@ update PC with LR content.
+        ldmpc regs="r4-r6, HIGH_REGS @@ update PC with LR content.
diff --git a/apps/plugins/pacbox/pacbox_arm.S b/apps/plugins/pacbox/pacbox_arm.S
index 32cf2d4..87696ce 100644
--- a/apps/plugins/pacbox/pacbox_arm.S
+++ b/apps/plugins/pacbox/pacbox_arm.S
@@ -19,6 +19,7 @@
  *
  ****************************************************************************/
 
+#include "config.h"
 #include "pacbox.h"
 
     .section .icode,"ax",%progbits
@@ -120,7 +121,7 @@ loop_x:
 /* end of y loop */
          add     r1, r1, #224*3           @ vbuf += 224*3
          subs    lr, lr, #4               @ y-=4
-         ldmeqia sp!, {r4-r11, pc}
+         ldmpc   cond=eq, regs=r4-r11
          b       loop_y
 #endif
 #endif
diff --git a/apps/recorder/jpeg_idct_arm.S b/apps/recorder/jpeg_idct_arm.S
index 4739600..e7eb4b8 100644
--- a/apps/recorder/jpeg_idct_arm.S
+++ b/apps/recorder/jpeg_idct_arm.S
@@ -89,7 +89,7 @@ jpeg_idct2v:
     add    r0,  r0,  #4
     cmp    r0,  r1
     bcc    1b
-    ldmia  sp!, { r4, pc }
+    ldmpc  regs=r4
 #else
 /* ARMv6 offers partitioned adds and subtracts, used here to unroll the loop
    to two columns.
@@ -137,7 +137,7 @@ jpeg_idct2h:
     add    r1,  r1,  r3
     cmp    r0,  r2
     bcc    1b
-    ldmia  sp!, { r4-r5, pc }
+    ldmpc  regs=r4-r5
 #else
     stmdb  sp!, { r4, lr }
     ldrsh  r14, .Lpool4+2
@@ -190,7 +190,7 @@ jpeg_idct4v:
     add    r0,  r0,  #2
     cmp    r0,  r1
     bcc    1b
-    ldmia  sp!, { r4-r7, pc }
+    ldmpc  regs=r4-r7
 #elif ARM_ARCH < 6
     stmdb sp!, { r4-r8, lr }
     mov    r8,  #1024
@@ -221,7 +221,7 @@ jpeg_idct4v:
     cmp    r0,  r1
     bcc    1b
     ldmia sp!, { r4-r8, pc }
-#else
+#else /* ARMv6+ */
     stmdb  sp!, { r4-r10, lr }
     ldrd   r2,  .Lpool4
     mov    r12, #1024
@@ -325,8 +325,8 @@ jpeg_idct4h:
     add    r1,  r1,  r3
     cmp    r0,  r2
     bcc    1b
-    ldmia sp!, { r4-r10, pc }
-#elif ARM_ARCH < 6
+    ldmpc  regs=r4-r10
+#elif ARM_ARCH < 6 /* ARMv5 */
     stmdb  sp!, { r4-r9, lr }
     ldr    r4,  .Lpool4
     ldr    r5,  .Lpool4+4
@@ -367,7 +367,7 @@ jpeg_idct4h:
     cmp    r0,  r2
     bcc    1b
     ldmia sp!, { r4-r9, pc }
-#else
+#else /* ARMv6+ */
     stmdb sp!, { r4-r9, lr }
     ldrd   r4,  .Lpool4
     mov    r9,  r4,  lsr #16
@@ -424,7 +424,7 @@ jpeg_idct8v:
     cmp    r0,  r1
     add    r2,  r2,  #2
     bcc    1b
-    ldmia  sp!, { r4-r11, pc }
+    ldmpc  regs=r4-r11
 2:
     ldr    r14, =4433
     ldr    r12, =-15137
@@ -586,7 +586,7 @@ jpeg_idct8v:
     cmp    r0,  r1
     add    r2,  r2,  #2
     bcc    1b
-    ldmia  sp!, { r4-r11, pc }
+    ldmpc  regs=r4-r11
     .size jpeg_idct8v, .-jpeg_idct8v
 
 #if ARM_ARCH > 4
@@ -631,7 +631,7 @@ jpeg_idct8h:
     add    r1,  r1,  r3
     cmp    r0,  r2
     bcc    1b
-    ldmia  sp!, { r4-r11, pc }
+    ldmpc  regs=r4-r11
 2:
     ldr    r14, =4433
     ldr    r12, =-15137
@@ -826,9 +826,9 @@ jpeg_idct8h:
     add    r1,  r1,  r3
     cmp    r0,  r2
     bcc    1b
-    ldmia  sp!, { r4-r11, pc }
+    ldmpc  regs=r4-r11
     .size jpeg_idct8h, .-jpeg_idct8h
-#else
+#else /* ARMv6+ */
 jpeg_idct8v:
     stmdb  sp!, { r4-r11, lr }
     add    r2,  r0,  #128
diff --git a/firmware/export/config.h b/firmware/export/config.h
index 5947ca1..3e269c2 100644
--- a/firmware/export/config.h
+++ b/firmware/export/config.h
@@ -710,6 +710,26 @@ Lyre prototype 1 */
 #define ROCKBOX_STRICT_ALIGN 1
 #endif
 
+#if defined(CPU_ARM) && defined(__ASSEMBLER__)
+/* ARMv4T doesn't switch the T bit when popping pc directly, we must use BX */
+.macro ldmpc cond="", order="ia", regs:req
+#if ARM_ARCH == 4 && defined(USE_THUMB)
+    ldm\cond\order sp!, { \regs, lr }
+    bx\cond lr
+#else
+    ldm\cond\order sp!, { \regs, pc }
+#endif
+.endm
+.macro ldrpc cond=""
+#if ARM_ARCH == 4 && defined(USE_THUMB)
+    ldr\cond lr, [sp], #4
+    bx\cond  lr
+#else
+    ldr\cond pc, [sp], #4
+#endif
+.endm
+#endif
+
 #ifndef CODEC_SIZE
 #define CODEC_SIZE 0
 #endif
diff --git a/firmware/target/arm/as3525/lcd-as-e200v2-fuze-fuzev2.S b/firmware/target/arm/as3525/lcd-as-e200v2-fuze-fuzev2.S
index f71216c..690641c 100644
--- a/firmware/target/arm/as3525/lcd-as-e200v2-fuze-fuzev2.S
+++ b/firmware/target/arm/as3525/lcd-as-e200v2-fuze-fuzev2.S
@@ -241,7 +241,7 @@ lcd_write_yuv420_lines:
     tst         r7, #DBOP_BUSY          @ fifo not empty?
     beq         1b                      @
 
-    ldmfd       sp!, { r4-r11, pc }     @ restore registers and return
+    ldmpc       regs=r4-r11             @ restore registers and return
     .ltorg                              @ dump constant pool
     .size   lcd_write_yuv420_lines, .-lcd_write_yuv420_lines
 
@@ -545,6 +545,6 @@ lcd_write_yuv420_lines_odither:
     tst         r7, #DBOP_BUSY          @ fifo not empty?
     beq         1b                      @
 
-    ldmfd       sp!, { r4-r11, pc }     @ restore registers and return
+    ldmpc       regs=r4-r11             @ restore registers and return
     .ltorg                              @ dump constant pool
     .size   lcd_write_yuv420_lines_odither, .-lcd_write_yuv420_lines_odither
diff --git a/firmware/target/arm/as3525/sansa-clip/lcd-as-clip.S b/firmware/target/arm/as3525/sansa-clip/lcd-as-clip.S
index 942ddf7..431dc62 100644
--- a/firmware/target/arm/as3525/sansa-clip/lcd-as-clip.S
+++ b/firmware/target/arm/as3525/sansa-clip/lcd-as-clip.S
@@ -19,6 +19,7 @@
  *
  ****************************************************************************/
 
+#include "config.h"
 #include "as3525.h"
 
     .text
@@ -90,5 +91,5 @@ lcd_grey_data:
     ands    r5, r5, #(1<<10)    @ wait until push fifo empties
     beq     1b
 
-    ldmfd   sp!, {r4-r7, pc}
+    ldmpc   regs=r4-r7
     .size   lcd_grey_data,.-lcd_grey_data
diff --git a/firmware/target/arm/ata-as-arm.S b/firmware/target/arm/ata-as-arm.S
index b1cafc2..101bc4d 100644
--- a/firmware/target/arm/ata-as-arm.S
+++ b/firmware/target/arm/ata-as-arm.S
@@ -146,7 +146,7 @@ copy_read_sectors:
 
     strb    r3, [r0], #1        /* store final byte */
 
-    ldmfd   sp!, {r4, r5, pc}
+    ldmpc   regs=r4-r5
 
     /* 16-bit aligned */
 .r_aligned:
@@ -195,7 +195,7 @@ copy_read_sectors:
     ldrneh  r3, [r2]
     strneh  r3, [r0], #2
 
-    ldmfd   sp!, {r4, r5, pc}
+    ldmpc   regs=r4-r5
 
 .r_end:
     .size   copy_read_sectors,.r_end-copy_read_sectors
@@ -300,7 +300,7 @@ copy_write_sectors:
     orr     r3, r3, r4, lsl #8
     strh    r3, [r2]            /* write final halfword */
 
-    ldmfd   sp!, {r4, r5, pc}
+    ldmpc   regs=r4-r5
 
     /* 16-bit aligned */
 .w_aligned:
@@ -349,7 +349,7 @@ copy_write_sectors:
     ldrneh  r3, [r0], #2
     strneh  r3, [r2]
 
-    ldmfd   sp!, {r4, r5, pc}
+    ldmpc   regs=r4-r5
 
 .w_end:
     .size   copy_write_sectors,.w_end-copy_write_sectors
diff --git a/firmware/target/arm/ipod/lcd-as-gray.S b/firmware/target/arm/ipod/lcd-as-gray.S
index 1364c1f..cfd179a 100644
--- a/firmware/target/arm/ipod/lcd-as-gray.S
+++ b/firmware/target/arm/ipod/lcd-as-gray.S
@@ -97,7 +97,7 @@ lcd_write_data_shifted:
     subs    r1, r1, #1
     bne     .sloop
 
-    ldmfd   sp!, {r4, pc}
+    ldmpc   regs=r4
     .size   lcd_write_data_shifted,.-lcd_write_data_shifted
     
 #elif defined IPOD_MINI
@@ -132,7 +132,7 @@ lcd_write_data_shifted:
     subs    r1, r1, #1
     bne     .sloop
 
-    ldr     pc, [sp], #4
+    ldrpc
     .size   lcd_write_data_shifted,.-lcd_write_data_shifted
 
 #endif
@@ -179,7 +179,7 @@ lcd_mono_data:
     subs    r1, r1, #1
     bne     .mloop
 
-    ldmfd   sp!, {r4, pc}
+    ldmpc   regs=r4
 
 .dibits:
     .byte   0x00, 0x03, 0x0C, 0x0F, 0x30, 0x33, 0x3C, 0x3F
@@ -267,6 +267,6 @@ lcd_grey_data:
     subs    r2, r2, #1
     bne     .greyloop
 
-    ldmfd   sp!, {r4-r7, pc}
+    ldmpc   regs=r4-r7
     .size   lcd_grey_data,.-lcd_grey_data
 
diff --git a/firmware/target/arm/ipod/video/lcd-as-video.S b/firmware/target/arm/ipod/video/lcd-as-video.S
index fa88dbc..47155b8 100644
--- a/firmware/target/arm/ipod/video/lcd-as-video.S
+++ b/firmware/target/arm/ipod/video/lcd-as-video.S
@@ -19,6 +19,8 @@
  *
  ****************************************************************************/
 
+#include "config.h"
+
     .section .icode, "ax", %progbits
 
 /****************************************************************************
@@ -60,7 +62,7 @@ lcd_write_data:                       /* r1 = pixel count, must be even */
     ldrne   r3, [r0], #4
     strne   r3, [lr]
 
-    ldmfd   sp!, {r4, pc}
+    ldmpc   regs=r4
 
 /****************************************************************************
  * extern void lcd_write_yuv420_lines(unsigned char const * const src[3],
@@ -294,7 +296,7 @@ lcd_write_yuv420_lines:
 
     ldr         r3, [sp, #12]
     add         sp, sp, r3            /* deallocate buffer */
-    ldmfd       sp!, { r4-r10, pc }   /* restore registers */
+    ldmpc       regs=r4-r10           /* restore registers */
 
     .ltorg
     .size   lcd_write_yuv420_lines, .-lcd_write_yuv420_lines
diff --git a/firmware/target/arm/iriver/h10/lcd-as-h10.S b/firmware/target/arm/iriver/h10/lcd-as-h10.S
index b3f12e4..8e851d8 100644
--- a/firmware/target/arm/iriver/h10/lcd-as-h10.S
+++ b/firmware/target/arm/iriver/h10/lcd-as-h10.S
@@ -232,7 +232,7 @@ lcd_write_yuv420_lines:
     subs        r1, r1, #2              @ subtract block from width
     bgt         10b @ loop line         @
                                         @
-    ldmfd       sp!, { r4-r11, pc }     @ restore registers and return
+    ldmpc       regs=r4-r11             @ restore registers and return
     .ltorg                              @ dump constant pool
     .size   lcd_write_yuv420_lines, .-lcd_write_yuv420_lines
 
@@ -533,6 +533,6 @@ lcd_write_yuv420_lines_odither:
     subs        r1, r1, #2              @ subtract block from width
     bgt         10b @ loop line         @
                                         @
-    ldmfd       sp!, { r4-r11, pc }     @ restore registers and return
+    ldmpc       regs=r4-r11             @ restore registers and return
     .ltorg                              @ dump constant pool
     .size   lcd_write_yuv420_lines_odither, .-lcd_write_yuv420_lines_odither
diff --git a/firmware/target/arm/lcd-as-memframe.S b/firmware/target/arm/lcd-as-memframe.S
index 4532bab..87cbb61 100644
--- a/firmware/target/arm/lcd-as-memframe.S
+++ b/firmware/target/arm/lcd-as-memframe.S
@@ -101,7 +101,7 @@ lcd_copy_buffer_rect:                   @
     add     r0, r0, r4, lsl #1          @
     subs    r3, r3, #1                  @ next line
     bgt     10b @ copy line             @
-    ldmfd   sp!, { r4-r11, pc }         @ restore regs and return
+    ldmpc   regs=r4-r11                 @ restore regs and return
     .ltorg                              @ dump constant pool
     .size   lcd_copy_buffer_rect, .-lcd_copy_buffer_rect
 
@@ -344,7 +344,7 @@ lcd_write_yuv420_lines:
     subs        r2, r2, #2              @ subtract block from width
     bgt         10b @ loop line         @
                                         @
-    ldmfd       sp!, { r4-r10, pc }     @ restore registers and return
+    ldmpc       regs=r4-r10             @ restore registers and return
     .ltorg                              @ dump constant pool
     .size   lcd_write_yuv420_lines, .-lcd_write_yuv420_lines
 
@@ -691,6 +691,6 @@ lcd_write_yuv420_lines_odither:
     subs        r2, r2, #2              @ subtract block from width
     bgt         10b @ loop line         @
                                         @
-    ldmfd       sp!, { r4-r11, pc }     @ restore registers and return
+    ldmpc       regs=r4-r11             @ restore registers and return
     .ltorg                              @ dump constant pool
     .size   lcd_write_yuv420_lines_odither, .-lcd_write_yuv420_lines_odither
diff --git a/firmware/target/arm/memcpy-arm.S b/firmware/target/arm/memcpy-arm.S
index d17d659..2a55fb5 100644
--- a/firmware/target/arm/memcpy-arm.S
+++ b/firmware/target/arm/memcpy-arm.S
@@ -98,7 +98,7 @@ memcpy:
         strcsb  r4, [r0], #1
         strcsb  ip, [r0]
 
-        ldmfd   sp!, {r0, r4, pc}
+        ldmpc   regs="r0, r4"
 
 9:      rsb ip, ip, #4
         cmp ip, #2
diff --git a/firmware/target/arm/memmove-arm.S b/firmware/target/arm/memmove-arm.S
index ce056d9..d8cab04 100644
--- a/firmware/target/arm/memmove-arm.S
+++ b/firmware/target/arm/memmove-arm.S
@@ -112,7 +112,7 @@ memmove:
         strneb  r3, [r0, #-1]!
         strcsb  r4, [r0, #-1]!
         strcsb  ip, [r0, #-1]
-        ldmfd   sp!, {r0, r4, pc}
+        ldmpc   regs="r0, r4"
 
 9:      cmp ip, #2
         ldrgtb  r3, [r1, #-1]!
diff --git a/firmware/target/arm/memset-arm.S b/firmware/target/arm/memset-arm.S
index 5dbde7a..682da87 100644
--- a/firmware/target/arm/memset-arm.S
+++ b/firmware/target/arm/memset-arm.S
@@ -69,7 +69,7 @@ memset:
         stmgedb r0!, {r1, r3, ip, lr}
         stmgedb r0!, {r1, r3, ip, lr}
         bgt     3b
-        ldreq   pc, [sp], #4            @ Now <64 bytes to go.
+        ldrpc   cond=eq                 @ Now <64 bytes to go.
 /*
  * No need to correct the count; we're only testing bits from now on
  */
diff --git a/firmware/target/arm/memset16-arm.S b/firmware/target/arm/memset16-arm.S
index 63d6264..5c787b1 100644
--- a/firmware/target/arm/memset16-arm.S
+++ b/firmware/target/arm/memset16-arm.S
@@ -59,7 +59,7 @@ memset16:
         stmgeia r0!, {r1, r3, ip, lr}
         stmgeia r0!, {r1, r3, ip, lr}
         bgt     2b
-        ldreq   pc, [sp], #4            @ Now <64 bytes to go.
+        ldrpc   cond=eq                 @ Now <64 bytes to go.
 /*
  * No need to correct the count; we're only testing bits from now on
  */
diff --git a/firmware/target/arm/olympus/mrobe-100/lcd-as-mr100.S b/firmware/target/arm/olympus/mrobe-100/lcd-as-mr100.S
index 6db6c7e..0977801 100644
--- a/firmware/target/arm/olympus/mrobe-100/lcd-as-mr100.S
+++ b/firmware/target/arm/olympus/mrobe-100/lcd-as-mr100.S
@@ -99,6 +99,6 @@ lcd_grey_data:
     subs    r2, r2, #1
     bne     .greyloop
 
-    ldmfd   sp!, {r4-r7, pc}
+    ldmpc   regs=r4-r7
     .size   lcd_grey_data,.-lcd_grey_data
 
diff --git a/firmware/target/arm/pbell/vibe500/lcd-as-vibe500.S b/firmware/target/arm/pbell/vibe500/lcd-as-vibe500.S
index d5d5157..9079be6 100644
--- a/firmware/target/arm/pbell/vibe500/lcd-as-vibe500.S
+++ b/firmware/target/arm/pbell/vibe500/lcd-as-vibe500.S
@@ -243,7 +243,7 @@ lcd_write_yuv420_lines:
     subs        r1, r1, #2              @ subtract block from width
     bgt         10b @ loop line         @
                                         @
-    ldmfd       sp!, { r4-r11, pc }     @ restore registers and return
+    ldmpc       regs=r4-r11             @ restore registers and return
     .ltorg                              @ dump constant pool
     .size   lcd_write_yuv420_lines, .-lcd_write_yuv420_lines
 
@@ -551,6 +551,6 @@ lcd_write_yuv420_lines_odither:
     subs        r1, r1, #2              @ subtract block from width
     bgt         10b @ loop line         @
                                         @
-    ldmfd       sp!, { r4-r11, pc }     @ restore registers and return
+    ldmpc       regs=r4-r11             @ restore registers and return
     .ltorg                              @ dump constant pool
     .size   lcd_write_yuv420_lines_odither, .-lcd_write_yuv420_lines_odither
diff --git a/firmware/target/arm/pcm-pp.c b/firmware/target/arm/pcm-pp.c
index 6289b4c..bffc69f 100644
--- a/firmware/target/arm/pcm-pp.c
+++ b/firmware/target/arm/pcm-pp.c
@@ -74,9 +74,14 @@ void fiq_handler(void) ICODE_ATTR __attribute__((naked));
 void fiq_handler(void)
 {
     asm volatile (
-        "ldr pc, [pc, #-4] \n"
-    "fiq_function:         \n"
-        ".word 0           \n"
+#if ARM_ARCH == 4 && defined(USE_THUMB)
+        "ldr r12, [pc, #-4] \n"
+        "bx  r12            \n"
+#else
+        "ldr pc, [pc, #-4]  \n"
+#endif
+    "fiq_function:          \n"
+        ".word 0            \n"
     );
 }
 
diff --git a/firmware/target/arm/philips/hdd1630/lcd-as-hdd1630.S b/firmware/target/arm/philips/hdd1630/lcd-as-hdd1630.S
index 73ad84a..7be807a 100644
--- a/firmware/target/arm/philips/hdd1630/lcd-as-hdd1630.S
+++ b/firmware/target/arm/philips/hdd1630/lcd-as-hdd1630.S
@@ -248,7 +248,7 @@ lcd_write_yuv420_lines:
     subs        r1, r1, #2              @ subtract block from width
     bgt         10b @ loop line         @
                                         @
-    ldmfd       sp!, { r4-r11, pc }     @ restore registers and return
+    ldmpc       regs=r4-r11             @ restore registers and return
     .ltorg                              @ dump constant pool
     .size   lcd_write_yuv420_lines, .-lcd_write_yuv420_lines
 
@@ -565,6 +565,6 @@ lcd_write_yuv420_lines_odither:
     subs        r1, r1, #2              @ subtract block from width
     bgt         10b @ loop line         @
                                         @
-    ldmfd       sp!, { r4-r11, pc }     @ restore registers and return
+    ldmpc       regs=r4-r11             @ restore registers and return
     .ltorg                              @ dump constant pool
     .size   lcd_write_yuv420_lines_odither, .-lcd_write_yuv420_lines_odither
diff --git a/firmware/target/arm/samsung/yh820/lcd-as-yh820.S b/firmware/target/arm/samsung/yh820/lcd-as-yh820.S
index 581a4f3..d8dfccc 100644
--- a/firmware/target/arm/samsung/yh820/lcd-as-yh820.S
+++ b/firmware/target/arm/samsung/yh820/lcd-as-yh820.S
@@ -238,7 +238,7 @@ lcd_write_yuv420_lines:
     subs        r1, r1, #2              @ subtract block from width
     bgt         10b @ loop line         @
                                         @
-    ldmfd       sp!, { r4-r10, pc }     @ restore registers and return
+    ldmpc       regs=r4-r10             @ restore registers and return
     .ltorg                              @ dump constant pool
     .size   lcd_write_yuv420_lines, .-lcd_write_yuv420_lines
 
@@ -545,6 +545,6 @@ lcd_write_yuv420_lines_odither:
     subs        r1, r1, #2              @ subtract block from width
     bgt         10b @ loop line         @
                                         @
-    ldmfd       sp!, { r4-r11, pc }     @ restore registers and return
+    ldmpc       regs=r4-r11             @ restore registers and return
     .ltorg                              @ dump constant pool
     .size   lcd_write_yuv420_lines_odither, .-lcd_write_yuv420_lines_odither
diff --git a/firmware/target/arm/samsung/yh920/lcd-as-yh920.S b/firmware/target/arm/samsung/yh920/lcd-as-yh920.S
index 2c7f511..130addf 100644
--- a/firmware/target/arm/samsung/yh920/lcd-as-yh920.S
+++ b/firmware/target/arm/samsung/yh920/lcd-as-yh920.S
@@ -113,6 +113,6 @@ lcd_grey_data:
     subs    r2, r2, #1
     bne     .greyloop
 
-    ldmfd   sp!, {r4-r5, pc}
+    ldmpc   regs=r4-r5
     .size   lcd_grey_data,.-lcd_grey_data
 
diff --git a/firmware/target/arm/samsung/yh925/lcd-as-yh925.S b/firmware/target/arm/samsung/yh925/lcd-as-yh925.S
index b3f12e4..8e851d8 100644
--- a/firmware/target/arm/samsung/yh925/lcd-as-yh925.S
+++ b/firmware/target/arm/samsung/yh925/lcd-as-yh925.S
@@ -232,7 +232,7 @@ lcd_write_yuv420_lines:
     subs        r1, r1, #2              @ subtract block from width
     bgt         10b @ loop line         @
                                         @
-    ldmfd       sp!, { r4-r11, pc }     @ restore registers and return
+    ldmpc       regs=r4-r11             @ restore registers and return
     .ltorg                              @ dump constant pool
     .size   lcd_write_yuv420_lines, .-lcd_write_yuv420_lines
 
@@ -533,6 +533,6 @@ lcd_write_yuv420_lines_odither:
     subs        r1, r1, #2              @ subtract block from width
     bgt         10b @ loop line         @
                                         @
-    ldmfd       sp!, { r4-r11, pc }     @ restore registers and return
+    ldmpc       regs=r4-r11             @ restore registers and return
     .ltorg                              @ dump constant pool
     .size   lcd_write_yuv420_lines_odither, .-lcd_write_yuv420_lines_odither
diff --git a/firmware/target/arm/sandisk/sansa-c200/lcd-as-c200.S b/firmware/target/arm/sandisk/sansa-c200/lcd-as-c200.S
index 581a4f3..d8dfccc 100644
--- a/firmware/target/arm/sandisk/sansa-c200/lcd-as-c200.S
+++ b/firmware/target/arm/sandisk/sansa-c200/lcd-as-c200.S
@@ -238,7 +238,7 @@ lcd_write_yuv420_lines:
     subs        r1, r1, #2              @ subtract block from width
     bgt         10b @ loop line         @
                                         @
-    ldmfd       sp!, { r4-r10, pc }     @ restore registers and return
+    ldmpc       regs=r4-r10             @ restore registers and return
     .ltorg                              @ dump constant pool
     .size   lcd_write_yuv420_lines, .-lcd_write_yuv420_lines
 
@@ -545,6 +545,6 @@ lcd_write_yuv420_lines_odither:
     subs        r1, r1, #2              @ subtract block from width
     bgt         10b @ loop line         @
                                         @
-    ldmfd       sp!, { r4-r11, pc }     @ restore registers and return
+    ldmpc       regs=r4-r11             @ restore registers and return
     .ltorg                              @ dump constant pool
     .size   lcd_write_yuv420_lines_odither, .-lcd_write_yuv420_lines_odither
diff --git a/firmware/target/arm/thread-arm.c b/firmware/target/arm/thread-arm.c
index c2d91ce..9ea3d0b 100644
--- a/firmware/target/arm/thread-arm.c
+++ b/firmware/target/arm/thread-arm.c
@@ -77,7 +77,15 @@ static inline void load_context(const void* addr)
     asm volatile(
         "ldr     r0, [%0, #40]          \n" /* Load start pointer */
         "cmp     r0, #0                 \n" /* Check for NULL */
-        "ldmneia %0, { r0, pc }         \n" /* If not already running, jump to start */ 
+
+        /* If not already running, jump to start */
+#if ARM_ARCH == 4 && defined(USE_THUMB)
+        "ldmneia %0, { r0, r12 }        \n"
+        "bxne    r12                    \n"
+#else
+        "ldmneia %0, { r0, pc }         \n"
+#endif
+
         "ldmia   %0, { r4-r11, sp, lr } \n" /* Load regs r4 to r14 from context */
         : : "r" (addr) : "r0" /* only! */
     );
