about summary refs log tree commit
diff options
context:
space:
mode:
authorEric Wong <normalperson@yhbt.net>2006-09-01 02:49:49 -0700
committerEric Wong <normalperson@yhbt.net>2006-09-09 18:57:40 -0700
commit3df8e38af6da6c0699da8ef040ffc80cf7dab810 (patch)
treef931e7aa499850db2992262f38aa3c1ccbd9d332
parentf345a3435dde249874fde205ef4b793184e1e7c4 (diff)
downloadflac-arm-1.1.3-3df8e38af6da6c0699da8ef040ffc80cf7dab810.tar.gz
Add ARM assembly optimizations
This was my first time writing ARM assembly, so it may not be the
best; but they're still significantly faster than the optimized
C versions[1] of the functions replace.

An knowledgable ARM coder should be able to make better
optimizations, but nobody has done so for FLAC yet (at least not
publicly, to my knowledge), so I decided to take a stab at it.

Also tweaked configure.in a bit to better support
cross-compilation.  no longer enable 3dnow, sse, or altivec
optimizations unless our CPU supports them.  (note: sse and
3dnow are supported by 64-bit chips, but the code is currently
only optimized for 32-bit sse and 3dnow).

FLAC__fixed_restore_signal_asm_arm and
FLAC__lpc_restore_signal_asm_arm are pretty well-tested from
tracks in my music collection.  I do not think I have any music
that can exersises FLAC__lpc_restore_signal_asm_arm_wide
(especially the 64-bit ASR macro).

These optimizations changes result in an approximately 20%
reduction in decoding time on my 3rd generation ipod (running
ipodlinux).

[1] - I still have them at hand, but they're not probably not
worth it for the non-ASM optimized architectures.
-rw-r--r--configure.in19
-rw-r--r--src/libFLAC/Makefile.am4
-rw-r--r--src/libFLAC/arm/fixed_asm.s236
-rw-r--r--src/libFLAC/arm/lpc_asm.s678
-rw-r--r--src/libFLAC/cpu.c2
-rw-r--r--src/libFLAC/include/private/cpu.h5
-rw-r--r--src/libFLAC/include/private/fixed.h4
-rw-r--r--src/libFLAC/include/private/lpc.h3
-rw-r--r--src/libFLAC/stream_decoder.c14
9 files changed, 959 insertions, 6 deletions
diff --git a/configure.in b/configure.in
index 7dce4e77..bf48c4d7 100644
--- a/configure.in
+++ b/configure.in
@@ -61,6 +61,11 @@ case "$host_cpu" in
                 AC_DEFINE(FLAC__CPU_PPC)
                 AH_TEMPLATE(FLAC__CPU_PPC, [define if building for PowerPC])
                 ;;
+        arm)
+                cpu_arm=true
+                AC_DEFINE(FLAC__CPU_ARM)
+                AH_TEMPLATE(FLAC__CPU_ARM, [define if building for ARM])
+                ;;
         sparc)
                 cpu_sparc=true
                 AC_DEFINE(FLAC__CPU_SPARC)
@@ -69,6 +74,7 @@ case "$host_cpu" in
 esac
 AM_CONDITIONAL(FLaC__CPU_IA32, test "x$cpu_ia32" = xtrue)
 AM_CONDITIONAL(FLaC__CPU_PPC, test "x$cpu_ppc" = xtrue)
+AM_CONDITIONAL(FLaC__CPU_ARM, test "x$cpu_arm" = xtrue)
 AM_CONDITIONAL(FLaC__CPU_SPARC, test "x$cpu_sparc" = xtrue)
 case "$host" in
         i386-*-openbsd3.[[0-3]]) OBJ_FORMAT=aoutb ;;
@@ -119,8 +125,8 @@ AC_HELP_STRING([--enable-sse], [Enable SSE support by asserting that the OS supp
         no)  sse_os=false ;;
         *) AC_MSG_ERROR(bad value ${enableval} for --enable-sse) ;;
 esac],[sse_os=false])
-AM_CONDITIONAL(FLaC__SSE_OS, test "x$sse_os" = xtrue)
-if test "x$sse_os" = xtrue ; then
+AM_CONDITIONAL(FLaC__SSE_OS, test "x$sse_os" = xtrue && test "x$cpu_ia32" = xtrue)
+if test "x$sse_os" = xtrue && test "x$cpu_ia32" = xtrue; then
 AC_DEFINE(FLAC__SSE_OS)
 AH_TEMPLATE(FLAC__SSE_OS, [define if your operating system supports SSE instructions])
 fi
@@ -132,8 +138,8 @@ AC_HELP_STRING([--disable-3dnow], [Disable 3DNOW! optimizations]),
         no)  use_3dnow=false ;;
         *) AC_MSG_ERROR(bad value ${enableval} for --enable-3dnow) ;;
 esac],[use_3dnow=true])
-AM_CONDITIONAL(FLaC__USE_3DNOW, test "x$use_3dnow" = xtrue)
-if test "x$use_3dnow" = xtrue ; then
+AM_CONDITIONAL(FLaC__USE_3DNOW, test "x$use_3dnow" = xtrue && test "x$cpu_ia32" = xtrue)
+if test "x$use_3dnow" = xtrue && test "x$cpu_ia32" = xtrue; then
 AC_DEFINE(FLAC__USE_3DNOW)
 AH_TEMPLATE(FLAC__USE_3DNOW, [define to enable use of 3Dnow! instructions])
 fi
@@ -145,8 +151,8 @@ AC_HELP_STRING([--disable-altivec], [Disable Altivec optimizations]),
         no)  use_altivec=false ;;
         *) AC_MSG_ERROR(bad value ${enableval} for --enable-altivec) ;;
 esac],[use_altivec=true])
-AM_CONDITIONAL(FLaC__USE_ALTIVEC, test "x$use_altivec" = xtrue)
-if test "x$use_altivec" = xtrue ; then
+AM_CONDITIONAL(FLaC__USE_ALTIVEC, test "x$use_altivec" = xtrue && test "x$cpu_ppc" = xtrue)
+if test "x$use_altivec" = xtrue && test "x$cpu_ppc" = xtrue; then
 AC_DEFINE(FLAC__USE_ALTIVEC)
 AH_TEMPLATE(FLAC__USE_ALTIVEC, [define to enable use of Altivec instructions])
 fi
@@ -281,6 +287,7 @@ AC_CONFIG_FILES([ \
         Makefile \
         src/Makefile \
         src/libFLAC/Makefile \
+        src/libFLAC/arm/Makefile \
         src/libFLAC/ia32/Makefile \
         src/libFLAC/ppc/Makefile \
         src/libFLAC/ppc/as/Makefile \
diff --git a/src/libFLAC/Makefile.am b/src/libFLAC/Makefile.am
index 395308fb..67154470 100644
--- a/src/libFLAC/Makefile.am
+++ b/src/libFLAC/Makefile.am
@@ -49,6 +49,10 @@ AM_CFLAGS = $(DEBUGCFLAGS) $(CPUCFLAGS)
 
 if FLaC__NO_ASM
 else
+if FLaC__CPU_ARM
+ARCH_SUBDIRS = arm
+libFLAC_la_LIBADD = arm/libFLAC-asm.la
+endif
 if FLaC__CPU_IA32
 if FLaC__HAS_NASM
 ARCH_SUBDIRS = ia32
diff --git a/src/libFLAC/arm/fixed_asm.s b/src/libFLAC/arm/fixed_asm.s
new file mode 100644
index 00000000..bffb2a99
--- /dev/null
+++ b/src/libFLAC/arm/fixed_asm.s
@@ -0,0 +1,236 @@
+@  libFLAC - Free Lossless Audio Codec library
+@  Copyright (C) 2001,2002,2003,2004,2005,2006  Josh Coalson
+@
+@  Redistribution and use in source and binary forms, with or without
+@  modification, are permitted provided that the following conditions
+@  are met:
+@
+@  - Redistributions of source code must retain the above copyright
+@  notice, this list of conditions and the following disclaimer.
+@
+@  - Redistributions in binary form must reproduce the above copyright
+@  notice, this list of conditions and the following disclaimer in the
+@  documentation and/or other materials provided with the distribution.
+@
+@  - Neither the name of the Xiph.org Foundation nor the names of its
+@  contributors may be used to endorse or promote products derived from
+@  this software without specific prior written permission.
+@
+@  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+@  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+@  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+@  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+@  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+@  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+@  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES@ LOSS OF USE, DATA, OR
+@  PROFITS@ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+@  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+@  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+@  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+        .text
+        .align        2
+        .global        FLAC__fixed_restore_signal_asm_arm
+        .type        FLAC__fixed_restore_signal_asm_arm, %function
+FLAC__fixed_restore_signal_asm_arm:
+        stmfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r14}
+        @        r0 = residual;
+        @        r1 = data_len;
+        @        r2 = order;
+        @        r3 = data;
+
+        cmp        r2, #4
+        ldrls        r15, [r15, r2, asl #2]
+        ldmfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+
+        .align        2
+        .word        .Lorder0
+        .word        .Lorder1
+        .word        .Lorder2
+        .word        .Lorder3
+        .word        .Lorder4
+
+        @ data[i] = residual[i];
+.Lorder0:
+        tst        r1, #15
+        beq        .Lorder0b
+.Lorder0a:
+        @ start off slow until we get to (data_len % 16) == 0
+        ldr        r2, [r0], #4
+        str        r2, [r3], #4
+        subs        r1, r1, #1
+        ldmeqfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+        tst        r1, #15
+        bne        .Lorder0a
+.Lorder0b:
+        @ WHEEEEEEEEEEEEEE!!!!!!!!!!!!!!!!!!!
+        ldmia        r0!, {r5 - r12}
+        stmia        r3!, {r5 - r12}
+        ldmia        r0!, {r5 - r12}
+        stmia        r3!, {r5 - r12}
+        subs        r1, r1, #16
+        bne        .Lorder0b
+        ldmfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+
+        @ data[i] = residual[i] + data[i-1];
+.Lorder1:
+        ldr        r14, [r3,#-4]
+        tst        r1, #7
+        beq        .Lorder1b
+.Lorder1a:
+        ldr        r2, [r0], #4
+        add        r14, r2, r14
+        str        r14, [r3], #4
+        subs        r1, r1, #1
+        ldmeqfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+        tst        r1, #7
+        bne        .Lorder1a
+.Lorder1b:
+        ldmia        r0!, {r2, r4 - r10}
+        add        r2, r2, r14
+        add        r4, r4, r2
+        add        r5, r5, r4
+        add        r6, r6, r5
+        add        r7, r7, r6
+        add        r8, r8, r7
+        add        r9, r9, r8
+        add        r14, r10, r9
+        stmia        r3!, {r2, r4 - r9, r14}
+        subs        r1, r1, #8
+        bne        .Lorder1b
+        ldmfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+
+        @ data[i] = residual[i] + (data[i-1]<<1) - data[i-2];
+.Lorder2:
+        @ r12 = data[i-2], r14 = data[i-1]
+        ldmdb        r3, {r12, r14}
+        tst        r1, #7
+        beq        .Lorder2b
+.Lorder2a:
+        ldr        r2, [r0], #4
+        add        r2, r2, r14, asl #1
+        sub        r2, r2, r12
+        str        r2, [r3], #4
+        mov        r12, r14
+        mov        r14, r2
+        subs        r1, r1, #1
+        ldmeqfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+        tst        r1, #7
+        bne        .Lorder2a
+.Lorder2b:
+        ldmia        r0!, {r2, r4 - r10}
+
+        @ r12 = data[i-2], r14 = data[i-1]
+        add        r2, r2, r14, asl #1
+        sub        r2, r2, r12
+
+        add        r4, r4, r2, asl #1
+        sub        r4, r4, r14
+
+        add        r5, r5, r4, asl #1
+        sub        r5, r5, r2
+
+        add        r6, r6, r5, asl #1
+        sub        r6, r6, r4
+
+        add        r7, r7, r6, asl #1
+        sub        r7, r7, r5
+
+        add        r8, r8, r7, asl #1
+        sub        r8, r8, r6
+
+        add        r9, r9, r8, asl #1
+        sub        r12, r9, r7
+
+        add        r10, r10, r12, asl #1
+        sub        r14, r10, r8
+
+        stmia        r3!, {r2, r4 - r8, r12, r14}
+        subs        r1, r1, #8
+        bne        .Lorder2b
+        ldmfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+
+        @ data[i] = residual[i] + (((data[i-1]-data[i-2])<<1)
+        @        + (data[i-1]-data[i-2])) + data[i-3];
+        .macro        do_order_3, dest, res_i, b3, b2, b1, tmp
+        add        \res_i, \res_i, \b3
+        sub        \tmp, \b1, \b2
+        add        \tmp, \tmp, \tmp, asl #1
+        add        \dest, \res_i, \tmp
+        .endm
+.Lorder3:
+        ldmdb                r3, { r11, r12, r14 }
+        tst                r1, #7
+        beq                .Lorder3b
+.Lorder3a:
+        ldr                r2, [r0], #4
+
+        do_order_3        r2, r2, r11, r12, r14, r11
+        str                r2, [r3], #4
+
+        ldmdb                r3, { r11, r12, r14 }
+
+        subs                r1, r1, #1
+        ldmeqfd                r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+        tst                r1, #7
+        bne                .Lorder3a
+.Lorder3b:
+        ldmia                r0!, {r2, r4 - r10}
+
+        do_order_3        r2, r2, r11, r12, r14, r11
+        do_order_3        r4, r4, r12, r14, r2, r12
+        do_order_3        r5, r5, r14, r2, r4, r14
+        do_order_3        r6, r6, r2, r4, r5, r11
+
+        do_order_3        r7, r7, r4, r5, r6, r12
+        do_order_3        r11, r8, r5, r6, r7, r11
+        do_order_3        r12, r9, r6, r7, r11, r12
+        do_order_3        r14, r10, r7, r11, r12, r14
+
+        stmia                r3!, {r2, r4 - r7, r11, r12, r14}
+        subs                r1, r1, #8
+        bne                .Lorder3b
+        ldmfd                r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+
+        @ data[i] = residual[i] + ((data[i-1]+data[i-3])<<2)
+        @                - ((data[i-2]<<2) + (data[i-2]<<1)) - data[i-4];
+        .macro        do_order_4, dest, res_i, b4, b3, b2, b1, tmp
+        sub        \res_i, \res_i, \b4
+        add        \tmp, \b1, \b3
+        add        \res_i, \res_i, \tmp, asl #2
+        sub        \res_i, \res_i, \b2, asl #2
+        sub        \dest, \res_i, \b2, asl #1
+        .endm
+.Lorder4:
+        ldmdb                r3, {r7, r11, r12, r14}
+        tst                r1, #7
+        beq                .Lorder4b
+.Lorder4a:
+        ldr                r2, [r0], #4
+        do_order_4        r2, r2, r7, r11, r12, r14, r7
+        str                r2, [r3], #4
+        ldmdb                r3, {r7, r11, r12, r14}
+        subs                r1, r1, #1
+        ldmeqfd                r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+        tst                r1, #7
+        bne                .Lorder4a
+.Lorder4b:
+        ldr                r2, [r0], #4
+        do_order_4        r2, r2, r7, r11, r12, r14, r10
+
+        ldmia                r0!, {r4 - r10}
+
+        do_order_4        r4, r4, r11, r12, r14, r2, r11
+        do_order_4        r5, r5, r12, r14, r2, r4, r12
+        do_order_4        r6, r6, r14, r2, r4, r5, r14
+
+        do_order_4        r7, r7, r2, r4, r5, r6, r11
+        do_order_4        r11, r8, r4, r5, r6, r7, r11
+        do_order_4        r12, r9, r5, r6, r7, r11, r12
+        do_order_4        r14, r10, r6, r7, r11, r12, r14
+
+        stmia                r3!, {r2, r4, r5, r6, r7, r11, r12, r14}
+        subs                r1, r1, #8
+        bne                .Lorder4b
+        ldmfd                r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+        .size        FLAC__fixed_restore_signal_asm_arm, .-FLAC__fixed_restore_signal_asm_arm
diff --git a/src/libFLAC/arm/lpc_asm.s b/src/libFLAC/arm/lpc_asm.s
new file mode 100644
index 00000000..3629a7b3
--- /dev/null
+++ b/src/libFLAC/arm/lpc_asm.s
@@ -0,0 +1,678 @@
+@  libFLAC - Free Lossless Audio Codec library
+@  Copyright (C) 2001,2002,2003,2004,2005,2006  Josh Coalson
+@
+@  Redistribution and use in source and binary forms, with or without
+@  modification, are permitted provided that the following conditions
+@  are met:
+@
+@  - Redistributions of source code must retain the above copyright
+@  notice, this list of conditions and the following disclaimer.
+@
+@  - Redistributions in binary form must reproduce the above copyright
+@  notice, this list of conditions and the following disclaimer in the
+@  documentation and/or other materials provided with the distribution.
+@
+@  - Neither the name of the Xiph.org Foundation nor the names of its
+@  contributors may be used to endorse or promote products derived from
+@  this software without specific prior written permission.
+@
+@  THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+@  ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+@  LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+@  A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR
+@  CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+@  EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+@  PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES@ LOSS OF USE, DATA, OR
+@  PROFITS@ OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
+@  LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
+@  NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+@  SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+@ TODO: special cases for order 9, 10, 11, 12 may be further optimizable...
+
+        .text
+        .align 2
+        .global        FLAC__lpc_restore_signal_asm_arm
+        .type        FLAC__lpc_restore_signal_asm_arm, %function
+FLAC__lpc_restore_signal_asm_arm:
+        stmfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r14}
+        @        r0 = residual;
+        @        r1 = data_len;
+        @        r2 = qlp_coeff;
+        @        r3 = order;
+        @        r5 = lp_quantization
+        @        r14 = &data
+        ldr        r5, [r13, #40]                @ lp_quantization
+        ldr        r14, [r13, #44]                @ &data
+
+
+        @ Special case each of the common LPC order levels used by encoders
+        @ switch (order) { ...
+        sub        r6, r3, #1
+        cmp        r6, #11
+        ldrls        r15, [r15, r6, asl #2]
+        b        .Lgeneric_restore_signal
+
+        .align  2
+        .word        .Lorder1
+        .word        .Lorder2
+        .word        .Lorder3
+        .word        .Lorder4
+        .word        .Lorder5
+        .word        .Lorder6
+        .word        .Lorder7
+        .word        .Lorder8
+        .word        .Lorder9
+        .word        .Lorder10
+        .word        .Lorder11
+        .word        .Lorder12
+.Lorder12:
+        ldmia        r2!, {r8 - r11}                @ qlp_coeff[0 - 3]
+        sub        r14, r14, #16                @ &data[-4]
+.Lorder12a:
+        ldmia        r14, {r3, r4, r6, r7}        @ r14 = &data[-4]
+        mul        r12, r8, r7
+        mla        r12, r9, r6, r12
+        mla        r12, r10, r4, r12
+        mla        r12, r11, r3, r12
+
+        ldmia        r2!, {r8 - r11}                @ qlp_coeff[4 - 7]
+        ldmdb        r14!, {r3, r4, r6, r7}        @ r14 = &data[-4] => &data[-8]
+        mla        r12, r8, r7, r12
+        mla        r12, r9, r6, r12
+        mla        r12, r10, r4, r12
+        mla        r12, r11, r3, r12
+
+        ldmia        r2, {r8 - r11}                @ qlp_coeff[8 - 11]
+        ldmdb        r14, {r3, r4, r6, r7}
+        mla        r12, r8, r7, r12
+        mla        r12, r9, r6, r12
+        mla        r12, r10, r4, r12
+        mla        r12, r11, r3, r12
+
+        ldr        r3, [r0], #4
+        add        r3, r3, r12, asr r5
+        str        r3, [r14, #32]
+        subs        r1, r1, #1
+        ldmeqfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+        add        r14, r14, #4                @ data++, r14 = &data[-4]
+
+        ldmdb        r14, {r3, r4, r6, r7}        @ data[-12, -11, -10, -9], &data[-8]
+        mul        r12, r11, r3
+        mla        r12, r10, r4, r12
+        mla        r12, r9, r6, r12
+        mla        r12, r8, r7, r12
+
+        ldmdb        r2!, {r8 - r11}                @ qlp_coeff[4 - 7]
+        ldmia        r14!, {r3, r4, r6, r7}        @ &data[-4]
+        mla        r12, r11, r3, r12
+        mla        r12, r10, r4, r12
+        mla        r12, r9, r6, r12
+        mla        r12, r8, r7, r12
+
+        ldmdb        r2, {r8 - r11}                @ qlp_coeff[0 - 3]
+        ldmia        r14, {r3, r4, r6, r7}
+        mla        r12, r11, r3, r12
+        mla        r12, r10, r4, r12
+        mla        r12, r9, r6, r12
+        mla        r12, r8, r7, r12
+
+        ldr        r3, [r0], #4
+        add        r3, r3, r12, asr r5
+        str        r3, [r14, #16]
+        subs        r1, r1, #1
+        add        r14, r14, #4                @ r14 = &data[-4]
+        bne        .Lorder12a
+        ldmfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+.Lorder11:
+        ldmia        r2!, {r8 - r11}                @ qlp_coeff[0 - 3]
+        sub        r14, r14, #16                @ &data[-4]
+.Lorder11a:
+        ldmia        r14, {r3, r4, r6, r7}        @ r14 = &data[-4]
+        mul        r12, r8, r7
+        mla        r12, r9, r6, r12
+        mla        r12, r10, r4, r12
+        mla        r12, r11, r3, r12
+
+        ldmia        r2!, {r8 - r11}                @ qlp_coeff[4 - 7]
+        ldmdb        r14!, {r3, r4, r6, r7}        @ r14 = &data[-4] => &data[-8]
+        mla        r12, r8, r7, r12
+        mla        r12, r9, r6, r12
+        mla        r12, r10, r4, r12
+        mla        r12, r11, r3, r12
+
+        ldmia        r2, {r8 - r10}                @ qlp_coeff[8 - 11]
+        ldmda        r14, {r3, r4, r6, r7}        @ we'll reuse r4, r6, and r7
+        mla        r12, r8, r6, r12
+        mla        r12, r9, r4, r12
+        mla        r12, r10, r3, r12
+
+        ldr        r3, [r0], #4
+        add        r3, r3, r12, asr r5
+        str        r3, [r14, #32]
+        subs        r1, r1, #1
+        ldmeqfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+        add        r14, r14, #4                @ data++, r14 = &data[-4]
+
+        mul        r12, r10, r4
+        mla        r12, r9, r6, r12
+        mla        r12, r8, r7, r12
+
+        ldmdb        r2!, {r8 - r11}                @ qlp_coeff[4 - 7]
+        ldmia        r14!, {r3, r4, r6, r7}        @ &data[-4]
+        mla        r12, r11, r3, r12
+        mla        r12, r10, r4, r12
+        mla        r12, r9, r6, r12
+        mla        r12, r8, r7, r12
+
+        ldmdb        r2, {r8 - r11}                @ qlp_coeff[0 - 3]
+        ldmia        r14, {r3, r4, r6, r7}
+        mla        r12, r11, r3, r12
+        mla        r12, r10, r4, r12
+        mla        r12, r9, r6, r12
+        mla        r12, r8, r7, r12
+
+        ldr        r3, [r0], #4
+        add        r3, r3, r12, asr r5
+        str        r3, [r14, #16]
+        subs        r1, r1, #1
+        add        r14, r14, #4                @ r14 = &data[-4]
+        bne        .Lorder11a
+        ldmfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+.Lorder10:
+        ldmia        r2!, {r8 - r11}                @ qlp_coeff[0 - 3]
+        sub        r14, r14, #16                @ &data[-4]
+.Lorder10a:
+        ldmia        r14, {r3, r4, r6, r7}        @ r14 = &data[-4]
+        mul        r12, r8, r7
+        mla        r12, r9, r6, r12
+        mla        r12, r10, r4, r12
+        mla        r12, r11, r3, r12
+
+        ldmia        r2!, {r8 - r11}                @ qlp_coeff[4 - 7]
+        ldmdb        r14!, {r3, r4, r6, r7}        @ r14 = &data[-4] => &data[-8]
+        mla        r12, r8, r7, r12
+        mla        r12, r9, r6, r12
+        mla        r12, r10, r4, r12
+        mla        r12, r11, r3, r12
+
+        ldmia        r2, {r8, r9}
+        ldmda        r14, {r4, r6, r7}        @ we'll reuse r4, r6, and r7
+        mla        r12, r8, r6, r12
+        mla        r12, r9, r4, r12
+
+        ldr        r3, [r0], #4
+        add        r3, r3, r12, asr r5
+        str        r3, [r14, #32]
+        subs        r1, r1, #1
+        ldmeqfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+        add        r14, r14, #4                @ data++, r14 = &data[-4]
+
+        mul        r12, r9, r6
+        mla        r12, r8, r7, r12
+
+        ldmdb        r2!, {r8 - r11}                @ qlp_coeff[4 - 7]
+        ldmia        r14!, {r3, r4, r6, r7}        @ &data[-4]
+        mla        r12, r11, r3, r12
+        mla        r12, r10, r4, r12
+        mla        r12, r9, r6, r12
+        mla        r12, r8, r7, r12
+
+        ldmdb        r2, {r8 - r11}                @ qlp_coeff[0 - 3]
+        ldmia        r14, {r3, r4, r6, r7}
+        mla        r12, r11, r3, r12
+        mla        r12, r10, r4, r12
+        mla        r12, r9, r6, r12
+        mla        r12, r8, r7, r12
+
+        ldr        r3, [r0], #4
+        add        r3, r3, r12, asr r5
+        str        r3, [r14, #16]
+        subs        r1, r1, #1
+        add        r14, r14, #4                @ r14 = &data[-4]
+        bne        .Lorder10a
+        ldmfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+.Lorder9:
+        ldmia        r2!, {r8 - r11}                @ qlp_coeff[0 - 3]
+        sub        r14, r14, #16                @ &data[-4]
+.Lorder9a:
+        ldmia        r14, {r3, r4, r6, r7}        @ r14 = &data[-4]
+        mul        r12, r8, r7
+        mla        r12, r9, r6, r12
+        mla        r12, r10, r4, r12
+        mla        r12, r11, r3, r12
+
+        ldmia        r2!, {r8 - r11}                @ qlp_coeff[4 - 7]
+        ldmdb        r14!, {r3, r4, r6, r7}        @ r14 = &data[-4] => &data[-8]
+        mla        r12, r8, r7, r12
+        mla        r12, r9, r6, r12
+        mla        r12, r10, r4, r12
+        mla        r12, r11, r3, r12
+
+        ldr        r8, [r2]
+        ldmda        r14, {r6, r7}        @ we'll reuse r7
+        mla        r12, r8, r6, r12
+
+        ldr        r3, [r0], #4
+        add        r3, r3, r12, asr r5
+        str        r3, [r14, #32]
+        subs        r1, r1, #1
+        ldmeqfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+        add        r14, r14, #4                @ data++, r14 = &data[-4]
+
+        mul        r12, r8, r7
+
+        ldmdb        r2!, {r8 - r11}                @ qlp_coeff[4 - 7]
+        ldmia        r14!, {r3, r4, r6, r7}        @ &data[-4]
+        mla        r12, r11, r3, r12
+        mla        r12, r10, r4, r12
+        mla        r12, r9, r6, r12
+        mla        r12, r8, r7, r12
+
+        ldmdb        r2, {r8 - r11}                @ qlp_coeff[0 - 3]
+        ldmia        r14, {r3, r4, r6, r7}
+        mla        r12, r11, r3, r12
+        mla        r12, r10, r4, r12
+        mla        r12, r9, r6, r12
+        mla        r12, r8, r7, r12
+
+        ldr        r3, [r0], #4
+        add        r3, r3, r12, asr r5
+        str        r3, [r14, #16]
+        subs        r1, r1, #1
+        add        r14, r14, #4                @ r14 = &data[-4]
+        bne        .Lorder9a
+        ldmfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+.Lorder8:
+        ldmia        r2!, {r8 - r11}                @ qlp_coeff[0 - 3]
+        sub        r14, r14, #16
+.Lorder8a:
+        ldmia        r14, {r3, r4, r6, r7}        @ r14 = &data[-4]
+        mul        r12, r8, r7
+        mla        r12, r9, r6, r12
+        mla        r12, r10, r4, r12
+        mla        r12, r11, r3, r12
+
+        ldmia        r2, {r8 - r11}                @ qlp_coeff [4 - 7]
+        ldmdb        r14, {r3, r4, r6, r7}        @ r14 = &data[-4]
+        mla        r12, r8, r7, r12
+        mla        r12, r9, r6, r12
+        mla        r12, r10, r4, r12
+        mla        r12, r11, r3, r12
+
+        ldr        r3, [r0], #4
+        add        r3, r3, r12, asr r5
+        str        r3, [r14, #16]
+        subs        r1, r1, #1
+        ldmeqfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+        add        r14, r14, #4                @ data++, r14 = &data[-4]
+
+        ldmdb        r14, {r3, r4, r6, r7}        @ data[-8, -7, -6, -5]
+        mul        r12, r11, r3                @ q[7] * d[-8]
+        mla        r12, r10, r4, r12        @ q[6] * d[-7]
+        mla        r12, r9, r6, r12
+        mla        r12, r8, r7, r12
+
+        ldmdb        r2, {r8 - r11}                @ qlp_coeff[0 - 3]
+        ldmia        r14, {r3, r4, r6, r7}        @ r14 = &data[-4]
+
+        mla        r12, r11, r3, r12
+        mla        r12, r10, r4, r12
+        mla        r12, r9, r6, r12
+        mla        r12, r8, r7, r12
+
+        ldr        r3, [r0], #4
+        add        r3, r3, r12, asr r5
+        str        r3, [r14, #16]
+        subs        r1, r1, #1
+        add        r14, r14, #4                @ r14 = &data[-4]
+        bne        .Lorder8a
+        ldmfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+.Lorder7:
+        ldmia        r2!, {r8 - r11}
+        sub        r14, r14, #16
+.Lorder7a:
+        ldmia        r14, {r3, r4, r6, r7}
+        mul        r12, r8, r7
+        mla        r12, r9, r6, r12
+        mla        r12, r10, r4, r12
+        mla        r12, r11, r3, r12
+
+        ldmia        r2, {r8 - r10}
+        ldmda        r14, {r3, r4, r6, r7} @ we reuse r4, r6, r7 below, too:
+        mla        r12, r8, r6, r12
+        mla        r12, r9, r4, r12
+        mla        r12, r10, r3, r12
+
+        ldr        r3, [r0], #4
+        add        r3, r3, r12, asr r5
+        str        r3, [r14, #16]
+
+        subs        r1, r1, #1
+        ldmeqfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+        add        r14, r14, #4
+
+        mul        r12, r10, r4
+        mla        r12, r9, r6, r12
+        mla        r12, r8, r7, r12
+
+        ldmdb        r2, {r8 - r11}
+        ldmia        r14, {r3, r4, r6, r7}
+
+        mla        r12, r11, r3, r12
+        mla        r12, r10, r4, r12
+        mla        r12, r9, r6, r12
+        mla        r12, r8, r7, r12
+
+        ldr        r3, [r0], #4
+        add        r3, r3, r12, asr r5
+        str        r3, [r14, #16]
+        subs        r1, r1, #1
+        add        r14, r14, #4
+
+        bne        .Lorder7a
+        ldmfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+.Lorder6:
+        ldmia        r2, {r6 - r11}                @ qlp_coeff[0 - 5]
+        sub        r14, r14, #12                @ data[-3]
+.Lorder6a:
+        ldmia        r14, {r2 - r4}
+        mul        r12, r6, r4
+        mla        r12, r7, r3, r12
+        mla        r12, r8, r2, r12
+
+        ldmdb        r14, {r2 - r4}
+        mla        r12, r9, r4, r12
+        mla        r12, r10, r3, r12
+        mla        r12, r11, r2, r12
+
+        ldr        r3, [r0], #4
+        add        r3, r3, r12, asr r5
+        str        r3, [r14, #12]
+        add        r14, r14, #4
+        subs        r1, r1, #1
+        bne        .Lorder6a
+        ldmfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+.Lorder5:
+        ldmia        r2, {r6 - r10}                @ qlp_coeff[0 - 4]
+        ldr        r12, [r14, #-4]!        @ &data[-1]
+.Lorder5a:
+        ldmdb        r14, {r2, r3, r4, r11}
+        mul        r12, r6, r12
+        mla        r12, r7, r11, r12
+        mla        r12, r8, r4, r12
+        mla        r12, r9, r3, r12
+        mla        r11, r10, r2, r12
+
+        ldr        r3, [r0], #4
+        add        r12, r3, r11, asr r5
+        str        r12, [r14, #4]!
+
+        subs        r1, r1, #1
+        bne        .Lorder5a
+        ldmfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+.Lorder4:
+        ldmia        r2, {r6 - r9}                @ qlp_coeff[0 - 3]
+.Lorder4a:
+        ldmdb        r14, {r2 - r4, r11}
+        mul        r12, r6, r11
+        mla        r12, r7, r4, r12
+        mla        r12, r8, r3, r12
+        mla        r12, r9, r2, r12
+
+        ldr        r3, [r0], #4
+        add        r3, r3, r12, asr r5
+        str        r3, [r14], #4
+
+        subs        r1, r1, #1
+        bne        .Lorder4a
+        ldmfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+.Lorder3:
+        ldmia        r2, {r6 - r8}                @ qlp_coeff[0 - 2]
+.Lorder3a:
+        ldmdb        r14, {r2 - r4}
+        mul        r12, r6, r4
+        mla        r12, r7, r3, r12
+        mla        r12, r8, r2, r12
+
+        ldr        r3, [r0], #4
+        add        r3, r3, r12, asr r5
+        str        r3, [r14], #4
+
+        subs        r1, r1, #1
+        bne        .Lorder3a
+        ldmfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+.Lorder2:
+        ldmia        r2, {r6, r7}                @ qlp_coeff[0, 1]
+        ldmdb        r14, {r2, r3}
+.Lorder2a:
+        mul        r12, r6, r3
+        mla        r12, r7, r2, r12
+
+        ldr        r3, [r0], #4
+        add        r3, r3, r12, asr r5
+        ldr        r2, [r14, #-4]
+        str        r3, [r14], #4
+
+        subs        r1, r1, #1
+        bne        .Lorder2a
+        ldmfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+.Lorder1:
+        ldr        r6, [r2]
+        ldr        r3, [r14, #-4]
+.Lorder1a:
+        mul        r12, r6, r3
+
+        ldr        r3, [r0], #4
+        add        r3, r3, r12, asr r5
+        str        r3, [r14], #4
+
+        subs        r1, r1, #1
+        bne        .Lorder1a
+        ldmfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+
+@ this part started out as a Duff's Device in C, but now it's
+@ optimized to take advantage of the ldm instructions:
+.Lgeneric_restore_signal:
+        add        r2, r2, r3, asl #2        @ qlp0 = &qlp_coeff[order]
+        add        r7, r3, #7                @ order + 7
+        mov        r8, r7, lsr #3                @ n = (order + 7) / (2^3)
+.Lduffs_device_outer:
+        sub        r14, r14, r3, asl #2        @ r8 = history = &data[-order]
+
+        mov        r7, r8
+        mov        r9, r2                        @ qlp = qlp0
+        mov        r12, #0                        @ sum = 0
+
+        @ switch (order % 8)
+        and        r4, r3, #7
+        cmp        r4, #7
+        ldrls        r15, [r15, r4, asl #2]
+        b        .Lduffs_end
+
+        .align 2
+        .word .Lduffs_case0
+        .word .Lduffs_case1
+        .word .Lduffs_case2
+        .word .Lduffs_case3
+        .word .Lduffs_case4
+        .word .Lduffs_case5
+        .word .Lduffs_case6
+        .word .Lduffs_case7
+
+.Lduffs_case7:
+        ldmia        r14!, {r4, r6}
+        ldmdb        r9!, {r10, r11}
+        mla        r12, r4, r11, r12
+        mla        r12, r6, r10, r12
+.Lduffs_case5:
+        ldmia        r14!, {r4, r6}
+        ldmdb        r9!, {r10, r11}
+        mla        r12, r4, r11, r12
+        mla        r12, r6, r10, r12
+.Lduffs_case3:
+        ldmia        r14!, {r4, r6}
+        ldmdb        r9!, {r10, r11}
+        mla        r12, r4, r11, r12
+        mla        r12, r6, r10, r12
+.Lduffs_case1:
+        ldr        r10, [r9, #-4]!
+        ldr        r11, [r14], #4
+        mla        r12, r10, r11, r12
+
+        subs        r7, r7, #1
+        bne        .Lduffs_case0
+
+.Lduffs_end:
+        ldr        r10, [r0], #4
+        add        r11, r10, r12, asr r5
+        str        r11, [r14], #4
+        subs        r1, r1, #1
+        bne        .Lduffs_device_outer
+        ldmfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+
+.Lduffs_case0:
+        ldmia        r14!, {r4, r6}
+        ldmdb        r9!, {r10, r11}
+        mla        r12, r4, r11, r12
+        mla        r12, r6, r10, r12
+.Lduffs_case6:
+        ldmia        r14!, {r4, r6}
+        ldmdb        r9!, {r10, r11}
+        mla        r12, r4, r11, r12
+        mla        r12, r6, r10, r12
+.Lduffs_case4:
+        ldmia        r14!, {r4, r6}
+        ldmdb        r9!, {r10, r11}
+        mla        r12, r4, r11, r12
+        mla        r12, r6, r10, r12
+.Lduffs_case2:
+        ldmia        r14!, {r4, r6}
+        ldmdb        r9!, {r10, r11}
+        mla        r12, r4, r11, r12
+        mla        r12, r6, r10, r12
+
+        subs        r7, r7, #1
+        bne        .Lduffs_case0
+
+        ldr        r10, [r0], #4
+        add        r11, r10, r12, asr r5
+        str        r11, [r14], #4
+        subs        r1, r1, #1
+        bne        .Lduffs_device_outer
+        ldmfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+
+        .size        FLAC__lpc_restore_signal_asm_arm, .-FLAC__lpc_restore_signal_asm_arm
+        .align 2
+        .global        FLAC__lpc_restore_signal_asm_arm_wide
+        .type        FLAC__lpc_restore_signal_asm_arm_wide, %function
+FLAC__lpc_restore_signal_asm_arm_wide:
+        stmfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r14}
+        @ r0 = residual;
+        @ r1 = data_len;
+        @ r2 = qlp_coeff;
+        @ r3 = order;
+
+        .macro finish_loop
+        @ 64-bit arithmetic shift right:
+        mov        r7, r7, lsr r2                @ shift lo register r2 bits right
+        rsb        r10, r2, #32
+        mov        r11, r12, lsl r10        @ shift hi register (32 - r2) bits left
+        orr        r7, r7, r11                @ combine new-hi and new-lo in one word
+        ands        r6, r12, #0x80000000        @ mask sign bit from original hi word
+        orrne        r7, r7, r6                @ restore sign bit from the original
+
+        ldr        r10, [r0], #4                @ residual
+        add        r12, r10, r7
+        str        r12, [r8], #4
+        subs        r1, r1, #1
+        bne        .Lwide_duffs_device_outer
+        .endm
+
+        add        r5, r2, r3, asl #2        @ qlp0 = &qlp_coeff[order]
+
+        ldr        r2, [r13, #40]                @ lp_quantization
+        ldr        r8, [r13, #44]                @ &data
+.Lwide_duffs_device_outer:
+        sub        r8, r8, r3, asl #2        @ r8 = data = &data[-order]
+        add        r14, r3, #7                @ order + 7
+        mov        r14, r14, lsr #3        @ n = (order + 7) / (2^3)
+
+        mov        r9, r5                        @ qlp = qlp0
+        mov        r7, #0                        @ sum = 0
+        mov        r12, #0                        @ sum = 0
+
+        @ switch (order % 8)
+        and        r4, r3, #7
+        cmp        r4, #7
+        ldrls        r15, [r15, r4, asl #2]
+        b        .Lwide_duffs_end
+
+        .align 2
+        .word .Lwide_duffs_case0
+        .word .Lwide_duffs_case1
+        .word .Lwide_duffs_case2
+        .word .Lwide_duffs_case3
+        .word .Lwide_duffs_case4
+        .word .Lwide_duffs_case5
+        .word .Lwide_duffs_case6
+        .word .Lwide_duffs_case7
+
+.Lwide_duffs_case7:
+        ldmia        r8!, {r4, r6}
+        ldmdb        r9!, {r10, r11}
+        smlal        r7, r12, r4, r11
+        smlal        r7, r12, r6, r10
+.Lwide_duffs_case5:
+        ldmia        r8!, {r4, r6}
+        ldmdb        r9!, {r10, r11}
+        smlal        r7, r12, r4, r11
+        smlal        r7, r12, r6, r10
+.Lwide_duffs_case3:
+        ldmia        r8!, {r4, r6}
+        ldmdb        r9!, {r10, r11}
+        smlal        r7, r12, r4, r11
+        smlal        r7, r12, r6, r10
+.Lwide_duffs_case1:
+        ldr        r10, [r9, #-4]!
+        ldr        r11, [r8], #4
+        smlal        r7, r12, r10, r11
+
+        subs        r14, r14, #1                @ --n
+        bne        .Lwide_duffs_case0
+
+.Lwide_duffs_end:
+        finish_loop
+
+        ldmfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+
+.Lwide_duffs_case0:
+        ldmia        r8!, {r4, r6}
+        ldmdb        r9!, {r10, r11}
+        smlal        r7, r12, r4, r11
+        smlal        r7, r12, r6, r10
+.Lwide_duffs_case6:
+        ldmia        r8!, {r4, r6}
+        ldmdb        r9!, {r10, r11}
+        smlal        r7, r12, r4, r11
+        smlal        r7, r12, r6, r10
+.Lwide_duffs_case4:
+        ldmia        r8!, {r4, r6}
+        ldmdb        r9!, {r10, r11}
+        smlal        r7, r12, r4, r11
+        smlal        r7, r12, r6, r10
+.Lwide_duffs_case2:
+        ldmia        r8!, {r4, r6}
+        ldmdb        r9!, {r10, r11}
+        smlal        r7, r12, r4, r11
+        smlal        r7, r12, r6, r10
+
+        subs        r14, r14, #1                @ --n
+        bne        .Lwide_duffs_case0
+
+        finish_loop
+
+        ldmfd        r13!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, r15}
+
+        .size        FLAC__lpc_restore_signal_asm_arm_wide, .-FLAC__lpc_restore_signal_asm_arm_wide
+
diff --git a/src/libFLAC/cpu.c b/src/libFLAC/cpu.c
index de2bb2a3..977cb90c 100644
--- a/src/libFLAC/cpu.c
+++ b/src/libFLAC/cpu.c
@@ -125,6 +125,8 @@ void FLAC__cpu_info(FLAC__CPUInfo *info)
 #endif
 #elif defined FLAC__CPU_PPC
         info->type = FLAC__CPUINFO_TYPE_PPC;
+#elif defined FLAC__CPU_ARM
+        info->type = FLAC__CPUINFO_TYPE_ARM;
 #if !defined FLAC__NO_ASM
         info->use_asm = true;
 #ifdef FLAC__USE_ALTIVEC
diff --git a/src/libFLAC/include/private/cpu.h b/src/libFLAC/include/private/cpu.h
index d59c779e..adda1b8a 100644
--- a/src/libFLAC/include/private/cpu.h
+++ b/src/libFLAC/include/private/cpu.h
@@ -41,6 +41,7 @@
 typedef enum {
         FLAC__CPUINFO_TYPE_IA32,
         FLAC__CPUINFO_TYPE_PPC,
+        FLAC__CPUINFO_TYPE_ARM,
         FLAC__CPUINFO_TYPE_UNKNOWN
 } FLAC__CPUInfo_Type;
 
@@ -60,6 +61,9 @@ typedef struct {
         FLAC__bool ppc64;
 } FLAC__CPUInfo_PPC;
 
+/* just generic ARM support for now */
+typedef FLAC__bool FLAC__CPUInfo_ARM;
+
 extern const unsigned FLAC__CPUINFO_IA32_CPUID_CMOV;
 extern const unsigned FLAC__CPUINFO_IA32_CPUID_MMX;
 extern const unsigned FLAC__CPUINFO_IA32_CPUID_FXSR;
@@ -76,6 +80,7 @@ typedef struct {
         union {
                 FLAC__CPUInfo_IA32 ia32;
                 FLAC__CPUInfo_PPC ppc;
+                FLAC__CPUInfo_ARM arm;
         } data;
 } FLAC__CPUInfo;
 
diff --git a/src/libFLAC/include/private/fixed.h b/src/libFLAC/include/private/fixed.h
index bb71b202..7dd40c9b 100644
--- a/src/libFLAC/include/private/fixed.h
+++ b/src/libFLAC/include/private/fixed.h
@@ -92,6 +92,10 @@ void FLAC__fixed_compute_residual(const FLAC__int32 data[], unsigned data_len, u
  *        IN  data[-order,-1]               previously-reconstructed historical samples
  *        OUT data[0,data_len-1]            original signal
  */
+#if (!defined(FLAC__NO_ASM) && defined(FLAC__CPU_ARM))
+void FLAC__fixed_restore_signal_asm_arm(const FLAC__int32 residual[], unsigned data_len, unsigned order, FLAC__int32 data[]);
+#else
 void FLAC__fixed_restore_signal(const FLAC__int32 residual[], unsigned data_len, unsigned order, FLAC__int32 data[]);
+#endif
 
 #endif
diff --git a/src/libFLAC/include/private/lpc.h b/src/libFLAC/include/private/lpc.h
index 970db8a8..b79422c2 100644
--- a/src/libFLAC/include/private/lpc.h
+++ b/src/libFLAC/include/private/lpc.h
@@ -176,6 +176,9 @@ void FLAC__lpc_restore_signal_asm_ia32_mmx(const FLAC__int32 residual[], unsigne
 #  elif defined FLAC__CPU_PPC
 void FLAC__lpc_restore_signal_asm_ppc_altivec_16(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
 void FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
+#  elif defined FLAC__CPU_ARM
+void FLAC__lpc_restore_signal_asm_arm(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
+void FLAC__lpc_restore_signal_asm_arm_wide(const FLAC__int32 residual[], unsigned data_len, const FLAC__int32 qlp_coeff[], unsigned order, int lp_quantization, FLAC__int32 data[]);
 #  endif/* FLAC__CPU_IA32 || FLAC__CPU_PPC */
 #endif /* FLAC__NO_ASM */
 
diff --git a/src/libFLAC/stream_decoder.c b/src/libFLAC/stream_decoder.c
index b3ac1a87..e10de5f5 100644
--- a/src/libFLAC/stream_decoder.c
+++ b/src/libFLAC/stream_decoder.c
@@ -322,6 +322,8 @@ FLAC_API FLAC__StreamDecoderState FLAC__stream_decoder_init(FLAC__StreamDecoder
                         decoder->private_->local_lpc_restore_signal_16bit = FLAC__lpc_restore_signal_asm_ppc_altivec_16;
                         decoder->private_->local_lpc_restore_signal_16bit_order8 = FLAC__lpc_restore_signal_asm_ppc_altivec_16_order8;
                 }
+#elif defined FLAC__CPU_ARM
+                FLAC__ASSERT(decoder->private_->cpuinfo.type == FLAC__CPUINFO_TYPE_ARM);
 #endif
         }
 #endif
@@ -1961,7 +1963,11 @@ FLAC__bool read_subframe_fixed_(FLAC__StreamDecoder *decoder, unsigned channel,
         /* decode the subframe */
         if(do_full_decode) {
                 memcpy(decoder->private_->output[channel], subframe->warmup, sizeof(FLAC__int32) * order);
+#if (!defined(FLAC__NO_ASM) && defined(FLAC__CPU_ARM))
+                FLAC__fixed_restore_signal_asm_arm(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, order, decoder->private_->output[channel]+order);
+#else
                 FLAC__fixed_restore_signal(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, order, decoder->private_->output[channel]+order);
+#endif
         }
 
         return true;
@@ -2039,6 +2045,9 @@ FLAC__bool read_subframe_lpc_(FLAC__StreamDecoder *decoder, unsigned channel, un
         if(do_full_decode) {
                 memcpy(decoder->private_->output[channel], subframe->warmup, sizeof(FLAC__int32) * order);
                 if(bps + subframe->qlp_coeff_precision + FLAC__bitmath_ilog2(order) <= 32)
+#ifdef FLAC__CPU_ARM
+                        FLAC__lpc_restore_signal_asm_arm(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
+#else /* ! FLAC__CPU_ARM */
                         if(bps <= 16 && subframe->qlp_coeff_precision <= 16) {
                                 if(order <= 8)
                                         decoder->private_->local_lpc_restore_signal_16bit_order8(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
@@ -2047,8 +2056,13 @@ FLAC__bool read_subframe_lpc_(FLAC__StreamDecoder *decoder, unsigned channel, un
                         }
                         else
                                 decoder->private_->local_lpc_restore_signal(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
+#endif /* ! FLAC__CPU_ARM */
                 else
+#ifdef FLAC__CPU_ARM
+                        FLAC__lpc_restore_signal_asm_arm_wide(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
+#else /* ! FLAC__CPU_ARM */
                         decoder->private_->local_lpc_restore_signal_64bit(decoder->private_->residual[channel], decoder->private_->frame.header.blocksize-order, subframe->qlp_coeff, order, subframe->quantization_level, decoder->private_->output[channel]+order);
+#endif /* ! FLAC__CPU_ARM */
         }
 
         return true;