[KissFFT] Add NEON optimized assembly routines

Replaces KissFFT's bfly2 and bfly4 C routines by NEON
optimized assembly routines for performance improvement.

Change-Id: I70dbfa5f810e121e15b0e150cac267b5d9768bb2
diff --git a/Android.mk b/Android.mk
new file mode 100644
index 0000000..007ab54
--- /dev/null
+++ b/Android.mk
@@ -0,0 +1,16 @@
+LOCAL_PATH:= $(call my-dir)
+include $(CLEAR_VARS)
+
+LOCAL_SRC_FILES := kiss_fft.c
+
+LOCAL_CFLAGS := -O3 -fvisibility=hidden
+
+ifeq ($(ARCH_ARM_HAVE_NEON),true)
+LOCAL_CFLAGS += -D__ARM_HAVE_NEON
+LOCAL_SRC_FILES += kiss_fft_bfly2_neon.S
+LOCAL_SRC_FILES += kiss_fft_bfly4_neon.S
+endif
+
+LOCAL_MODULE := libkissfft
+
+include $(BUILD_STATIC_LIBRARY)
diff --git a/kiss_fft.c b/kiss_fft.c
index 465d6c9..2098086 100644
--- a/kiss_fft.c
+++ b/kiss_fft.c
@@ -18,6 +18,10 @@
  fixed or floating point complex numbers.  It also delares the kf_ internal functions.
  */
 
+#ifdef __ARM_HAVE_NEON
+void kf_bfly2(kiss_fft_cpx * Fout, const size_t fstride, const kiss_fft_cfg st, int m);
+void kf_bfly4(kiss_fft_cpx * Fout, const size_t fstride, const kiss_fft_cfg st, const size_t m);
+#else
 static void kf_bfly2(
         kiss_fft_cpx * Fout,
         const size_t fstride,
@@ -88,6 +92,7 @@
         ++Fout;
     }while(--k);
 }
+#endif //__ARM_HAVE_NEON
 
 static void kf_bfly3(
          kiss_fft_cpx * Fout,
diff --git a/kiss_fft_bfly2_neon.S b/kiss_fft_bfly2_neon.S
new file mode 100644
index 0000000..4f82cc0
--- /dev/null
+++ b/kiss_fft_bfly2_neon.S
@@ -0,0 +1,123 @@
+/*
+* Copyright (C) 2012, Code Aurora Forum. All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are
+* met:
+*     * Redistributions of source code must retain the above copyright
+*       notice, this list of conditions and the following disclaimer.
+*     * Redistributions in binary form must reproduce the above
+*       copyright notice, this list of conditions and the following
+*       disclaimer in the documentation and/or other materials provided
+*       with the distribution.
+*     * Neither the name of Code Aurora Forum, Inc. nor the names of its
+*       contributors may be used to endorse or promote products derived
+*       from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
+* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
+* ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
+* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+@ NEON optimized assembly routine of kf_bfly2()
+
+    .text
+    .fpu neon
+    .align 4
+    .global     kf_bfly2
+    .func       kf_bfly2
+
+kf_bfly2:
+    stmdb            sp!, {r4, r5, r6, r7, r8, r9, sl, fp, lr}
+@    vstmdb           sp!, {d8-d15}
+                                                @ r0 - Fout| r1 - fstride | r2 - st | r3 - m
+    pld             [r0, #0]
+    mov             r8, r3, asl #3              @ convert m into bytes count (m*8)
+    add             r5, r0, r8                  @ Fout2 = Fout + m;
+    add             r6, r2, #264                @ tw1 = st->twiddles
+    pld             [r6, #0]
+    mov             r1, r1, asl #3              @ convert fstride into bytes count (fstride*8)
+                                                @ float32x4x2_t *Fout;     q0, q1 (d0-d3)
+                                                @ float32x4x2_t tmp;         q2, q3 (d4-d7)
+                                                @ float32x4x2_t *Fout2;    q10, q11 (d20-d23)
+                                                @ float32x4x2_t *tw1;      q12, q13 (d24-d27)
+                                                @ float32x4x2_t t;      q8, q9 (d16-d19)
+    asrs            r4, r3, #2                  @ size_t k=m/4;
+    beq             .bfly2_do_while1
+    mov             r7, r1, asl #2              @ convert fstride into bytes count (fstride*8*4 /*4 samples*/)
+
+.bfly2_do_while4:                               @ do { //process 4 samples per iteration
+    vld2.32         {d20-d23}, [r5]             @ load *Fout2;
+    vld2.32         {d16-d19}, [r6], r7         @ load *tw1; tw1 += (fstride*4);
+    pld             [r6, #0]                    @ preload next tw1
+    vmul.f32        q2, q10, q12                @ C_MUL (t,  *Fout2 , *tw1);
+    vmul.f32        q3, q11, q13
+    vsub.f32        q8, q2, q3
+    vmul.f32        q2, q10, q13
+    vmul.f32        q3, q11, q12
+    vadd.f32        q9, q2, q3
+
+    vld2.32         {d0-d3}, [r0]               @ load *Fout;
+    vsub.f32        q10, q0, q8                 @ C_SUB( *Fout2 ,  *Fout , t );
+    vsub.f32        q11, q1, q9
+    vst2.32         {d20-d23}, [r5]!            @ store *Fout2; Fout2+=4
+    pld             [r5, #0]                    @ preload next Fout2
+
+    vadd.f32        q0, q0, q8                  @ C_ADDTO( *Fout ,  t );
+    vadd.f32        q1, q1, q9
+    vst2.32         {d0-d3}, [r0]!              @ store *Fout; Fout+=4
+    pld             [r0, #0]                    @ preload next Fout
+
+    subs            r4, r4, #1                  @ }while(--k);
+    bne             .bfly2_do_while4
+
+@.kf_bfly2_process_remaining:
+    asr             r8, r3, #31
+    lsr             r7, r8, #30
+    add             r4, r7, r3
+    ands            r3, r4, #3                  @ if (k % 4 == 0)
+    beq             .kf_bfly2_done
+                                                @ float32x4x2_t *Fout;      d0 {s0,s1}
+                                                @ float32x4x2_t tmp;        d1 {s2,s3}
+                                                @ float32x4x2_t *Fout2;     d2 {s4,s5}
+                                                @ float32x4x2_t *tw1;       d3 {s6,s7}
+                                                @ float32x4x2_t t;          d4 {s8,s9}
+
+
+.bfly2_do_while1:                               @ do { //process 1 sample per iteration
+    vld1.32         {d2}, [r5]                  @ load *Fout2;{s16,s17}
+    vld1.32         {d3}, [r6], r1              @ load *tw1; tw1 += (fstride);{s24,s25}
+    pld             [r6, #0]                    @ preload next tw1
+    vmul.f32        d1, d2, d3                  @ @ C_MUL (t,  *Fout2 , *tw1);
+    vsub.f32        s8, s2, s3
+    vmul.f32        s2, s4, s7
+    vmul.f32        s3, s5, s6
+    vadd.f32        s9, s2, s3
+
+    vld1.32         {d0}, [r0]                  @ load *Fout;
+    vsub.f32        d5, d0, d4                  @ C_SUB( *Fout2 ,  *Fout , t );
+    vst1.32         {d5}, [r5]!                 @ store *Fout2; ++Fout2
+    pld             [r5, #0]                    @ preload next Fout2
+
+    vadd.f32        d0, d0, d4                  @ C_ADDTO( *Fout ,  t );
+    vst1.32         {d0}, [r0]!                 @ store *Fout; ++Fout
+    pld             [r0, #0]                    @ preload next Fout
+
+    subs            r3, r3, #1                  @ }while(--k);
+    bne             .bfly2_do_while1
+
+.kf_bfly2_done:
+@    vldmia           sp!, {d8-d15}
+    ldmia            sp!, {r4, r5, r6, r7, r8, r9, sl, fp, pc}
+    nop
+
+    .endfunc
+    .end
diff --git a/kiss_fft_bfly4_neon.S b/kiss_fft_bfly4_neon.S
new file mode 100644
index 0000000..27794c3
--- /dev/null
+++ b/kiss_fft_bfly4_neon.S
@@ -0,0 +1,246 @@
+/*
+* Copyright (C) 2012, Code Aurora Forum. All rights reserved.
+*
+* Redistribution and use in source and binary forms, with or without
+* modification, are permitted provided that the following conditions are
+* met:
+*     * Redistributions of source code must retain the above copyright
+*       notice, this list of conditions and the following disclaimer.
+*     * Redistributions in binary form must reproduce the above
+*       copyright notice, this list of conditions and the following
+*       disclaimer in the documentation and/or other materials provided
+*       with the distribution.
+*     * Neither the name of Code Aurora Forum, Inc. nor the names of its
+*       contributors may be used to endorse or promote products derived
+*       from this software without specific prior written permission.
+*
+* THIS SOFTWARE IS PROVIDED "AS IS" AND ANY EXPRESS OR IMPLIED
+* WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT
+* ARE DISCLAIMED.  IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS
+* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+* BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+* WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE
+* OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN
+* IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+*/
+
+@ NEON optimized assembly routine of kf_bfly4()
+
+    .text
+    .fpu neon
+    .align 4
+    .global     kf_bfly4
+    .func       kf_bfly4
+
+kf_bfly4:
+    stmdb           sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, lr}
+@   vstmdb          sp!, {d8-d15}
+                                                @ r0 - Fout| r1 - fstride | r2 - st | r3 - m
+    pld             [r0, #0]
+    mov             r5, r3
+    mov             r3, r3, asl #3              @ convert m into bytes count (m*8)
+    add             r6, r2, #264                @ tw1 = st->twiddles
+    pld             [r6, #0]
+    mov             r7, r6                      @ tw2 = st->twiddles
+    mov             r8, r7                      @ tw3 = st->twiddles
+    ldr             r2, [r2, #4]                @ st->inverse
+    mov             r1, r1, asl #3              @ convert fstride into bytes count (fstride*8)
+    mov             r9, r1, asl #1              @ fstride*2
+    add             r10, r1, r9                 @ fstride*3
+                                                @ float32x4x2_t rfout;       q0, q1 (d0-d3)
+                                                @ float32x4x2_t tmp;         q2, q3 (d4-d7)
+                                                @ float32x4x2_t scratch0;    q12, q13 (d24-d27)
+                                                @ float32x4x2_t scratch1;    q14, q15 (d28-d31)
+                                                @ float32x4x2_t scratch2;    q8, q9 (d16-d19)
+                                                @ float32x4x2_t scratch3;    q10, q11 (d20-d23)
+    asrs            r4, r5, #2                  @ size_t k=m/4;
+    beq             .kf_bfly4_do_while1         @ if(k==0)
+
+.kf_bfly4_do_while4:                            @ do { //process 4 samples per iteration
+    add             r11, r0, r3                 @ fom = Fout+m;
+    mov             r12, r11
+    pld             [r7, #0]
+    vld1.32         {d20}, [r6], r1             @ rtwd1 = vld1_f32((const float32_t*)tw1); tw1 += fstride;
+    vld1.32         {d21}, [r6], r1             @ rtwd2 = vld1_f32((const float32_t*)tw1); tw1 += fstride;
+    vld1.32         {d22}, [r6], r1             @ rtwd3 = vld1_f32((const float32_t*)tw1); tw1 += fstride;
+    vld1.32         {d23}, [r6], r1             @ rtwd4 = vld1_f32((const float32_t*)tw1); tw1 += fstride;
+    vuzp.32         q10, q11                    @ scratch3 = vuzpq_f32(vcombine_f32(rtwd1, rtwd2), vcombine_f32(rtwd3, rtwd4));
+    vld2.32         {d0-d3}, [r11], r3          @ rfout = vld2q_f32((const float32_t*)(fom1)); fom2 = Fout+m2;
+    vmul.f32        q2, q0, q10                 @ C_MUL_NEON(scratch0, rfout, scratch3);
+    vmul.f32        q3, q1, q11
+    vsub.f32        q12, q2, q3
+    vmul.f32        q2, q0, q11
+    vmul.f32        q3, q1, q10
+    vadd.f32        q13, q2, q3
+
+    pld             [r8, #0]
+    vld1.32         {d20}, [r7], r9             @ rtwd1 = vld1_f32((const float32_t*)tw2); tw2 += fstride*2;
+    vld1.32         {d21}, [r7], r9             @ rtwd2 = vld1_f32((const float32_t*)tw2); tw2 += fstride*2;
+    vld1.32         {d22}, [r7], r9             @ rtwd3 = vld1_f32((const float32_t*)tw2); tw2 += fstride*2;
+    vld1.32         {d23}, [r7], r9             @ trtwd4 = vld1_f32((const float32_t*)tw2); tw2 += fstride*2;
+    vuzp.32         q10, q11                    @ scratch3 = vuzpq_f32(vcombine_f32(rtwd1, rtwd2), vcombine_f32(rtwd3, rtwd4));
+    vld2.32         {d0-d3}, [r11], r3          @ rfout = vld2q_f32((const float32_t*)(fom2)); fom3 = Fout+m3;
+    vmul.f32        q2, q0, q10                 @ C_MUL_NEON(scratch1, rfout, scratch3);
+    vmul.f32        q3, q1, q11
+    vsub.f32        q14, q2, q3
+    vmul.f32        q2, q0, q11
+    vmul.f32        q3, q1, q10
+    vadd.f32        q15, q2, q3
+
+    pld             [r0, #0]
+    vld1.32         {d20}, [r8], r10            @ rtwd1 = vld1_f32((const float32_t*)tw3); tw3 += fstride*3;
+    vld1.32         {d21}, [r8], r10            @ rtwd2 = vld1_f32((const float32_t*)tw3); tw3 += fstride*3;
+    vld1.32         {d22}, [r8], r10            @ rtwd3 = vld1_f32((const float32_t*)tw3); tw3 += fstride*3;
+    vld1.32         {d23}, [r8], r10            @ rtwd4 = vld1_f32((const float32_t*)tw3); tw3 += fstride*3;
+    vuzp.32         q10, q11                    @ scratch3 = vuzpq_f32(vcombine_f32(rtwd1, rtwd2), vcombine_f32(rtwd3, rtwd4));
+    vld2.32         {d0-d3}, [r11]              @ rfout = vld2q_f32((const float32_t*)(fom3));
+    vmul.f32        q2, q0, q10                 @ C_MUL_NEON(scratch2, rfout, scratch3);
+    vmul.f32        q3, q1, q11
+    vsub.f32        q8, q2, q3
+    vmul.f32        q2, q0, q11
+    vmul.f32        q3, q1, q10
+    vadd.f32        q9, q2, q3
+
+    vld2.32         {d0-d3}, [r0]               @ rfout = vld2q_f32((const float32_t*)(Fout));
+    vsub.f32        q2, q0, q14                 @ C_SUB_NEON(tmp, rfout, scratch1 );
+    vsub.f32        q3, q1, q15
+
+    vadd.f32        q0, q0, q14                 @ C_ADD_NEON(rfout, rfout, scratch1);
+    vadd.f32        q1, q1, q15
+
+    vadd.f32        q10, q12, q8                @ C_ADD_NEON(scratch3, scratch0, scratch2);
+    vadd.f32        q11, q13, q9
+
+    vsub.f32        q12, q12, q8                @ C_SUB_NEON(scratch0, scratch0, scratch2);
+    vsub.f32        q13, q13, q9
+
+    vsub.f32        q8, q0, q10                 @ C_SUB_NEON(scratch2, rfout, scratch3);
+    vsub.f32        q9, q1, q11
+
+    vadd.f32        q0, q0, q10                 @ C_ADD_NEON(rfout, rfout, scratch3);
+    vadd.f32        q1, q1, q11
+    vst2.32         {d0-d3}, [r0]!              @ vst2q_f32((float32_t*)Fout, rfout); Fout+=4;;
+
+    cmp             r2, #0
+    beq             .not_inverse4               @ if(st->inverse) {
+    vsub.f32        q10, q2, q13                @ scratch3.val[0] = vsubq_f32(tmp.val[0], scratch0.val[1]);
+    vadd.f32        q11, q3, q12                @ scratch3.val[1] = vaddq_f32(tmp.val[1], scratch0.val[0]);
+    vadd.f32        q14, q2, q13                @ scratch1.val[0] = vaddq_f32(tmp.val[0], scratch0.val[1]);
+    vsub.f32        q15, q3, q12                @ scratch1.val[1] = vsubq_f32(tmp.val[1], scratch0.val[0]);
+    b               .c_end4
+.not_inverse4:                                  @ } else {
+    vadd.f32        q10, q2, q13                @ scratch3.val[0] = vaddq_f32(tmp.val[0], scratch0.val[1]);
+    vsub.f32        q11, q3, q12                @ scratch3.val[1] = vsubq_f32(tmp.val[1], scratch0.val[0]);
+    vsub.f32        q14, q2, q13                @ scratch1.val[0] = vsubq_f32(tmp.val[0], scratch0.val[1]);
+    vadd.f32        q15, q3, q12                @ scratch1.val[1] = vaddq_f32(tmp.val[1], scratch0.val[0]);
+                                                @ }
+.c_end4:
+    vst2.32         {d20-d23}, [r12], r3        @ vst2q_f32((float32_t*)(fom), scratch3); fom2 = Fout+m2;
+    vst2.32         {d16-d19}, [r12], r3        @ vst2q_f32((float32_t*)fom2, scratch2); fom3 = Fout+m3;
+    vst2.32         {d28-d31}, [r12]            @ vst2q_f32((float32_t*)(fom3), scratch1);
+
+    pld             [r6, #0]
+
+    subs            r4, r4, #1                  @ }while(--k);
+    bne             .kf_bfly4_do_while4
+
+@.kf_bfly4_process_singles:
+    asr             r4, r5, #31
+    lsr             r4, r4, #30
+    add             r4, r4, r5
+    ands            r5, r4, #3                  @ if (k%4 == 0)
+    beq             .kf_bfly4_done
+
+.kf_bfly4_do_while1:                            @ do { //process 1 sample per iteration
+    pld             [r7, #0]
+    vld1.32         {d18}, [r6], r1             @ rtwd1 = vld1_f32((const float32_t*)tw1); tw1 += fstride;
+    vuzp.32         d18, d19                    @ scratch3 = vuzp_f32(rtwd1, rtwd2); //d11 is empty
+    add             r12, r0, r3                 @ fom = Fout+m;
+    vld1.32         {d0}, [r12], r3             @ rfout = vld2_f32((const float32_t*)(fom1)); fom2 = Fout+m2;
+    vuzp.32         d0, d1                      @ d1 is empty
+    vmul.f32        q1, q0, q9                  @ C_MUL_NEON(scratch0, rfout, scratch3);
+    vsub.f32        d4, d2, d3
+    vmul.f32        d2, d0, d19
+    vmul.f32        d3, d1, d18
+    vadd.f32        d5, d2, d3
+
+    pld             [r8, #0]
+    vld1.32         {d18}, [r7], r9             @ rtwd1 = vld1_f32((const float32_t*)tw2); tw2+= fstride*2;
+    vuzp.32         d18, d19                    @ scratch3 = vuzp_f32(rtwd1, rtwd2); //d11 is empty
+    vld1.32         {d0}, [r12], r3             @ rfout = vld2_f32((const float32_t*)(fom2)); fom3 = Fout+m3;
+    vuzp.32         d0, d1                      @ d1 is empty
+    vmul.f32        q1, q0, q9                  @ C_MUL_NEON(scratch1, rfout, scratch3);
+    vsub.f32        d6, d2, d3
+    vmul.f32        d2, d0, d19
+    vmul.f32        d3, d1, d18
+    vadd.f32        d7, d2, d3
+
+    pld             [r0, #0]
+    vld1.32         {d18}, [r8], r10            @ rtwd1 = vld1_f32((const float32_t*)tw3); tw3 += fstride*3;
+    vuzp.32         d18, d19                    @ scratch3 = vuzp_f32(rtwd1, rtwd2); //d11 is empty
+    vld1.32         {d0}, [r12]                 @ rfout = vld2_f32((const float32_t*)(fom3));
+    vuzp.32         d0, d1                      @ d1 is empty
+    vmul.f32        q1, q0, q9                  @ C_MUL_NEON(scratch2, rfout, scratch3);
+    vsub.f32        d16, d2, d3
+    vmul.f32        d2, d0, d19
+    vmul.f32        d3, d1, d18
+    vadd.f32        d17, d2, d3
+
+    vld1.32         {d0}, [r0]                  @ rfout = vld2_f32((const float32_t*)(Fout));
+    vuzp.32         d0, d1
+    vsub.f32        q1, q0, q3                  @ C_SUB_NEON(tmp, rfout, scratch1 );
+
+    vadd.f32        q0, q0, q3                  @ C_ADD_NEON(rfout, rfout, scratch1);
+
+    vadd.f32        q9, q2, q8                  @ C_ADD_NEON(scratch3, scratch0, scratch2);
+
+    vsub.f32        q2, q2, q8                  @ C_SUB_NEON(scratch0, scratch0, scratch2);
+
+    vsub.f32        q8, q0, q9                  @ C_SUB_NEON(scratch2, rfout, scratch3);
+
+    vadd.f32        q0, q0, q9                  @ C_ADD_NEON(rfout, rfout, scratch3);
+
+    cmp             r2, #0
+    beq             .not_inverse1               @ if(st->inverse) {
+    vsub.f32        d18, d2, d5                 @ scratch3.val[0] = vsub_f32(tmp.val[0], scratch0.val[1]);
+    vadd.f32        d19, d3, d4                 @ scratch3.val[1] = vadd_f32(tmp.val[1], scratch0.val[0]);
+    vadd.f32        d6, d2, d5                  @ scratch1.val[0] = vadd_f32(tmp.val[0], scratch0.val[1]);
+    vsub.f32        d7, d3, d4                  @ scratch1.val[1] = vsub_f32(tmp.val[1], scratch0.val[0]);
+    b               .c_end1
+.not_inverse1:                                  @ } else {
+    vadd.f32        d18, d2, d5                 @ scratch3.val[0] = vadd_f32(tmp.val[0], scratch0.val[1]);
+    vsub.f32        d19, d3, d4                 @ scratch3.val[1] = vsub_f32(tmp.val[1], scratch0.val[0]);
+    vsub.f32        d6, d2, d5                  @ scratch1.val[0] = vsub_f32(tmp.val[0], scratch0.val[1]);
+    vadd.f32        d7, d3, d4                  @ scratch1.val[1] = vadd_f32(tmp.val[1], scratch0.val[0]);
+                                                @ }
+.c_end1:
+    mov             r12, r0
+    vzip.32         d0, d1
+    vst1.32         {d0}, [r12], r3             @ vst2_f32((float32_t*)Fout, rfout); fom = Fout+m;
+
+    vzip.32         d18, d19
+    vst1.32         {d18}, [r12], r3            @ vst2_f32((float32_t*)(fom), scratch3); fom2 = Fout+m2;
+
+    vzip.32         d16, d17
+    vst1.32         {d16}, [r12], r3            @ vst2_f32((float32_t*)fom2, scratch2);  fom3 = Fout+m3;
+
+    vzip.32         d6, d7
+    vst1.32         {d6}, [r12]                 @ vst2_f32((float32_t*)(fom3), scratch1);
+
+    add             r0, r0, #8                  @ Fout+=1;
+    pld             [r6, #0]
+
+    subs            r5, r5, #1                  @ }while(--k);
+    bne             .kf_bfly4_do_while1
+
+.kf_bfly4_done:
+@   vldmia          sp!, {d8-d15}
+    ldmia           sp!, {r4, r5, r6, r7, r8, r9, r10, r11, r12, pc}
+    nop
+
+    .endfunc
+    .end
+