neon: add alignment hints

This commit is contained in:
Wim Taymans 2020-04-03 14:26:02 +02:00
parent 167460a9bc
commit bf3ebb67aa

View file

@ -89,16 +89,16 @@ static void inner_product_neon(float *d, const float * SPA_RESTRICT s,
asm volatile ( asm volatile (
" cmp %[n_taps], #0\n" " cmp %[n_taps], #0\n"
" bne 1f\n" " bne 1f\n"
" vld1.32 {q4}, [%[taps]]!\n" " vld1.32 {q4}, [%[taps] :128]!\n"
" vld1.32 {q8}, [%[s]]!\n" " vld1.32 {q8}, [%[s]]!\n"
" subs %[remainder], %[remainder], #4\n" " subs %[remainder], %[remainder], #4\n"
" vmul.f32 q0, q4, q8\n" " vmul.f32 q0, q4, q8\n"
" bne 4f\n" " bne 4f\n"
" b 5f\n" " b 5f\n"
"1:" "1:"
" vld1.32 {q4, q5}, [%[taps]]!\n" " vld1.32 {q4, q5}, [%[taps] :128]!\n"
" vld1.32 {q8, q9}, [%[s]]!\n" " vld1.32 {q8, q9}, [%[s]]!\n"
" vld1.32 {q6, q7}, [%[taps]]!\n" " vld1.32 {q6, q7}, [%[taps] :128]!\n"
" vld1.32 {q10, q11}, [%[s]]!\n" " vld1.32 {q10, q11}, [%[s]]!\n"
" subs %[n_taps], %[n_taps], #16\n" " subs %[n_taps], %[n_taps], #16\n"
" vmul.f32 q0, q4, q8\n" " vmul.f32 q0, q4, q8\n"
@ -107,9 +107,9 @@ static void inner_product_neon(float *d, const float * SPA_RESTRICT s,
" vmul.f32 q3, q7, q11\n" " vmul.f32 q3, q7, q11\n"
" beq 3f\n" " beq 3f\n"
"2:" "2:"
" vld1.32 {q4, q5}, [%[taps]]!\n" " vld1.32 {q4, q5}, [%[taps] :128]!\n"
" vld1.32 {q8, q9}, [%[s]]!\n" " vld1.32 {q8, q9}, [%[s]]!\n"
" vld1.32 {q6, q7}, [%[taps]]!\n" " vld1.32 {q6, q7}, [%[taps] :128]!\n"
" vld1.32 {q10, q11}, [%[s]]!\n" " vld1.32 {q10, q11}, [%[s]]!\n"
" subs %[n_taps], %[n_taps], #16\n" " subs %[n_taps], %[n_taps], #16\n"
" vmla.f32 q0, q4, q8\n" " vmla.f32 q0, q4, q8\n"
@ -124,7 +124,7 @@ static void inner_product_neon(float *d, const float * SPA_RESTRICT s,
" vadd.f32 q0, q4, q5\n" " vadd.f32 q0, q4, q5\n"
" beq 5f\n" " beq 5f\n"
"4:" "4:"
" vld1.32 {q6}, [%[taps]]!\n" " vld1.32 {q6}, [%[taps] :128]!\n"
" vld1.32 {q10}, [%[s]]!\n" " vld1.32 {q10}, [%[s]]!\n"
" subs %[remainder], %[remainder], #4\n" " subs %[remainder], %[remainder], #4\n"
" vmla.f32 q0, q6, q10\n" " vmla.f32 q0, q6, q10\n"
@ -183,9 +183,9 @@ static void inner_product_ip_neon(float *d, const float * SPA_RESTRICT s,
#else #else
asm volatile( asm volatile(
" vdup.32 q10, %[x]\n" " vdup.32 q10, %[x]\n"
" vld1.32 {q4, q5}, [%[t0]]!\n" " vld1.32 {q4, q5}, [%[t0] :128]!\n"
" vld1.32 {q8, q9}, [%[s]]!\n" " vld1.32 {q8, q9}, [%[s]]!\n"
" vld1.32 {q6, q7}, [%[t1]]!\n" " vld1.32 {q6, q7}, [%[t1] :128]!\n"
" subs %[n_taps], %[n_taps], #8\n" " subs %[n_taps], %[n_taps], #8\n"
" vmul.f32 q0, q4, q8\n" " vmul.f32 q0, q4, q8\n"
" vmul.f32 q1, q5, q9\n" " vmul.f32 q1, q5, q9\n"
@ -193,9 +193,9 @@ static void inner_product_ip_neon(float *d, const float * SPA_RESTRICT s,
" vmul.f32 q3, q7, q9\n" " vmul.f32 q3, q7, q9\n"
" beq 3f\n" " beq 3f\n"
"2:" "2:"
" vld1.32 {q4, q5}, [%[t0]]!\n" " vld1.32 {q4, q5}, [%[t0] :128]!\n"
" vld1.32 {q8, q9}, [%[s]]!\n" " vld1.32 {q8, q9}, [%[s]]!\n"
" vld1.32 {q6, q7}, [%[t1]]!\n" " vld1.32 {q6, q7}, [%[t1] :128]!\n"
" subs %[n_taps], %[n_taps], #8\n" " subs %[n_taps], %[n_taps], #8\n"
" vmla.f32 q0, q4, q8\n" " vmla.f32 q0, q4, q8\n"
" vmla.f32 q1, q5, q9\n" " vmla.f32 q1, q5, q9\n"