diff --git a/spa/plugins/audioconvert/fmt-ops-neon.c b/spa/plugins/audioconvert/fmt-ops-neon.c new file mode 100644 index 000000000..2666430fa --- /dev/null +++ b/spa/plugins/audioconvert/fmt-ops-neon.c @@ -0,0 +1,265 @@ +/* Spa + * + * Copyright © 2020 Wim Taymans + * + * Permission is hereby granted, free of charge, to any person obtaining a + * copy of this software and associated documentation files (the "Software"), + * to deal in the Software without restriction, including without limitation + * the rights to use, copy, modify, merge, publish, distribute, sublicense, + * and/or sell copies of the Software, and to permit persons to whom the + * Software is furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice (including the next + * paragraph) shall be included in all copies or substantial portions of the + * Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER + * DEALINGS IN THE SOFTWARE. + */ + +#include +#include +#include + +#include "fmt-ops.h" + +static void +conv_s16_to_f32d_2s_neon(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src, + uint32_t n_channels, uint32_t n_samples) +{ + const int16_t *s = src; + float *d0 = dst[0], *d1 = dst[1]; + +#ifdef __aarch64__ + uint32_t stride = n_channels << 1; + unsigned int remainder = n_samples & 3; + n_samples -= remainder; + + asm volatile( + " cmp %[n_samples], #0\n" + " beq 2f\n" + "1:" + " ld2 { v0.h, v1.h }[0], [%[s]], %[stride]\n" + " ld2 { v0.h, v1.h }[1], [%[s]], %[stride]\n" + " ld2 { v0.h, v1.h }[2], [%[s]], %[stride]\n" + " ld2 { v0.h, v1.h }[3], [%[s]], %[stride]\n" + " subs %[n_samples], %[n_samples], #4\n" + " sshll v2.4s, v0.4h, #0\n" + " sshll v3.4s, v1.4h, #0\n" + " scvtf v0.4s, v2.4s, #15\n" + " scvtf v1.4s, v3.4s, #15\n" + " st1 { v0.4s }, [%[d0]], #16\n" + " st1 { v1.4s }, [%[d1]], #16\n" + " bne 1b\n" + "2:" + " cmp %[remainder], #0\n" + " beq 4f\n" + "3:" + " ld2 { v0.h, v1.h }[0], [%[s]], %[stride]\n" + " subs %[remainder], %[remainder], #1\n" + " sshll v2.4s, v0.4h, #0\n" + " sshll v3.4s, v1.4h, #0\n" + " scvtf v0.4s, v2.4s, #15\n" + " scvtf v1.4s, v3.4s, #15\n" + " st1 { v0.s }[0], [%[d0]], #4\n" + " st1 { v1.s }[0], [%[d1]], #4\n" + " bne 3b\n" + "4:" + : [d0] "+r" (d0), [d1] "+r" (d1), [s] "+r" (s), [n_samples] "+r" (n_samples), + [remainder] "+r" (remainder) + : [stride] "r" (stride) + : "cc", "v0", "v1", "v2", "v3"); +#else + uint32_t n; + for(n = 0; n < n_samples; n++) { + *d0++ = S16_TO_F32(s[0]); + *d1++ = S16_TO_F32(s[1]); + s += n_channels; + } +#endif +} + +static void +conv_s16_to_f32d_1s_neon(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src, + uint32_t n_channels, uint32_t n_samples) +{ + const int16_t *s = src; + float *d = dst[0]; +#ifdef __aarch64__ + uint32_t stride = n_channels << 1; + uint32_t remainder = n_samples & 3; + n_samples -= remainder; + + asm volatile( + " cmp %[n_samples], #0\n" + " beq 2f\n" + "1:" + " ld1 { v0.h }[0], [%[s]], %[stride]\n" + " ld1 { v0.h }[1], [%[s]], %[stride]\n" + " ld1 { v0.h }[2], [%[s]], %[stride]\n" + " ld1 { v0.h }[3], [%[s]], %[stride]\n" + " subs %[n_samples], %[n_samples], #4\n" + " sshll v1.4s, v0.4h, #0\n" + " scvtf v0.4s, v1.4s, #15\n" + " st1 { v0.4s }, [%[d]], #16\n" + " bne 1b\n" + "2:" + " cmp %[remainder], #0\n" + " beq 4f\n" + "3:" + " ld1 { v0.h }[0], [%[s]], %[stride]\n" + " subs %[remainder], %[remainder], #1\n" + " sshll v1.4s, v0.4h, #0\n" + " scvtf v0.4s, v1.4s, #15\n" + " st1 { v0.s }[0], [%[d]], #4\n" + " bne 3b\n" + "4:" + : [d] "+r" (d), [s] "+r" (s), [n_samples] "+r" (n_samples), + [remainder] "+r" (remainder) + : [stride] "r" (stride) + : "cc", "v0", "v1"); +#else + uint32_t n; + for(n = 0; n < n_samples; n++) { + *d++ = S16_TO_F32(s[0]); + s += n_channels; + } +#endif +} + +void +conv_s16_to_f32d_neon(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], + uint32_t n_samples) +{ + const int16_t *s = src[0]; + uint32_t i = 0, n_channels = conv->n_channels; + + for(; i + 1 < n_channels; i += 2) + conv_s16_to_f32d_2s_neon(conv, &dst[i], &s[i], n_channels, n_samples); + for(; i < n_channels; i++) + conv_s16_to_f32d_1s_neon(conv, &dst[i], &s[i], n_channels, n_samples); +} + +static void +conv_f32d_to_s16_2s_neon(void *data, void * SPA_RESTRICT dst, const void * SPA_RESTRICT src[], + uint32_t n_channels, uint32_t n_samples) +{ + const float *s0 = src[0], *s1 = src[1]; + int16_t *d = dst; + +#ifdef __aarch64__ + uint32_t stride = n_channels << 1; + uint32_t remainder = n_samples & 3; + n_samples -= remainder; + + asm volatile( + " cmp %[n_samples], #0\n" + " beq 2f\n" + "1:" + " ld1 { v0.4s }, [%[s0]], #16\n" + " ld1 { v1.4s }, [%[s1]], #16\n" + " subs %[n_samples], %[n_samples], #4\n" + " fcvtzs v0.4s, v0.4s, #31\n" + " fcvtzs v1.4s, v1.4s, #31\n" + " sqrshrn v0.4h, v0.4s, #16\n" + " sqrshrn v1.4h, v1.4s, #16\n" + " st2 { v0.h, v1.h }[0], [%[d]], %[stride]\n" + " st2 { v0.h, v1.h }[1], [%[d]], %[stride]\n" + " st2 { v0.h, v1.h }[2], [%[d]], %[stride]\n" + " st2 { v0.h, v1.h }[3], [%[d]], %[stride]\n" + " bne 1b\n" + "2:" + " cmp %[remainder], #0\n" + " beq 4f\n" + "3:" + " ld1 { v0.s }[0], [%[s0]], #4\n" + " ld1 { v2.s }[0], [%[s1]], #4\n" + " subs %[remainder], %[remainder], #1\n" + " fcvtzs v0.4s, v0.4s, #31\n" + " fcvtzs v1.4s, v1.4s, #31\n" + " sqrshrn v0.4h, v0.4s, #16\n" + " sqrshrn v1.4h, v1.4s, #16\n" + " st2 { v0.h, v1.h }[0], [%[d]], %[stride]\n" + " bne 3b\n" + "4:" + : [d] "+r" (d), [s0] "+r" (s0), [s1] "+r" (s1), [n_samples] "+r" (n_samples), + [remainder] "+r" (remainder) + : [stride] "r" (stride) + : "cc", "v0", "v1"); +#else + uint32_t n; + for(n = 0; n < n_samples; n++) { + d[0] = F32_TO_S16(s0[n]); + d[1] = F32_TO_S16(s1[n]); + d += n_channels; + } +#endif +} + +static void +conv_f32d_to_s16_1s_neon(void *data, void * SPA_RESTRICT dst, const void * SPA_RESTRICT src[], + uint32_t n_channels, uint32_t n_samples) +{ + const float *s = src[0]; + int16_t *d = dst; + +#ifdef __aarch64__ + uint32_t stride = n_channels << 1; + uint32_t remainder = n_samples & 3; + n_samples -= remainder; + + asm volatile( + " cmp %[n_samples], #0\n" + " beq 2f\n" + "1:" + " ld1 { v0.4s }, [%[s]], #16\n" + " subs %[n_samples], %[n_samples], #4\n" + " fcvtzs v0.4s, v0.4s, #31\n" + " sqrshrn v0.4h, v0.4s, #16\n" + " st1 { v0.h }[0], [%[d]], %[stride]\n" + " st1 { v0.h }[1], [%[d]], %[stride]\n" + " st1 { v0.h }[2], [%[d]], %[stride]\n" + " st1 { v0.h }[3], [%[d]], %[stride]\n" + " bne 1b\n" + "2:" + " cmp %[remainder], #0\n" + " beq 4f\n" + "3:" + " ld1 { v0.s }[0], [%[s]], #4\n" + " subs %[remainder], %[remainder], #1\n" + " fcvtzs v0.4s, v0.4s, #31\n" + " sqrshrn v0.4h, v0.4s, #16\n" + " st1 { v0.h }[0], [%[d]], %[stride]\n" + " bne 3b\n" + "4:" + : [d] "+r" (d), [s] "+r" (s), [n_samples] "+r" (n_samples), + [remainder] "+r" (remainder) + : [stride] "r" (stride) + : "cc", "v0"); +#else + uint32_t n; + for(n = 0; n < n_samples; n++) { + *d = F32_TO_S16(s0[n]); + d += n_channels; + } +#endif +} + +void +conv_f32d_to_s16_neon(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], + uint32_t n_samples) +{ + int16_t *d = dst[0]; + uint32_t i = 0, n_channels = conv->n_channels; + + for(; i + 1 < n_channels; i+=2) + for(; i + 1 < n_channels; i += 2) + conv_f32d_to_s16_2s_neon(conv, &d[i], &src[i], n_channels, n_samples); + for(; i < n_channels; i++) + conv_f32d_to_s16_1s_neon(conv, &d[i], &src[i], n_channels, n_samples); +} diff --git a/spa/plugins/audioconvert/fmt-ops.c b/spa/plugins/audioconvert/fmt-ops.c index f5d9c3345..512a21a77 100644 --- a/spa/plugins/audioconvert/fmt-ops.c +++ b/spa/plugins/audioconvert/fmt-ops.c @@ -55,6 +55,9 @@ static struct conv_info conv_table[] = { SPA_AUDIO_FORMAT_S16, SPA_AUDIO_FORMAT_F32, 0, 0, conv_s16_to_f32_c }, { SPA_AUDIO_FORMAT_S16P, SPA_AUDIO_FORMAT_F32P, 0, 0, conv_s16d_to_f32d_c }, +#if defined (HAVE_NEON) + { SPA_AUDIO_FORMAT_S16, SPA_AUDIO_FORMAT_F32P, 0, SPA_CPU_FLAG_NEON, conv_s16_to_f32d_neon }, +#endif #if defined (HAVE_AVX2) { SPA_AUDIO_FORMAT_S16, SPA_AUDIO_FORMAT_F32P, 2, SPA_CPU_FLAG_SSE2, conv_s16_to_f32d_2_avx2 }, { SPA_AUDIO_FORMAT_S16, SPA_AUDIO_FORMAT_F32P, 0, SPA_CPU_FLAG_SSE2, conv_s16_to_f32d_avx2 }, @@ -115,6 +118,9 @@ static struct conv_info conv_table[] = { SPA_AUDIO_FORMAT_F32, SPA_AUDIO_FORMAT_S16, 0, 0, conv_f32_to_s16_c }, { SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S16P, 0, 0, conv_f32d_to_s16d_c }, { SPA_AUDIO_FORMAT_F32, SPA_AUDIO_FORMAT_S16P, 0, 0, conv_f32_to_s16d_c }, +#if defined (HAVE_NEON) + { SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S16, 0, SPA_CPU_FLAG_NEON, conv_f32d_to_s16_neon }, +#endif #if defined (HAVE_AVX2) { SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S16, 4, SPA_CPU_FLAG_SSE2, conv_f32d_to_s16_4_avx2 }, { SPA_AUDIO_FORMAT_F32P, SPA_AUDIO_FORMAT_S16, 2, SPA_CPU_FLAG_SSE2, conv_f32d_to_s16_2_avx2 }, diff --git a/spa/plugins/audioconvert/fmt-ops.h b/spa/plugins/audioconvert/fmt-ops.h index 02909c837..8871f5c6c 100644 --- a/spa/plugins/audioconvert/fmt-ops.h +++ b/spa/plugins/audioconvert/fmt-ops.h @@ -186,6 +186,10 @@ DEFINE_FUNCTION(interleave_16, c); DEFINE_FUNCTION(interleave_24, c); DEFINE_FUNCTION(interleave_32, c); +#if defined(HAVE_NEON) +DEFINE_FUNCTION(s16_to_f32d, neon); +DEFINE_FUNCTION(f32d_to_s16, neon); +#endif #if defined(HAVE_SSE2) DEFINE_FUNCTION(s16_to_f32d_2, sse2); DEFINE_FUNCTION(s16_to_f32d, sse2); diff --git a/spa/plugins/audioconvert/meson.build b/spa/plugins/audioconvert/meson.build index 9a5fe9a89..abfe6dbcc 100644 --- a/spa/plugins/audioconvert/meson.build +++ b/spa/plugins/audioconvert/meson.build @@ -87,7 +87,8 @@ endif if have_neon audioconvert_neon = static_library('audioconvert_neon', - ['resample-native-neon.c'], + ['resample-native-neon.c', + 'fmt-ops-neon.c' ], c_args : ['-O3', '-DHAVE_NEON'], include_directories : [spa_inc], install : false