core: Add ARM NEON optimized sample conversion code

final: * includes some minor style fixes and build-time changes to allow building a single binary for neon and non-neon systems v4: * fix for sample length < 4 v3: * convert from intrinsics to inline assembly v2: * load and store data with vld1/vld1q and vst1/vst1q, resp., to work around alignment issues of compiler-generated vldmia instruction * remove redundant check for NEON flags Ubuntu/Linaro gcc 4.6.3 arm-linux-gnueabi-gcc -O2 -mcpu=cortex-a8 -mfloat-abi=softfp -mfpu=neon runtime on beagle-xm: D: [pulseaudio] sconv_neon.c: checking NEON sconv_s16le_from_float I: [pulseaudio] sconv_neon.c: NEON: 3754 usec. I: [pulseaudio] sconv_neon.c: ref: 58594 usec. D: [pulseaudio] sconv_neon.c: checking NEON sconv_s16le_to_float I: [pulseaudio] sconv_neon.c: NEON: 1831 usec. I: [pulseaudio] sconv_neon.c: ref: 10528 usec. I: [pulseaudio] sconv_neon.c: Initialising ARM NEON optimized conversions. conversion may be off by one for some samples due to rounding issues
2025-11-05 13:29:57 -05:00 · 2012-10-23 17:54:57 +02:00 · 2012-10-23 17:54:57 +02:00 · 1319c4533a
commit 1319c4533a
parent 4171df3019
6 changed files with 193 additions and 1 deletions
--- a/src/tests/cpu-test.c
+++ b/src/tests/cpu-test.c
@ -282,6 +282,35 @@ START_TEST (sconv_sse_test) {
 END_TEST
 #endif /* defined (__i386__) || defined (__amd64__) */

+#if defined (__arm__) && defined (__linux__)
+START_TEST (sconv_neon_test) {
+    pa_cpu_arm_flag_t flags = 0;
+    pa_convert_func_t orig_func, neon_func;
+
+    pa_cpu_get_arm_flags(&flags);
+
+    if (!(flags & PA_CPU_ARM_NEON)) {
+        pa_log_info("NEON not supported. Skipping");
+        return;
+    }
+
+    orig_func = pa_get_convert_from_float32ne_function(PA_SAMPLE_S16LE);
+    pa_convert_func_init_neon(flags);
+    neon_func = pa_get_convert_from_float32ne_function(PA_SAMPLE_S16LE);
+
+    pa_log_debug("Checking NEON sconv (s16 -> float)");
+    run_conv_test_float_to_s16(neon_func, orig_func, 0, TRUE, FALSE);
+    run_conv_test_float_to_s16(neon_func, orig_func, 1, TRUE, FALSE);
+    run_conv_test_float_to_s16(neon_func, orig_func, 2, TRUE, FALSE);
+    run_conv_test_float_to_s16(neon_func, orig_func, 3, TRUE, FALSE);
+    run_conv_test_float_to_s16(neon_func, orig_func, 4, TRUE, FALSE);
+    run_conv_test_float_to_s16(neon_func, orig_func, 5, TRUE, FALSE);
+    run_conv_test_float_to_s16(neon_func, orig_func, 6, TRUE, FALSE);
+    run_conv_test_float_to_s16(neon_func, orig_func, 7, TRUE, TRUE);
+}
+END_TEST
+#endif /* defined (__arm__) && defined (__linux__) */
+
 #undef SAMPLES
 #undef TIMES
 /* End conversion tests */
@ -313,6 +342,11 @@ int main(int argc, char *argv[]) {
    tc = tcase_create("sconv");
 #if defined (__i386__) || defined (__amd64__)
    tcase_add_test(tc, sconv_sse_test);
+#endif
+#if defined (__arm__) && defined (__linux__)
+#if HAVE_NEON
+    tcase_add_test(tc, sconv_neon_test);
+#endif
 #endif
    suite_add_tcase(s, tc);