mirror of
				https://gitlab.freedesktop.org/pipewire/pipewire.git
				synced 2025-11-03 09:01:54 -05:00 
			
		
		
		
	audioconvert: fix rounding on arm neon
Add the neon functions to the test
This commit is contained in:
		
							parent
							
								
									1db52856ff
								
							
						
					
					
						commit
						b1b8b0985a
					
				
					 2 changed files with 84 additions and 30 deletions
				
			
		| 
						 | 
				
			
			@ -26,6 +26,8 @@
 | 
			
		|||
#include <stdio.h>
 | 
			
		||||
#include <math.h>
 | 
			
		||||
 | 
			
		||||
#include <arm_neon.h>
 | 
			
		||||
 | 
			
		||||
#include "fmt-ops.h"
 | 
			
		||||
 | 
			
		||||
void
 | 
			
		||||
| 
						 | 
				
			
			@ -289,16 +291,19 @@ conv_f32d_to_s16_2s_neon(void *data, void * SPA_RESTRICT dst, const void * SPA_R
 | 
			
		|||
 | 
			
		||||
#ifdef __aarch64__
 | 
			
		||||
	asm volatile(
 | 
			
		||||
		"      dup v2.4s, %w[scale]\n"
 | 
			
		||||
		"      cmp %[n_samples], #0\n"
 | 
			
		||||
		"      beq 2f\n"
 | 
			
		||||
		"1:"
 | 
			
		||||
		"      ld1 { v0.4s }, [%[s0]], #16\n"
 | 
			
		||||
		"      ld1 { v1.4s }, [%[s1]], #16\n"
 | 
			
		||||
		"      subs %[n_samples], %[n_samples], #4\n"
 | 
			
		||||
		"      fcvtzs v0.4s, v0.4s, #31\n"
 | 
			
		||||
		"      fcvtzs v1.4s, v1.4s, #31\n"
 | 
			
		||||
		"      sqrshrn v0.4h, v0.4s, #16\n"
 | 
			
		||||
		"      sqrshrn v1.4h, v1.4s, #16\n"
 | 
			
		||||
		"      sqadd  v0.4s, v0.4s, v2.4s\n"
 | 
			
		||||
		"      sqadd  v1.4s, v1.4s, v2.4s\n"
 | 
			
		||||
		"      fcvtns v0.4s, v0.4s\n"
 | 
			
		||||
		"      fcvtns v1.4s, v1.4s\n"
 | 
			
		||||
		"      sqxtn  v0.4h, v0.4s\n"
 | 
			
		||||
		"      sqxtn  v1.4h, v1.4s\n"
 | 
			
		||||
		"      st2 { v0.h, v1.h }[0], [%[d]], %[stride]\n"
 | 
			
		||||
		"      st2 { v0.h, v1.h }[1], [%[d]], %[stride]\n"
 | 
			
		||||
		"      st2 { v0.h, v1.h }[2], [%[d]], %[stride]\n"
 | 
			
		||||
| 
						 | 
				
			
			@ -311,29 +316,42 @@ conv_f32d_to_s16_2s_neon(void *data, void * SPA_RESTRICT dst, const void * SPA_R
 | 
			
		|||
		"      ld1 { v0.s }[0], [%[s0]], #4\n"
 | 
			
		||||
		"      ld1 { v2.s }[0], [%[s1]], #4\n"
 | 
			
		||||
		"      subs %[remainder], %[remainder], #1\n"
 | 
			
		||||
		"      fcvtzs v0.4s, v0.4s, #31\n"
 | 
			
		||||
		"      fcvtzs v1.4s, v1.4s, #31\n"
 | 
			
		||||
		"      sqrshrn v0.4h, v0.4s, #16\n"
 | 
			
		||||
		"      sqrshrn v1.4h, v1.4s, #16\n"
 | 
			
		||||
		"      sqadd  v0.4s, v0.4s, v2.4s\n"
 | 
			
		||||
		"      sqadd  v1.4s, v1.4s, v2.4s\n"
 | 
			
		||||
		"      fcvtns v0.4s, v0.4s\n"
 | 
			
		||||
		"      fcvtns v1.4s, v1.4s\n"
 | 
			
		||||
		"      sqxtn  v0.4h, v0.4s\n"
 | 
			
		||||
		"      sqxtn  v1.4h, v1.4s\n"
 | 
			
		||||
		"      st2 { v0.h, v1.h }[0], [%[d]], %[stride]\n"
 | 
			
		||||
		"      bne 3b\n"
 | 
			
		||||
		"4:"
 | 
			
		||||
		: [d] "+r" (d), [s0] "+r" (s0), [s1] "+r" (s1), [n_samples] "+r" (n_samples),
 | 
			
		||||
		  [remainder] "+r" (remainder)
 | 
			
		||||
		: [stride] "r" (stride)
 | 
			
		||||
		: [stride] "r" (stride),
 | 
			
		||||
		  [scale] "r" (15 << 23)
 | 
			
		||||
		: "cc", "v0", "v1");
 | 
			
		||||
#else
 | 
			
		||||
	float32x4_t pos = vdupq_n_f32(0.4999999f / S16_SCALE);
 | 
			
		||||
	float32x4_t neg = vdupq_n_f32(-0.4999999f / S16_SCALE);
 | 
			
		||||
 | 
			
		||||
	asm volatile(
 | 
			
		||||
		"      veor q2, q2, q2\n"
 | 
			
		||||
		"      cmp %[n_samples], #0\n"
 | 
			
		||||
		"      beq 2f\n"
 | 
			
		||||
		"1:"
 | 
			
		||||
		"      vld1.32 { q0 }, [%[s0]]!\n"
 | 
			
		||||
		"      vld1.32 { q1 }, [%[s1]]!\n"
 | 
			
		||||
		"      subs %[n_samples], %[n_samples], #4\n"
 | 
			
		||||
		"      vcvt.s32.f32 q0, q0, #31\n"
 | 
			
		||||
		"      vcvt.s32.f32 q1, q1, #31\n"
 | 
			
		||||
		"      vqrshrn.s32 d0, q0, #16\n"
 | 
			
		||||
		"      vqrshrn.s32 d1, q1, #16\n"
 | 
			
		||||
		"      vcgt.f32 q3, q0, q2\n"
 | 
			
		||||
		"      vcgt.f32 q4, q0, q2\n"
 | 
			
		||||
		"      vbsl q3, %q[pos], %q[neg]\n"
 | 
			
		||||
		"      vbsl q4, %q[pos], %q[neg]\n"
 | 
			
		||||
		"      vadd.f32 q0, q0, q3\n"
 | 
			
		||||
		"      vadd.f32 q1, q1, q4\n"
 | 
			
		||||
		"      vcvt.s32.f32 q0, q0, #15\n"
 | 
			
		||||
		"      vcvt.s32.f32 q1, q1, #15\n"
 | 
			
		||||
		"      vqmovn.s32 d0, q0\n"
 | 
			
		||||
		"      vqmovn.s32 d1, q1\n"
 | 
			
		||||
		"      vst2.16 { d0[0], d1[0] }, [%[d]], %[stride]\n"
 | 
			
		||||
		"      vst2.16 { d0[1], d1[1] }, [%[d]], %[stride]\n"
 | 
			
		||||
		"      vst2.16 { d0[2], d1[2] }, [%[d]], %[stride]\n"
 | 
			
		||||
| 
						 | 
				
			
			@ -346,17 +364,25 @@ conv_f32d_to_s16_2s_neon(void *data, void * SPA_RESTRICT dst, const void * SPA_R
 | 
			
		|||
		"      vld1.32 { d0[0] }, [%[s0]]!\n"
 | 
			
		||||
		"      vld1.32 { d2[0] }, [%[s1]]!\n"
 | 
			
		||||
		"      subs %[remainder], %[remainder], #1\n"
 | 
			
		||||
		"      vcvt.s32.f32 q0, q0, #31\n"
 | 
			
		||||
		"      vcvt.s32.f32 q1, q1, #31\n"
 | 
			
		||||
		"      vqrshrn.s32 d0, q0, #16\n"
 | 
			
		||||
		"      vqrshrn.s32 d1, q1, #16\n"
 | 
			
		||||
		"      vcgt.f32 q3, q0, q2\n"
 | 
			
		||||
		"      vcgt.f32 q4, q0, q2\n"
 | 
			
		||||
		"      vbsl q3, %q[pos], %q[neg]\n"
 | 
			
		||||
		"      vbsl q4, %q[pos], %q[neg]\n"
 | 
			
		||||
		"      vadd.f32 q0, q0, q3\n"
 | 
			
		||||
		"      vadd.f32 q1, q1, q4\n"
 | 
			
		||||
		"      vcvt.s32.f32 q0, q0, #15\n"
 | 
			
		||||
		"      vcvt.s32.f32 q1, q1, #15\n"
 | 
			
		||||
		"      vqmovn.s32 d0, q0\n"
 | 
			
		||||
		"      vqmovn.s32 d1, q1\n"
 | 
			
		||||
		"      vst2.16 { d0[0], d1[0] }, [%[d]], %[stride]\n"
 | 
			
		||||
		"      bne 3b\n"
 | 
			
		||||
		"4:"
 | 
			
		||||
		: [d] "+r" (d), [s0] "+r" (s0), [s1] "+r" (s1), [n_samples] "+r" (n_samples),
 | 
			
		||||
		  [remainder] "+r" (remainder)
 | 
			
		||||
		: [stride] "r" (stride)
 | 
			
		||||
		: "cc", "q0", "q1");
 | 
			
		||||
		: [stride] "r" (stride),
 | 
			
		||||
		  [pos]"w"(pos),
 | 
			
		||||
		  [neg]"w"(neg)
 | 
			
		||||
		: "cc", "q0", "q1", "q2", "q3", "q4");
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			@ -372,13 +398,15 @@ conv_f32d_to_s16_1s_neon(void *data, void * SPA_RESTRICT dst, const void * SPA_R
 | 
			
		|||
 | 
			
		||||
#ifdef __aarch64__
 | 
			
		||||
	asm volatile(
 | 
			
		||||
		"      dup v2.4s, %w[scale]\n"
 | 
			
		||||
		"      cmp %[n_samples], #0\n"
 | 
			
		||||
		"      beq 2f\n"
 | 
			
		||||
		"1:"
 | 
			
		||||
		"      ld1 { v0.4s }, [%[s]], #16\n"
 | 
			
		||||
		"      subs %[n_samples], %[n_samples], #4\n"
 | 
			
		||||
		"      fcvtzs v0.4s, v0.4s, #31\n"
 | 
			
		||||
		"      sqrshrn v0.4h, v0.4s, #16\n"
 | 
			
		||||
		"      sqadd  v0.4s, v0.4s, v2.4s\n"
 | 
			
		||||
		"      fcvtns v0.4s, v0.4s\n"
 | 
			
		||||
		"      sqxtn  v0.4h, v0.4s\n"
 | 
			
		||||
		"      st1 { v0.h }[0], [%[d]], %[stride]\n"
 | 
			
		||||
		"      st1 { v0.h }[1], [%[d]], %[stride]\n"
 | 
			
		||||
		"      st1 { v0.h }[2], [%[d]], %[stride]\n"
 | 
			
		||||
| 
						 | 
				
			
			@ -390,24 +418,33 @@ conv_f32d_to_s16_1s_neon(void *data, void * SPA_RESTRICT dst, const void * SPA_R
 | 
			
		|||
		"3:"
 | 
			
		||||
		"      ld1 { v0.s }[0], [%[s]], #4\n"
 | 
			
		||||
		"      subs %[remainder], %[remainder], #1\n"
 | 
			
		||||
		"      fcvtzs v0.4s, v0.4s, #31\n"
 | 
			
		||||
		"      sqrshrn v0.4h, v0.4s, #16\n"
 | 
			
		||||
		"      sqadd  v0.4s, v0.4s, v2.4s\n"
 | 
			
		||||
		"      fcvtns v0.4s, v0.4s\n"
 | 
			
		||||
		"      sqxtn  v0.4h, v0.4s\n"
 | 
			
		||||
		"      st1 { v0.h }[0], [%[d]], %[stride]\n"
 | 
			
		||||
		"      bne 3b\n"
 | 
			
		||||
		"4:"
 | 
			
		||||
		: [d] "+r" (d), [s] "+r" (s), [n_samples] "+r" (n_samples),
 | 
			
		||||
		  [remainder] "+r" (remainder)
 | 
			
		||||
		: [stride] "r" (stride)
 | 
			
		||||
		: [stride] "r" (stride),
 | 
			
		||||
		  [scale] "r" (15 << 23)
 | 
			
		||||
		: "cc", "v0");
 | 
			
		||||
#else
 | 
			
		||||
	float32x4_t pos = vdupq_n_f32(0.4999999f / S16_SCALE);
 | 
			
		||||
	float32x4_t neg = vdupq_n_f32(-0.4999999f / S16_SCALE);
 | 
			
		||||
 | 
			
		||||
	asm volatile(
 | 
			
		||||
		"      veor q1, q1, q1\n"
 | 
			
		||||
		"      cmp %[n_samples], #0\n"
 | 
			
		||||
		"      beq 2f\n"
 | 
			
		||||
		"1:"
 | 
			
		||||
		"      vld1.32 { q0 }, [%[s]]!\n"
 | 
			
		||||
		"      subs %[n_samples], %[n_samples], #4\n"
 | 
			
		||||
		"      vcvt.s32.f32 q0, q0, #31\n"
 | 
			
		||||
		"      vqrshrn.s32 d0, q0, #16\n"
 | 
			
		||||
		"      vcgt.f32 q2, q0, q1\n"
 | 
			
		||||
		"      vbsl q2, %q[pos], %q[neg]\n"
 | 
			
		||||
		"      vadd.f32 q0, q0, q2\n"
 | 
			
		||||
		"      vcvt.s32.f32 q0, q0, #15\n"
 | 
			
		||||
		"      vqmovn.s32 d0, q0\n"
 | 
			
		||||
		"      vst1.16 { d0[0] }, [%[d]], %[stride]\n"
 | 
			
		||||
		"      vst1.16 { d0[1] }, [%[d]], %[stride]\n"
 | 
			
		||||
		"      vst1.16 { d0[2] }, [%[d]], %[stride]\n"
 | 
			
		||||
| 
						 | 
				
			
			@ -419,15 +456,20 @@ conv_f32d_to_s16_1s_neon(void *data, void * SPA_RESTRICT dst, const void * SPA_R
 | 
			
		|||
		"3:"
 | 
			
		||||
		"      vld1.32 { d0[0] }, [%[s]]!\n"
 | 
			
		||||
		"      subs %[remainder], %[remainder], #1\n"
 | 
			
		||||
		"      vcvt.s32.f32 q0, q0, #31\n"
 | 
			
		||||
		"      vqrshrn.s32 d0, q0, #16\n"
 | 
			
		||||
		"      vcgt.f32 q2, q0, q1\n"
 | 
			
		||||
		"      vbsl q2, %q[pos], %q[neg]\n"
 | 
			
		||||
		"      vadd.f32 q0, q0, q2\n"
 | 
			
		||||
		"      vcvt.s32.f32 q0, q0, #15\n"
 | 
			
		||||
		"      vqmovn.s32 d0, q0\n"
 | 
			
		||||
		"      vst1.16 { d0[0] }, [%[d]], %[stride]\n"
 | 
			
		||||
		"      bne 3b\n"
 | 
			
		||||
		"4:"
 | 
			
		||||
		: [d] "+r" (d), [s] "+r" (s), [n_samples] "+r" (n_samples),
 | 
			
		||||
		  [remainder] "+r" (remainder)
 | 
			
		||||
		: [stride] "r" (stride)
 | 
			
		||||
		: "cc", "q0");
 | 
			
		||||
		: [stride] "r" (stride),
 | 
			
		||||
		  [pos]"w"(pos),
 | 
			
		||||
		  [neg]"w"(neg)
 | 
			
		||||
		: "cc", "q0", "q1", "q2");
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -242,6 +242,12 @@ static void test_f32_s16(void)
 | 
			
		|||
			false, true, conv_f32d_to_s16_avx2);
 | 
			
		||||
	}
 | 
			
		||||
#endif
 | 
			
		||||
#if defined(HAVE_NEON)
 | 
			
		||||
	if (cpu_flags & SPA_CPU_FLAG_NEON) {
 | 
			
		||||
		run_test("test_f32d_s16_neon", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out),
 | 
			
		||||
			false, true, conv_f32d_to_s16_neon);
 | 
			
		||||
	}
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void test_s16_f32(void)
 | 
			
		||||
| 
						 | 
				
			
			@ -269,6 +275,12 @@ static void test_s16_f32(void)
 | 
			
		|||
			true, false, conv_s16_to_f32d_avx2);
 | 
			
		||||
	}
 | 
			
		||||
#endif
 | 
			
		||||
#if defined(HAVE_NEON)
 | 
			
		||||
	if (cpu_flags & SPA_CPU_FLAG_NEON) {
 | 
			
		||||
		run_test("test_s16_f32d_neon", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out),
 | 
			
		||||
			true, false, conv_s16_to_f32d_neon);
 | 
			
		||||
	}
 | 
			
		||||
#endif
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static void test_f32_u32(void)
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue