mirror of
				https://gitlab.freedesktop.org/pipewire/pipewire.git
				synced 2025-11-03 09:01:54 -05:00 
			
		
		
		
	audioconvert: add stereo deinterleave neon asm
This can take some shortcuts and convert twice as many samples in one iteration as the strided stereo deinterleave one.
This commit is contained in:
		
							parent
							
								
									6fab8fabca
								
							
						
					
					
						commit
						0ace131d72
					
				
					 3 changed files with 84 additions and 0 deletions
				
			
		| 
						 | 
					@ -28,6 +28,88 @@
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#include "fmt-ops.h"
 | 
					#include "fmt-ops.h"
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					void
 | 
				
			||||||
 | 
					conv_s16_to_f32d_2_neon(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
 | 
				
			||||||
 | 
							uint32_t n_samples)
 | 
				
			||||||
 | 
					{
 | 
				
			||||||
 | 
						const int16_t *s = src[0];
 | 
				
			||||||
 | 
						float *d0 = dst[0], *d1 = dst[1];
 | 
				
			||||||
 | 
						unsigned int remainder = n_samples & 7;
 | 
				
			||||||
 | 
						n_samples -= remainder;
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					#ifdef __aarch64__
 | 
				
			||||||
 | 
						asm volatile(
 | 
				
			||||||
 | 
							"      cmp %[n_samples], #0\n"
 | 
				
			||||||
 | 
							"      beq 2f\n"
 | 
				
			||||||
 | 
							"1:"
 | 
				
			||||||
 | 
							"      ld2 {v2.8h, v3.8h}, [%[s]], #32\n"
 | 
				
			||||||
 | 
							"      subs %w[n_samples], %w[n_samples], #8\n"
 | 
				
			||||||
 | 
							"      sxtl v0.4s, v2.4h\n"
 | 
				
			||||||
 | 
							"      sxtl2 v1.4s, v2.8h\n"
 | 
				
			||||||
 | 
							"      sxtl v2.4s, v3.4h\n"
 | 
				
			||||||
 | 
							"      sxtl2 v3.4s, v3.8h\n"
 | 
				
			||||||
 | 
							"      scvtf v0.4s, v0.4s, #15\n"
 | 
				
			||||||
 | 
							"      scvtf v1.4s, v1.4s, #15\n"
 | 
				
			||||||
 | 
							"      scvtf v2.4s, v2.4s, #15\n"
 | 
				
			||||||
 | 
							"      scvtf v3.4s, v3.4s, #15\n"
 | 
				
			||||||
 | 
							"      st1 {v0.4s, v1.4s}, [%[d0]], #32\n"
 | 
				
			||||||
 | 
							"      st1 {v2.4s, v3.4s}, [%[d1]], #32\n"
 | 
				
			||||||
 | 
							"      b.ne 1b\n"
 | 
				
			||||||
 | 
							"2:"
 | 
				
			||||||
 | 
							"      cmp %[remainder], #0\n"
 | 
				
			||||||
 | 
							"      beq 4f\n"
 | 
				
			||||||
 | 
							"3:"
 | 
				
			||||||
 | 
							"      ld2 { v0.h, v1.h }[0], [%[s]], #4\n"
 | 
				
			||||||
 | 
							"      subs %[remainder], %[remainder], #1\n"
 | 
				
			||||||
 | 
							"      sshll v2.4s, v0.4h, #0\n"
 | 
				
			||||||
 | 
							"      sshll v3.4s, v1.4h, #0\n"
 | 
				
			||||||
 | 
							"      scvtf v0.4s, v2.4s, #15\n"
 | 
				
			||||||
 | 
							"      scvtf v1.4s, v3.4s, #15\n"
 | 
				
			||||||
 | 
							"      st1 { v0.s }[0], [%[d0]], #4\n"
 | 
				
			||||||
 | 
							"      st1 { v1.s }[0], [%[d1]], #4\n"
 | 
				
			||||||
 | 
							"      bne 3b\n"
 | 
				
			||||||
 | 
							"4:"
 | 
				
			||||||
 | 
							: [d0] "+r" (d0), [d1] "+r" (d1), [s] "+r" (s), [n_samples] "+r" (n_samples),
 | 
				
			||||||
 | 
							  [remainder] "+r" (remainder)
 | 
				
			||||||
 | 
							: : "v0", "v1", "v2", "v3", "memory", "cc");
 | 
				
			||||||
 | 
					#else
 | 
				
			||||||
 | 
						asm volatile(
 | 
				
			||||||
 | 
							"      cmp %[n_samples], #0\n"
 | 
				
			||||||
 | 
							"      beq 2f\n"
 | 
				
			||||||
 | 
							"1:"
 | 
				
			||||||
 | 
							"      vld2.16 {d0-d3}, [%[s]]!\n"
 | 
				
			||||||
 | 
							"      subs %[n_samples], #8\n"
 | 
				
			||||||
 | 
							"      vmovl.s16 q3, d3\n"
 | 
				
			||||||
 | 
							"      vmovl.s16 q2, d2\n"
 | 
				
			||||||
 | 
							"      vmovl.s16 q1, d1\n"
 | 
				
			||||||
 | 
							"      vmovl.s16 q0, d0\n"
 | 
				
			||||||
 | 
							"      vcvt.f32.s32 q3, q3, #15\n"
 | 
				
			||||||
 | 
							"      vcvt.f32.s32 q2, q2, #15\n"
 | 
				
			||||||
 | 
							"      vcvt.f32.s32 q1, q1, #15\n"
 | 
				
			||||||
 | 
							"      vcvt.f32.s32 q0, q0, #15\n"
 | 
				
			||||||
 | 
							"      vst1.32 {d4-d7}, [%[d1]]!\n"
 | 
				
			||||||
 | 
							"      vst1.32 {d0-d3}, [%[d0]]!\n"
 | 
				
			||||||
 | 
							"      bne 1b\n"
 | 
				
			||||||
 | 
							"2:"
 | 
				
			||||||
 | 
							"      cmp %[remainder], #0\n"
 | 
				
			||||||
 | 
							"      beq 4f\n"
 | 
				
			||||||
 | 
							"3:"
 | 
				
			||||||
 | 
							"      vld2.16 { d0[0], d1[0] }, [%[s]], #4\n"
 | 
				
			||||||
 | 
							"      subs %[remainder], %[remainder], #1\n"
 | 
				
			||||||
 | 
							"      vmovl.s16 q1, d1\n"
 | 
				
			||||||
 | 
							"      vmovl.s16 q0, d0\n"
 | 
				
			||||||
 | 
							"      vcvt.f32.s32 q1, q1, #15\n"
 | 
				
			||||||
 | 
							"      vcvt.f32.s32 q0, q0, #15\n"
 | 
				
			||||||
 | 
							"      vst1.32 { d2[0] }, [%[d1]]!\n"
 | 
				
			||||||
 | 
							"      vst1.32 { d0[0] }, [%[d0]]!\n"
 | 
				
			||||||
 | 
							"      bne 3b\n"
 | 
				
			||||||
 | 
							"4:"
 | 
				
			||||||
 | 
							: [d0] "+r" (d0), [d1] "+r" (d1), [s] "+r" (s), [n_samples] "+r" (n_samples),
 | 
				
			||||||
 | 
							  [remainder] "+r" (remainder)
 | 
				
			||||||
 | 
							: : "q0", "q1", "q2", "q3", "memory", "cc");
 | 
				
			||||||
 | 
					#endif
 | 
				
			||||||
 | 
					}
 | 
				
			||||||
 | 
					
 | 
				
			||||||
static void
 | 
					static void
 | 
				
			||||||
conv_s16_to_f32d_2s_neon(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
 | 
					conv_s16_to_f32d_2s_neon(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
 | 
				
			||||||
		uint32_t n_channels, uint32_t n_samples)
 | 
							uint32_t n_channels, uint32_t n_samples)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -66,6 +66,7 @@ static struct conv_info conv_table[] =
 | 
				
			||||||
	{ SPA_AUDIO_FORMAT_S16, SPA_AUDIO_FORMAT_F32, 0, 0, conv_s16_to_f32_c },
 | 
						{ SPA_AUDIO_FORMAT_S16, SPA_AUDIO_FORMAT_F32, 0, 0, conv_s16_to_f32_c },
 | 
				
			||||||
	{ SPA_AUDIO_FORMAT_S16P, SPA_AUDIO_FORMAT_F32P, 0, 0, conv_s16d_to_f32d_c },
 | 
						{ SPA_AUDIO_FORMAT_S16P, SPA_AUDIO_FORMAT_F32P, 0, 0, conv_s16d_to_f32d_c },
 | 
				
			||||||
#if defined (HAVE_NEON)
 | 
					#if defined (HAVE_NEON)
 | 
				
			||||||
 | 
						{ SPA_AUDIO_FORMAT_S16, SPA_AUDIO_FORMAT_F32P, 2, SPA_CPU_FLAG_NEON, conv_s16_to_f32d_2_neon },
 | 
				
			||||||
	{ SPA_AUDIO_FORMAT_S16, SPA_AUDIO_FORMAT_F32P, 0, SPA_CPU_FLAG_NEON, conv_s16_to_f32d_neon },
 | 
						{ SPA_AUDIO_FORMAT_S16, SPA_AUDIO_FORMAT_F32P, 0, SPA_CPU_FLAG_NEON, conv_s16_to_f32d_neon },
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
#if defined (HAVE_AVX2)
 | 
					#if defined (HAVE_AVX2)
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -295,6 +295,7 @@ DEFINE_FUNCTION(interleave_32, c);
 | 
				
			||||||
DEFINE_FUNCTION(interleave_32s, c);
 | 
					DEFINE_FUNCTION(interleave_32s, c);
 | 
				
			||||||
 | 
					
 | 
				
			||||||
#if defined(HAVE_NEON)
 | 
					#if defined(HAVE_NEON)
 | 
				
			||||||
 | 
					DEFINE_FUNCTION(s16_to_f32d_2, neon);
 | 
				
			||||||
DEFINE_FUNCTION(s16_to_f32d, neon);
 | 
					DEFINE_FUNCTION(s16_to_f32d, neon);
 | 
				
			||||||
DEFINE_FUNCTION(f32d_to_s16, neon);
 | 
					DEFINE_FUNCTION(f32d_to_s16, neon);
 | 
				
			||||||
#endif
 | 
					#endif
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue