mirror of
				https://gitlab.freedesktop.org/pipewire/pipewire.git
				synced 2025-11-03 09:01:54 -05:00 
			
		
		
		
	audioconvert: SSE optimize delay and convolver
This commit is contained in:
		
							parent
							
								
									33fb2f04c7
								
							
						
					
					
						commit
						2c0ce6afc2
					
				
					 5 changed files with 115 additions and 69 deletions
				
			
		| 
						 | 
				
			
			@ -257,6 +257,80 @@ static void lr4_process_2_sse(struct lr4 *lr40, struct lr4 *lr41, float *dst0, f
 | 
			
		|||
#undef F
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline void convolver_run(const float *src, float *dst,
 | 
			
		||||
		const float *taps, uint32_t n_taps, const __m128 vol)
 | 
			
		||||
{
 | 
			
		||||
	__m128 t[1], sum[4];
 | 
			
		||||
	uint32_t i;
 | 
			
		||||
 | 
			
		||||
	sum[0] = _mm_setzero_ps();
 | 
			
		||||
	for(i = 0; i < n_taps; i+=4) {
 | 
			
		||||
		t[0] = _mm_loadu_ps(&src[i]);
 | 
			
		||||
		sum[0] = _mm_add_ps(sum[0], _mm_mul_ps(_mm_loadu_ps(&taps[i]), t[0]));
 | 
			
		||||
	}
 | 
			
		||||
	sum[0] = _mm_add_ps(sum[0], _mm_movehl_ps(sum[0], sum[0]));
 | 
			
		||||
	sum[0] = _mm_add_ss(sum[0], _mm_shuffle_ps(sum[0], sum[0], 0x55));
 | 
			
		||||
	t[0] = _mm_mul_ss(sum[0], vol);
 | 
			
		||||
	_mm_store_ss(dst, t[0]);
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
static inline void delay_convolve_run_sse(float *buffer, uint32_t *pos,
 | 
			
		||||
		uint32_t n_buffer, uint32_t delay,
 | 
			
		||||
		const float *taps, uint32_t n_taps,
 | 
			
		||||
		float *dst, const float *src, const float vol, uint32_t n_samples)
 | 
			
		||||
{
 | 
			
		||||
	__m128 t[1];
 | 
			
		||||
	const __m128 v = _mm_set1_ps(vol);
 | 
			
		||||
	uint32_t i;
 | 
			
		||||
	uint32_t w = *pos;
 | 
			
		||||
	uint32_t o = n_buffer - delay - n_taps-1;
 | 
			
		||||
	uint32_t n, unrolled;
 | 
			
		||||
 | 
			
		||||
	if (SPA_IS_ALIGNED(src, 16) &&
 | 
			
		||||
	    SPA_IS_ALIGNED(dst, 16))
 | 
			
		||||
		unrolled = n_samples & ~3;
 | 
			
		||||
	else
 | 
			
		||||
		unrolled = 0;
 | 
			
		||||
 | 
			
		||||
	if (n_taps == 1) {
 | 
			
		||||
		for(n = 0; n < unrolled; n += 4) {
 | 
			
		||||
			t[0] = _mm_load_ps(&src[n]);
 | 
			
		||||
			_mm_storeu_ps(&buffer[w], t[0]);
 | 
			
		||||
			_mm_storeu_ps(&buffer[w+n_buffer], t[0]);
 | 
			
		||||
			t[0] = _mm_loadu_ps(&buffer[w+o]);
 | 
			
		||||
			t[0] = _mm_mul_ps(t[0], v);
 | 
			
		||||
			_mm_store_ps(&dst[n], t[0]);
 | 
			
		||||
			w = w + 4 >= n_buffer ? 0 : w + 4;
 | 
			
		||||
		}
 | 
			
		||||
		for(; n < n_samples; n++) {
 | 
			
		||||
			t[0] = _mm_load_ss(&src[n]);
 | 
			
		||||
			_mm_store_ss(&buffer[w], t[0]);
 | 
			
		||||
			_mm_store_ss(&buffer[w+n_buffer], t[0]);
 | 
			
		||||
			t[0] = _mm_load_ss(&buffer[w+o]);
 | 
			
		||||
			t[0] = _mm_mul_ss(t[0], v);
 | 
			
		||||
			_mm_store_ss(&dst[n], t[0]);
 | 
			
		||||
			w = w + 1 >= n_buffer ? 0 : w + 1;
 | 
			
		||||
		}
 | 
			
		||||
	} else {
 | 
			
		||||
		for(n = 0; n < unrolled; n += 4) {
 | 
			
		||||
			t[0] = _mm_load_ps(&src[n]);
 | 
			
		||||
			_mm_storeu_ps(&buffer[w], t[0]);
 | 
			
		||||
			_mm_storeu_ps(&buffer[w+n_buffer], t[0]);
 | 
			
		||||
			for(i = 0; i < 4; i++)
 | 
			
		||||
				convolver_run(&buffer[w+o+i], &dst[n+i], taps, n_taps, v);
 | 
			
		||||
			w = w + 4 >= n_buffer ? 0 : w + 4;
 | 
			
		||||
		}
 | 
			
		||||
		for(; n < n_samples; n++) {
 | 
			
		||||
			t[0] = _mm_load_ss(&src[n]);
 | 
			
		||||
			_mm_store_ss(&buffer[w], t[0]);
 | 
			
		||||
			_mm_store_ss(&buffer[w+n_buffer], t[0]);
 | 
			
		||||
			convolver_run(&buffer[w+o], &dst[n], taps, n_taps, v);
 | 
			
		||||
			w = w + 1 >= n_buffer ? 0 : w + 1;
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
	*pos = w;
 | 
			
		||||
}
 | 
			
		||||
 | 
			
		||||
void
 | 
			
		||||
channelmix_f32_n_m_sse(struct channelmix *mix, void * SPA_RESTRICT dst[],
 | 
			
		||||
		   const void * SPA_RESTRICT src[], uint32_t n_samples)
 | 
			
		||||
| 
						 | 
				
			
			@ -371,9 +445,9 @@ channelmix_f32_2_5p1_sse(struct channelmix *mix, void * SPA_RESTRICT dst[],
 | 
			
		|||
		} else {
 | 
			
		||||
			sub_sse(d[4], s[0], s[1], n_samples);
 | 
			
		||||
 | 
			
		||||
			delay_convolve_run(mix->buffer[1], &mix->pos[1], BUFFER_SIZE, mix->delay,
 | 
			
		||||
			delay_convolve_run_sse(mix->buffer[1], &mix->pos[1], BUFFER_SIZE, mix->delay,
 | 
			
		||||
					mix->taps, mix->n_taps, d[5], d[4], -v5, n_samples);
 | 
			
		||||
			delay_convolve_run(mix->buffer[0], &mix->pos[0], BUFFER_SIZE, mix->delay,
 | 
			
		||||
			delay_convolve_run_sse(mix->buffer[0], &mix->pos[0], BUFFER_SIZE, mix->delay,
 | 
			
		||||
					mix->taps, mix->n_taps, d[4], d[4], v4, n_samples);
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
| 
						 | 
				
			
			@ -407,9 +481,9 @@ channelmix_f32_2_7p1_sse(struct channelmix *mix, void * SPA_RESTRICT dst[],
 | 
			
		|||
		} else {
 | 
			
		||||
			sub_sse(d[6], s[0], s[1], n_samples);
 | 
			
		||||
 | 
			
		||||
			delay_convolve_run(mix->buffer[1], &mix->pos[1], BUFFER_SIZE, mix->delay,
 | 
			
		||||
			delay_convolve_run_sse(mix->buffer[1], &mix->pos[1], BUFFER_SIZE, mix->delay,
 | 
			
		||||
					mix->taps, mix->n_taps, d[7], d[6], -v7, n_samples);
 | 
			
		||||
			delay_convolve_run(mix->buffer[0], &mix->pos[0], BUFFER_SIZE, mix->delay,
 | 
			
		||||
			delay_convolve_run_sse(mix->buffer[0], &mix->pos[0], BUFFER_SIZE, mix->delay,
 | 
			
		||||
					mix->taps, mix->n_taps, d[6], d[6], v6, n_samples);
 | 
			
		||||
		}
 | 
			
		||||
	}
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		Add a link
		
	
		Reference in a new issue