mirror of
https://gitlab.freedesktop.org/pipewire/pipewire.git
synced 2026-03-22 05:33:53 -04:00
audioconvert: add avx2 optimized s32_to f32d
Add an alternative avx2 s32_to_f32d implementation that doesn't use the gather function for when gather is slow. Don't overwrite the orinal cpu_flags but store the selected flags in a new variable. Use this to debug the selected function cpu flags. Build libraries with defines from previous libraries so that we can reuse functions from them. We can then remove the SSE2 | SLOW_GATHER function selection from the list. We will now select avx2 and it will then switch implementations based on the CPU flags.
This commit is contained in:
parent
3dff64364f
commit
c02cdcb5ce
13 changed files with 218 additions and 38 deletions
|
|
@ -2125,7 +2125,7 @@ static int setup_in_convert(struct impl *this)
|
||||||
return res;
|
return res;
|
||||||
|
|
||||||
spa_log_debug(this->log, "%p: got converter features %08x:%08x passthrough:%d remap:%d %s", this,
|
spa_log_debug(this->log, "%p: got converter features %08x:%08x passthrough:%d remap:%d %s", this,
|
||||||
this->cpu_flags, in->conv.cpu_flags, in->conv.is_passthrough,
|
this->cpu_flags, in->conv.func_cpu_flags, in->conv.is_passthrough,
|
||||||
remap, in->conv.func_name);
|
remap, in->conv.func_name);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
@ -2282,7 +2282,7 @@ static int setup_channelmix(struct impl *this, uint32_t channels, uint32_t *posi
|
||||||
set_volume(this);
|
set_volume(this);
|
||||||
|
|
||||||
spa_log_debug(this->log, "%p: got channelmix features %08x:%08x flags:%08x %s",
|
spa_log_debug(this->log, "%p: got channelmix features %08x:%08x flags:%08x %s",
|
||||||
this, this->cpu_flags, this->mix.cpu_flags,
|
this, this->cpu_flags, this->mix.func_cpu_flags,
|
||||||
this->mix.flags, this->mix.func_name);
|
this->mix.flags, this->mix.func_name);
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
@ -2330,7 +2330,7 @@ static int setup_resample(struct impl *this)
|
||||||
res = resample_native_init(&this->resample);
|
res = resample_native_init(&this->resample);
|
||||||
|
|
||||||
spa_log_debug(this->log, "%p: got resample features %08x:%08x %s",
|
spa_log_debug(this->log, "%p: got resample features %08x:%08x %s",
|
||||||
this, this->cpu_flags, this->resample.cpu_flags,
|
this, this->cpu_flags, this->resample.func_cpu_flags,
|
||||||
this->resample.func_name);
|
this->resample.func_name);
|
||||||
return res;
|
return res;
|
||||||
}
|
}
|
||||||
|
|
@ -2422,7 +2422,7 @@ static int setup_out_convert(struct impl *this)
|
||||||
|
|
||||||
spa_log_debug(this->log, "%p: got converter features %08x:%08x quant:%d:%d"
|
spa_log_debug(this->log, "%p: got converter features %08x:%08x quant:%d:%d"
|
||||||
" passthrough:%d remap:%d %s", this,
|
" passthrough:%d remap:%d %s", this,
|
||||||
this->cpu_flags, out->conv.cpu_flags, out->conv.method,
|
this->cpu_flags, out->conv.func_cpu_flags, out->conv.method,
|
||||||
out->conv.noise_bits, out->conv.is_passthrough, remap, out->conv.func_name);
|
out->conv.noise_bits, out->conv.is_passthrough, remap, out->conv.func_name);
|
||||||
|
|
||||||
return 0;
|
return 0;
|
||||||
|
|
|
||||||
|
|
@ -885,8 +885,8 @@ int channelmix_init(struct channelmix *mix)
|
||||||
mix->free = impl_channelmix_free;
|
mix->free = impl_channelmix_free;
|
||||||
mix->process = info->process;
|
mix->process = info->process;
|
||||||
mix->set_volume = impl_channelmix_set_volume;
|
mix->set_volume = impl_channelmix_set_volume;
|
||||||
mix->cpu_flags = info->cpu_flags;
|
|
||||||
mix->delay = (uint32_t)(mix->rear_delay * mix->freq / 1000.0f);
|
mix->delay = (uint32_t)(mix->rear_delay * mix->freq / 1000.0f);
|
||||||
|
mix->func_cpu_flags = info->cpu_flags;
|
||||||
mix->func_name = info->name;
|
mix->func_name = info->name;
|
||||||
|
|
||||||
spa_zero(mix->taps_mem);
|
spa_zero(mix->taps_mem);
|
||||||
|
|
|
||||||
|
|
@ -44,6 +44,7 @@ struct channelmix {
|
||||||
uint32_t upmix;
|
uint32_t upmix;
|
||||||
|
|
||||||
struct spa_log *log;
|
struct spa_log *log;
|
||||||
|
uint32_t func_cpu_flags;
|
||||||
const char *func_name;
|
const char *func_name;
|
||||||
|
|
||||||
#define CHANNELMIX_FLAG_ZERO (1<<0) /**< all zero components */
|
#define CHANNELMIX_FLAG_ZERO (1<<0) /**< all zero components */
|
||||||
|
|
|
||||||
|
|
@ -4,6 +4,8 @@
|
||||||
|
|
||||||
#include "fmt-ops.h"
|
#include "fmt-ops.h"
|
||||||
|
|
||||||
|
#include <spa/support/cpu.h>
|
||||||
|
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
// GCC: workaround for missing AVX intrinsic: "_mm256_setr_m128()"
|
// GCC: workaround for missing AVX intrinsic: "_mm256_setr_m128()"
|
||||||
// (see https://stackoverflow.com/questions/32630458/setting-m256i-to-the-value-of-two-m128i-values)
|
// (see https://stackoverflow.com/questions/32630458/setting-m256i-to-the-value-of-two-m128i-values)
|
||||||
|
|
@ -285,7 +287,7 @@ conv_s16s_to_f32d_2_avx2(struct convert *conv, void * SPA_RESTRICT dst[], const
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
conv_s24_to_f32d_1s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
conv_s24_to_f32d_1s_gather_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
||||||
uint32_t n_channels, uint32_t n_samples)
|
uint32_t n_channels, uint32_t n_samples)
|
||||||
{
|
{
|
||||||
const int8_t *s = src;
|
const int8_t *s = src;
|
||||||
|
|
@ -321,7 +323,7 @@ conv_s24_to_f32d_1s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
conv_s24_to_f32d_2s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
conv_s24_to_f32d_2s_gather_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
||||||
uint32_t n_channels, uint32_t n_samples)
|
uint32_t n_channels, uint32_t n_samples)
|
||||||
{
|
{
|
||||||
const int8_t *s = src;
|
const int8_t *s = src;
|
||||||
|
|
@ -373,7 +375,7 @@ conv_s24_to_f32d_2s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
static void
|
static void
|
||||||
conv_s24_to_f32d_4s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
conv_s24_to_f32d_4s_gather_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
||||||
uint32_t n_channels, uint32_t n_samples)
|
uint32_t n_channels, uint32_t n_samples)
|
||||||
{
|
{
|
||||||
const int8_t *s = src;
|
const int8_t *s = src;
|
||||||
|
|
@ -447,16 +449,22 @@ conv_s24_to_f32d_avx2(struct convert *conv, void * SPA_RESTRICT dst[], const voi
|
||||||
const int8_t *s = src[0];
|
const int8_t *s = src[0];
|
||||||
uint32_t i = 0, n_channels = conv->n_channels;
|
uint32_t i = 0, n_channels = conv->n_channels;
|
||||||
|
|
||||||
|
if (conv->cpu_flags & SPA_CPU_FLAG_SLOW_GATHER) {
|
||||||
|
#if defined (HAVE_SSE2)
|
||||||
|
conv_s24_to_f32d_sse2(conv, dst, src, n_samples);
|
||||||
|
#endif
|
||||||
|
} else {
|
||||||
for(; i + 3 < n_channels; i += 4)
|
for(; i + 3 < n_channels; i += 4)
|
||||||
conv_s24_to_f32d_4s_avx2(conv, &dst[i], &s[3*i], n_channels, n_samples);
|
conv_s24_to_f32d_4s_gather_avx2(conv, &dst[i], &s[3*i], n_channels, n_samples);
|
||||||
for(; i + 1 < n_channels; i += 2)
|
for(; i + 1 < n_channels; i += 2)
|
||||||
conv_s24_to_f32d_2s_avx2(conv, &dst[i], &s[3*i], n_channels, n_samples);
|
conv_s24_to_f32d_2s_gather_avx2(conv, &dst[i], &s[3*i], n_channels, n_samples);
|
||||||
for(; i < n_channels; i++)
|
for(; i < n_channels; i++)
|
||||||
conv_s24_to_f32d_1s_avx2(conv, &dst[i], &s[3*i], n_channels, n_samples);
|
conv_s24_to_f32d_1s_gather_avx2(conv, &dst[i], &s[3*i], n_channels, n_samples);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
conv_s32_to_f32d_4s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
conv_s32_to_f32d_4s_gather_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
||||||
uint32_t n_channels, uint32_t n_samples)
|
uint32_t n_channels, uint32_t n_samples)
|
||||||
{
|
{
|
||||||
const int32_t *s = src;
|
const int32_t *s = src;
|
||||||
|
|
@ -510,7 +518,7 @@ conv_s32_to_f32d_4s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
conv_s32_to_f32d_2s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
conv_s32_to_f32d_2s_gather_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
||||||
uint32_t n_channels, uint32_t n_samples)
|
uint32_t n_channels, uint32_t n_samples)
|
||||||
{
|
{
|
||||||
const int32_t *s = src;
|
const int32_t *s = src;
|
||||||
|
|
@ -555,7 +563,7 @@ conv_s32_to_f32d_2s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
conv_s32_to_f32d_1s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
conv_s32_to_f32d_1s_gather_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
||||||
uint32_t n_channels, uint32_t n_samples)
|
uint32_t n_channels, uint32_t n_samples)
|
||||||
{
|
{
|
||||||
const int32_t *s = src;
|
const int32_t *s = src;
|
||||||
|
|
@ -595,6 +603,169 @@ conv_s32_to_f32d_1s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static void
|
||||||
|
conv_s32_to_f32d_2s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
||||||
|
uint32_t n_channels, uint32_t n_samples)
|
||||||
|
{
|
||||||
|
const int32_t *s = src;
|
||||||
|
float *d0 = dst[0], *d1 = dst[1];
|
||||||
|
uint32_t n, unrolled;
|
||||||
|
__m256i in[4];
|
||||||
|
__m256 out[4], t[4], factor = _mm256_set1_ps(1.0f / S32_SCALE_I2F);
|
||||||
|
|
||||||
|
if (SPA_IS_ALIGNED(d0, 32) &&
|
||||||
|
SPA_IS_ALIGNED(d1, 32))
|
||||||
|
unrolled = n_samples & ~7;
|
||||||
|
else
|
||||||
|
unrolled = 0;
|
||||||
|
|
||||||
|
for(n = 0; n < unrolled; n += 8) {
|
||||||
|
in[0] = _mm256_setr_epi64x(
|
||||||
|
*((uint64_t*)&s[0*n_channels]),
|
||||||
|
*((uint64_t*)&s[1*n_channels]),
|
||||||
|
*((uint64_t*)&s[4*n_channels]),
|
||||||
|
*((uint64_t*)&s[5*n_channels]));
|
||||||
|
in[1] = _mm256_setr_epi64x(
|
||||||
|
*((uint64_t*)&s[2*n_channels]),
|
||||||
|
*((uint64_t*)&s[3*n_channels]),
|
||||||
|
*((uint64_t*)&s[6*n_channels]),
|
||||||
|
*((uint64_t*)&s[7*n_channels]));
|
||||||
|
|
||||||
|
out[0] = _mm256_cvtepi32_ps(in[0]);
|
||||||
|
out[1] = _mm256_cvtepi32_ps(in[1]);
|
||||||
|
|
||||||
|
out[0] = _mm256_mul_ps(out[0], factor); /* a0 b0 a1 b1 a4 b4 a5 b5 */
|
||||||
|
out[1] = _mm256_mul_ps(out[1], factor); /* a2 b2 a3 b3 a6 b6 a7 b7 */
|
||||||
|
|
||||||
|
t[0] = _mm256_unpacklo_ps(out[0], out[1]); /* a0 a2 b0 b2 a4 a6 b4 b6 */
|
||||||
|
t[1] = _mm256_unpackhi_ps(out[0], out[1]); /* a1 a3 b1 b3 a5 a7 b5 b7 */
|
||||||
|
|
||||||
|
out[0] = _mm256_unpacklo_ps(t[0], t[1]); /* a0 a1 a2 a3 a4 a5 a6 a7 */
|
||||||
|
out[1] = _mm256_unpackhi_ps(t[0], t[1]); /* b0 b1 b2 b3 b4 b5 b6 b7 */
|
||||||
|
|
||||||
|
_mm256_store_ps(&d0[n], out[0]);
|
||||||
|
_mm256_store_ps(&d1[n], out[1]);
|
||||||
|
|
||||||
|
s += 8*n_channels;
|
||||||
|
}
|
||||||
|
for(; n < n_samples; n++) {
|
||||||
|
__m128 out[2], factor = _mm_set1_ps(1.0f / S32_SCALE_I2F);
|
||||||
|
out[0] = _mm_cvtsi32_ss(factor, s[0]);
|
||||||
|
out[1] = _mm_cvtsi32_ss(factor, s[1]);
|
||||||
|
out[0] = _mm_mul_ss(out[0], factor);
|
||||||
|
out[1] = _mm_mul_ss(out[1], factor);
|
||||||
|
_mm_store_ss(&d0[n], out[0]);
|
||||||
|
_mm_store_ss(&d1[n], out[1]);
|
||||||
|
s += n_channels;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
conv_s32_to_f32d_1s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
||||||
|
uint32_t n_channels, uint32_t n_samples)
|
||||||
|
{
|
||||||
|
const int32_t *s = src;
|
||||||
|
float *d0 = dst[0];
|
||||||
|
uint32_t n, unrolled;
|
||||||
|
__m256i in[2];
|
||||||
|
__m256 out[2], factor = _mm256_set1_ps(1.0f / S32_SCALE_I2F);
|
||||||
|
|
||||||
|
if (SPA_IS_ALIGNED(d0, 32))
|
||||||
|
unrolled = n_samples & ~7;
|
||||||
|
else
|
||||||
|
unrolled = 0;
|
||||||
|
|
||||||
|
for(n = 0; n < unrolled; n += 8) {
|
||||||
|
in[0] = _mm256_setr_epi32(
|
||||||
|
s[0*n_channels], s[1*n_channels],
|
||||||
|
s[2*n_channels], s[3*n_channels],
|
||||||
|
s[4*n_channels], s[5*n_channels],
|
||||||
|
s[6*n_channels], s[7*n_channels]);
|
||||||
|
out[0] = _mm256_cvtepi32_ps(in[0]);
|
||||||
|
out[0] = _mm256_mul_ps(out[0], factor);
|
||||||
|
_mm256_store_ps(&d0[n+0], out[0]);
|
||||||
|
s += 8*n_channels;
|
||||||
|
}
|
||||||
|
for(; n < n_samples; n++) {
|
||||||
|
__m128 out, factor = _mm_set1_ps(1.0f / S32_SCALE_I2F);
|
||||||
|
out = _mm_cvtsi32_ss(factor, s[0]);
|
||||||
|
out = _mm_mul_ss(out, factor);
|
||||||
|
_mm_store_ss(&d0[n], out);
|
||||||
|
s += n_channels;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static void
|
||||||
|
conv_s32_to_f32d_4s_avx2(void *data, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
|
||||||
|
uint32_t n_channels, uint32_t n_samples)
|
||||||
|
{
|
||||||
|
const int32_t *s = src;
|
||||||
|
float *d0 = dst[0], *d1 = dst[1], *d2 = dst[2], *d3 = dst[3];
|
||||||
|
uint32_t n, unrolled;
|
||||||
|
__m256i in[4];
|
||||||
|
__m256 out[4], t[4], factor = _mm256_set1_ps(1.0f / S32_SCALE_I2F);
|
||||||
|
|
||||||
|
if (SPA_IS_ALIGNED(d0, 32) &&
|
||||||
|
SPA_IS_ALIGNED(d1, 32) &&
|
||||||
|
SPA_IS_ALIGNED(d2, 32) &&
|
||||||
|
SPA_IS_ALIGNED(d3, 32))
|
||||||
|
unrolled = n_samples & ~7;
|
||||||
|
else
|
||||||
|
unrolled = 0;
|
||||||
|
|
||||||
|
for(n = 0; n < unrolled; n += 8) {
|
||||||
|
in[0] = _mm256_setr_m128i(
|
||||||
|
_mm_loadu_si128((__m128i*)&s[0*n_channels]),
|
||||||
|
_mm_loadu_si128((__m128i*)&s[4*n_channels]));
|
||||||
|
in[1] = _mm256_setr_m128i(
|
||||||
|
_mm_loadu_si128((__m128i*)&s[1*n_channels]),
|
||||||
|
_mm_loadu_si128((__m128i*)&s[5*n_channels]));
|
||||||
|
in[2] = _mm256_setr_m128i(
|
||||||
|
_mm_loadu_si128((__m128i*)&s[2*n_channels]),
|
||||||
|
_mm_loadu_si128((__m128i*)&s[6*n_channels]));
|
||||||
|
in[3] = _mm256_setr_m128i(
|
||||||
|
_mm_loadu_si128((__m128i*)&s[3*n_channels]),
|
||||||
|
_mm_loadu_si128((__m128i*)&s[7*n_channels]));
|
||||||
|
|
||||||
|
out[0] = _mm256_cvtepi32_ps(in[0]); /* a0 b0 c0 d0 a4 b4 c4 d4 */
|
||||||
|
out[1] = _mm256_cvtepi32_ps(in[1]); /* a1 b1 c1 d1 a5 b5 c5 d5 */
|
||||||
|
out[2] = _mm256_cvtepi32_ps(in[2]); /* a2 b2 c2 d2 a6 b6 c6 d6 */
|
||||||
|
out[3] = _mm256_cvtepi32_ps(in[3]); /* a3 b3 c3 d3 a7 b7 c7 d7 */
|
||||||
|
|
||||||
|
out[0] = _mm256_mul_ps(out[0], factor);
|
||||||
|
out[1] = _mm256_mul_ps(out[1], factor);
|
||||||
|
out[2] = _mm256_mul_ps(out[2], factor);
|
||||||
|
out[3] = _mm256_mul_ps(out[3], factor);
|
||||||
|
|
||||||
|
t[0] = _mm256_unpacklo_ps(out[0], out[2]); /* a0 a2 b0 b2 a4 a6 b4 b6 */
|
||||||
|
t[1] = _mm256_unpackhi_ps(out[0], out[2]); /* c0 c2 d0 d2 c4 c6 d4 d6 */
|
||||||
|
t[2] = _mm256_unpacklo_ps(out[1], out[3]); /* a1 a3 b1 b3 a5 a7 b5 b7 */
|
||||||
|
t[3] = _mm256_unpackhi_ps(out[1], out[3]); /* c1 c3 d1 d3 c5 c7 d5 d7 */
|
||||||
|
|
||||||
|
out[0] = _mm256_unpacklo_ps(t[0], t[2]); /* a0 a1 a2 a3 a4 a5 a6 a7 */
|
||||||
|
out[1] = _mm256_unpackhi_ps(t[0], t[2]); /* b0 b1 b2 b3 b4 b5 b6 b7 */
|
||||||
|
out[2] = _mm256_unpacklo_ps(t[1], t[3]); /* c0 c1 c2 c3 c4 c5 c6 c7 */
|
||||||
|
out[3] = _mm256_unpackhi_ps(t[1], t[3]); /* d0 d1 d2 d3 d4 d5 d6 d7 */
|
||||||
|
|
||||||
|
_mm256_store_ps(&d0[n], out[0]);
|
||||||
|
_mm256_store_ps(&d1[n], out[1]);
|
||||||
|
_mm256_store_ps(&d2[n], out[2]);
|
||||||
|
_mm256_store_ps(&d3[n], out[3]);
|
||||||
|
|
||||||
|
s += 8*n_channels;
|
||||||
|
}
|
||||||
|
for(; n < n_samples; n++) {
|
||||||
|
__m128 out[4], factor = _mm_set1_ps(1.0f / S32_SCALE_I2F);
|
||||||
|
__m128i in[1];
|
||||||
|
in[0] = _mm_setr_epi32(s[0], s[1], s[2], s[3]);
|
||||||
|
out[0] = _mm_cvtepi32_ps(in[0]);
|
||||||
|
out[0] = _mm_mul_ps(out[0], factor);
|
||||||
|
_MM_STOREM_PS(&d0[n], &d1[n], &d2[n], &d3[n], out[0]);
|
||||||
|
s += n_channels;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void
|
void
|
||||||
conv_s32_to_f32d_avx2(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
|
conv_s32_to_f32d_avx2(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
|
||||||
uint32_t n_samples)
|
uint32_t n_samples)
|
||||||
|
|
@ -602,12 +773,21 @@ conv_s32_to_f32d_avx2(struct convert *conv, void * SPA_RESTRICT dst[], const voi
|
||||||
const int32_t *s = src[0];
|
const int32_t *s = src[0];
|
||||||
uint32_t i = 0, n_channels = conv->n_channels;
|
uint32_t i = 0, n_channels = conv->n_channels;
|
||||||
|
|
||||||
|
if (conv->cpu_flags & SPA_CPU_FLAG_SLOW_GATHER) {
|
||||||
for(; i + 3 < n_channels; i += 4)
|
for(; i + 3 < n_channels; i += 4)
|
||||||
conv_s32_to_f32d_4s_avx2(conv, &dst[i], &s[i], n_channels, n_samples);
|
conv_s32_to_f32d_4s_avx2(conv, &dst[i], &s[i], n_channels, n_samples);
|
||||||
for(; i + 1 < n_channels; i += 2)
|
for(; i + 1 < n_channels; i += 2)
|
||||||
conv_s32_to_f32d_2s_avx2(conv, &dst[i], &s[i], n_channels, n_samples);
|
conv_s32_to_f32d_2s_avx2(conv, &dst[i], &s[i], n_channels, n_samples);
|
||||||
for(; i < n_channels; i++)
|
for(; i < n_channels; i++)
|
||||||
conv_s32_to_f32d_1s_avx2(conv, &dst[i], &s[i], n_channels, n_samples);
|
conv_s32_to_f32d_1s_avx2(conv, &dst[i], &s[i], n_channels, n_samples);
|
||||||
|
} else {
|
||||||
|
for(; i + 3 < n_channels; i += 4)
|
||||||
|
conv_s32_to_f32d_4s_gather_avx2(conv, &dst[i], &s[i], n_channels, n_samples);
|
||||||
|
for(; i + 1 < n_channels; i += 2)
|
||||||
|
conv_s32_to_f32d_2s_gather_avx2(conv, &dst[i], &s[i], n_channels, n_samples);
|
||||||
|
for(; i < n_channels; i++)
|
||||||
|
conv_s32_to_f32d_1s_gather_avx2(conv, &dst[i], &s[i], n_channels, n_samples);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
static void
|
static void
|
||||||
|
|
@ -1187,3 +1367,4 @@ conv_f32d_to_s16s_2_avx2(struct convert *conv, void * SPA_RESTRICT dst[], const
|
||||||
d += 2;
|
d += 2;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -108,9 +108,6 @@ static struct conv_info conv_table[] =
|
||||||
MAKE(U32, F32, 0, conv_u32_to_f32_c),
|
MAKE(U32, F32, 0, conv_u32_to_f32_c),
|
||||||
MAKE(U32, F32P, 0, conv_u32_to_f32d_c),
|
MAKE(U32, F32P, 0, conv_u32_to_f32d_c),
|
||||||
|
|
||||||
#if defined (HAVE_SSE2)
|
|
||||||
MAKE(S32, F32P, 0, conv_s32_to_f32d_sse2, SPA_CPU_FLAG_SSE2 | SPA_CPU_FLAG_SLOW_GATHER),
|
|
||||||
#endif
|
|
||||||
#if defined (HAVE_AVX2)
|
#if defined (HAVE_AVX2)
|
||||||
MAKE(S32, F32P, 0, conv_s32_to_f32d_avx2, SPA_CPU_FLAG_AVX2),
|
MAKE(S32, F32P, 0, conv_s32_to_f32d_avx2, SPA_CPU_FLAG_AVX2),
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -132,9 +129,6 @@ static struct conv_info conv_table[] =
|
||||||
|
|
||||||
MAKE(S24, F32, 0, conv_s24_to_f32_c),
|
MAKE(S24, F32, 0, conv_s24_to_f32_c),
|
||||||
MAKE(S24P, F32P, 0, conv_s24d_to_f32d_c),
|
MAKE(S24P, F32P, 0, conv_s24d_to_f32d_c),
|
||||||
#if defined (HAVE_SSE2)
|
|
||||||
MAKE(S24, F32P, 0, conv_s24_to_f32d_sse2, SPA_CPU_FLAG_SSE2 | SPA_CPU_FLAG_SLOW_GATHER),
|
|
||||||
#endif
|
|
||||||
#if defined (HAVE_AVX2)
|
#if defined (HAVE_AVX2)
|
||||||
MAKE(S24, F32P, 0, conv_s24_to_f32d_avx2, SPA_CPU_FLAG_AVX2),
|
MAKE(S24, F32P, 0, conv_s24_to_f32d_avx2, SPA_CPU_FLAG_AVX2),
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -637,7 +631,7 @@ int convert_init(struct convert *conv)
|
||||||
conv->random[i] = random();
|
conv->random[i] = random();
|
||||||
|
|
||||||
conv->is_passthrough = conv->src_fmt == conv->dst_fmt;
|
conv->is_passthrough = conv->src_fmt == conv->dst_fmt;
|
||||||
conv->cpu_flags = info->cpu_flags;
|
conv->func_cpu_flags = info->cpu_flags;
|
||||||
conv->update_noise = ninfo->noise;
|
conv->update_noise = ninfo->noise;
|
||||||
conv->process = info->process;
|
conv->process = info->process;
|
||||||
conv->clear = cinfo ? cinfo->clear : NULL;
|
conv->clear = cinfo ? cinfo->clear : NULL;
|
||||||
|
|
|
||||||
|
|
@ -219,6 +219,7 @@ struct convert {
|
||||||
uint32_t n_channels;
|
uint32_t n_channels;
|
||||||
uint32_t rate;
|
uint32_t rate;
|
||||||
uint32_t cpu_flags;
|
uint32_t cpu_flags;
|
||||||
|
uint32_t func_cpu_flags;
|
||||||
const char *func_name;
|
const char *func_name;
|
||||||
|
|
||||||
unsigned int is_passthrough:1;
|
unsigned int is_passthrough:1;
|
||||||
|
|
|
||||||
|
|
@ -44,7 +44,7 @@ endif
|
||||||
if have_sse2
|
if have_sse2
|
||||||
audioconvert_sse2 = static_library('audioconvert_sse2',
|
audioconvert_sse2 = static_library('audioconvert_sse2',
|
||||||
['fmt-ops-sse2.c' ],
|
['fmt-ops-sse2.c' ],
|
||||||
c_args : [sse2_args, '-O3', '-DHAVE_SSE2'],
|
c_args : [sse2_args, '-O3', '-DHAVE_SSE2', simd_cargs],
|
||||||
dependencies : [ spa_dep ],
|
dependencies : [ spa_dep ],
|
||||||
install : false
|
install : false
|
||||||
)
|
)
|
||||||
|
|
@ -55,7 +55,7 @@ if have_ssse3
|
||||||
audioconvert_ssse3 = static_library('audioconvert_ssse3',
|
audioconvert_ssse3 = static_library('audioconvert_ssse3',
|
||||||
['fmt-ops-ssse3.c',
|
['fmt-ops-ssse3.c',
|
||||||
'resample-native-ssse3.c' ],
|
'resample-native-ssse3.c' ],
|
||||||
c_args : [ssse3_args, '-O3', '-DHAVE_SSSE3'],
|
c_args : [ssse3_args, '-O3', '-DHAVE_SSSE3', simd_cargs],
|
||||||
dependencies : [ spa_dep ],
|
dependencies : [ spa_dep ],
|
||||||
install : false
|
install : false
|
||||||
)
|
)
|
||||||
|
|
@ -65,7 +65,7 @@ endif
|
||||||
if have_sse41
|
if have_sse41
|
||||||
audioconvert_sse41 = static_library('audioconvert_sse41',
|
audioconvert_sse41 = static_library('audioconvert_sse41',
|
||||||
['fmt-ops-sse41.c'],
|
['fmt-ops-sse41.c'],
|
||||||
c_args : [sse41_args, '-O3', '-DHAVE_SSE41'],
|
c_args : [sse41_args, '-O3', '-DHAVE_SSE41', simd_cargs],
|
||||||
dependencies : [ spa_dep ],
|
dependencies : [ spa_dep ],
|
||||||
install : false
|
install : false
|
||||||
)
|
)
|
||||||
|
|
@ -75,7 +75,7 @@ endif
|
||||||
if have_avx2 and have_fma
|
if have_avx2 and have_fma
|
||||||
audioconvert_avx2_fma = static_library('audioconvert_avx2_fma',
|
audioconvert_avx2_fma = static_library('audioconvert_avx2_fma',
|
||||||
['resample-native-avx2.c'],
|
['resample-native-avx2.c'],
|
||||||
c_args : [avx2_args, fma_args, '-O3', '-DHAVE_AVX2', '-DHAVE_FMA'],
|
c_args : [avx2_args, fma_args, '-O3', '-DHAVE_AVX2', '-DHAVE_FMA', simd_cargs],
|
||||||
dependencies : [ spa_dep ],
|
dependencies : [ spa_dep ],
|
||||||
install : false
|
install : false
|
||||||
)
|
)
|
||||||
|
|
@ -85,7 +85,7 @@ endif
|
||||||
if have_avx2
|
if have_avx2
|
||||||
audioconvert_avx2 = static_library('audioconvert_avx2',
|
audioconvert_avx2 = static_library('audioconvert_avx2',
|
||||||
['fmt-ops-avx2.c'],
|
['fmt-ops-avx2.c'],
|
||||||
c_args : [avx2_args, '-O3', '-DHAVE_AVX2'],
|
c_args : [avx2_args, '-O3', '-DHAVE_AVX2', simd_cargs],
|
||||||
dependencies : [ spa_dep ],
|
dependencies : [ spa_dep ],
|
||||||
install : false
|
install : false
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -60,7 +60,7 @@ int peaks_init(struct peaks *peaks)
|
||||||
if (info == NULL)
|
if (info == NULL)
|
||||||
return -ENOTSUP;
|
return -ENOTSUP;
|
||||||
|
|
||||||
peaks->cpu_flags = info->cpu_flags;
|
peaks->func_cpu_flags = info->cpu_flags;
|
||||||
peaks->func_name = info->name;
|
peaks->func_name = info->name;
|
||||||
peaks->free = impl_peaks_free;
|
peaks->free = impl_peaks_free;
|
||||||
peaks->min_max = info->min_max;
|
peaks->min_max = info->min_max;
|
||||||
|
|
|
||||||
|
|
@ -14,6 +14,7 @@ extern struct spa_log_topic resample_log_topic;
|
||||||
|
|
||||||
struct peaks {
|
struct peaks {
|
||||||
uint32_t cpu_flags;
|
uint32_t cpu_flags;
|
||||||
|
uint32_t func_cpu_flags;
|
||||||
const char *func_name;
|
const char *func_name;
|
||||||
|
|
||||||
struct spa_log *log;
|
struct spa_log *log;
|
||||||
|
|
|
||||||
|
|
@ -576,7 +576,7 @@ int resample_native_init(struct resample *r)
|
||||||
r, c->cutoff, r->quality, c->window, r->i_rate, r->o_rate, gcd, n_taps, n_phases,
|
r, c->cutoff, r->quality, c->window, r->i_rate, r->o_rate, gcd, n_taps, n_phases,
|
||||||
r->cpu_flags, d->info->cpu_flags);
|
r->cpu_flags, d->info->cpu_flags);
|
||||||
|
|
||||||
r->cpu_flags = d->info->cpu_flags;
|
r->func_cpu_flags = d->info->cpu_flags;
|
||||||
|
|
||||||
impl_native_reset(r);
|
impl_native_reset(r);
|
||||||
impl_native_update_rate(r, 1.0);
|
impl_native_update_rate(r, 1.0);
|
||||||
|
|
|
||||||
|
|
@ -38,6 +38,7 @@ struct resample {
|
||||||
#define RESAMPLE_OPTION_PREFILL (1<<0)
|
#define RESAMPLE_OPTION_PREFILL (1<<0)
|
||||||
uint32_t options;
|
uint32_t options;
|
||||||
uint32_t cpu_flags;
|
uint32_t cpu_flags;
|
||||||
|
uint32_t func_cpu_flags;
|
||||||
const char *func_name;
|
const char *func_name;
|
||||||
|
|
||||||
uint32_t channels;
|
uint32_t channels;
|
||||||
|
|
|
||||||
|
|
@ -56,7 +56,7 @@ int volume_init(struct volume *vol)
|
||||||
if (info == NULL)
|
if (info == NULL)
|
||||||
return -ENOTSUP;
|
return -ENOTSUP;
|
||||||
|
|
||||||
vol->cpu_flags = info->cpu_flags;
|
vol->func_cpu_flags = info->cpu_flags;
|
||||||
vol->func_name = info->name;
|
vol->func_name = info->name;
|
||||||
vol->free = impl_volume_free;
|
vol->free = impl_volume_free;
|
||||||
vol->process = info->process;
|
vol->process = info->process;
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,7 @@
|
||||||
|
|
||||||
struct volume {
|
struct volume {
|
||||||
uint32_t cpu_flags;
|
uint32_t cpu_flags;
|
||||||
|
uint32_t func_cpu_flags;
|
||||||
const char *func_name;
|
const char *func_name;
|
||||||
|
|
||||||
struct spa_log *log;
|
struct spa_log *log;
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue