audioconvert: refactor peaks resampler

Use common code in macro and generate arch specific version.
Compile with -Ofast to optimize some fmaxf calls.
This commit is contained in:
Wim Taymans 2022-09-07 16:00:31 +02:00
parent a79b5c86ea
commit 6e9e02b420
9 changed files with 89 additions and 110 deletions

View file

@ -27,7 +27,7 @@ if have_sse
'resample-peaks-sse.c', 'resample-peaks-sse.c',
'volume-ops-sse.c', 'volume-ops-sse.c',
'channelmix-ops-sse.c' ], 'channelmix-ops-sse.c' ],
c_args : [sse_args, '-O3', '-DHAVE_SSE'], c_args : [sse_args, '-Ofast', '-DHAVE_SSE'],
dependencies : [ spa_dep ], dependencies : [ spa_dep ],
install : false install : false
) )

View file

@ -27,7 +27,7 @@
#include <assert.h> #include <assert.h>
#include <immintrin.h> #include <immintrin.h>
static void inner_product_avx(float *d, const float * SPA_RESTRICT s, static inline void inner_product_avx(float *d, const float * SPA_RESTRICT s,
const float * SPA_RESTRICT taps, uint32_t n_taps) const float * SPA_RESTRICT taps, uint32_t n_taps)
{ {
__m256 sy[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() }, ty; __m256 sy[2] = { _mm256_setzero_ps(), _mm256_setzero_ps() }, ty;
@ -56,7 +56,7 @@ static void inner_product_avx(float *d, const float * SPA_RESTRICT s,
_mm_store_ss(d, sx[0]); _mm_store_ss(d, sx[0]);
} }
static void inner_product_ip_avx(float *d, const float * SPA_RESTRICT s, static inline void inner_product_ip_avx(float *d, const float * SPA_RESTRICT s,
const float * SPA_RESTRICT t0, const float * SPA_RESTRICT t1, float x, const float * SPA_RESTRICT t0, const float * SPA_RESTRICT t1, float x,
uint32_t n_taps) uint32_t n_taps)
{ {

View file

@ -24,7 +24,7 @@
#include "resample-native-impl.h" #include "resample-native-impl.h"
static void inner_product_c(float *d, const float * SPA_RESTRICT s, static inline void inner_product_c(float *d, const float * SPA_RESTRICT s,
const float * SPA_RESTRICT taps, uint32_t n_taps) const float * SPA_RESTRICT taps, uint32_t n_taps)
{ {
float sum = 0.0f; float sum = 0.0f;
@ -40,7 +40,7 @@ static void inner_product_c(float *d, const float * SPA_RESTRICT s,
*d = sum; *d = sum;
} }
static void inner_product_ip_c(float *d, const float * SPA_RESTRICT s, static inline void inner_product_ip_c(float *d, const float * SPA_RESTRICT s,
const float * SPA_RESTRICT t0, const float * SPA_RESTRICT t1, float x, const float * SPA_RESTRICT t0, const float * SPA_RESTRICT t1, float x,
uint32_t n_taps) uint32_t n_taps)
{ {

View file

@ -26,7 +26,7 @@
#include <arm_neon.h> #include <arm_neon.h>
static void inner_product_neon(float *d, const float * SPA_RESTRICT s, static inline void inner_product_neon(float *d, const float * SPA_RESTRICT s,
const float * SPA_RESTRICT taps, uint32_t n_taps) const float * SPA_RESTRICT taps, uint32_t n_taps)
{ {
unsigned int remainder = n_taps % 16; unsigned int remainder = n_taps % 16;
@ -137,7 +137,7 @@ static void inner_product_neon(float *d, const float * SPA_RESTRICT s,
#endif #endif
} }
static void inner_product_ip_neon(float *d, const float * SPA_RESTRICT s, static inline void inner_product_ip_neon(float *d, const float * SPA_RESTRICT s,
const float * SPA_RESTRICT t0, const float * SPA_RESTRICT t1, float x, const float * SPA_RESTRICT t0, const float * SPA_RESTRICT t1, float x,
uint32_t n_taps) uint32_t n_taps)
{ {

View file

@ -26,7 +26,7 @@
#include <xmmintrin.h> #include <xmmintrin.h>
static void inner_product_sse(float *d, const float * SPA_RESTRICT s, static inline void inner_product_sse(float *d, const float * SPA_RESTRICT s,
const float * SPA_RESTRICT taps, uint32_t n_taps) const float * SPA_RESTRICT taps, uint32_t n_taps)
{ {
__m128 sum = _mm_setzero_ps(); __m128 sum = _mm_setzero_ps();
@ -68,7 +68,7 @@ static void inner_product_sse(float *d, const float * SPA_RESTRICT s,
_mm_store_ss(d, sum); _mm_store_ss(d, sum);
} }
static void inner_product_ip_sse(float *d, const float * SPA_RESTRICT s, static inline void inner_product_ip_sse(float *d, const float * SPA_RESTRICT s,
const float * SPA_RESTRICT t0, const float * SPA_RESTRICT t1, float x, const float * SPA_RESTRICT t0, const float * SPA_RESTRICT t1, float x,
uint32_t n_taps) uint32_t n_taps)
{ {

View file

@ -26,7 +26,7 @@
#include <tmmintrin.h> #include <tmmintrin.h>
static void inner_product_ssse3(float *d, const float * SPA_RESTRICT s, static inline void inner_product_ssse3(float *d, const float * SPA_RESTRICT s,
const float * SPA_RESTRICT taps, uint32_t n_taps) const float * SPA_RESTRICT taps, uint32_t n_taps)
{ {
__m128 sum = _mm_setzero_ps(); __m128 sum = _mm_setzero_ps();
@ -97,7 +97,7 @@ static void inner_product_ssse3(float *d, const float * SPA_RESTRICT s,
_mm_store_ss(d, sum); _mm_store_ss(d, sum);
} }
static void inner_product_ip_ssse3(float *d, const float * SPA_RESTRICT s, static inline void inner_product_ip_ssse3(float *d, const float * SPA_RESTRICT s,
const float * SPA_RESTRICT t0, const float * SPA_RESTRICT t1, float x, const float * SPA_RESTRICT t0, const float * SPA_RESTRICT t1, float x,
uint32_t n_taps) uint32_t n_taps)
{ {

View file

@ -26,48 +26,12 @@
#include "resample-peaks-impl.h" #include "resample-peaks-impl.h"
void resample_peaks_process_c(struct resample *r, static inline float find_abs_max_c(const float *s, uint32_t n_samples, float m)
const void * SPA_RESTRICT src[], uint32_t *in_len,
void * SPA_RESTRICT dst[], uint32_t *out_len)
{ {
struct peaks_data *pd = r->data; uint32_t n;
uint32_t c, i, o, end, chunk, o_count, i_count; for (n = 0; n < n_samples; n++)
m = fmaxf(fabsf(s[n]), m);
if (SPA_UNLIKELY(r->channels == 0)) return m;
return;
for (c = 0; c < r->channels; c++) {
const float *s = src[c];
float *d = dst[c], m = pd->max_f[c];
o_count = pd->o_count;
i_count = pd->i_count;
o = i = 0;
while (i < *in_len && o < *out_len) {
end = ((uint64_t) (o_count + 1) * r->i_rate) / r->o_rate;
end = end > i_count ? end - i_count : 0;
chunk = SPA_MIN(end, *in_len);
for (; i < chunk; i++)
m = SPA_MAX(fabsf(s[i]), m);
if (i == end) {
d[o++] = m;
m = 0.0f;
o_count++;
}
}
pd->max_f[c] = m;
}
*out_len = o;
*in_len = i;
pd->o_count = o_count;
pd->i_count = i_count + i;
while (pd->i_count >= r->i_rate) {
pd->i_count -= r->i_rate;
pd->o_count -= r->o_rate;
}
} }
MAKE_PEAKS(c);

View file

@ -34,11 +34,59 @@ struct peaks_data {
float max_f[]; float max_f[];
}; };
void resample_peaks_process_c(struct resample *r, #define DEFINE_PEAKS(arch) \
const void * SPA_RESTRICT src[], uint32_t *in_len, void resample_peaks_process_##arch(struct resample *r, \
void * SPA_RESTRICT dst[], uint32_t *out_len); const void * SPA_RESTRICT src[], uint32_t *in_len, \
void * SPA_RESTRICT dst[], uint32_t *out_len)
#define MAKE_PEAKS(arch) \
DEFINE_PEAKS(arch) \
{ \
struct peaks_data *pd = r->data; \
uint32_t c, i, o, end, chunk, i_count, o_count; \
\
if (SPA_UNLIKELY(r->channels == 0)) \
return; \
\
for (c = 0; c < r->channels; c++) { \
const float *s = src[c]; \
float *d = dst[c], m = pd->max_f[c]; \
\
o_count = pd->o_count; \
i_count = pd->i_count; \
o = i = 0; \
\
while (i < *in_len && o < *out_len) { \
end = ((uint64_t) (o_count + 1) \
* r->i_rate) / r->o_rate; \
end = end > i_count ? end - i_count : 0; \
chunk = SPA_MIN(end, *in_len); \
\
m = find_abs_max_##arch(&s[i], chunk - i, m); \
\
i += chunk; \
\
if (i == end) { \
d[o++] = m; \
m = 0.0f; \
o_count++; \
} \
} \
pd->max_f[c] = m; \
} \
*out_len = o; \
*in_len = i; \
pd->o_count = o_count; \
pd->i_count = i_count + i; \
\
while (pd->i_count >= r->i_rate) { \
pd->i_count -= r->i_rate; \
pd->o_count -= r->o_rate; \
} \
}
DEFINE_PEAKS(c);
#if defined (HAVE_SSE) #if defined (HAVE_SSE)
void resample_peaks_process_sse(struct resample *r, DEFINE_PEAKS(sse);
const void * SPA_RESTRICT src[], uint32_t *in_len,
void * SPA_RESTRICT dst[], uint32_t *out_len);
#endif #endif

View file

@ -37,60 +37,27 @@ static inline float hmax_ps(__m128 val)
return _mm_cvtss_f32(val); return _mm_cvtss_f32(val);
} }
void resample_peaks_process_sse(struct resample *r, static inline float find_abs_max_sse(const float *s, uint32_t n_samples, float m)
const void * SPA_RESTRICT src[], uint32_t *in_len,
void * SPA_RESTRICT dst[], uint32_t *out_len)
{ {
struct peaks_data *pd = r->data; __m128 in, max;
uint32_t c, i, o, end, chunk, unrolled, i_count, o_count; uint32_t n, unrolled;
__m128 in, max, mask = _mm_andnot_ps(_mm_set_ps1(-0.0f), const __m128 mask = _mm_andnot_ps(
_mm_cmpeq_ps(_mm_setzero_ps(), _mm_setzero_ps())); _mm_set_ps1(-0.0f),
_mm_cmpeq_ps(_mm_setzero_ps(), _mm_setzero_ps()));
if (r->channels == 0) max = _mm_set1_ps(m);
return;
for (c = 0; c < r->channels; c++) { unrolled = n_samples & ~3;
const float *s = src[c];
float *d = dst[c], m = pd->max_f[c];
o_count = pd->o_count; for (n = 0; n < unrolled; n+=4) {
i_count = pd->i_count; in = _mm_loadu_ps(&s[n]);
o = i = 0; in = _mm_and_ps(mask, in);
max = _mm_max_ps(in, max);
max = _mm_set1_ps(m);
while (i < *in_len && o < *out_len) {
end = ((uint64_t) (o_count + 1) * r->i_rate) / r->o_rate;
end = end > i_count ? end - i_count : 0;
chunk = SPA_MIN(end, *in_len);
unrolled = chunk - ((chunk - i) & 3);
for (; i < unrolled; i+=4) {
in = _mm_loadu_ps(&s[i]);
in = _mm_and_ps(mask, in);
max = _mm_max_ps(in, max);
}
for (; i < chunk; i++)
m = SPA_MAX(fabsf(s[i]), m);
if (i == end) {
d[o++] = SPA_MAX(hmax_ps(max), m);
m = 0.0f;
max = _mm_set1_ps(m);
o_count++;
}
}
pd->max_f[c] = SPA_MAX(hmax_ps(max), m);
} }
for (; n < n_samples; n++)
m = fmaxf(fabsf(s[n]), m);
*out_len = o; return fmaxf(hmax_ps(max), m);
*in_len = i;
pd->o_count = o_count;
pd->i_count = i_count + i;
while (pd->i_count >= r->i_rate) {
pd->i_count -= r->i_rate;
pd->o_count -= r->o_rate;
}
} }
MAKE_PEAKS(sse);