fmt-ops: add RVV optimizations for f32d_s16d

This commit is contained in:
sunyuechi 2024-09-23 10:22:44 +08:00 committed by Wim Taymans
parent 9348ad8115
commit 852de6c35c
5 changed files with 50 additions and 25 deletions

View file

@ -136,6 +136,7 @@ static void test_f32_s16(void)
#if defined (HAVE_RVV) #if defined (HAVE_RVV)
if (cpu_flags & SPA_CPU_FLAG_RISCV_V) { if (cpu_flags & SPA_CPU_FLAG_RISCV_V) {
run_test("test_f32_s16", "rvv", true, true, conv_f32_to_s16_rvv); run_test("test_f32_s16", "rvv", true, true, conv_f32_to_s16_rvv);
run_test("test_f32d_s16d", "rvv", false, false, conv_f32d_to_s16d_rvv);
} }
#endif #endif
run_test("test_f32_s16d", "c", true, false, conv_f32_to_s16d_c); run_test("test_f32_s16d", "c", true, false, conv_f32_to_s16d_c);

View file

@ -5,6 +5,35 @@
#include "fmt-ops.h" #include "fmt-ops.h"
#if HAVE_RVV #if HAVE_RVV
void
f32_to_s16(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src,
uint32_t n_samples)
{
asm __volatile__ (
".option arch, +v \n\t"
"li t0, 1191182336 \n\t"
"fmv.w.x fa5, t0 \n\t"
"1: \n\t"
"vsetvli t0, %[n_samples], e32, m8, ta, ma \n\t"
"vle32.v v8, (%[src]) \n\t"
"sub %[n_samples], %[n_samples], t0 \n\t"
"vfmul.vf v8, v8, fa5 \n\t"
"vsetvli zero, zero, e16, m4, ta, ma \n\t"
"vfncvt.x.f.w v8, v8 \n\t"
"slli t0, t0, 1 \n\t"
"vse16.v v8, (%[dst]) \n\t"
"add %[src], %[src], t0 \n\t"
"add %[dst], %[dst], t0 \n\t"
"add %[src], %[src], t0 \n\t"
"bnez %[n_samples], 1b \n\t"
: [n_samples] "+r" (n_samples),
[src] "+r" (src),
[dst] "+r" (dst)
:
: "cc", "memory"
);
}
void void
conv_f32_to_s16_rvv(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[], conv_f32_to_s16_rvv(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
uint32_t n_samples) uint32_t n_samples)
@ -14,30 +43,21 @@ conv_f32_to_s16_rvv(struct convert *conv, void * SPA_RESTRICT dst[], const void
return; return;
} }
__asm__ __volatile__ ( f32_to_s16(conv, *dst, *src, n_samples * conv -> n_channels);
".option arch, +v \n\t" }
"ld a1, (a1) \n\t"
"ld a2, (a2) \n\t" void
"lwu a0, 16(a0) \n\t" conv_f32d_to_s16d_rvv(struct convert *conv, void * SPA_RESTRICT dst[], const void * SPA_RESTRICT src[],
"li t0, 1191182336 \n\t" uint32_t n_samples)
"mul a0, a0, a3 \n\t" {
"fmv.w.x fa5, t0 \n\t" if (n_samples <= 4) {
"1: \n\t" conv_f32d_to_s16d_c(conv, dst, src, n_samples);
"vsetvli t0, a0, e32, m8, ta, ma \n\t" return;
"vle32.v v8, (a2) \n\t" }
"sub a0, a0, t0 \n\t"
"vfmul.vf v8, v8, fa5 \n\t" uint32_t i = 0, n_channels = conv->n_channels;
"vsetvli zero, zero, e16, m4, ta, ma \n\t" for(i = 0; i < n_channels; i++) {
"vfncvt.x.f.w v8, v8 \n\t" f32_to_s16(conv, dst[i], src[i], n_samples);
"slli t0, t0, 1 \n\t" }
"vse16.v v8, (a1) \n\t"
"add a2, a2, t0 \n\t"
"add a1, a1, t0 \n\t"
"add a2, a2, t0 \n\t"
"bnez a0, 1b \n\t"
:
:
: "cc", "memory"
);
} }
#endif #endif

View file

@ -179,6 +179,7 @@ static struct conv_info conv_table[] =
#endif #endif
#if defined (HAVE_RVV) #if defined (HAVE_RVV)
MAKE(F32, S16, 0, conv_f32_to_s16_rvv, SPA_CPU_FLAG_RISCV_V), MAKE(F32, S16, 0, conv_f32_to_s16_rvv, SPA_CPU_FLAG_RISCV_V),
MAKE(F32P, S16P, 0, conv_f32d_to_s16d_rvv, SPA_CPU_FLAG_RISCV_V),
#endif #endif
MAKE(F32, S16, 0, conv_f32_to_s16_c), MAKE(F32, S16, 0, conv_f32_to_s16_c),

View file

@ -442,6 +442,7 @@ DEFINE_FUNCTION(f32d_to_s16, neon);
#endif #endif
#if defined(HAVE_RVV) #if defined(HAVE_RVV)
DEFINE_FUNCTION(f32_to_s16, rvv); DEFINE_FUNCTION(f32_to_s16, rvv);
DEFINE_FUNCTION(f32d_to_s16d, rvv);
#endif #endif
#if defined(HAVE_SSE2) #if defined(HAVE_SSE2)
DEFINE_FUNCTION(s16_to_f32d_2, sse2); DEFINE_FUNCTION(s16_to_f32d_2, sse2);

View file

@ -232,6 +232,8 @@ static void test_f32_s16(void)
if (cpu_flags & SPA_CPU_FLAG_RISCV_V) { if (cpu_flags & SPA_CPU_FLAG_RISCV_V) {
run_test("test_f32_s16_rvv", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out), run_test("test_f32_s16_rvv", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out),
true, true, conv_f32_to_s16_rvv); true, true, conv_f32_to_s16_rvv);
run_test("test_f32d_s16d_rvv", in, sizeof(in[0]), out, sizeof(out[0]), SPA_N_ELEMENTS(out),
false, false, conv_f32d_to_s16d_rvv);
} }
#endif #endif
} }