filter-chain: move fft mem function to dsp_ops

So that we can have different versions per FFT. One possible optimization would be to split the complex number in separate real and imag arrays. This would speed up the multiply operations.
2025-11-04 13:30:12 -05:00 · 2024-11-05 16:12:28 +01:00 · 2024-11-05 16:12:28 +01:00 · c672327181
commit c672327181
parent 79baeb8d18
4 changed files with 98 additions and 72 deletions
--- a/src/modules/module-filter-chain/convolver.c
+++ b/src/modules/module-filter-chain/convolver.c
@ -37,37 +37,6 @@ struct convolver1 {
 	float scale;
 };

-static void *fft_alloc(int size)
-{
-	size_t nb_bytes = size * sizeof(float);
-#define ALIGNMENT 64
-	void *p, *p0 = malloc(nb_bytes + ALIGNMENT);
-	if (!p0)
-		return (void *)0;
-	p = (void *)(((size_t)p0 + ALIGNMENT) & (~((size_t)(ALIGNMENT - 1))));
-	*((void **)p - 1) = p0;
-	return p;
-}
-static void fft_free(void *p)
-{
-	if (p)
-		free(*((void **)p - 1));
-}
-
-static inline void fft_cpx_clear(float *v, int size)
-{
-	dsp_ops_clear(dsp, v, size * 2);
-}
-static float *fft_cpx_alloc(int size)
-{
-	return fft_alloc(size * 2);
-}
-
-static void fft_cpx_free(float *cpx)
-{
-	fft_free(cpx);
-}
-
 static int next_power_of_two(int val)
 {
 	int r = 1;
@ -80,11 +49,11 @@ static void convolver1_reset(struct convolver1 *conv)
 {
 	int i;
 	for (i = 0; i < conv->segCount; i++)
-		fft_cpx_clear(conv->segments[i], conv->fftComplexSize);
-	dsp_ops_clear(dsp, conv->overlap, conv->blockSize);
-	dsp_ops_clear(dsp, conv->inputBuffer, conv->segSize);
-	fft_cpx_clear(conv->pre_mult, conv->fftComplexSize);
-	fft_cpx_clear(conv->conv, conv->fftComplexSize);
+		dsp_ops_fft_memclear(dsp, conv->segments[i], conv->fftComplexSize, false);
+	dsp_ops_fft_memclear(dsp, conv->overlap, conv->blockSize, true);
+	dsp_ops_fft_memclear(dsp, conv->inputBuffer, conv->segSize, true);
+	dsp_ops_fft_memclear(dsp, conv->pre_mult, conv->fftComplexSize, false);
+	dsp_ops_fft_memclear(dsp, conv->conv, conv->fftComplexSize, false);
 	conv->inputBufferFill = 0;
 	conv->current = 0;
 }
@ -94,22 +63,22 @@ static void convolver1_free(struct convolver1 *conv)
 	int i;
 	for (i = 0; i < conv->segCount; i++) {
 		if (conv->segments)
-			fft_cpx_free(conv->segments[i]);
+			dsp_ops_fft_memfree(dsp, conv->segments[i]);
 		if (conv->segmentsIr)
-			fft_cpx_free(conv->segmentsIr[i]);
+			dsp_ops_fft_memfree(dsp, conv->segmentsIr[i]);
 	}
 	if (conv->fft)
 		dsp_ops_fft_free(dsp, conv->fft);
 	if (conv->ifft)
 		dsp_ops_fft_free(dsp, conv->ifft);
 	if (conv->fft_buffer)
-		fft_free(conv->fft_buffer);
+		dsp_ops_fft_memfree(dsp, conv->fft_buffer);
 	free(conv->segments);
 	free(conv->segmentsIr);
-	fft_cpx_free(conv->pre_mult);
-	fft_cpx_free(conv->conv);
-	fft_free(conv->overlap);
-	fft_free(conv->inputBuffer);
+	dsp_ops_fft_memfree(dsp, conv->pre_mult);
+	dsp_ops_fft_memfree(dsp, conv->conv);
+	dsp_ops_fft_memfree(dsp, conv->overlap);
+	dsp_ops_fft_memfree(dsp, conv->inputBuffer);
 	free(conv);
 }

@ -143,7 +112,7 @@ static struct convolver1 *convolver1_new(int block, const float *ir, int irlen)
 	if (conv->ifft == NULL)
 		goto error;

-	conv->fft_buffer = fft_alloc(conv->segSize);
+	conv->fft_buffer = dsp_ops_fft_memalloc(dsp, conv->segSize, true);
 	if (conv->fft_buffer == NULL)
 		goto error;

@ -156,21 +125,21 @@ static struct convolver1 *convolver1_new(int block, const float *ir, int irlen)
 		int left = irlen - (i * conv->blockSize);
 		int copy = SPA_MIN(conv->blockSize, left);

-		conv->segments[i] = fft_cpx_alloc(conv->fftComplexSize);
-		conv->segmentsIr[i] = fft_cpx_alloc(conv->fftComplexSize);
+		conv->segments[i] = dsp_ops_fft_memalloc(dsp, conv->fftComplexSize, false);
+		conv->segmentsIr[i] = dsp_ops_fft_memalloc(dsp, conv->fftComplexSize, false);
 		if (conv->segments[i] == NULL || conv->segmentsIr[i] == NULL)
 			goto error;

 		dsp_ops_copy(dsp, conv->fft_buffer, &ir[i * conv->blockSize], copy);
 		if (copy < conv->segSize)
-			dsp_ops_clear(dsp, conv->fft_buffer + copy, conv->segSize - copy);
+			dsp_ops_fft_memclear(dsp, conv->fft_buffer + copy, conv->segSize - copy, true);

 	        dsp_ops_fft_run(dsp, conv->fft, 1, conv->fft_buffer, conv->segmentsIr[i]);
 	}
-	conv->pre_mult = fft_cpx_alloc(conv->fftComplexSize);
-	conv->conv = fft_cpx_alloc(conv->fftComplexSize);
-	conv->overlap = fft_alloc(conv->blockSize);
-	conv->inputBuffer = fft_alloc(conv->segSize);
+	conv->pre_mult = dsp_ops_fft_memalloc(dsp, conv->fftComplexSize, false);
+	conv->conv = dsp_ops_fft_memalloc(dsp, conv->fftComplexSize, false);
+	conv->overlap = dsp_ops_fft_memalloc(dsp, conv->blockSize, true);
+	conv->inputBuffer = dsp_ops_fft_memalloc(dsp, conv->segSize, true);
 	if (conv->pre_mult == NULL || conv->conv == NULL || conv->overlap == NULL ||
 			conv->inputBuffer == NULL)
 			goto error;
@ -188,7 +157,7 @@ static int convolver1_run(struct convolver1 *conv, const float *input, float *ou
 	int i, processed = 0;

 	if (conv == NULL || conv->segCount == 0) {
-		dsp_ops_clear(dsp, output, len);
+		dsp_ops_fft_memclear(dsp, output, len, true);
 		return len;
 	}

@ -198,7 +167,7 @@ static int convolver1_run(struct convolver1 *conv, const float *input, float *ou

 		dsp_ops_copy(dsp, conv->inputBuffer + inputBufferPos, input + processed, processing);
 		if (inputBufferPos == 0 && processing < conv->blockSize)
-			dsp_ops_clear(dsp, conv->inputBuffer + processing, conv->blockSize - processing);
+			dsp_ops_fft_memclear(dsp, conv->inputBuffer + processing, conv->blockSize - processing, true);

 		dsp_ops_fft_run(dsp, conv->fft, 1, conv->inputBuffer, conv->segments[conv->current]);

@ -277,13 +246,13 @@ void convolver_reset(struct convolver *conv)
 		convolver1_reset(conv->headConvolver);
 	if (conv->tailConvolver0) {
 		convolver1_reset(conv->tailConvolver0);
-		dsp_ops_clear(dsp, conv->tailOutput0, conv->tailBlockSize);
-		dsp_ops_clear(dsp, conv->tailPrecalculated0, conv->tailBlockSize);
+		dsp_ops_fft_memclear(dsp, conv->tailOutput0, conv->tailBlockSize, true);
+		dsp_ops_fft_memclear(dsp, conv->tailPrecalculated0, conv->tailBlockSize, true);
 	}
 	if (conv->tailConvolver) {
 		convolver1_reset(conv->tailConvolver);
-		dsp_ops_clear(dsp, conv->tailOutput, conv->tailBlockSize);
-		dsp_ops_clear(dsp, conv->tailPrecalculated, conv->tailBlockSize);
+		dsp_ops_fft_memclear(dsp, conv->tailOutput, conv->tailBlockSize, true);
+		dsp_ops_fft_memclear(dsp, conv->tailPrecalculated, conv->tailBlockSize, true);
 	}
 	conv->tailInputFill = 0;
 	conv->precalculatedPos = 0;
@ -324,8 +293,8 @@ struct convolver *convolver_new(struct dsp_ops *dsp_ops, int head_block, int tai
 	if (irlen > conv->tailBlockSize) {
 		int conv1IrLen = SPA_MIN(irlen - conv->tailBlockSize, conv->tailBlockSize);
 		conv->tailConvolver0 = convolver1_new(conv->headBlockSize, ir + conv->tailBlockSize, conv1IrLen);
-		conv->tailOutput0 = fft_alloc(conv->tailBlockSize);
-		conv->tailPrecalculated0 = fft_alloc(conv->tailBlockSize);
+		conv->tailOutput0 = dsp_ops_fft_memalloc(dsp, conv->tailBlockSize, true);
+		conv->tailPrecalculated0 = dsp_ops_fft_memalloc(dsp, conv->tailBlockSize, true);
 		if (conv->tailConvolver0 == NULL || conv->tailOutput0 == NULL ||
 				conv->tailPrecalculated0 == NULL)
 			goto error;
@ -334,15 +303,15 @@ struct convolver *convolver_new(struct dsp_ops *dsp_ops, int head_block, int tai
 	if (irlen > 2 * conv->tailBlockSize) {
 		int tailIrLen = irlen - (2 * conv->tailBlockSize);
 		conv->tailConvolver = convolver1_new(conv->tailBlockSize, ir + (2 * conv->tailBlockSize), tailIrLen);
-		conv->tailOutput = fft_alloc(conv->tailBlockSize);
-		conv->tailPrecalculated = fft_alloc(conv->tailBlockSize);
+		conv->tailOutput = dsp_ops_fft_memalloc(dsp, conv->tailBlockSize, true);
+		conv->tailPrecalculated = dsp_ops_fft_memalloc(dsp, conv->tailBlockSize, true);
 		if (conv->tailConvolver == NULL || conv->tailOutput == NULL ||
 				conv->tailPrecalculated == NULL)
 			goto error;
 	}

 	if (conv->tailConvolver0 || conv->tailConvolver) {
-		conv->tailInput = fft_alloc(conv->tailBlockSize);
+		conv->tailInput = dsp_ops_fft_memalloc(dsp, conv->tailBlockSize, true);
 		if (conv->tailInput == NULL)
 			goto error;
 	}
@ -363,11 +332,11 @@ void convolver_free(struct convolver *conv)
 		convolver1_free(conv->tailConvolver0);
 	if (conv->tailConvolver)
 		convolver1_free(conv->tailConvolver);
-	fft_free(conv->tailOutput0);
-	fft_free(conv->tailPrecalculated0);
-	fft_free(conv->tailOutput);
-	fft_free(conv->tailPrecalculated);
-	fft_free(conv->tailInput);
+	dsp_ops_fft_memfree(dsp, conv->tailOutput0);
+	dsp_ops_fft_memfree(dsp, conv->tailPrecalculated0);
+	dsp_ops_fft_memfree(dsp, conv->tailOutput);
+	dsp_ops_fft_memfree(dsp, conv->tailPrecalculated);
+	dsp_ops_fft_memfree(dsp, conv->tailInput);
 	free(conv);
 }

--- a/src/modules/module-filter-chain/dsp-ops-c.c
+++ b/src/modules/module-filter-chain/dsp-ops-c.c
@ -222,7 +222,7 @@ struct fft_info {
 };
 #endif

-void *dsp_fft_new_c(struct dsp_ops *ops, int32_t size, bool real)
+void *dsp_fft_new_c(struct dsp_ops *ops, uint32_t size, bool real)
 {
 #ifdef HAVE_FFTW
 	struct fft_info *info = calloc(1, sizeof(struct fft_info));
@ -232,8 +232,8 @@ void *dsp_fft_new_c(struct dsp_ops *ops, int32_t size, bool real)
 	if (info == NULL)
 		return NULL;

-	rdata = fftwf_alloc_real (size * 2);
-	cdata = fftwf_alloc_complex (size + 1);
+	rdata = fftwf_alloc_real(size * 2);
+	cdata = fftwf_alloc_complex(size + 1);

 	info->plan_r2c = fftwf_plan_dft_r2c_1d(size, rdata, cdata, FFTW_ESTIMATE);
 	info->plan_c2r = fftwf_plan_dft_c2r_1d(size, cdata, rdata, FFTW_ESTIMATE);
@ -258,6 +258,39 @@ void dsp_fft_free_c(struct dsp_ops *ops, void *fft)
 	pffft_destroy_setup(fft);
 #endif
 }
+void *dsp_fft_memalloc_c(struct dsp_ops *ops, uint32_t size, bool real)
+{
+#ifdef HAVE_FFTW
+	if (real)
+		return fftwf_alloc_real(size);
+	else {
+		return fftwf_alloc_complex(size);
+	}
+#else
+	if (real)
+		pffft_aligned_malloc(size * sizeof(float));
+	else
+		pffft_aligned_malloc(size * 2 * sizeof(float));
+#endif
+}
+void dsp_fft_memfree_c(struct dsp_ops *ops, void *data)
+{
+#ifdef HAVE_FFTW
+	fftwf_free(data);
+#else
+	pffft_aligned_free(data);
+#endif
+}
+
+void dsp_fft_memclear_c(struct dsp_ops *ops, void *data, uint32_t size, bool real)
+{
+#ifdef HAVE_FFTW
+	dsp_ops_clear(ops, data, real ? size : size * 2);
+#else
+	dsp_ops_clear(ops, data, real ? size : size * 2);
+#endif
+}
+
 void dsp_fft_run_c(struct dsp_ops *ops, void *fft, int direction,
 	const float * SPA_RESTRICT src, float * SPA_RESTRICT dst)
 {
--- a/src/modules/module-filter-chain/dsp-ops.c
+++ b/src/modules/module-filter-chain/dsp-ops.c
@ -32,6 +32,9 @@ static struct dsp_info dsp_table[] =
 		.funcs.mult = dsp_mult_c,
 		.funcs.fft_new = dsp_fft_new_c,
 		.funcs.fft_free = dsp_fft_free_c,
+		.funcs.fft_memalloc = dsp_fft_memalloc_c,
+		.funcs.fft_memfree = dsp_fft_memfree_c,
+		.funcs.fft_memclear = dsp_fft_memclear_c,
 		.funcs.fft_run = dsp_fft_run_c,
 		.funcs.fft_cmul = dsp_fft_cmul_avx,
 		.funcs.fft_cmuladd = dsp_fft_cmuladd_avx,
@ -50,6 +53,9 @@ static struct dsp_info dsp_table[] =
 		.funcs.mult = dsp_mult_c,
 		.funcs.fft_new = dsp_fft_new_c,
 		.funcs.fft_free = dsp_fft_free_c,
+		.funcs.fft_memalloc = dsp_fft_memalloc_c,
+		.funcs.fft_memfree = dsp_fft_memfree_c,
+		.funcs.fft_memclear = dsp_fft_memclear_c,
 		.funcs.fft_run = dsp_fft_run_c,
 		.funcs.fft_cmul = dsp_fft_cmul_sse,
 		.funcs.fft_cmuladd = dsp_fft_cmuladd_sse,
@ -67,6 +73,9 @@ static struct dsp_info dsp_table[] =
 		.funcs.mult = dsp_mult_c,
 		.funcs.fft_new = dsp_fft_new_c,
 		.funcs.fft_free = dsp_fft_free_c,
+		.funcs.fft_memalloc = dsp_fft_memalloc_c,
+		.funcs.fft_memfree = dsp_fft_memfree_c,
+		.funcs.fft_memclear = dsp_fft_memclear_c,
 		.funcs.fft_run = dsp_fft_run_c,
 		.funcs.fft_cmul = dsp_fft_cmul_c,
 		.funcs.fft_cmuladd = dsp_fft_cmuladd_c,
--- a/src/modules/module-filter-chain/dsp-ops.h
+++ b/src/modules/module-filter-chain/dsp-ops.h
@ -26,8 +26,11 @@ struct dsp_ops_funcs {
 			float * dst, const float * SPA_RESTRICT a,
 			const float * SPA_RESTRICT b, uint32_t n_samples);

-	void *(*fft_new) (struct dsp_ops *ops, int32_t size, bool real);
+	void *(*fft_new) (struct dsp_ops *ops, uint32_t size, bool real);
 	void (*fft_free) (struct dsp_ops *ops, void *fft);
+	void *(*fft_memalloc) (struct dsp_ops *ops, uint32_t size, bool real);
+	void (*fft_memfree) (struct dsp_ops *ops, void *mem);
+	void (*fft_memclear) (struct dsp_ops *ops, void *mem, uint32_t size, bool real);
 	void (*fft_run) (struct dsp_ops *ops, void *fft, int direction,
 			const float * SPA_RESTRICT src, float * SPA_RESTRICT dst);
 	void (*fft_cmul) (struct dsp_ops *ops, void *fft,
@ -77,6 +80,9 @@ int dsp_ops_benchmark(void);

 #define dsp_ops_fft_new(ops,...)	(ops)->funcs.fft_new(ops, __VA_ARGS__)
 #define dsp_ops_fft_free(ops,...)	(ops)->funcs.fft_free(ops, __VA_ARGS__)
+#define dsp_ops_fft_memalloc(ops,...)	(ops)->funcs.fft_memalloc(ops, __VA_ARGS__)
+#define dsp_ops_fft_memfree(ops,...)	(ops)->funcs.fft_memfree(ops, __VA_ARGS__)
+#define dsp_ops_fft_memclear(ops,...)	(ops)->funcs.fft_memclear(ops, __VA_ARGS__)
 #define dsp_ops_fft_run(ops,...)	(ops)->funcs.fft_run(ops, __VA_ARGS__)
 #define dsp_ops_fft_cmul(ops,...)	(ops)->funcs.fft_cmul(ops, __VA_ARGS__)
 #define dsp_ops_fft_cmuladd(ops,...)	(ops)->funcs.fft_cmuladd(ops, __VA_ARGS__)
@ -109,9 +115,15 @@ void dsp_delay_##arch (struct dsp_ops *ops, float *buffer, uint32_t *pos, uint32
 		uint32_t delay, float *dst, const float *src, uint32_t n_samples)

 #define MAKE_FFT_NEW_FUNC(arch) \
-void *dsp_fft_new_##arch(struct dsp_ops *ops, int32_t size, bool real)
+void *dsp_fft_new_##arch(struct dsp_ops *ops, uint32_t size, bool real)
 #define MAKE_FFT_FREE_FUNC(arch) \
 void dsp_fft_free_##arch(struct dsp_ops *ops, void *fft)
+#define MAKE_FFT_MEMALLOC_FUNC(arch) \
+void *dsp_fft_memalloc_##arch(struct dsp_ops *ops, uint32_t size, bool real)
+#define MAKE_FFT_MEMFREE_FUNC(arch) \
+void dsp_fft_memfree_##arch(struct dsp_ops *ops, void *mem)
+#define MAKE_FFT_MEMCLEAR_FUNC(arch) \
+void dsp_fft_memclear_##arch(struct dsp_ops *ops, void *mem, uint32_t size, bool real)
 #define MAKE_FFT_RUN_FUNC(arch) \
 void dsp_fft_run_##arch(struct dsp_ops *ops, void *fft, int direction, \
 	const float * SPA_RESTRICT src, float * SPA_RESTRICT dst)
@ -138,6 +150,9 @@ MAKE_DELAY_FUNC(c);

 MAKE_FFT_NEW_FUNC(c);
 MAKE_FFT_FREE_FUNC(c);
+MAKE_FFT_MEMALLOC_FUNC(c);
+MAKE_FFT_MEMFREE_FUNC(c);
+MAKE_FFT_MEMCLEAR_FUNC(c);
 MAKE_FFT_RUN_FUNC(c);
 MAKE_FFT_CMUL_FUNC(c);
 MAKE_FFT_CMULADD_FUNC(c);