filter-chain: use pffft for the convolver

It is faster.
2025-11-03 09:01:54 -05:00 · 2021-08-23 21:00:45 +02:00 · 2021-08-23 21:00:45 +02:00 · 0f5face73f
commit 0f5face73f
parent 9dbfa63193
9 changed files with 2550 additions and 990 deletions
--- a/src/modules/meson.build
+++ b/src/modules/meson.build
@ -50,8 +50,7 @@ pipewire_module_filter_chain = shared_library('pipewire-module-filter-chain',
    'module-filter-chain/biquad.c',
    'module-filter-chain/ladspa_plugin.c',
    'module-filter-chain/builtin_plugin.c',
-    'module-filter-chain/kiss_fft_f32.c',
-    'module-filter-chain/kiss_fftr_f32.c',
+    'module-filter-chain/pffft.c',
    'module-filter-chain/convolver.c' ],
  include_directories : [configinc, spa_inc],
  install : true,
--- a/src/modules/module-filter-chain/_kiss_fft_guts_f32.h
+++ b/src/modules/module-filter-chain/_kiss_fft_guts_f32.h
@ -1,173 +0,0 @@
-/*
- *  Copyright (c) 2003-2010, Mark Borgerding. All rights reserved.
- *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
- *
- *  SPDX-License-Identifier: BSD-3-Clause
- *  See COPYING file for more information.
- */
-
-/* kiss_fft_f32.h
-   defines kiss_fft_f32_scalar as either short or a float type
-   and defines
-   typedef struct { kiss_fft_f32_scalar r; kiss_fft_f32_scalar i; }kiss_fft_f32_cpx; */
-#include "kiss_fft_f32.h"
-#include <limits.h>
-
-/* The 2*sizeof(size_t) alignment here is borrowed from
- * GNU libc, so it should be good most everywhere.
- * It is more conservative than is needed on some 64-bit
- * platforms, but ia64 does require a 16-byte alignment.
- * The SIMD extensions for x86 and ppc32 would want a
- * larger alignment than this, but we don't need to
- * do better than malloc.
- *
- * Borrowed from GLib's gobject/gtype.c
- */
-#define STRUCT_ALIGNMENT (2 * sizeof (size_t))
-#define ALIGN_STRUCT(offset) \
-      ((offset + (STRUCT_ALIGNMENT - 1)) & -STRUCT_ALIGNMENT)
-
-#define MAXFACTORS 32
-/* e.g. an fft of length 128 has 4 factors 
- as far as kissfft is concerned
- 4*4*4*2
- */
-
-struct kiss_fft_f32_state{
-    int nfft;
-    int inverse;
-    int factors[2*MAXFACTORS];
-    kiss_fft_f32_cpx twiddles[1];
-};
-
-/*
-  Explanation of macros dealing with complex math:
-
-   C_MUL(m,a,b)         : m = a*b
-   C_FIXDIV( c , div )  : if a fixed point impl., c /= div. noop otherwise
-   C_SUB( res, a,b)     : res = a - b
-   C_SUBFROM( res , a)  : res -= a
-   C_ADDTO( res , a)    : res += a
- * */
-#ifdef FIXED_POINT
-#include <stdint.h>
-#if (FIXED_POINT==32)
-# define FRACBITS 31
-# define SAMPPROD int64_t
-#define SAMP_MAX INT32_MAX
-#define SAMP_MIN INT32_MIN
-#else
-# define FRACBITS 15
-# define SAMPPROD int32_t
-#define SAMP_MAX INT16_MAX
-#define SAMP_MIN INT16_MIN
-#endif
-
-#if defined(CHECK_OVERFLOW)
-#  define CHECK_OVERFLOW_OP(a,op,b)  \
-	if ( (SAMPPROD)(a) op (SAMPPROD)(b) > SAMP_MAX || (SAMPPROD)(a) op (SAMPPROD)(b) < SAMP_MIN ) { \
-		g_critical("overflow @ " __FILE__ "(%d): (%d " #op" %d) = %ld",__LINE__,(a),(b),(SAMPPROD)(a) op (SAMPPROD)(b) );  }
-#endif
-
-
-#   define smul(a,b) ( (SAMPPROD)(a)*(b) )
-#   define sround( x )  (kiss_fft_f32_scalar)( ( (x) + (1<<(FRACBITS-1)) ) >> FRACBITS )
-
-#   define S_MUL(a,b) sround( smul(a,b) )
-
-#   define C_MUL(m,a,b) \
-      do{ (m).r = sround( smul((a).r,(b).r) - smul((a).i,(b).i) ); \
-          (m).i = sround( smul((a).r,(b).i) + smul((a).i,(b).r) ); }while(0)
-
-#   define DIVSCALAR(x,k) \
-	(x) = sround( smul(  x, SAMP_MAX/k ) )
-
-#   define C_FIXDIV(c,div) \
-	do {    DIVSCALAR( (c).r , div);  \
-		DIVSCALAR( (c).i  , div); }while (0)
-
-#   define C_MULBYSCALAR( c, s ) \
-    do{ (c).r =  sround( smul( (c).r , s ) ) ;\
-        (c).i =  sround( smul( (c).i , s ) ) ; }while(0)
-
-#else  /* not FIXED_POINT*/
-
-#   define S_MUL(a,b) ( (a)*(b) )
-#define C_MUL(m,a,b) \
-    do{ (m).r = (a).r*(b).r - (a).i*(b).i;\
-        (m).i = (a).r*(b).i + (a).i*(b).r; }while(0)
-#   define C_FIXDIV(c,div) /* NOOP */
-#   define C_MULBYSCALAR( c, s ) \
-    do{ (c).r *= (s);\
-        (c).i *= (s); }while(0)
-#endif
-
-#ifndef CHECK_OVERFLOW_OP
-#  define CHECK_OVERFLOW_OP(a,op,b) /* noop */
-#endif
-
-#define  C_ADD( res, a,b)\
-    do { \
-	    CHECK_OVERFLOW_OP((a).r,+,(b).r)\
-	    CHECK_OVERFLOW_OP((a).i,+,(b).i)\
-	    (res).r=(a).r+(b).r;  (res).i=(a).i+(b).i; \
-    }while(0)
-#define  C_SUB( res, a,b)\
-    do { \
-	    CHECK_OVERFLOW_OP((a).r,-,(b).r)\
-	    CHECK_OVERFLOW_OP((a).i,-,(b).i)\
-	    (res).r=(a).r-(b).r;  (res).i=(a).i-(b).i; \
-    }while(0)
-#define C_ADDTO( res , a)\
-    do { \
-	    CHECK_OVERFLOW_OP((res).r,+,(a).r)\
-	    CHECK_OVERFLOW_OP((res).i,+,(a).i)\
-	    (res).r += (a).r;  (res).i += (a).i;\
-    }while(0)
-
-#define C_SUBFROM( res , a)\
-    do {\
-	    CHECK_OVERFLOW_OP((res).r,-,(a).r)\
-	    CHECK_OVERFLOW_OP((res).i,-,(a).i)\
-	    (res).r -= (a).r;  (res).i -= (a).i; \
-    }while(0)
-
-
-#ifdef FIXED_POINT
-#  define KISS_FFT_F32_COS(phase)  floor(.5+SAMP_MAX * cos (phase))
-#  define KISS_FFT_F32_SIN(phase)  floor(.5+SAMP_MAX * sin (phase))
-#  define HALF_OF(x) ((x)>>1)
-#elif defined(USE_SIMD)
-#  define KISS_FFT_F32_COS(phase) _mm_set1_ps( cos(phase) )
-#  define KISS_FFT_F32_SIN(phase) _mm_set1_ps( sin(phase) )
-#  define HALF_OF(x) ((x)*_mm_set1_ps(.5))
-#else
-#  define KISS_FFT_F32_COS(phase) (kiss_fft_f32_scalar) cos(phase)
-#  define KISS_FFT_F32_SIN(phase) (kiss_fft_f32_scalar) sin(phase)
-#  define HALF_OF(x) ((x)*.5)
-#endif
-
-#define  kf_cexp(x,phase) \
-	do{ \
-		(x)->r = KISS_FFT_F32_COS(phase);\
-		(x)->i = KISS_FFT_F32_SIN(phase);\
-	}while(0)
-
-
-/* a debugging function */
-#define pcpx(c)\
-    fprintf(stderr,"%g + %gi\n",(double)((c)->r),(double)((c)->i) )
-
-
-#ifdef KISS_FFT_F32_USE_ALLOCA
-// define this to allow use of alloca instead of malloc for temporary buffers
-// Temporary buffers are used in two case: 
-// 1. FFT sizes that have "bad" factors. i.e. not 2,3 and 5
-// 2. "in-place" FFTs.  Notice the quotes, since kissfft does not really do an in-place transform.
-#include <alloca.h>
-#define  KISS_FFT_F32_TMP_ALLOC(nbytes) alloca(nbytes)
-#define  KISS_FFT_F32_TMP_FREE(ptr) 
-#else
-#define  KISS_FFT_F32_TMP_ALLOC(nbytes) KISS_FFT_F32_MALLOC(nbytes)
-#define  KISS_FFT_F32_TMP_FREE(ptr) KISS_FFT_F32_FREE(ptr)
-#endif
--- a/src/modules/module-filter-chain/convolver.c
+++ b/src/modules/module-filter-chain/convolver.c
@ -28,8 +28,14 @@

 #include <spa/utils/defs.h>

-#include "kiss_fft_f32.h"
-#include "kiss_fftr_f32.h"
+#include <math.h>
+#include <xmmintrin.h>
+
+#include "pffft.h"
+
+struct fft_cpx {
+	float *v;
+};

 struct convolver1 {
 	int blockSize;
@ -37,16 +43,16 @@ struct convolver1 {
 	int segCount;
 	int fftComplexSize;

-	kiss_fft_f32_cpx **segments;
-	kiss_fft_f32_cpx **segmentsIr;
+	struct fft_cpx *segments;
+	struct fft_cpx *segmentsIr;

 	float *fft_buffer;

 	void *fft;
 	void *ifft;

-	kiss_fft_f32_cpx *pre_mult;
-	kiss_fft_f32_cpx *conv;
+	struct fft_cpx pre_mult;
+	struct fft_cpx conv;
 	float *overlap;

 	float *inputBuffer;
@ -55,6 +61,38 @@ struct convolver1 {
 	int current;
 };

+static void *fft_alloc(int size)
+{
+	void *d;
+	d = pffft_aligned_malloc(size);
+	memset(d, 0, size);
+	return d;
+}
+static void fft_free(void  *data)
+{
+	pffft_aligned_free(data);
+}
+
+static void fft_cpx_init(struct fft_cpx *cpx, int size)
+{
+	cpx->v = fft_alloc(size * 2 * sizeof(float));
+}
+
+static void fft_cpx_free(struct fft_cpx *cpx)
+{
+	fft_free(cpx->v);
+}
+
+static void fft_cpx_clear(struct fft_cpx *cpx, int size)
+{
+	memset(cpx->v, 0, sizeof(float) * 2 * size);
+}
+
+static void fft_cpx_copy(struct fft_cpx *dst, struct fft_cpx *src, int size)
+{
+	memcpy(dst->v, src->v, sizeof(float) * 2 * size);
+}
+
 static int next_power_of_two(int val)
 {
 	int r = 1;
@ -63,6 +101,37 @@ static int next_power_of_two(int val)
 	return r;
 }

+static inline void *fft_new(int size)
+{
+	return pffft_new_setup(size, PFFFT_REAL);
+}
+
+static inline void *ifft_new(int size)
+{
+	return pffft_new_setup(size, PFFFT_REAL);
+}
+
+static inline void fft_destroy(void *fft)
+{
+	pffft_destroy_setup(fft);
+}
+
+static inline void fft_run(void *fft, float *in, struct fft_cpx *out)
+{
+	pffft_transform(fft, in, out->v, NULL, PFFFT_FORWARD);
+}
+
+static inline void ifft_run(void *ifft, struct fft_cpx *in, float *out)
+{
+	pffft_transform(ifft, in->v, out, NULL, PFFFT_BACKWARD);
+}
+
+static inline void fft_convolve_accum(void *fft, struct fft_cpx *r,
+		const struct fft_cpx *a, const struct fft_cpx *b, int len, float scale)
+{
+	pffft_zconvolve_accumulate(fft, a->v, b->v, r->v, scale);
+}
+
 static struct convolver1 *convolver1_new(int block, const float *ir, int irlen)
 {
 	struct convolver1 *conv;
@ -86,37 +155,37 @@ static struct convolver1 *convolver1_new(int block, const float *ir, int irlen)
 	conv->segCount = (irlen + conv->blockSize-1) / conv->blockSize;
 	conv->fftComplexSize = (conv->segSize / 2) + 1;

-        conv->fft = kiss_fftr_f32_alloc(conv->segSize, 0, NULL, NULL);
+        conv->fft = fft_new(conv->segSize);
        if (conv->fft == NULL)
                return NULL;
-        conv->ifft = kiss_fftr_f32_alloc(conv->segSize, 1, NULL, NULL);
+        conv->ifft = ifft_new(conv->segSize);
        if (conv->ifft == NULL)
                return NULL;

-	conv->fft_buffer = calloc(sizeof(float), conv->segSize);
+	conv->fft_buffer = fft_alloc(sizeof(float) * conv->segSize);
        if (conv->fft_buffer == NULL)
                return NULL;

-	conv->segments = calloc(sizeof(kiss_fft_f32_cpx*), conv->segCount);
-	conv->segmentsIr = calloc(sizeof(kiss_fft_f32_cpx*), conv->segCount);
+	conv->segments = calloc(sizeof(struct fft_cpx), conv->segCount);
+	conv->segmentsIr = calloc(sizeof(struct fft_cpx), conv->segCount);

 	for (i = 0; i < conv->segCount; i++) {
 		int left = irlen - (i * conv->blockSize);
 		int copy = SPA_MIN(conv->blockSize, left);

-		conv->segments[i] = calloc(sizeof(kiss_fft_f32_cpx), conv->fftComplexSize);
-		conv->segmentsIr[i] = calloc(sizeof(kiss_fft_f32_cpx), conv->fftComplexSize);
+		fft_cpx_init(&conv->segments[i], conv->fftComplexSize);
+		fft_cpx_init(&conv->segmentsIr[i], conv->fftComplexSize);

 		memcpy(conv->fft_buffer, &ir[i * conv->blockSize], copy * sizeof(float));
 		if (copy < conv->segSize)
 			memset(conv->fft_buffer + copy, 0, (conv->segSize - copy) * sizeof(float));

-	        kiss_fftr_f32(conv->fft, conv->fft_buffer, conv->segmentsIr[i]);
+	        fft_run(conv->fft, conv->fft_buffer, &conv->segmentsIr[i]);
 	}
-	conv->pre_mult = calloc(sizeof(kiss_fft_f32_cpx), conv->fftComplexSize);
-	conv->conv = calloc(sizeof(kiss_fft_f32_cpx), conv->fftComplexSize);
-	conv->overlap = calloc(sizeof(float), conv->blockSize);
-	conv->inputBuffer = calloc(sizeof(float), conv->blockSize);
+	fft_cpx_init(&conv->pre_mult, conv->fftComplexSize);
+	fft_cpx_init(&conv->conv, conv->fftComplexSize);
+	conv->overlap = fft_alloc(sizeof(float) * conv->blockSize);
+	conv->inputBuffer = fft_alloc(sizeof(float) * conv->blockSize);
 	conv->inputBufferFill = 0;
 	conv->current = 0;

@ -127,35 +196,38 @@ static void convolver1_free(struct convolver1 *conv)
 {
 	int i;
 	for (i = 0; i < conv->segCount; i++) {
-		free(conv->segments[i]);
-		free(conv->segmentsIr[i]);
+		fft_cpx_free(&conv->segments[i]);
+		fft_cpx_free(&conv->segmentsIr[i]);
 	}
-	free(conv->fft);
-	free(conv->ifft);
-	free(conv->fft_buffer);
+	fft_destroy(conv->fft);
+	fft_destroy(conv->ifft);
+	fft_free(conv->fft_buffer);
 	free(conv->segments);
 	free(conv->segmentsIr);
-	free(conv->pre_mult);
-	free(conv->conv);
-	free(conv->overlap);
-	free(conv->inputBuffer);
+	fft_cpx_free(&conv->pre_mult);
+	fft_cpx_free(&conv->conv);
+	fft_free(conv->overlap);
+	fft_free(conv->inputBuffer);
 	free(conv);
 }

 void Sum(float* result, const float* a, const float* b, int len)
 {
 	int i;
+#if defined (__SSE__)
+	const int end4 = 4 * (len / 4);
+	for (i = 0; i < end4; i += 4) {
+		const __m128 va = _mm_load_ps(&a[i]);
+		const __m128 vb = _mm_load_ps(&b[i]);
+		_mm_store_ps(&result[i], _mm_add_ps(va,vb));
+	}
+	for (i = end4; i < len; ++i) {
+		result[i] = a[i] + b[i];
+	}
+#else
 	for (i = 0; i < len; i++)
 		result[i] = a[i] + b[i];
-}
-
-void ComplexMultiplyAccumulate(kiss_fft_f32_cpx *r, const kiss_fft_f32_cpx *a, const kiss_fft_f32_cpx *b, int len)
-{
-	int i;
-	for (i = 0; i < len; i++) {
-		r[i].r += a[i].r * b[i].r - a[i].i * b[i].i;
-		r[i].i += a[i].r * b[i].i + a[i].i * b[i].r;
-	}
+#endif
 }

 static int convolver1_run(struct convolver1 *conv, const float *input, float *output, int len)
@ -176,30 +248,27 @@ static int convolver1_run(struct convolver1 *conv, const float *input, float *ou
 		memcpy(conv->fft_buffer, conv->inputBuffer, conv->blockSize * sizeof(float));
 		memset(conv->fft_buffer + conv->blockSize, 0, (conv->segSize - conv->blockSize) * sizeof(float));

-		kiss_fftr_f32(conv->fft, conv->fft_buffer, conv->segments[conv->current]);
+		fft_run(conv->fft, conv->fft_buffer, &conv->segments[conv->current]);

 		if (conv->inputBufferFill == 0) {
-			memset(conv->pre_mult, 0, sizeof(kiss_fft_f32_cpx) * conv->fftComplexSize);
+			fft_cpx_clear(&conv->pre_mult, conv->fftComplexSize);

 			for (i = 1; i < conv->segCount; i++) {
 				const int indexIr = i;
 				const int indexAudio = (conv->current + i) % conv->segCount;

-				ComplexMultiplyAccumulate(conv->pre_mult,
-						conv->segmentsIr[indexIr],
-						conv->segments[indexAudio],
-						conv->fftComplexSize);
+				fft_convolve_accum(conv->fft, &conv->pre_mult,
+						&conv->segmentsIr[indexIr],
+						&conv->segments[indexAudio],
+						conv->fftComplexSize, 1.0f / conv->segSize);
 			}
 		}
-		memcpy(conv->conv, conv->pre_mult, sizeof(kiss_fft_f32_cpx) * conv->fftComplexSize);
+		fft_cpx_copy(&conv->conv, &conv->pre_mult, conv->fftComplexSize);

-		ComplexMultiplyAccumulate(conv->conv, conv->segments[conv->current], conv->segmentsIr[0],
-				conv->fftComplexSize);
+		fft_convolve_accum(conv->fft, &conv->conv, &conv->segments[conv->current], &conv->segmentsIr[0],
+				conv->fftComplexSize, 1.0f / conv->segSize);

-		kiss_fftri_f32(conv->ifft, conv->conv, conv->fft_buffer);
-
-		for (i = 0; i < conv->segSize; i++)
-			conv->fft_buffer[i] /= conv->segSize;
+		ifft_run(conv->ifft, &conv->conv, conv->fft_buffer);

 		Sum(output + processed, conv->fft_buffer + inputBufferPos, conv->overlap + inputBufferPos, processing);

@ -265,19 +334,19 @@ struct convolver *convolver_new(int head_block, int tail_block, const float *ir,
 	if (irlen > conv->tailBlockSize) {
 		int conv1IrLen = SPA_MIN(irlen - conv->tailBlockSize, conv->tailBlockSize);
 		conv->tailConvolver0 = convolver1_new(conv->headBlockSize, ir + conv->tailBlockSize, conv1IrLen);
-		conv->tailOutput0 = calloc(conv->tailBlockSize, sizeof(float));
-		conv->tailPrecalculated0 = calloc(conv->tailBlockSize, sizeof(float));
+		conv->tailOutput0 = fft_alloc(conv->tailBlockSize * sizeof(float));
+		conv->tailPrecalculated0 = fft_alloc(conv->tailBlockSize * sizeof(float));
 	}

 	if (irlen > 2 * conv->tailBlockSize) {
 		int tailIrLen = irlen - (2 * conv->tailBlockSize);
 		conv->tailConvolver = convolver1_new(conv->tailBlockSize, ir + (2 * conv->tailBlockSize), tailIrLen);
-		conv->tailOutput = calloc(conv->tailBlockSize, sizeof(float));
-		conv->tailPrecalculated = calloc(conv->tailBlockSize, sizeof(float));
+		conv->tailOutput = fft_alloc(conv->tailBlockSize * sizeof(float));
+		conv->tailPrecalculated = fft_alloc(conv->tailBlockSize * sizeof(float));
 	}

 	if (conv->tailConvolver0 || conv->tailConvolver)
-		conv->tailInput = calloc(conv->tailBlockSize, sizeof(float));
+		conv->tailInput = fft_alloc(conv->tailBlockSize * sizeof(float));

 	conv->tailInputFill = 0;
 	conv->precalculatedPos = 0;
@ -293,11 +362,11 @@ void convolver_free(struct convolver *conv)
 		convolver1_free(conv->tailConvolver0);
 	if (conv->tailConvolver)
 		convolver1_free(conv->tailConvolver);
-	free(conv->tailOutput0);
-	free(conv->tailPrecalculated0);
-	free(conv->tailOutput);
-	free(conv->tailPrecalculated);
-	free(conv->tailInput);
+	fft_free(conv->tailOutput0);
+	fft_free(conv->tailPrecalculated0);
+	fft_free(conv->tailOutput);
+	fft_free(conv->tailPrecalculated);
+	fft_free(conv->tailInput);
 	free(conv);
 }

--- a/src/modules/module-filter-chain/kiss_fft_f32.c
+++ b/src/modules/module-filter-chain/kiss_fft_f32.c
@ -1,442 +0,0 @@
-/*
- *  Copyright (c) 2003-2010, Mark Borgerding. All rights reserved.
- *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
- *
- *  SPDX-License-Identifier: BSD-3-Clause
- *  See COPYING file for more information.
- */
-
-
-#include "_kiss_fft_guts_f32.h"
-/* The guts header contains all the multiplication and addition macros that are defined for
- fixed or floating point complex numbers.  It also delares the kf_ internal functions.
- */
-
-static void
-kf_bfly2 (kiss_fft_f32_cpx * Fout,
-    const size_t fstride, const kiss_fft_f32_cfg st, int m)
-{
-  kiss_fft_f32_cpx *Fout2;
-  kiss_fft_f32_cpx *tw1 = st->twiddles;
-  kiss_fft_f32_cpx t;
-  Fout2 = Fout + m;
-  do {
-    C_FIXDIV (*Fout, 2);
-    C_FIXDIV (*Fout2, 2);
-
-    C_MUL (t, *Fout2, *tw1);
-    tw1 += fstride;
-    C_SUB (*Fout2, *Fout, t);
-    C_ADDTO (*Fout, t);
-    ++Fout2;
-    ++Fout;
-  } while (--m);
-}
-
-static void
-kf_bfly4 (kiss_fft_f32_cpx * Fout,
-    const size_t fstride, const kiss_fft_f32_cfg st, const size_t m)
-{
-  kiss_fft_f32_cpx *tw1, *tw2, *tw3;
-  kiss_fft_f32_cpx scratch[6];
-  size_t k = m;
-  const size_t m2 = 2 * m;
-  const size_t m3 = 3 * m;
-
-
-  tw3 = tw2 = tw1 = st->twiddles;
-
-  do {
-    C_FIXDIV (*Fout, 4);
-    C_FIXDIV (Fout[m], 4);
-    C_FIXDIV (Fout[m2], 4);
-    C_FIXDIV (Fout[m3], 4);
-
-    C_MUL (scratch[0], Fout[m], *tw1);
-    C_MUL (scratch[1], Fout[m2], *tw2);
-    C_MUL (scratch[2], Fout[m3], *tw3);
-
-    C_SUB (scratch[5], *Fout, scratch[1]);
-    C_ADDTO (*Fout, scratch[1]);
-    C_ADD (scratch[3], scratch[0], scratch[2]);
-    C_SUB (scratch[4], scratch[0], scratch[2]);
-    C_SUB (Fout[m2], *Fout, scratch[3]);
-    tw1 += fstride;
-    tw2 += fstride * 2;
-    tw3 += fstride * 3;
-    C_ADDTO (*Fout, scratch[3]);
-
-    if (st->inverse) {
-      Fout[m].r = scratch[5].r - scratch[4].i;
-      Fout[m].i = scratch[5].i + scratch[4].r;
-      Fout[m3].r = scratch[5].r + scratch[4].i;
-      Fout[m3].i = scratch[5].i - scratch[4].r;
-    } else {
-      Fout[m].r = scratch[5].r + scratch[4].i;
-      Fout[m].i = scratch[5].i - scratch[4].r;
-      Fout[m3].r = scratch[5].r - scratch[4].i;
-      Fout[m3].i = scratch[5].i + scratch[4].r;
-    }
-    ++Fout;
-  } while (--k);
-}
-
-static void
-kf_bfly3 (kiss_fft_f32_cpx * Fout,
-    const size_t fstride, const kiss_fft_f32_cfg st, size_t m)
-{
-  size_t k = m;
-  const size_t m2 = 2 * m;
-  kiss_fft_f32_cpx *tw1, *tw2;
-  kiss_fft_f32_cpx scratch[5];
-  kiss_fft_f32_cpx epi3;
-  epi3 = st->twiddles[fstride * m];
-
-  tw1 = tw2 = st->twiddles;
-
-  do {
-    C_FIXDIV (*Fout, 3);
-    C_FIXDIV (Fout[m], 3);
-    C_FIXDIV (Fout[m2], 3);
-
-    C_MUL (scratch[1], Fout[m], *tw1);
-    C_MUL (scratch[2], Fout[m2], *tw2);
-
-    C_ADD (scratch[3], scratch[1], scratch[2]);
-    C_SUB (scratch[0], scratch[1], scratch[2]);
-    tw1 += fstride;
-    tw2 += fstride * 2;
-
-    Fout[m].r = Fout->r - HALF_OF (scratch[3].r);
-    Fout[m].i = Fout->i - HALF_OF (scratch[3].i);
-
-    C_MULBYSCALAR (scratch[0], epi3.i);
-
-    C_ADDTO (*Fout, scratch[3]);
-
-    Fout[m2].r = Fout[m].r + scratch[0].i;
-    Fout[m2].i = Fout[m].i - scratch[0].r;
-
-    Fout[m].r -= scratch[0].i;
-    Fout[m].i += scratch[0].r;
-
-    ++Fout;
-  } while (--k);
-}
-
-static void
-kf_bfly5 (kiss_fft_f32_cpx * Fout,
-    const size_t fstride, const kiss_fft_f32_cfg st, int m)
-{
-  kiss_fft_f32_cpx *Fout0, *Fout1, *Fout2, *Fout3, *Fout4;
-  int u;
-  kiss_fft_f32_cpx scratch[13];
-  kiss_fft_f32_cpx *twiddles = st->twiddles;
-  kiss_fft_f32_cpx *tw;
-  kiss_fft_f32_cpx ya, yb;
-  ya = twiddles[fstride * m];
-  yb = twiddles[fstride * 2 * m];
-
-  Fout0 = Fout;
-  Fout1 = Fout0 + m;
-  Fout2 = Fout0 + 2 * m;
-  Fout3 = Fout0 + 3 * m;
-  Fout4 = Fout0 + 4 * m;
-
-  tw = st->twiddles;
-  for (u = 0; u < m; ++u) {
-    C_FIXDIV (*Fout0, 5);
-    C_FIXDIV (*Fout1, 5);
-    C_FIXDIV (*Fout2, 5);
-    C_FIXDIV (*Fout3, 5);
-    C_FIXDIV (*Fout4, 5);
-    scratch[0] = *Fout0;
-
-    C_MUL (scratch[1], *Fout1, tw[u * fstride]);
-    C_MUL (scratch[2], *Fout2, tw[2 * u * fstride]);
-    C_MUL (scratch[3], *Fout3, tw[3 * u * fstride]);
-    C_MUL (scratch[4], *Fout4, tw[4 * u * fstride]);
-
-    C_ADD (scratch[7], scratch[1], scratch[4]);
-    C_SUB (scratch[10], scratch[1], scratch[4]);
-    C_ADD (scratch[8], scratch[2], scratch[3]);
-    C_SUB (scratch[9], scratch[2], scratch[3]);
-
-    Fout0->r += scratch[7].r + scratch[8].r;
-    Fout0->i += scratch[7].i + scratch[8].i;
-
-    scratch[5].r =
-        scratch[0].r + S_MUL (scratch[7].r, ya.r) + S_MUL (scratch[8].r, yb.r);
-    scratch[5].i =
-        scratch[0].i + S_MUL (scratch[7].i, ya.r) + S_MUL (scratch[8].i, yb.r);
-
-    scratch[6].r = S_MUL (scratch[10].i, ya.i) + S_MUL (scratch[9].i, yb.i);
-    scratch[6].i = -S_MUL (scratch[10].r, ya.i) - S_MUL (scratch[9].r, yb.i);
-
-    C_SUB (*Fout1, scratch[5], scratch[6]);
-    C_ADD (*Fout4, scratch[5], scratch[6]);
-
-    scratch[11].r =
-        scratch[0].r + S_MUL (scratch[7].r, yb.r) + S_MUL (scratch[8].r, ya.r);
-    scratch[11].i =
-        scratch[0].i + S_MUL (scratch[7].i, yb.r) + S_MUL (scratch[8].i, ya.r);
-    scratch[12].r = -S_MUL (scratch[10].i, yb.i) + S_MUL (scratch[9].i, ya.i);
-    scratch[12].i = S_MUL (scratch[10].r, yb.i) - S_MUL (scratch[9].r, ya.i);
-
-    C_ADD (*Fout2, scratch[11], scratch[12]);
-    C_SUB (*Fout3, scratch[11], scratch[12]);
-
-    ++Fout0;
-    ++Fout1;
-    ++Fout2;
-    ++Fout3;
-    ++Fout4;
-  }
-}
-
-/* perform the butterfly for one stage of a mixed radix FFT */
-static void
-kf_bfly_generic (kiss_fft_f32_cpx * Fout,
-    const size_t fstride, const kiss_fft_f32_cfg st, int m, int p)
-{
-  int u, k, q1, q;
-  kiss_fft_f32_cpx *twiddles = st->twiddles;
-  kiss_fft_f32_cpx t;
-  int Norig = st->nfft;
-
-  kiss_fft_f32_cpx *scratch =
-      (kiss_fft_f32_cpx *) KISS_FFT_F32_TMP_ALLOC (sizeof (kiss_fft_f32_cpx) *
-      p);
-
-  for (u = 0; u < m; ++u) {
-    k = u;
-    for (q1 = 0; q1 < p; ++q1) {
-      scratch[q1] = Fout[k];
-      C_FIXDIV (scratch[q1], p);
-      k += m;
-    }
-
-    k = u;
-    for (q1 = 0; q1 < p; ++q1) {
-      int twidx = 0;
-      Fout[k] = scratch[0];
-      for (q = 1; q < p; ++q) {
-        twidx += fstride * k;
-        if (twidx >= Norig)
-          twidx -= Norig;
-        C_MUL (t, scratch[q], twiddles[twidx]);
-        C_ADDTO (Fout[k], t);
-      }
-      k += m;
-    }
-  }
-  KISS_FFT_F32_TMP_FREE (scratch);
-}
-
-static void
-kf_work (kiss_fft_f32_cpx * Fout,
-    const kiss_fft_f32_cpx * f,
-    const size_t fstride, int in_stride, int *factors,
-    const kiss_fft_f32_cfg st)
-{
-  kiss_fft_f32_cpx *Fout_beg = Fout;
-  const int p = *factors++;     /* the radix  */
-  const int m = *factors++;     /* stage's fft length/p */
-  const kiss_fft_f32_cpx *Fout_end = Fout + p * m;
-
-#ifdef _OPENMP
-  // use openmp extensions at the 
-  // top-level (not recursive)
-  if (fstride == 1 && p <= 5 && m != 1) {
-    int k;
-
-    // execute the p different work units in different threads
-#       pragma omp parallel for
-    for (k = 0; k < p; ++k)
-      kf_work (Fout + k * m, f + fstride * in_stride * k, fstride * p,
-          in_stride, factors, st);
-    // all threads have joined by this point
-
-    switch (p) {
-      case 2:
-        kf_bfly2 (Fout, fstride, st, m);
-        break;
-      case 3:
-        kf_bfly3 (Fout, fstride, st, m);
-        break;
-      case 4:
-        kf_bfly4 (Fout, fstride, st, m);
-        break;
-      case 5:
-        kf_bfly5 (Fout, fstride, st, m);
-        break;
-      default:
-        kf_bfly_generic (Fout, fstride, st, m, p);
-        break;
-    }
-    return;
-  }
-#endif
-
-  if (m == 1) {
-    do {
-      *Fout = *f;
-      f += fstride * in_stride;
-    } while (++Fout != Fout_end);
-  } else {
-    do {
-      // recursive call:
-      // DFT of size m*p performed by doing
-      // p instances of smaller DFTs of size m, 
-      // each one takes a decimated version of the input
-      kf_work (Fout, f, fstride * p, in_stride, factors, st);
-      f += fstride * in_stride;
-    } while ((Fout += m) != Fout_end);
-  }
-
-  Fout = Fout_beg;
-
-  // recombine the p smaller DFTs 
-  switch (p) {
-    case 2:
-      kf_bfly2 (Fout, fstride, st, m);
-      break;
-    case 3:
-      kf_bfly3 (Fout, fstride, st, m);
-      break;
-    case 4:
-      kf_bfly4 (Fout, fstride, st, m);
-      break;
-    case 5:
-      kf_bfly5 (Fout, fstride, st, m);
-      break;
-    default:
-      kf_bfly_generic (Fout, fstride, st, m, p);
-      break;
-  }
-}
-
-/*  facbuf is populated by p1,m1,p2,m2, ...
-    where 
-    p[i] * m[i] = m[i-1]
-    m0 = n                  */
-static void
-kf_factor (int n, int *facbuf)
-{
-  int p = 4;
-  double floor_sqrt;
-  floor_sqrt = floor (sqrt ((double) n));
-
-  /*factor out powers of 4, powers of 2, then any remaining primes */
-  do {
-    while (n % p) {
-      switch (p) {
-        case 4:
-          p = 2;
-          break;
-        case 2:
-          p = 3;
-          break;
-        default:
-          p += 2;
-          break;
-      }
-      if (p > floor_sqrt)
-        p = n;                  /* no more factors, skip to end */
-    }
-    n /= p;
-    *facbuf++ = p;
-    *facbuf++ = n;
-  } while (n > 1);
-}
-
-/*
- *
- * User-callable function to allocate all necessary storage space for the fft.
- *
- * The return value is a contiguous block of memory, allocated with malloc.  As such,
- * It can be freed with free(), rather than a kiss_fft_f32-specific function.
- * */
-kiss_fft_f32_cfg
-kiss_fft_f32_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem)
-{
-  kiss_fft_f32_cfg st = NULL;
-  size_t memneeded = sizeof (struct kiss_fft_f32_state)
-      + sizeof (kiss_fft_f32_cpx) * (nfft - 1); /* twiddle factors */
-
-  if (lenmem == NULL) {
-    st = (kiss_fft_f32_cfg) KISS_FFT_F32_MALLOC (memneeded);
-  } else {
-    if (mem != NULL && *lenmem >= memneeded)
-      st = (kiss_fft_f32_cfg) mem;
-    *lenmem = memneeded;
-  }
-  if (st) {
-    int i;
-    st->nfft = nfft;
-    st->inverse = inverse_fft;
-
-    for (i = 0; i < nfft; ++i) {
-      const double pi =
-          3.141592653589793238462643383279502884197169399375105820974944;
-      double phase = -2 * pi * i / nfft;
-      if (st->inverse)
-        phase *= -1;
-      kf_cexp (st->twiddles + i, phase);
-    }
-
-    kf_factor (nfft, st->factors);
-  }
-  return st;
-}
-
-
-void
-kiss_fft_f32_stride (kiss_fft_f32_cfg st, const kiss_fft_f32_cpx * fin,
-    kiss_fft_f32_cpx * fout, int in_stride)
-{
-  if (fin == fout) {
-    //NOTE: this is not really an in-place FFT algorithm.
-    //It just performs an out-of-place FFT into a temp buffer
-    kiss_fft_f32_cpx *tmpbuf =
-        (kiss_fft_f32_cpx *) KISS_FFT_F32_TMP_ALLOC (sizeof (kiss_fft_f32_cpx) *
-        st->nfft);
-    kf_work (tmpbuf, fin, 1, in_stride, st->factors, st);
-    memcpy (fout, tmpbuf, sizeof (kiss_fft_f32_cpx) * st->nfft);
-    KISS_FFT_F32_TMP_FREE (tmpbuf);
-  } else {
-    kf_work (fout, fin, 1, in_stride, st->factors, st);
-  }
-}
-
-void
-kiss_fft_f32 (kiss_fft_f32_cfg cfg, const kiss_fft_f32_cpx * fin,
-    kiss_fft_f32_cpx * fout)
-{
-  kiss_fft_f32_stride (cfg, fin, fout, 1);
-}
-
-
-void
-kiss_fft_f32_cleanup (void)
-{
-  // nothing needed any more
-}
-
-int
-kiss_fft_f32_next_fast_size (int n)
-{
-  while (1) {
-    int m = n;
-    while ((m % 2) == 0)
-      m /= 2;
-    while ((m % 3) == 0)
-      m /= 3;
-    while ((m % 5) == 0)
-      m /= 5;
-    if (m <= 1)
-      break;                    /* n is completely factorable by twos, threes, and fives */
-    n++;
-  }
-  return n;
-}
--- a/src/modules/module-filter-chain/kiss_fft_f32.h
+++ b/src/modules/module-filter-chain/kiss_fft_f32.h
@ -1,112 +0,0 @@
-/*
- *  Copyright (c) 2003-2010, Mark Borgerding. All rights reserved.
- *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
- *
- *  SPDX-License-Identifier: BSD-3-Clause
- *  See COPYING file for more information.
- */
-
-#ifndef KISS_FFT_F32_H
-#define KISS_FFT_F32_H
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <string.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- ATTENTION!
- If you would like a :
- -- a utility that will handle the caching of fft objects
- -- real-only (no imaginary time component ) FFT
- -- a multi-dimensional FFT
- -- a command-line utility to perform ffts
- -- a command-line utility to perform fast-convolution filtering
-
- Then see kfc.h kiss_fftr_f32.h kiss_fft_f32nd.h fftutil.c kiss_fastfir.c
-  in the tools/ directory.
-*/
-
-#define KISS_FFT_F32_MALLOC malloc
-#define KISS_FFT_F32_FREE free
-#define kiss_fft_f32_scalar float
-
-typedef struct {
-    kiss_fft_f32_scalar r;
-    kiss_fft_f32_scalar i;
-}kiss_fft_f32_cpx;
-
-typedef struct kiss_fft_f32_state* kiss_fft_f32_cfg;
-
-/* 
- *  kiss_fft_f32_alloc
- *  
- *  Initialize a FFT (or IFFT) algorithm's cfg/state buffer.
- *
- *  typical usage:      kiss_fft_f32_cfg mycfg=kiss_fft_f32_alloc(1024,0,NULL,NULL);
- *
- *  The return value from fft_alloc is a cfg buffer used internally
- *  by the fft routine or NULL.
- *
- *  If lenmem is NULL, then kiss_fft_f32_alloc will allocate a cfg buffer using malloc.
- *  The returned value should be free()d when done to avoid memory leaks.
- *  
- *  The state can be placed in a user supplied buffer 'mem':
- *  If lenmem is not NULL and mem is not NULL and *lenmem is large enough,
- *      then the function places the cfg in mem and the size used in *lenmem
- *      and returns mem.
- *  
- *  If lenmem is not NULL and ( mem is NULL or *lenmem is not large enough),
- *      then the function returns NULL and places the minimum cfg 
- *      buffer size in *lenmem.
- * */
-
-kiss_fft_f32_cfg kiss_fft_f32_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem); 
-
-/*
- * kiss_fft_f32(cfg,in_out_buf)
- *
- * Perform an FFT on a complex input buffer.
- * for a forward FFT,
- * fin should be  f[0] , f[1] , ... ,f[nfft-1]
- * fout will be   F[0] , F[1] , ... ,F[nfft-1]
- * Note that each element is complex and can be accessed like
-    f[k].r and f[k].i
- * */
-void kiss_fft_f32(kiss_fft_f32_cfg cfg,const kiss_fft_f32_cpx *fin,kiss_fft_f32_cpx *fout);
-
-/*
- A more generic version of the above function. It reads its input from every Nth sample.
- * */
-void kiss_fft_f32_stride(kiss_fft_f32_cfg cfg,const kiss_fft_f32_cpx *fin,kiss_fft_f32_cpx *fout,int fin_stride);
-
-/* If kiss_fft_f32_alloc allocated a buffer, it is one contiguous 
-   buffer and can be simply free()d when no longer needed*/
-#define kiss_fft_f32_free KISS_FFT_F32_FREE
-
-/*
- Cleans up some memory that gets managed internally. Not necessary to call, but it might clean up 
- your compiler output to call this before you exit.
-*/
-void kiss_fft_f32_cleanup(void);
-	
-
-/*
- * Returns the smallest integer k, such that k>=n and k has only "fast" factors (2,3,5)
- */
-int kiss_fft_f32_next_fast_size(int n);
-
-/* for real ffts, we need an even size */
-#define kiss_fftr_f32_next_fast_size_real(n) \
-        (kiss_fft_f32_next_fast_size( ((n)+1)>>1)<<1)
-
-#ifdef __cplusplus
-} 
-#endif
-
-#endif
--- a/src/modules/module-filter-chain/kiss_fftr_f32.c
+++ b/src/modules/module-filter-chain/kiss_fftr_f32.c
@ -1,148 +0,0 @@
-/*
- *  Copyright (c) 2003-2004, Mark Borgerding. All rights reserved.
- *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
- *
- *  SPDX-License-Identifier: BSD-3-Clause
- *  See COPYING file for more information.
- */
-
-#include "kiss_fftr_f32.h"
-#include "_kiss_fft_guts_f32.h"
-
-struct kiss_fftr_f32_state
-{
-  kiss_fft_f32_cfg substate;
-  kiss_fft_f32_cpx *tmpbuf;
-  kiss_fft_f32_cpx *super_twiddles;
-#ifdef USE_SIMD
-  void *pad;
-#endif
-};
-
-kiss_fftr_f32_cfg
-kiss_fftr_f32_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem)
-{
-  int i;
-  kiss_fftr_f32_cfg st = NULL;
-  size_t subsize = 0, memneeded;
-
-  nfft >>= 1;
-
-  kiss_fft_f32_alloc (nfft, inverse_fft, NULL, &subsize);
-  memneeded =
-      ALIGN_STRUCT (sizeof (struct kiss_fftr_f32_state)) +
-      ALIGN_STRUCT (subsize) + sizeof (kiss_fft_f32_cpx) * (nfft * 3 / 2);
-
-  if (lenmem == NULL) {
-    st = (kiss_fftr_f32_cfg) KISS_FFT_F32_MALLOC (memneeded);
-  } else {
-    if (*lenmem >= memneeded)
-      st = (kiss_fftr_f32_cfg) mem;
-    *lenmem = memneeded;
-  }
-  if (!st)
-    return NULL;
-
-  st->substate = (kiss_fft_f32_cfg) (((char *) st) + ALIGN_STRUCT (sizeof (struct kiss_fftr_f32_state)));       /*just beyond kiss_fftr_f32_state struct */
-  st->tmpbuf =
-      (kiss_fft_f32_cpx *) (((char *) st->substate) + ALIGN_STRUCT (subsize));
-  st->super_twiddles = st->tmpbuf + nfft;
-  kiss_fft_f32_alloc (nfft, inverse_fft, st->substate, &subsize);
-
-  for (i = 0; i < nfft / 2; ++i) {
-    double phase =
-        -3.14159265358979323846264338327 * ((double) (i + 1) / nfft + .5);
-    if (inverse_fft)
-      phase *= -1;
-    kf_cexp (st->super_twiddles + i, phase);
-  }
-  return st;
-}
-
-void
-kiss_fftr_f32 (kiss_fftr_f32_cfg st, const kiss_fft_f32_scalar * timedata,
-    kiss_fft_f32_cpx * freqdata)
-{
-  /* input buffer timedata is stored row-wise */
-  int k, ncfft;
-  kiss_fft_f32_cpx fpnk, fpk, f1k, f2k, tw, tdc;
-
-  ncfft = st->substate->nfft;
-
-  /*perform the parallel fft of two real signals packed in real,imag */
-  kiss_fft_f32 (st->substate, (const kiss_fft_f32_cpx *) timedata, st->tmpbuf);
-  /* The real part of the DC element of the frequency spectrum in st->tmpbuf
-   * contains the sum of the even-numbered elements of the input time sequence
-   * The imag part is the sum of the odd-numbered elements
-   *
-   * The sum of tdc.r and tdc.i is the sum of the input time sequence. 
-   *      yielding DC of input time sequence
-   * The difference of tdc.r - tdc.i is the sum of the input (dot product) [1,-1,1,-1... 
-   *      yielding Nyquist bin of input time sequence
-   */
-
-  tdc.r = st->tmpbuf[0].r;
-  tdc.i = st->tmpbuf[0].i;
-  C_FIXDIV (tdc, 2);
-  CHECK_OVERFLOW_OP (tdc.r, +, tdc.i);
-  CHECK_OVERFLOW_OP (tdc.r, -, tdc.i);
-  freqdata[0].r = tdc.r + tdc.i;
-  freqdata[ncfft].r = tdc.r - tdc.i;
-#ifdef USE_SIMD
-  freqdata[ncfft].i = freqdata[0].i = _mm_set1_ps (0);
-#else
-  freqdata[ncfft].i = freqdata[0].i = 0;
-#endif
-
-  for (k = 1; k <= ncfft / 2; ++k) {
-    fpk = st->tmpbuf[k];
-    fpnk.r = st->tmpbuf[ncfft - k].r;
-    fpnk.i = -st->tmpbuf[ncfft - k].i;
-    C_FIXDIV (fpk, 2);
-    C_FIXDIV (fpnk, 2);
-
-    C_ADD (f1k, fpk, fpnk);
-    C_SUB (f2k, fpk, fpnk);
-    C_MUL (tw, f2k, st->super_twiddles[k - 1]);
-
-    freqdata[k].r = HALF_OF (f1k.r + tw.r);
-    freqdata[k].i = HALF_OF (f1k.i + tw.i);
-    freqdata[ncfft - k].r = HALF_OF (f1k.r - tw.r);
-    freqdata[ncfft - k].i = HALF_OF (tw.i - f1k.i);
-  }
-}
-
-void
-kiss_fftri_f32 (kiss_fftr_f32_cfg st, const kiss_fft_f32_cpx * freqdata,
-    kiss_fft_f32_scalar * timedata)
-{
-  /* input buffer timedata is stored row-wise */
-  int k, ncfft;
-
-  ncfft = st->substate->nfft;
-
-  st->tmpbuf[0].r = freqdata[0].r + freqdata[ncfft].r;
-  st->tmpbuf[0].i = freqdata[0].r - freqdata[ncfft].r;
-  C_FIXDIV (st->tmpbuf[0], 2);
-
-  for (k = 1; k <= ncfft / 2; ++k) {
-    kiss_fft_f32_cpx fk, fnkc, fek, fok, tmp;
-    fk = freqdata[k];
-    fnkc.r = freqdata[ncfft - k].r;
-    fnkc.i = -freqdata[ncfft - k].i;
-    C_FIXDIV (fk, 2);
-    C_FIXDIV (fnkc, 2);
-
-    C_ADD (fek, fk, fnkc);
-    C_SUB (tmp, fk, fnkc);
-    C_MUL (fok, tmp, st->super_twiddles[k - 1]);
-    C_ADD (st->tmpbuf[k], fek, fok);
-    C_SUB (st->tmpbuf[ncfft - k], fek, fok);
-#ifdef USE_SIMD
-    st->tmpbuf[ncfft - k].i *= _mm_set1_ps (-1.0);
-#else
-    st->tmpbuf[ncfft - k].i *= -1;
-#endif
-  }
-  kiss_fft_f32 (st->substate, st->tmpbuf, (kiss_fft_f32_cpx *) timedata);
-}
--- a/src/modules/module-filter-chain/kiss_fftr_f32.h
+++ b/src/modules/module-filter-chain/kiss_fftr_f32.h
@ -1,54 +0,0 @@
-/*
- *  Copyright (c) 2003-2004, Mark Borgerding. All rights reserved.
- *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
- *
- *  SPDX-License-Identifier: BSD-3-Clause
- *  See COPYING file for more information.
- */
-
-#ifndef KISS_FTR_H
-#define KISS_FTR_H
-
-#include "kiss_fft_f32.h"
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-    
-/* 
- 
- Real optimized version can save about 45% cpu time vs. complex fft of a real seq.
-
- 
- 
- */
-
-typedef struct kiss_fftr_f32_state *kiss_fftr_f32_cfg;
-
-
-kiss_fftr_f32_cfg kiss_fftr_f32_alloc(int nfft,int inverse_fft,void * mem, size_t * lenmem);
-/*
- nfft must be even
-
- If you don't care to allocate space, use mem = lenmem = NULL 
-*/
-
-
-void kiss_fftr_f32(kiss_fftr_f32_cfg cfg,const kiss_fft_f32_scalar *timedata,kiss_fft_f32_cpx *freqdata);
-/*
- input timedata has nfft scalar points
- output freqdata has nfft/2+1 complex points
-*/
-
-void kiss_fftri_f32(kiss_fftr_f32_cfg cfg,const kiss_fft_f32_cpx *freqdata,kiss_fft_f32_scalar *timedata);
-/*
- input freqdata has  nfft/2+1 complex points
- output timedata has nfft scalar points
-*/
-
-#define kiss_fftr_f32_free KISS_FFT_F32_FREE
-
-#ifdef __cplusplus
-}
-#endif
-#endif
--- a/src/modules/module-filter-chain/pffft.c
+++ b/src/modules/module-filter-chain/pffft.c
--- a/src/modules/module-filter-chain/pffft.h
+++ b/src/modules/module-filter-chain/pffft.h
@ -0,0 +1,177 @@
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Based on original fortran 77 code from FFTPACKv4 from NETLIB,
+   authored by Dr Paul Swarztrauber of NCAR, in 1985.
+
+   As confirmed by the NCAR fftpack software curators, the following
+   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
+   released under the same terms.
+
+   FFTPACK license:
+
+   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
+
+   Copyright (c) 2004 the University Corporation for Atmospheric
+   Research ("UCAR"). All rights reserved. Developed by NCAR's
+   Computational and Information Systems Laboratory, UCAR,
+   www.cisl.ucar.edu.
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+/*
+   PFFFT : a Pretty Fast FFT.
+
+   This is basically an adaptation of the single precision fftpack
+   (v4) as found on netlib taking advantage of SIMD instruction found
+   on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
+
+   For architectures where no SIMD instruction is available, the code
+   falls back to a scalar version.
+
+   Restrictions:
+
+   - 1D transforms only, with 32-bit single precision.
+
+   - supports only transforms for inputs of length N of the form
+   N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
+   144, 160, etc are all acceptable lengths). Performance is best for
+   128<=N<=8192.
+
+   - all (float*) pointers in the functions below are expected to
+   have an "simd-compatible" alignment, that is 16 bytes on x86 and
+   powerpc CPUs.
+
+   You can allocate such buffers with the functions
+   pffft_aligned_malloc / pffft_aligned_free (or with stuff like
+   posix_memalign..)
+
+*/
+
+#ifndef PFFFT_H
+#define PFFFT_H
+
+#include <stddef.h> // for size_t
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  /* opaque struct holding internal stuff (precomputed twiddle factors)
+     this struct can be shared by many threads as it contains only
+     read-only data.
+  */
+  typedef struct PFFFT_Setup PFFFT_Setup;
+
+  /* direction of the transform */
+  typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
+
+  /* type of transform */
+  typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
+
+  /*
+    prepare for performing transforms of size N -- the returned
+    PFFFT_Setup structure is read-only so it can safely be shared by
+    multiple concurrent threads.
+  */
+  PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
+  void pffft_destroy_setup(PFFFT_Setup *);
+  /*
+     Perform a Fourier transform , The z-domain data is stored in the
+     most efficient order for transforming it back, or using it for
+     convolution. If you need to have its content sorted in the
+     "usual" way, that is as an array of interleaved complex numbers,
+     either use pffft_transform_ordered , or call pffft_zreorder after
+     the forward fft, and before the backward fft.
+
+     Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
+     Typically you will want to scale the backward transform by 1/N.
+
+     The 'work' pointer should point to an area of N (2*N for complex
+     fft) floats, properly aligned. If 'work' is NULL, then stack will
+     be used instead (this is probably the best strategy for small
+     FFTs, say for N < 16384).
+
+     input and output may alias.
+  */
+  void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
+
+  /*
+     Similar to pffft_transform, but makes sure that the output is
+     ordered as expected (interleaved complex numbers).  This is
+     similar to calling pffft_transform and then pffft_zreorder.
+
+     input and output may alias.
+  */
+  void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
+
+  /*
+     call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
+     PFFFT_FORWARD) if you want to have the frequency components in
+     the correct "canonical" order, as interleaved complex numbers.
+
+     (for real transforms, both 0-frequency and half frequency
+     components, which are real, are assembled in the first entry as
+     F(0)+i*F(n/2+1). Note that the original fftpack did place
+     F(n/2+1) at the end of the arrays).
+
+     input and output should not alias.
+  */
+  void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
+
+  /*
+     Perform a multiplication of the frequency components of dft_a and
+     dft_b and accumulate them into dft_ab. The arrays should have
+     been obtained with pffft_transform(.., PFFFT_FORWARD) and should
+     *not* have been reordered with pffft_zreorder (otherwise just
+     perform the operation yourself as the dft coefs are stored as
+     interleaved complex numbers).
+
+     the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
+
+     The dft_a, dft_b and dft_ab pointers may alias.
+  */
+  void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
+
+  /*
+    the float buffers must have the correct alignment (16-byte boundary
+    on intel and powerpc). This function may be used to obtain such
+    correctly aligned buffers. 
+  */
+  void *pffft_aligned_malloc(size_t nb_bytes);
+  void pffft_aligned_free(void *);
+
+  /* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
+  int pffft_simd_size();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // PFFFT_H