filter-chain: use pffft for the convolver

It is faster.
2025-11-04 13:30:12 -05:00 · 2021-08-23 21:00:45 +02:00 · 2021-08-23 21:00:45 +02:00 · 0f5face73f
commit 0f5face73f
parent 9dbfa63193
9 changed files with 2550 additions and 990 deletions
--- a/src/modules/meson.build
+++ b/src/modules/meson.build
@ -50,8 +50,7 @@ pipewire_module_filter_chain = shared_library('pipewire-module-filter-chain',
    'module-filter-chain/biquad.c',
    'module-filter-chain/ladspa_plugin.c',
    'module-filter-chain/builtin_plugin.c',
-    'module-filter-chain/kiss_fft_f32.c',
+    'module-filter-chain/pffft.c',
    'module-filter-chain/kiss_fftr_f32.c',
    'module-filter-chain/convolver.c' ],
  include_directories : [configinc, spa_inc],
  install : true,
--- a/src/modules/module-filter-chain/_kiss_fft_guts_f32.h
+++ b/src/modules/module-filter-chain/_kiss_fft_guts_f32.h
@ -1,173 +0,0 @@
 /*
 *  Copyright (c) 2003-2010, Mark Borgerding. All rights reserved.
 *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
 *
 *  SPDX-License-Identifier: BSD-3-Clause
 *  See COPYING file for more information.
 */
 /* kiss_fft_f32.h
   defines kiss_fft_f32_scalar as either short or a float type
   and defines
   typedef struct { kiss_fft_f32_scalar r; kiss_fft_f32_scalar i; }kiss_fft_f32_cpx; */
 #include "kiss_fft_f32.h"
 #include <limits.h>
 /* The 2*sizeof(size_t) alignment here is borrowed from
 * GNU libc, so it should be good most everywhere.
 * It is more conservative than is needed on some 64-bit
 * platforms, but ia64 does require a 16-byte alignment.
 * The SIMD extensions for x86 and ppc32 would want a
 * larger alignment than this, but we don't need to
 * do better than malloc.
 *
 * Borrowed from GLib's gobject/gtype.c
 */
 #define STRUCT_ALIGNMENT (2 * sizeof (size_t))
 #define ALIGN_STRUCT(offset) \
      ((offset + (STRUCT_ALIGNMENT - 1)) & -STRUCT_ALIGNMENT)
 #define MAXFACTORS 32
 /* e.g. an fft of length 128 has 4 factors 
 as far as kissfft is concerned
 4*4*4*2
 */
 struct kiss_fft_f32_state{
    int nfft;
    int inverse;
    int factors[2*MAXFACTORS];
    kiss_fft_f32_cpx twiddles[1];
 };
 /*
  Explanation of macros dealing with complex math:
   C_MUL(m,a,b)         : m = a*b
   C_FIXDIV( c , div )  : if a fixed point impl., c /= div. noop otherwise
   C_SUB( res, a,b)     : res = a - b
   C_SUBFROM( res , a)  : res -= a
   C_ADDTO( res , a)    : res += a
 * */
 #ifdef FIXED_POINT
 #include <stdint.h>
 #if (FIXED_POINT==32)
 # define FRACBITS 31
 # define SAMPPROD int64_t
 #define SAMP_MAX INT32_MAX
 #define SAMP_MIN INT32_MIN
 #else
 # define FRACBITS 15
 # define SAMPPROD int32_t
 #define SAMP_MAX INT16_MAX
 #define SAMP_MIN INT16_MIN
 #endif
 #if defined(CHECK_OVERFLOW)
 #  define CHECK_OVERFLOW_OP(a,op,b)  \
 	if ( (SAMPPROD)(a) op (SAMPPROD)(b) > SAMP_MAX || (SAMPPROD)(a) op (SAMPPROD)(b) < SAMP_MIN ) { \
 		g_critical("overflow @ " __FILE__ "(%d): (%d " #op" %d) = %ld",__LINE__,(a),(b),(SAMPPROD)(a) op (SAMPPROD)(b) );  }
 #endif
 #   define smul(a,b) ( (SAMPPROD)(a)*(b) )
 #   define sround( x )  (kiss_fft_f32_scalar)( ( (x) + (1<<(FRACBITS-1)) ) >> FRACBITS )
 #   define S_MUL(a,b) sround( smul(a,b) )
 #   define C_MUL(m,a,b) \
      do{ (m).r = sround( smul((a).r,(b).r) - smul((a).i,(b).i) ); \
          (m).i = sround( smul((a).r,(b).i) + smul((a).i,(b).r) ); }while(0)
 #   define DIVSCALAR(x,k) \
 	(x) = sround( smul(  x, SAMP_MAX/k ) )
 #   define C_FIXDIV(c,div) \
 	do {    DIVSCALAR( (c).r , div);  \
 		DIVSCALAR( (c).i  , div); }while (0)
 #   define C_MULBYSCALAR( c, s ) \
    do{ (c).r =  sround( smul( (c).r , s ) ) ;\
        (c).i =  sround( smul( (c).i , s ) ) ; }while(0)
 #else  /* not FIXED_POINT*/
 #   define S_MUL(a,b) ( (a)*(b) )
 #define C_MUL(m,a,b) \
    do{ (m).r = (a).r*(b).r - (a).i*(b).i;\
        (m).i = (a).r*(b).i + (a).i*(b).r; }while(0)
 #   define C_FIXDIV(c,div) /* NOOP */
 #   define C_MULBYSCALAR( c, s ) \
    do{ (c).r *= (s);\
        (c).i *= (s); }while(0)
 #endif
 #ifndef CHECK_OVERFLOW_OP
 #  define CHECK_OVERFLOW_OP(a,op,b) /* noop */
 #endif
 #define  C_ADD( res, a,b)\
    do { \
 	    CHECK_OVERFLOW_OP((a).r,+,(b).r)\
 	    CHECK_OVERFLOW_OP((a).i,+,(b).i)\
 	    (res).r=(a).r+(b).r;  (res).i=(a).i+(b).i; \
    }while(0)
 #define  C_SUB( res, a,b)\
    do { \
 	    CHECK_OVERFLOW_OP((a).r,-,(b).r)\
 	    CHECK_OVERFLOW_OP((a).i,-,(b).i)\
 	    (res).r=(a).r-(b).r;  (res).i=(a).i-(b).i; \
    }while(0)
 #define C_ADDTO( res , a)\
    do { \
 	    CHECK_OVERFLOW_OP((res).r,+,(a).r)\
 	    CHECK_OVERFLOW_OP((res).i,+,(a).i)\
 	    (res).r += (a).r;  (res).i += (a).i;\
    }while(0)
 #define C_SUBFROM( res , a)\
    do {\
 	    CHECK_OVERFLOW_OP((res).r,-,(a).r)\
 	    CHECK_OVERFLOW_OP((res).i,-,(a).i)\
 	    (res).r -= (a).r;  (res).i -= (a).i; \
    }while(0)
 #ifdef FIXED_POINT
 #  define KISS_FFT_F32_COS(phase)  floor(.5+SAMP_MAX * cos (phase))
 #  define KISS_FFT_F32_SIN(phase)  floor(.5+SAMP_MAX * sin (phase))
 #  define HALF_OF(x) ((x)>>1)
 #elif defined(USE_SIMD)
 #  define KISS_FFT_F32_COS(phase) _mm_set1_ps( cos(phase) )
 #  define KISS_FFT_F32_SIN(phase) _mm_set1_ps( sin(phase) )
 #  define HALF_OF(x) ((x)*_mm_set1_ps(.5))
 #else
 #  define KISS_FFT_F32_COS(phase) (kiss_fft_f32_scalar) cos(phase)
 #  define KISS_FFT_F32_SIN(phase) (kiss_fft_f32_scalar) sin(phase)
 #  define HALF_OF(x) ((x)*.5)
 #endif
 #define  kf_cexp(x,phase) \
 	do{ \
 		(x)->r = KISS_FFT_F32_COS(phase);\
 		(x)->i = KISS_FFT_F32_SIN(phase);\
 	}while(0)
 /* a debugging function */
 #define pcpx(c)\
    fprintf(stderr,"%g + %gi\n",(double)((c)->r),(double)((c)->i) )
 #ifdef KISS_FFT_F32_USE_ALLOCA
 // define this to allow use of alloca instead of malloc for temporary buffers
 // Temporary buffers are used in two case: 
 // 1. FFT sizes that have "bad" factors. i.e. not 2,3 and 5
 // 2. "in-place" FFTs.  Notice the quotes, since kissfft does not really do an in-place transform.
 #include <alloca.h>
 #define  KISS_FFT_F32_TMP_ALLOC(nbytes) alloca(nbytes)
 #define  KISS_FFT_F32_TMP_FREE(ptr) 
 #else
 #define  KISS_FFT_F32_TMP_ALLOC(nbytes) KISS_FFT_F32_MALLOC(nbytes)
 #define  KISS_FFT_F32_TMP_FREE(ptr) KISS_FFT_F32_FREE(ptr)
 #endif
--- a/src/modules/module-filter-chain/convolver.c
+++ b/src/modules/module-filter-chain/convolver.c
@ -28,8 +28,14 @@
 #include <spa/utils/defs.h>
-#include "kiss_fft_f32.h"
+#include <math.h>
-#include "kiss_fftr_f32.h"
+#include <xmmintrin.h>
 #include "pffft.h"
 struct fft_cpx {
 	float *v;
 };
 struct convolver1 {
 	int blockSize;
@ -37,16 +43,16 @@ struct convolver1 {
 	int segCount;
 	int fftComplexSize;
-	kiss_fft_f32_cpx **segments;
+	struct fft_cpx *segments;
-	kiss_fft_f32_cpx **segmentsIr;
+	struct fft_cpx *segmentsIr;
 	float *fft_buffer;
 	void *fft;
 	void *ifft;
-	kiss_fft_f32_cpx *pre_mult;
+	struct fft_cpx pre_mult;
-	kiss_fft_f32_cpx *conv;
+	struct fft_cpx conv;
 	float *overlap;
 	float *inputBuffer;
@ -55,6 +61,38 @@ struct convolver1 {
 	int current;
 };
 static void *fft_alloc(int size)
 {
 	void *d;
 	d = pffft_aligned_malloc(size);
 	memset(d, 0, size);
 	return d;
 }
 static void fft_free(void  *data)
 {
 	pffft_aligned_free(data);
 }
 static void fft_cpx_init(struct fft_cpx *cpx, int size)
 {
 	cpx->v = fft_alloc(size * 2 * sizeof(float));
 }
 static void fft_cpx_free(struct fft_cpx *cpx)
 {
 	fft_free(cpx->v);
 }
 static void fft_cpx_clear(struct fft_cpx *cpx, int size)
 {
 	memset(cpx->v, 0, sizeof(float) * 2 * size);
 }
 static void fft_cpx_copy(struct fft_cpx *dst, struct fft_cpx *src, int size)
 {
 	memcpy(dst->v, src->v, sizeof(float) * 2 * size);
 }
 static int next_power_of_two(int val)
 {
 	int r = 1;
@ -63,6 +101,37 @@ static int next_power_of_two(int val)
 	return r;
 }
 static inline void *fft_new(int size)
 {
 	return pffft_new_setup(size, PFFFT_REAL);
 }
 static inline void *ifft_new(int size)
 {
 	return pffft_new_setup(size, PFFFT_REAL);
 }
 static inline void fft_destroy(void *fft)
 {
 	pffft_destroy_setup(fft);
 }
 static inline void fft_run(void *fft, float *in, struct fft_cpx *out)
 {
 	pffft_transform(fft, in, out->v, NULL, PFFFT_FORWARD);
 }
 static inline void ifft_run(void *ifft, struct fft_cpx *in, float *out)
 {
 	pffft_transform(ifft, in->v, out, NULL, PFFFT_BACKWARD);
 }
 static inline void fft_convolve_accum(void *fft, struct fft_cpx *r,
 		const struct fft_cpx *a, const struct fft_cpx *b, int len, float scale)
 {
 	pffft_zconvolve_accumulate(fft, a->v, b->v, r->v, scale);
 }
 static struct convolver1 *convolver1_new(int block, const float *ir, int irlen)
 {
 	struct convolver1 *conv;
@ -86,37 +155,37 @@ static struct convolver1 *convolver1_new(int block, const float *ir, int irlen)
 	conv->segCount = (irlen + conv->blockSize-1) / conv->blockSize;
 	conv->fftComplexSize = (conv->segSize / 2) + 1;
-        conv->fft = kiss_fftr_f32_alloc(conv->segSize, 0, NULL, NULL);
+        conv->fft = fft_new(conv->segSize);
        if (conv->fft == NULL)
                return NULL;
-        conv->ifft = kiss_fftr_f32_alloc(conv->segSize, 1, NULL, NULL);
+        conv->ifft = ifft_new(conv->segSize);
        if (conv->ifft == NULL)
                return NULL;
-	conv->fft_buffer = calloc(sizeof(float), conv->segSize);
+	conv->fft_buffer = fft_alloc(sizeof(float) * conv->segSize);
        if (conv->fft_buffer == NULL)
                return NULL;
-	conv->segments = calloc(sizeof(kiss_fft_f32_cpx*), conv->segCount);
+	conv->segments = calloc(sizeof(struct fft_cpx), conv->segCount);
-	conv->segmentsIr = calloc(sizeof(kiss_fft_f32_cpx*), conv->segCount);
+	conv->segmentsIr = calloc(sizeof(struct fft_cpx), conv->segCount);
 	for (i = 0; i < conv->segCount; i++) {
 		int left = irlen - (i * conv->blockSize);
 		int copy = SPA_MIN(conv->blockSize, left);
-		conv->segments[i] = calloc(sizeof(kiss_fft_f32_cpx), conv->fftComplexSize);
+		fft_cpx_init(&conv->segments[i], conv->fftComplexSize);
-		conv->segmentsIr[i] = calloc(sizeof(kiss_fft_f32_cpx), conv->fftComplexSize);
+		fft_cpx_init(&conv->segmentsIr[i], conv->fftComplexSize);
 		memcpy(conv->fft_buffer, &ir[i * conv->blockSize], copy * sizeof(float));
 		if (copy < conv->segSize)
 			memset(conv->fft_buffer + copy, 0, (conv->segSize - copy) * sizeof(float));
-	        kiss_fftr_f32(conv->fft, conv->fft_buffer, conv->segmentsIr[i]);
+	        fft_run(conv->fft, conv->fft_buffer, &conv->segmentsIr[i]);
 	}
-	conv->pre_mult = calloc(sizeof(kiss_fft_f32_cpx), conv->fftComplexSize);
+	fft_cpx_init(&conv->pre_mult, conv->fftComplexSize);
-	conv->conv = calloc(sizeof(kiss_fft_f32_cpx), conv->fftComplexSize);
+	fft_cpx_init(&conv->conv, conv->fftComplexSize);
-	conv->overlap = calloc(sizeof(float), conv->blockSize);
+	conv->overlap = fft_alloc(sizeof(float) * conv->blockSize);
-	conv->inputBuffer = calloc(sizeof(float), conv->blockSize);
+	conv->inputBuffer = fft_alloc(sizeof(float) * conv->blockSize);
 	conv->inputBufferFill = 0;
 	conv->current = 0;
@ -127,35 +196,38 @@ static void convolver1_free(struct convolver1 *conv)
 {
 	int i;
 	for (i = 0; i < conv->segCount; i++) {
-		free(conv->segments[i]);
+		fft_cpx_free(&conv->segments[i]);
-		free(conv->segmentsIr[i]);
+		fft_cpx_free(&conv->segmentsIr[i]);
 	}
-	free(conv->fft);
+	fft_destroy(conv->fft);
-	free(conv->ifft);
+	fft_destroy(conv->ifft);
-	free(conv->fft_buffer);
+	fft_free(conv->fft_buffer);
 	free(conv->segments);
 	free(conv->segmentsIr);
-	free(conv->pre_mult);
+	fft_cpx_free(&conv->pre_mult);
-	free(conv->conv);
+	fft_cpx_free(&conv->conv);
-	free(conv->overlap);
+	fft_free(conv->overlap);
-	free(conv->inputBuffer);
+	fft_free(conv->inputBuffer);
 	free(conv);
 }
 void Sum(float* result, const float* a, const float* b, int len)
 {
 	int i;
-	for (i = 0; i < len; i++)
+#if defined (__SSE__)
 	const int end4 = 4 * (len / 4);
 	for (i = 0; i < end4; i += 4) {
 		const __m128 va = _mm_load_ps(&a[i]);
 		const __m128 vb = _mm_load_ps(&b[i]);
 		_mm_store_ps(&result[i], _mm_add_ps(va,vb));
 	}
 	for (i = end4; i < len; ++i) {
 		result[i] = a[i] + b[i];
 	}
-
+#else
-void ComplexMultiplyAccumulate(kiss_fft_f32_cpx *r, const kiss_fft_f32_cpx *a, const kiss_fft_f32_cpx *b, int len)
+	for (i = 0; i < len; i++)
-{
+		result[i] = a[i] + b[i];
-	int i;
+#endif
 	for (i = 0; i < len; i++) {
 		r[i].r += a[i].r * b[i].r - a[i].i * b[i].i;
 		r[i].i += a[i].r * b[i].i + a[i].i * b[i].r;
 	}
 }
 static int convolver1_run(struct convolver1 *conv, const float *input, float *output, int len)
@ -176,30 +248,27 @@ static int convolver1_run(struct convolver1 *conv, const float *input, float *ou
 		memcpy(conv->fft_buffer, conv->inputBuffer, conv->blockSize * sizeof(float));
 		memset(conv->fft_buffer + conv->blockSize, 0, (conv->segSize - conv->blockSize) * sizeof(float));
-		kiss_fftr_f32(conv->fft, conv->fft_buffer, conv->segments[conv->current]);
+		fft_run(conv->fft, conv->fft_buffer, &conv->segments[conv->current]);
 		if (conv->inputBufferFill == 0) {
-			memset(conv->pre_mult, 0, sizeof(kiss_fft_f32_cpx) * conv->fftComplexSize);
+			fft_cpx_clear(&conv->pre_mult, conv->fftComplexSize);
 			for (i = 1; i < conv->segCount; i++) {
 				const int indexIr = i;
 				const int indexAudio = (conv->current + i) % conv->segCount;
-				ComplexMultiplyAccumulate(conv->pre_mult,
+				fft_convolve_accum(conv->fft, &conv->pre_mult,
-						conv->segmentsIr[indexIr],
+						&conv->segmentsIr[indexIr],
-						conv->segments[indexAudio],
+						&conv->segments[indexAudio],
-						conv->fftComplexSize);
+						conv->fftComplexSize, 1.0f / conv->segSize);
 			}
 		}
-		memcpy(conv->conv, conv->pre_mult, sizeof(kiss_fft_f32_cpx) * conv->fftComplexSize);
+		fft_cpx_copy(&conv->conv, &conv->pre_mult, conv->fftComplexSize);
-		ComplexMultiplyAccumulate(conv->conv, conv->segments[conv->current], conv->segmentsIr[0],
+		fft_convolve_accum(conv->fft, &conv->conv, &conv->segments[conv->current], &conv->segmentsIr[0],
-				conv->fftComplexSize);
+				conv->fftComplexSize, 1.0f / conv->segSize);
-		kiss_fftri_f32(conv->ifft, conv->conv, conv->fft_buffer);
+		ifft_run(conv->ifft, &conv->conv, conv->fft_buffer);
 		for (i = 0; i < conv->segSize; i++)
 			conv->fft_buffer[i] /= conv->segSize;
 		Sum(output + processed, conv->fft_buffer + inputBufferPos, conv->overlap + inputBufferPos, processing);
@ -265,19 +334,19 @@ struct convolver *convolver_new(int head_block, int tail_block, const float *ir,
 	if (irlen > conv->tailBlockSize) {
 		int conv1IrLen = SPA_MIN(irlen - conv->tailBlockSize, conv->tailBlockSize);
 		conv->tailConvolver0 = convolver1_new(conv->headBlockSize, ir + conv->tailBlockSize, conv1IrLen);
-		conv->tailOutput0 = calloc(conv->tailBlockSize, sizeof(float));
+		conv->tailOutput0 = fft_alloc(conv->tailBlockSize * sizeof(float));
-		conv->tailPrecalculated0 = calloc(conv->tailBlockSize, sizeof(float));
+		conv->tailPrecalculated0 = fft_alloc(conv->tailBlockSize * sizeof(float));
 	}
 	if (irlen > 2 * conv->tailBlockSize) {
 		int tailIrLen = irlen - (2 * conv->tailBlockSize);
 		conv->tailConvolver = convolver1_new(conv->tailBlockSize, ir + (2 * conv->tailBlockSize), tailIrLen);
-		conv->tailOutput = calloc(conv->tailBlockSize, sizeof(float));
+		conv->tailOutput = fft_alloc(conv->tailBlockSize * sizeof(float));
-		conv->tailPrecalculated = calloc(conv->tailBlockSize, sizeof(float));
+		conv->tailPrecalculated = fft_alloc(conv->tailBlockSize * sizeof(float));
 	}
 	if (conv->tailConvolver0 || conv->tailConvolver)
-		conv->tailInput = calloc(conv->tailBlockSize, sizeof(float));
+		conv->tailInput = fft_alloc(conv->tailBlockSize * sizeof(float));
 	conv->tailInputFill = 0;
 	conv->precalculatedPos = 0;
@ -293,11 +362,11 @@ void convolver_free(struct convolver *conv)
 		convolver1_free(conv->tailConvolver0);
 	if (conv->tailConvolver)
 		convolver1_free(conv->tailConvolver);
-	free(conv->tailOutput0);
+	fft_free(conv->tailOutput0);
-	free(conv->tailPrecalculated0);
+	fft_free(conv->tailPrecalculated0);
-	free(conv->tailOutput);
+	fft_free(conv->tailOutput);
-	free(conv->tailPrecalculated);
+	fft_free(conv->tailPrecalculated);
-	free(conv->tailInput);
+	fft_free(conv->tailInput);
 	free(conv);
 }
--- a/src/modules/module-filter-chain/kiss_fft_f32.c
+++ b/src/modules/module-filter-chain/kiss_fft_f32.c
@ -1,442 +0,0 @@
 /*
 *  Copyright (c) 2003-2010, Mark Borgerding. All rights reserved.
 *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
 *
 *  SPDX-License-Identifier: BSD-3-Clause
 *  See COPYING file for more information.
 */
 #include "_kiss_fft_guts_f32.h"
 /* The guts header contains all the multiplication and addition macros that are defined for
 fixed or floating point complex numbers.  It also delares the kf_ internal functions.
 */
 static void
 kf_bfly2 (kiss_fft_f32_cpx * Fout,
    const size_t fstride, const kiss_fft_f32_cfg st, int m)
 {
  kiss_fft_f32_cpx *Fout2;
  kiss_fft_f32_cpx *tw1 = st->twiddles;
  kiss_fft_f32_cpx t;
  Fout2 = Fout + m;
  do {
    C_FIXDIV (*Fout, 2);
    C_FIXDIV (*Fout2, 2);
    C_MUL (t, *Fout2, *tw1);
    tw1 += fstride;
    C_SUB (*Fout2, *Fout, t);
    C_ADDTO (*Fout, t);
    ++Fout2;
    ++Fout;
  } while (--m);
 }
 static void
 kf_bfly4 (kiss_fft_f32_cpx * Fout,
    const size_t fstride, const kiss_fft_f32_cfg st, const size_t m)
 {
  kiss_fft_f32_cpx *tw1, *tw2, *tw3;
  kiss_fft_f32_cpx scratch[6];
  size_t k = m;
  const size_t m2 = 2 * m;
  const size_t m3 = 3 * m;
  tw3 = tw2 = tw1 = st->twiddles;
  do {
    C_FIXDIV (*Fout, 4);
    C_FIXDIV (Fout[m], 4);
    C_FIXDIV (Fout[m2], 4);
    C_FIXDIV (Fout[m3], 4);
    C_MUL (scratch[0], Fout[m], *tw1);
    C_MUL (scratch[1], Fout[m2], *tw2);
    C_MUL (scratch[2], Fout[m3], *tw3);
    C_SUB (scratch[5], *Fout, scratch[1]);
    C_ADDTO (*Fout, scratch[1]);
    C_ADD (scratch[3], scratch[0], scratch[2]);
    C_SUB (scratch[4], scratch[0], scratch[2]);
    C_SUB (Fout[m2], *Fout, scratch[3]);
    tw1 += fstride;
    tw2 += fstride * 2;
    tw3 += fstride * 3;
    C_ADDTO (*Fout, scratch[3]);
    if (st->inverse) {
      Fout[m].r = scratch[5].r - scratch[4].i;
      Fout[m].i = scratch[5].i + scratch[4].r;
      Fout[m3].r = scratch[5].r + scratch[4].i;
      Fout[m3].i = scratch[5].i - scratch[4].r;
    } else {
      Fout[m].r = scratch[5].r + scratch[4].i;
      Fout[m].i = scratch[5].i - scratch[4].r;
      Fout[m3].r = scratch[5].r - scratch[4].i;
      Fout[m3].i = scratch[5].i + scratch[4].r;
    }
    ++Fout;
  } while (--k);
 }
 static void
 kf_bfly3 (kiss_fft_f32_cpx * Fout,
    const size_t fstride, const kiss_fft_f32_cfg st, size_t m)
 {
  size_t k = m;
  const size_t m2 = 2 * m;
  kiss_fft_f32_cpx *tw1, *tw2;
  kiss_fft_f32_cpx scratch[5];
  kiss_fft_f32_cpx epi3;
  epi3 = st->twiddles[fstride * m];
  tw1 = tw2 = st->twiddles;
  do {
    C_FIXDIV (*Fout, 3);
    C_FIXDIV (Fout[m], 3);
    C_FIXDIV (Fout[m2], 3);
    C_MUL (scratch[1], Fout[m], *tw1);
    C_MUL (scratch[2], Fout[m2], *tw2);
    C_ADD (scratch[3], scratch[1], scratch[2]);
    C_SUB (scratch[0], scratch[1], scratch[2]);
    tw1 += fstride;
    tw2 += fstride * 2;
    Fout[m].r = Fout->r - HALF_OF (scratch[3].r);
    Fout[m].i = Fout->i - HALF_OF (scratch[3].i);
    C_MULBYSCALAR (scratch[0], epi3.i);
    C_ADDTO (*Fout, scratch[3]);
    Fout[m2].r = Fout[m].r + scratch[0].i;
    Fout[m2].i = Fout[m].i - scratch[0].r;
    Fout[m].r -= scratch[0].i;
    Fout[m].i += scratch[0].r;
    ++Fout;
  } while (--k);
 }
 static void
 kf_bfly5 (kiss_fft_f32_cpx * Fout,
    const size_t fstride, const kiss_fft_f32_cfg st, int m)
 {
  kiss_fft_f32_cpx *Fout0, *Fout1, *Fout2, *Fout3, *Fout4;
  int u;
  kiss_fft_f32_cpx scratch[13];
  kiss_fft_f32_cpx *twiddles = st->twiddles;
  kiss_fft_f32_cpx *tw;
  kiss_fft_f32_cpx ya, yb;
  ya = twiddles[fstride * m];
  yb = twiddles[fstride * 2 * m];
  Fout0 = Fout;
  Fout1 = Fout0 + m;
  Fout2 = Fout0 + 2 * m;
  Fout3 = Fout0 + 3 * m;
  Fout4 = Fout0 + 4 * m;
  tw = st->twiddles;
  for (u = 0; u < m; ++u) {
    C_FIXDIV (*Fout0, 5);
    C_FIXDIV (*Fout1, 5);
    C_FIXDIV (*Fout2, 5);
    C_FIXDIV (*Fout3, 5);
    C_FIXDIV (*Fout4, 5);
    scratch[0] = *Fout0;
    C_MUL (scratch[1], *Fout1, tw[u * fstride]);
    C_MUL (scratch[2], *Fout2, tw[2 * u * fstride]);
    C_MUL (scratch[3], *Fout3, tw[3 * u * fstride]);
    C_MUL (scratch[4], *Fout4, tw[4 * u * fstride]);
    C_ADD (scratch[7], scratch[1], scratch[4]);
    C_SUB (scratch[10], scratch[1], scratch[4]);
    C_ADD (scratch[8], scratch[2], scratch[3]);
    C_SUB (scratch[9], scratch[2], scratch[3]);
    Fout0->r += scratch[7].r + scratch[8].r;
    Fout0->i += scratch[7].i + scratch[8].i;
    scratch[5].r =
        scratch[0].r + S_MUL (scratch[7].r, ya.r) + S_MUL (scratch[8].r, yb.r);
    scratch[5].i =
        scratch[0].i + S_MUL (scratch[7].i, ya.r) + S_MUL (scratch[8].i, yb.r);
    scratch[6].r = S_MUL (scratch[10].i, ya.i) + S_MUL (scratch[9].i, yb.i);
    scratch[6].i = -S_MUL (scratch[10].r, ya.i) - S_MUL (scratch[9].r, yb.i);
    C_SUB (*Fout1, scratch[5], scratch[6]);
    C_ADD (*Fout4, scratch[5], scratch[6]);
    scratch[11].r =
        scratch[0].r + S_MUL (scratch[7].r, yb.r) + S_MUL (scratch[8].r, ya.r);
    scratch[11].i =
        scratch[0].i + S_MUL (scratch[7].i, yb.r) + S_MUL (scratch[8].i, ya.r);
    scratch[12].r = -S_MUL (scratch[10].i, yb.i) + S_MUL (scratch[9].i, ya.i);
    scratch[12].i = S_MUL (scratch[10].r, yb.i) - S_MUL (scratch[9].r, ya.i);
    C_ADD (*Fout2, scratch[11], scratch[12]);
    C_SUB (*Fout3, scratch[11], scratch[12]);
    ++Fout0;
    ++Fout1;
    ++Fout2;
    ++Fout3;
    ++Fout4;
  }
 }
 /* perform the butterfly for one stage of a mixed radix FFT */
 static void
 kf_bfly_generic (kiss_fft_f32_cpx * Fout,
    const size_t fstride, const kiss_fft_f32_cfg st, int m, int p)
 {
  int u, k, q1, q;
  kiss_fft_f32_cpx *twiddles = st->twiddles;
  kiss_fft_f32_cpx t;
  int Norig = st->nfft;
  kiss_fft_f32_cpx *scratch =
      (kiss_fft_f32_cpx *) KISS_FFT_F32_TMP_ALLOC (sizeof (kiss_fft_f32_cpx) *
      p);
  for (u = 0; u < m; ++u) {
    k = u;
    for (q1 = 0; q1 < p; ++q1) {
      scratch[q1] = Fout[k];
      C_FIXDIV (scratch[q1], p);
      k += m;
    }
    k = u;
    for (q1 = 0; q1 < p; ++q1) {
      int twidx = 0;
      Fout[k] = scratch[0];
      for (q = 1; q < p; ++q) {
        twidx += fstride * k;
        if (twidx >= Norig)
          twidx -= Norig;
        C_MUL (t, scratch[q], twiddles[twidx]);
        C_ADDTO (Fout[k], t);
      }
      k += m;
    }
  }
  KISS_FFT_F32_TMP_FREE (scratch);
 }
 static void
 kf_work (kiss_fft_f32_cpx * Fout,
    const kiss_fft_f32_cpx * f,
    const size_t fstride, int in_stride, int *factors,
    const kiss_fft_f32_cfg st)
 {
  kiss_fft_f32_cpx *Fout_beg = Fout;
  const int p = *factors++;     /* the radix  */
  const int m = *factors++;     /* stage's fft length/p */
  const kiss_fft_f32_cpx *Fout_end = Fout + p * m;
 #ifdef _OPENMP
  // use openmp extensions at the 
  // top-level (not recursive)
  if (fstride == 1 && p <= 5 && m != 1) {
    int k;
    // execute the p different work units in different threads
 #       pragma omp parallel for
    for (k = 0; k < p; ++k)
      kf_work (Fout + k * m, f + fstride * in_stride * k, fstride * p,
          in_stride, factors, st);
    // all threads have joined by this point
    switch (p) {
      case 2:
        kf_bfly2 (Fout, fstride, st, m);
        break;
      case 3:
        kf_bfly3 (Fout, fstride, st, m);
        break;
      case 4:
        kf_bfly4 (Fout, fstride, st, m);
        break;
      case 5:
        kf_bfly5 (Fout, fstride, st, m);
        break;
      default:
        kf_bfly_generic (Fout, fstride, st, m, p);
        break;
    }
    return;
  }
 #endif
  if (m == 1) {
    do {
      *Fout = *f;
      f += fstride * in_stride;
    } while (++Fout != Fout_end);
  } else {
    do {
      // recursive call:
      // DFT of size m*p performed by doing
      // p instances of smaller DFTs of size m, 
      // each one takes a decimated version of the input
      kf_work (Fout, f, fstride * p, in_stride, factors, st);
      f += fstride * in_stride;
    } while ((Fout += m) != Fout_end);
  }
  Fout = Fout_beg;
  // recombine the p smaller DFTs 
  switch (p) {
    case 2:
      kf_bfly2 (Fout, fstride, st, m);
      break;
    case 3:
      kf_bfly3 (Fout, fstride, st, m);
      break;
    case 4:
      kf_bfly4 (Fout, fstride, st, m);
      break;
    case 5:
      kf_bfly5 (Fout, fstride, st, m);
      break;
    default:
      kf_bfly_generic (Fout, fstride, st, m, p);
      break;
  }
 }
 /*  facbuf is populated by p1,m1,p2,m2, ...
    where 
    p[i] * m[i] = m[i-1]
    m0 = n                  */
 static void
 kf_factor (int n, int *facbuf)
 {
  int p = 4;
  double floor_sqrt;
  floor_sqrt = floor (sqrt ((double) n));
  /*factor out powers of 4, powers of 2, then any remaining primes */
  do {
    while (n % p) {
      switch (p) {
        case 4:
          p = 2;
          break;
        case 2:
          p = 3;
          break;
        default:
          p += 2;
          break;
      }
      if (p > floor_sqrt)
        p = n;                  /* no more factors, skip to end */
    }
    n /= p;
    *facbuf++ = p;
    *facbuf++ = n;
  } while (n > 1);
 }
 /*
 *
 * User-callable function to allocate all necessary storage space for the fft.
 *
 * The return value is a contiguous block of memory, allocated with malloc.  As such,
 * It can be freed with free(), rather than a kiss_fft_f32-specific function.
 * */
 kiss_fft_f32_cfg
 kiss_fft_f32_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem)
 {
  kiss_fft_f32_cfg st = NULL;
  size_t memneeded = sizeof (struct kiss_fft_f32_state)
      + sizeof (kiss_fft_f32_cpx) * (nfft - 1); /* twiddle factors */
  if (lenmem == NULL) {
    st = (kiss_fft_f32_cfg) KISS_FFT_F32_MALLOC (memneeded);
  } else {
    if (mem != NULL && *lenmem >= memneeded)
      st = (kiss_fft_f32_cfg) mem;
    *lenmem = memneeded;
  }
  if (st) {
    int i;
    st->nfft = nfft;
    st->inverse = inverse_fft;
    for (i = 0; i < nfft; ++i) {
      const double pi =
          3.141592653589793238462643383279502884197169399375105820974944;
      double phase = -2 * pi * i / nfft;
      if (st->inverse)
        phase *= -1;
      kf_cexp (st->twiddles + i, phase);
    }
    kf_factor (nfft, st->factors);
  }
  return st;
 }
 void
 kiss_fft_f32_stride (kiss_fft_f32_cfg st, const kiss_fft_f32_cpx * fin,
    kiss_fft_f32_cpx * fout, int in_stride)
 {
  if (fin == fout) {
    //NOTE: this is not really an in-place FFT algorithm.
    //It just performs an out-of-place FFT into a temp buffer
    kiss_fft_f32_cpx *tmpbuf =
        (kiss_fft_f32_cpx *) KISS_FFT_F32_TMP_ALLOC (sizeof (kiss_fft_f32_cpx) *
        st->nfft);
    kf_work (tmpbuf, fin, 1, in_stride, st->factors, st);
    memcpy (fout, tmpbuf, sizeof (kiss_fft_f32_cpx) * st->nfft);
    KISS_FFT_F32_TMP_FREE (tmpbuf);
  } else {
    kf_work (fout, fin, 1, in_stride, st->factors, st);
  }
 }
 void
 kiss_fft_f32 (kiss_fft_f32_cfg cfg, const kiss_fft_f32_cpx * fin,
    kiss_fft_f32_cpx * fout)
 {
  kiss_fft_f32_stride (cfg, fin, fout, 1);
 }
 void
 kiss_fft_f32_cleanup (void)
 {
  // nothing needed any more
 }
 int
 kiss_fft_f32_next_fast_size (int n)
 {
  while (1) {
    int m = n;
    while ((m % 2) == 0)
      m /= 2;
    while ((m % 3) == 0)
      m /= 3;
    while ((m % 5) == 0)
      m /= 5;
    if (m <= 1)
      break;                    /* n is completely factorable by twos, threes, and fives */
    n++;
  }
  return n;
 }
--- a/src/modules/module-filter-chain/kiss_fft_f32.h
+++ b/src/modules/module-filter-chain/kiss_fft_f32.h
@ -1,112 +0,0 @@
 /*
 *  Copyright (c) 2003-2010, Mark Borgerding. All rights reserved.
 *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
 *
 *  SPDX-License-Identifier: BSD-3-Clause
 *  See COPYING file for more information.
 */
 #ifndef KISS_FFT_F32_H
 #define KISS_FFT_F32_H
 #include <stdlib.h>
 #include <stdio.h>
 #include <math.h>
 #include <string.h>
 #include <stdint.h>
 #ifdef __cplusplus
 extern "C" {
 #endif
 /*
 ATTENTION!
 If you would like a :
 -- a utility that will handle the caching of fft objects
 -- real-only (no imaginary time component ) FFT
 -- a multi-dimensional FFT
 -- a command-line utility to perform ffts
 -- a command-line utility to perform fast-convolution filtering
 Then see kfc.h kiss_fftr_f32.h kiss_fft_f32nd.h fftutil.c kiss_fastfir.c
  in the tools/ directory.
 */
 #define KISS_FFT_F32_MALLOC malloc
 #define KISS_FFT_F32_FREE free
 #define kiss_fft_f32_scalar float
 typedef struct {
    kiss_fft_f32_scalar r;
    kiss_fft_f32_scalar i;
 }kiss_fft_f32_cpx;
 typedef struct kiss_fft_f32_state* kiss_fft_f32_cfg;
 /* 
 *  kiss_fft_f32_alloc
 *  
 *  Initialize a FFT (or IFFT) algorithm's cfg/state buffer.
 *
 *  typical usage:      kiss_fft_f32_cfg mycfg=kiss_fft_f32_alloc(1024,0,NULL,NULL);
 *
 *  The return value from fft_alloc is a cfg buffer used internally
 *  by the fft routine or NULL.
 *
 *  If lenmem is NULL, then kiss_fft_f32_alloc will allocate a cfg buffer using malloc.
 *  The returned value should be free()d when done to avoid memory leaks.
 *  
 *  The state can be placed in a user supplied buffer 'mem':
 *  If lenmem is not NULL and mem is not NULL and *lenmem is large enough,
 *      then the function places the cfg in mem and the size used in *lenmem
 *      and returns mem.
 *  
 *  If lenmem is not NULL and ( mem is NULL or *lenmem is not large enough),
 *      then the function returns NULL and places the minimum cfg 
 *      buffer size in *lenmem.
 * */
 kiss_fft_f32_cfg kiss_fft_f32_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem); 
 /*
 * kiss_fft_f32(cfg,in_out_buf)
 *
 * Perform an FFT on a complex input buffer.
 * for a forward FFT,
 * fin should be  f[0] , f[1] , ... ,f[nfft-1]
 * fout will be   F[0] , F[1] , ... ,F[nfft-1]
 * Note that each element is complex and can be accessed like
    f[k].r and f[k].i
 * */
 void kiss_fft_f32(kiss_fft_f32_cfg cfg,const kiss_fft_f32_cpx *fin,kiss_fft_f32_cpx *fout);
 /*
 A more generic version of the above function. It reads its input from every Nth sample.
 * */
 void kiss_fft_f32_stride(kiss_fft_f32_cfg cfg,const kiss_fft_f32_cpx *fin,kiss_fft_f32_cpx *fout,int fin_stride);
 /* If kiss_fft_f32_alloc allocated a buffer, it is one contiguous 
   buffer and can be simply free()d when no longer needed*/
 #define kiss_fft_f32_free KISS_FFT_F32_FREE
 /*
 Cleans up some memory that gets managed internally. Not necessary to call, but it might clean up 
 your compiler output to call this before you exit.
 */
 void kiss_fft_f32_cleanup(void);
 /*
 * Returns the smallest integer k, such that k>=n and k has only "fast" factors (2,3,5)
 */
 int kiss_fft_f32_next_fast_size(int n);
 /* for real ffts, we need an even size */
 #define kiss_fftr_f32_next_fast_size_real(n) \
        (kiss_fft_f32_next_fast_size( ((n)+1)>>1)<<1)
 #ifdef __cplusplus
 } 
 #endif
 #endif
--- a/src/modules/module-filter-chain/kiss_fftr_f32.c
+++ b/src/modules/module-filter-chain/kiss_fftr_f32.c
@ -1,148 +0,0 @@
 /*
 *  Copyright (c) 2003-2004, Mark Borgerding. All rights reserved.
 *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
 *
 *  SPDX-License-Identifier: BSD-3-Clause
 *  See COPYING file for more information.
 */
 #include "kiss_fftr_f32.h"
 #include "_kiss_fft_guts_f32.h"
 struct kiss_fftr_f32_state
 {
  kiss_fft_f32_cfg substate;
  kiss_fft_f32_cpx *tmpbuf;
  kiss_fft_f32_cpx *super_twiddles;
 #ifdef USE_SIMD
  void *pad;
 #endif
 };
 kiss_fftr_f32_cfg
 kiss_fftr_f32_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem)
 {
  int i;
  kiss_fftr_f32_cfg st = NULL;
  size_t subsize = 0, memneeded;
  nfft >>= 1;
  kiss_fft_f32_alloc (nfft, inverse_fft, NULL, &subsize);
  memneeded =
      ALIGN_STRUCT (sizeof (struct kiss_fftr_f32_state)) +
      ALIGN_STRUCT (subsize) + sizeof (kiss_fft_f32_cpx) * (nfft * 3 / 2);
  if (lenmem == NULL) {
    st = (kiss_fftr_f32_cfg) KISS_FFT_F32_MALLOC (memneeded);
  } else {
    if (*lenmem >= memneeded)
      st = (kiss_fftr_f32_cfg) mem;
    *lenmem = memneeded;
  }
  if (!st)
    return NULL;
  st->substate = (kiss_fft_f32_cfg) (((char *) st) + ALIGN_STRUCT (sizeof (struct kiss_fftr_f32_state)));       /*just beyond kiss_fftr_f32_state struct */
  st->tmpbuf =
      (kiss_fft_f32_cpx *) (((char *) st->substate) + ALIGN_STRUCT (subsize));
  st->super_twiddles = st->tmpbuf + nfft;
  kiss_fft_f32_alloc (nfft, inverse_fft, st->substate, &subsize);
  for (i = 0; i < nfft / 2; ++i) {
    double phase =
        -3.14159265358979323846264338327 * ((double) (i + 1) / nfft + .5);
    if (inverse_fft)
      phase *= -1;
    kf_cexp (st->super_twiddles + i, phase);
  }
  return st;
 }
 void
 kiss_fftr_f32 (kiss_fftr_f32_cfg st, const kiss_fft_f32_scalar * timedata,
    kiss_fft_f32_cpx * freqdata)
 {
  /* input buffer timedata is stored row-wise */
  int k, ncfft;
  kiss_fft_f32_cpx fpnk, fpk, f1k, f2k, tw, tdc;
  ncfft = st->substate->nfft;
  /*perform the parallel fft of two real signals packed in real,imag */
  kiss_fft_f32 (st->substate, (const kiss_fft_f32_cpx *) timedata, st->tmpbuf);
  /* The real part of the DC element of the frequency spectrum in st->tmpbuf
   * contains the sum of the even-numbered elements of the input time sequence
   * The imag part is the sum of the odd-numbered elements
   *
   * The sum of tdc.r and tdc.i is the sum of the input time sequence. 
   *      yielding DC of input time sequence
   * The difference of tdc.r - tdc.i is the sum of the input (dot product) [1,-1,1,-1... 
   *      yielding Nyquist bin of input time sequence
   */
  tdc.r = st->tmpbuf[0].r;
  tdc.i = st->tmpbuf[0].i;
  C_FIXDIV (tdc, 2);
  CHECK_OVERFLOW_OP (tdc.r, +, tdc.i);
  CHECK_OVERFLOW_OP (tdc.r, -, tdc.i);
  freqdata[0].r = tdc.r + tdc.i;
  freqdata[ncfft].r = tdc.r - tdc.i;
 #ifdef USE_SIMD
  freqdata[ncfft].i = freqdata[0].i = _mm_set1_ps (0);
 #else
  freqdata[ncfft].i = freqdata[0].i = 0;
 #endif
  for (k = 1; k <= ncfft / 2; ++k) {
    fpk = st->tmpbuf[k];
    fpnk.r = st->tmpbuf[ncfft - k].r;
    fpnk.i = -st->tmpbuf[ncfft - k].i;
    C_FIXDIV (fpk, 2);
    C_FIXDIV (fpnk, 2);
    C_ADD (f1k, fpk, fpnk);
    C_SUB (f2k, fpk, fpnk);
    C_MUL (tw, f2k, st->super_twiddles[k - 1]);
    freqdata[k].r = HALF_OF (f1k.r + tw.r);
    freqdata[k].i = HALF_OF (f1k.i + tw.i);
    freqdata[ncfft - k].r = HALF_OF (f1k.r - tw.r);
    freqdata[ncfft - k].i = HALF_OF (tw.i - f1k.i);
  }
 }
 void
 kiss_fftri_f32 (kiss_fftr_f32_cfg st, const kiss_fft_f32_cpx * freqdata,
    kiss_fft_f32_scalar * timedata)
 {
  /* input buffer timedata is stored row-wise */
  int k, ncfft;
  ncfft = st->substate->nfft;
  st->tmpbuf[0].r = freqdata[0].r + freqdata[ncfft].r;
  st->tmpbuf[0].i = freqdata[0].r - freqdata[ncfft].r;
  C_FIXDIV (st->tmpbuf[0], 2);
  for (k = 1; k <= ncfft / 2; ++k) {
    kiss_fft_f32_cpx fk, fnkc, fek, fok, tmp;
    fk = freqdata[k];
    fnkc.r = freqdata[ncfft - k].r;
    fnkc.i = -freqdata[ncfft - k].i;
    C_FIXDIV (fk, 2);
    C_FIXDIV (fnkc, 2);
    C_ADD (fek, fk, fnkc);
    C_SUB (tmp, fk, fnkc);
    C_MUL (fok, tmp, st->super_twiddles[k - 1]);
    C_ADD (st->tmpbuf[k], fek, fok);
    C_SUB (st->tmpbuf[ncfft - k], fek, fok);
 #ifdef USE_SIMD
    st->tmpbuf[ncfft - k].i *= _mm_set1_ps (-1.0);
 #else
    st->tmpbuf[ncfft - k].i *= -1;
 #endif
  }
  kiss_fft_f32 (st->substate, st->tmpbuf, (kiss_fft_f32_cpx *) timedata);
 }
--- a/src/modules/module-filter-chain/kiss_fftr_f32.h
+++ b/src/modules/module-filter-chain/kiss_fftr_f32.h
@ -1,54 +0,0 @@
 /*
 *  Copyright (c) 2003-2004, Mark Borgerding. All rights reserved.
 *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
 *
 *  SPDX-License-Identifier: BSD-3-Clause
 *  See COPYING file for more information.
 */
 #ifndef KISS_FTR_H
 #define KISS_FTR_H
 #include "kiss_fft_f32.h"
 #ifdef __cplusplus
 extern "C" {
 #endif
 /* 
 Real optimized version can save about 45% cpu time vs. complex fft of a real seq.
 */
 typedef struct kiss_fftr_f32_state *kiss_fftr_f32_cfg;
 kiss_fftr_f32_cfg kiss_fftr_f32_alloc(int nfft,int inverse_fft,void * mem, size_t * lenmem);
 /*
 nfft must be even
 If you don't care to allocate space, use mem = lenmem = NULL 
 */
 void kiss_fftr_f32(kiss_fftr_f32_cfg cfg,const kiss_fft_f32_scalar *timedata,kiss_fft_f32_cpx *freqdata);
 /*
 input timedata has nfft scalar points
 output freqdata has nfft/2+1 complex points
 */
 void kiss_fftri_f32(kiss_fftr_f32_cfg cfg,const kiss_fft_f32_cpx *freqdata,kiss_fft_f32_scalar *timedata);
 /*
 input freqdata has  nfft/2+1 complex points
 output timedata has nfft scalar points
 */
 #define kiss_fftr_f32_free KISS_FFT_F32_FREE
 #ifdef __cplusplus
 }
 #endif
 #endif
--- a/src/modules/module-filter-chain/pffft.c
+++ b/src/modules/module-filter-chain/pffft.c
--- a/src/modules/module-filter-chain/pffft.h
+++ b/src/modules/module-filter-chain/pffft.h
@ -0,0 +1,177 @@
 /* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
   Based on original fortran 77 code from FFTPACKv4 from NETLIB,
   authored by Dr Paul Swarztrauber of NCAR, in 1985.
   As confirmed by the NCAR fftpack software curators, the following
   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
   released under the same terms.
   FFTPACK license:
   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
   Copyright (c) 2004 the University Corporation for Atmospheric
   Research ("UCAR"). All rights reserved. Developed by NCAR's
   Computational and Information Systems Laboratory, UCAR,
   www.cisl.ucar.edu.
   Redistribution and use of the Software in source and binary forms,
   with or without modification, is permitted provided that the
   following conditions are met:
   - Neither the names of NCAR's Computational and Information Systems
   Laboratory, the University Corporation for Atmospheric Research,
   nor the names of its sponsors or contributors may be used to
   endorse or promote products derived from this Software without
   specific prior written permission.
   - Redistributions of source code must retain the above copyright
   notices, this list of conditions, and the disclaimer below.
   - Redistributions in binary form must reproduce the above copyright
   notice, this list of conditions, and the disclaimer below in the
   documentation and/or other materials provided with the
   distribution.
   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
   SOFTWARE.
 */
 /*
   PFFFT : a Pretty Fast FFT.
   This is basically an adaptation of the single precision fftpack
   (v4) as found on netlib taking advantage of SIMD instruction found
   on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
   For architectures where no SIMD instruction is available, the code
   falls back to a scalar version.
   Restrictions:
   - 1D transforms only, with 32-bit single precision.
   - supports only transforms for inputs of length N of the form
   N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
   144, 160, etc are all acceptable lengths). Performance is best for
   128<=N<=8192.
   - all (float*) pointers in the functions below are expected to
   have an "simd-compatible" alignment, that is 16 bytes on x86 and
   powerpc CPUs.
   You can allocate such buffers with the functions
   pffft_aligned_malloc / pffft_aligned_free (or with stuff like
   posix_memalign..)
 */
 #ifndef PFFFT_H
 #define PFFFT_H
 #include <stddef.h> // for size_t
 #ifdef __cplusplus
 extern "C" {
 #endif
  /* opaque struct holding internal stuff (precomputed twiddle factors)
     this struct can be shared by many threads as it contains only
     read-only data.
  */
  typedef struct PFFFT_Setup PFFFT_Setup;
  /* direction of the transform */
  typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
  /* type of transform */
  typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
  /*
    prepare for performing transforms of size N -- the returned
    PFFFT_Setup structure is read-only so it can safely be shared by
    multiple concurrent threads.
  */
  PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
  void pffft_destroy_setup(PFFFT_Setup *);
  /*
     Perform a Fourier transform , The z-domain data is stored in the
     most efficient order for transforming it back, or using it for
     convolution. If you need to have its content sorted in the
     "usual" way, that is as an array of interleaved complex numbers,
     either use pffft_transform_ordered , or call pffft_zreorder after
     the forward fft, and before the backward fft.
     Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
     Typically you will want to scale the backward transform by 1/N.
     The 'work' pointer should point to an area of N (2*N for complex
     fft) floats, properly aligned. If 'work' is NULL, then stack will
     be used instead (this is probably the best strategy for small
     FFTs, say for N < 16384).
     input and output may alias.
  */
  void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
  /*
     Similar to pffft_transform, but makes sure that the output is
     ordered as expected (interleaved complex numbers).  This is
     similar to calling pffft_transform and then pffft_zreorder.
     input and output may alias.
  */
  void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
  /*
     call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
     PFFFT_FORWARD) if you want to have the frequency components in
     the correct "canonical" order, as interleaved complex numbers.
     (for real transforms, both 0-frequency and half frequency
     components, which are real, are assembled in the first entry as
     F(0)+i*F(n/2+1). Note that the original fftpack did place
     F(n/2+1) at the end of the arrays).
     input and output should not alias.
  */
  void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
  /*
     Perform a multiplication of the frequency components of dft_a and
     dft_b and accumulate them into dft_ab. The arrays should have
     been obtained with pffft_transform(.., PFFFT_FORWARD) and should
     *not* have been reordered with pffft_zreorder (otherwise just
     perform the operation yourself as the dft coefs are stored as
     interleaved complex numbers).
     the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
     The dft_a, dft_b and dft_ab pointers may alias.
  */
  void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
  /*
    the float buffers must have the correct alignment (16-byte boundary
    on intel and powerpc). This function may be used to obtain such
    correctly aligned buffers. 
  */
  void *pffft_aligned_malloc(size_t nb_bytes);
  void pffft_aligned_free(void *);
  /* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
  int pffft_simd_size();
 #ifdef __cplusplus
 }
 #endif
 #endif // PFFFT_H