diff --git a/src/modules/meson.build b/src/modules/meson.build
index cf456999e..755c8a084 100644
--- a/src/modules/meson.build
+++ b/src/modules/meson.build
@@ -50,8 +50,7 @@ pipewire_module_filter_chain = shared_library('pipewire-module-filter-chain',
     'module-filter-chain/biquad.c',
     'module-filter-chain/ladspa_plugin.c',
     'module-filter-chain/builtin_plugin.c',
-    'module-filter-chain/kiss_fft_f32.c',
-    'module-filter-chain/kiss_fftr_f32.c',
+    'module-filter-chain/pffft.c',
     'module-filter-chain/convolver.c' ],
   include_directories : [configinc, spa_inc],
   install : true,
diff --git a/src/modules/module-filter-chain/_kiss_fft_guts_f32.h b/src/modules/module-filter-chain/_kiss_fft_guts_f32.h
deleted file mode 100644
index c0cf744f9..000000000
--- a/src/modules/module-filter-chain/_kiss_fft_guts_f32.h
+++ /dev/null
@@ -1,173 +0,0 @@
-/*
- *  Copyright (c) 2003-2010, Mark Borgerding. All rights reserved.
- *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
- *
- *  SPDX-License-Identifier: BSD-3-Clause
- *  See COPYING file for more information.
- */
-
-/* kiss_fft_f32.h
-   defines kiss_fft_f32_scalar as either short or a float type
-   and defines
-   typedef struct { kiss_fft_f32_scalar r; kiss_fft_f32_scalar i; }kiss_fft_f32_cpx; */
-#include "kiss_fft_f32.h"
-#include <limits.h>
-
-/* The 2*sizeof(size_t) alignment here is borrowed from
- * GNU libc, so it should be good most everywhere.
- * It is more conservative than is needed on some 64-bit
- * platforms, but ia64 does require a 16-byte alignment.
- * The SIMD extensions for x86 and ppc32 would want a
- * larger alignment than this, but we don't need to
- * do better than malloc.
- *
- * Borrowed from GLib's gobject/gtype.c
- */
-#define STRUCT_ALIGNMENT (2 * sizeof (size_t))
-#define ALIGN_STRUCT(offset) \
-      ((offset + (STRUCT_ALIGNMENT - 1)) & -STRUCT_ALIGNMENT)
-
-#define MAXFACTORS 32
-/* e.g. an fft of length 128 has 4 factors 
- as far as kissfft is concerned
- 4*4*4*2
- */
-
-struct kiss_fft_f32_state{
-    int nfft;
-    int inverse;
-    int factors[2*MAXFACTORS];
-    kiss_fft_f32_cpx twiddles[1];
-};
-
-/*
-  Explanation of macros dealing with complex math:
-
-   C_MUL(m,a,b)         : m = a*b
-   C_FIXDIV( c , div )  : if a fixed point impl., c /= div. noop otherwise
-   C_SUB( res, a,b)     : res = a - b
-   C_SUBFROM( res , a)  : res -= a
-   C_ADDTO( res , a)    : res += a
- * */
-#ifdef FIXED_POINT
-#include <stdint.h>
-#if (FIXED_POINT==32)
-# define FRACBITS 31
-# define SAMPPROD int64_t
-#define SAMP_MAX INT32_MAX
-#define SAMP_MIN INT32_MIN
-#else
-# define FRACBITS 15
-# define SAMPPROD int32_t
-#define SAMP_MAX INT16_MAX
-#define SAMP_MIN INT16_MIN
-#endif
-
-#if defined(CHECK_OVERFLOW)
-#  define CHECK_OVERFLOW_OP(a,op,b)  \
-	if ( (SAMPPROD)(a) op (SAMPPROD)(b) > SAMP_MAX || (SAMPPROD)(a) op (SAMPPROD)(b) < SAMP_MIN ) { \
-		g_critical("overflow @ " __FILE__ "(%d): (%d " #op" %d) = %ld",__LINE__,(a),(b),(SAMPPROD)(a) op (SAMPPROD)(b) );  }
-#endif
-
-
-#   define smul(a,b) ( (SAMPPROD)(a)*(b) )
-#   define sround( x )  (kiss_fft_f32_scalar)( ( (x) + (1<<(FRACBITS-1)) ) >> FRACBITS )
-
-#   define S_MUL(a,b) sround( smul(a,b) )
-
-#   define C_MUL(m,a,b) \
-      do{ (m).r = sround( smul((a).r,(b).r) - smul((a).i,(b).i) ); \
-          (m).i = sround( smul((a).r,(b).i) + smul((a).i,(b).r) ); }while(0)
-
-#   define DIVSCALAR(x,k) \
-	(x) = sround( smul(  x, SAMP_MAX/k ) )
-
-#   define C_FIXDIV(c,div) \
-	do {    DIVSCALAR( (c).r , div);  \
-		DIVSCALAR( (c).i  , div); }while (0)
-
-#   define C_MULBYSCALAR( c, s ) \
-    do{ (c).r =  sround( smul( (c).r , s ) ) ;\
-        (c).i =  sround( smul( (c).i , s ) ) ; }while(0)
-
-#else  /* not FIXED_POINT*/
-
-#   define S_MUL(a,b) ( (a)*(b) )
-#define C_MUL(m,a,b) \
-    do{ (m).r = (a).r*(b).r - (a).i*(b).i;\
-        (m).i = (a).r*(b).i + (a).i*(b).r; }while(0)
-#   define C_FIXDIV(c,div) /* NOOP */
-#   define C_MULBYSCALAR( c, s ) \
-    do{ (c).r *= (s);\
-        (c).i *= (s); }while(0)
-#endif
-
-#ifndef CHECK_OVERFLOW_OP
-#  define CHECK_OVERFLOW_OP(a,op,b) /* noop */
-#endif
-
-#define  C_ADD( res, a,b)\
-    do { \
-	    CHECK_OVERFLOW_OP((a).r,+,(b).r)\
-	    CHECK_OVERFLOW_OP((a).i,+,(b).i)\
-	    (res).r=(a).r+(b).r;  (res).i=(a).i+(b).i; \
-    }while(0)
-#define  C_SUB( res, a,b)\
-    do { \
-	    CHECK_OVERFLOW_OP((a).r,-,(b).r)\
-	    CHECK_OVERFLOW_OP((a).i,-,(b).i)\
-	    (res).r=(a).r-(b).r;  (res).i=(a).i-(b).i; \
-    }while(0)
-#define C_ADDTO( res , a)\
-    do { \
-	    CHECK_OVERFLOW_OP((res).r,+,(a).r)\
-	    CHECK_OVERFLOW_OP((res).i,+,(a).i)\
-	    (res).r += (a).r;  (res).i += (a).i;\
-    }while(0)
-
-#define C_SUBFROM( res , a)\
-    do {\
-	    CHECK_OVERFLOW_OP((res).r,-,(a).r)\
-	    CHECK_OVERFLOW_OP((res).i,-,(a).i)\
-	    (res).r -= (a).r;  (res).i -= (a).i; \
-    }while(0)
-
-
-#ifdef FIXED_POINT
-#  define KISS_FFT_F32_COS(phase)  floor(.5+SAMP_MAX * cos (phase))
-#  define KISS_FFT_F32_SIN(phase)  floor(.5+SAMP_MAX * sin (phase))
-#  define HALF_OF(x) ((x)>>1)
-#elif defined(USE_SIMD)
-#  define KISS_FFT_F32_COS(phase) _mm_set1_ps( cos(phase) )
-#  define KISS_FFT_F32_SIN(phase) _mm_set1_ps( sin(phase) )
-#  define HALF_OF(x) ((x)*_mm_set1_ps(.5))
-#else
-#  define KISS_FFT_F32_COS(phase) (kiss_fft_f32_scalar) cos(phase)
-#  define KISS_FFT_F32_SIN(phase) (kiss_fft_f32_scalar) sin(phase)
-#  define HALF_OF(x) ((x)*.5)
-#endif
-
-#define  kf_cexp(x,phase) \
-	do{ \
-		(x)->r = KISS_FFT_F32_COS(phase);\
-		(x)->i = KISS_FFT_F32_SIN(phase);\
-	}while(0)
-
-
-/* a debugging function */
-#define pcpx(c)\
-    fprintf(stderr,"%g + %gi\n",(double)((c)->r),(double)((c)->i) )
-
-
-#ifdef KISS_FFT_F32_USE_ALLOCA
-// define this to allow use of alloca instead of malloc for temporary buffers
-// Temporary buffers are used in two case: 
-// 1. FFT sizes that have "bad" factors. i.e. not 2,3 and 5
-// 2. "in-place" FFTs.  Notice the quotes, since kissfft does not really do an in-place transform.
-#include <alloca.h>
-#define  KISS_FFT_F32_TMP_ALLOC(nbytes) alloca(nbytes)
-#define  KISS_FFT_F32_TMP_FREE(ptr) 
-#else
-#define  KISS_FFT_F32_TMP_ALLOC(nbytes) KISS_FFT_F32_MALLOC(nbytes)
-#define  KISS_FFT_F32_TMP_FREE(ptr) KISS_FFT_F32_FREE(ptr)
-#endif
diff --git a/src/modules/module-filter-chain/convolver.c b/src/modules/module-filter-chain/convolver.c
index 29f5d2e44..e58a4531d 100644
--- a/src/modules/module-filter-chain/convolver.c
+++ b/src/modules/module-filter-chain/convolver.c
@@ -28,8 +28,14 @@
 
 #include <spa/utils/defs.h>
 
-#include "kiss_fft_f32.h"
-#include "kiss_fftr_f32.h"
+#include <math.h>
+#include <xmmintrin.h>
+
+#include "pffft.h"
+
+struct fft_cpx {
+	float *v;
+};
 
 struct convolver1 {
 	int blockSize;
@@ -37,16 +43,16 @@ struct convolver1 {
 	int segCount;
 	int fftComplexSize;
 
-	kiss_fft_f32_cpx **segments;
-	kiss_fft_f32_cpx **segmentsIr;
+	struct fft_cpx *segments;
+	struct fft_cpx *segmentsIr;
 
 	float *fft_buffer;
 
 	void *fft;
 	void *ifft;
 
-	kiss_fft_f32_cpx *pre_mult;
-	kiss_fft_f32_cpx *conv;
+	struct fft_cpx pre_mult;
+	struct fft_cpx conv;
 	float *overlap;
 
 	float *inputBuffer;
@@ -55,6 +61,38 @@ struct convolver1 {
 	int current;
 };
 
+static void *fft_alloc(int size)
+{
+	void *d;
+	d = pffft_aligned_malloc(size);
+	memset(d, 0, size);
+	return d;
+}
+static void fft_free(void  *data)
+{
+	pffft_aligned_free(data);
+}
+
+static void fft_cpx_init(struct fft_cpx *cpx, int size)
+{
+	cpx->v = fft_alloc(size * 2 * sizeof(float));
+}
+
+static void fft_cpx_free(struct fft_cpx *cpx)
+{
+	fft_free(cpx->v);
+}
+
+static void fft_cpx_clear(struct fft_cpx *cpx, int size)
+{
+	memset(cpx->v, 0, sizeof(float) * 2 * size);
+}
+
+static void fft_cpx_copy(struct fft_cpx *dst, struct fft_cpx *src, int size)
+{
+	memcpy(dst->v, src->v, sizeof(float) * 2 * size);
+}
+
 static int next_power_of_two(int val)
 {
 	int r = 1;
@@ -63,6 +101,37 @@ static int next_power_of_two(int val)
 	return r;
 }
 
+static inline void *fft_new(int size)
+{
+	return pffft_new_setup(size, PFFFT_REAL);
+}
+
+static inline void *ifft_new(int size)
+{
+	return pffft_new_setup(size, PFFFT_REAL);
+}
+
+static inline void fft_destroy(void *fft)
+{
+	pffft_destroy_setup(fft);
+}
+
+static inline void fft_run(void *fft, float *in, struct fft_cpx *out)
+{
+	pffft_transform(fft, in, out->v, NULL, PFFFT_FORWARD);
+}
+
+static inline void ifft_run(void *ifft, struct fft_cpx *in, float *out)
+{
+	pffft_transform(ifft, in->v, out, NULL, PFFFT_BACKWARD);
+}
+
+static inline void fft_convolve_accum(void *fft, struct fft_cpx *r,
+		const struct fft_cpx *a, const struct fft_cpx *b, int len, float scale)
+{
+	pffft_zconvolve_accumulate(fft, a->v, b->v, r->v, scale);
+}
+
 static struct convolver1 *convolver1_new(int block, const float *ir, int irlen)
 {
 	struct convolver1 *conv;
@@ -86,37 +155,37 @@ static struct convolver1 *convolver1_new(int block, const float *ir, int irlen)
 	conv->segCount = (irlen + conv->blockSize-1) / conv->blockSize;
 	conv->fftComplexSize = (conv->segSize / 2) + 1;
 
-        conv->fft = kiss_fftr_f32_alloc(conv->segSize, 0, NULL, NULL);
+        conv->fft = fft_new(conv->segSize);
         if (conv->fft == NULL)
                 return NULL;
-        conv->ifft = kiss_fftr_f32_alloc(conv->segSize, 1, NULL, NULL);
+        conv->ifft = ifft_new(conv->segSize);
         if (conv->ifft == NULL)
                 return NULL;
 
-	conv->fft_buffer = calloc(sizeof(float), conv->segSize);
+	conv->fft_buffer = fft_alloc(sizeof(float) * conv->segSize);
         if (conv->fft_buffer == NULL)
                 return NULL;
 
-	conv->segments = calloc(sizeof(kiss_fft_f32_cpx*), conv->segCount);
-	conv->segmentsIr = calloc(sizeof(kiss_fft_f32_cpx*), conv->segCount);
+	conv->segments = calloc(sizeof(struct fft_cpx), conv->segCount);
+	conv->segmentsIr = calloc(sizeof(struct fft_cpx), conv->segCount);
 
 	for (i = 0; i < conv->segCount; i++) {
 		int left = irlen - (i * conv->blockSize);
 		int copy = SPA_MIN(conv->blockSize, left);
 
-		conv->segments[i] = calloc(sizeof(kiss_fft_f32_cpx), conv->fftComplexSize);
-		conv->segmentsIr[i] = calloc(sizeof(kiss_fft_f32_cpx), conv->fftComplexSize);
+		fft_cpx_init(&conv->segments[i], conv->fftComplexSize);
+		fft_cpx_init(&conv->segmentsIr[i], conv->fftComplexSize);
 
 		memcpy(conv->fft_buffer, &ir[i * conv->blockSize], copy * sizeof(float));
 		if (copy < conv->segSize)
 			memset(conv->fft_buffer + copy, 0, (conv->segSize - copy) * sizeof(float));
 
-	        kiss_fftr_f32(conv->fft, conv->fft_buffer, conv->segmentsIr[i]);
+	        fft_run(conv->fft, conv->fft_buffer, &conv->segmentsIr[i]);
 	}
-	conv->pre_mult = calloc(sizeof(kiss_fft_f32_cpx), conv->fftComplexSize);
-	conv->conv = calloc(sizeof(kiss_fft_f32_cpx), conv->fftComplexSize);
-	conv->overlap = calloc(sizeof(float), conv->blockSize);
-	conv->inputBuffer = calloc(sizeof(float), conv->blockSize);
+	fft_cpx_init(&conv->pre_mult, conv->fftComplexSize);
+	fft_cpx_init(&conv->conv, conv->fftComplexSize);
+	conv->overlap = fft_alloc(sizeof(float) * conv->blockSize);
+	conv->inputBuffer = fft_alloc(sizeof(float) * conv->blockSize);
 	conv->inputBufferFill = 0;
 	conv->current = 0;
 
@@ -127,35 +196,38 @@ static void convolver1_free(struct convolver1 *conv)
 {
 	int i;
 	for (i = 0; i < conv->segCount; i++) {
-		free(conv->segments[i]);
-		free(conv->segmentsIr[i]);
+		fft_cpx_free(&conv->segments[i]);
+		fft_cpx_free(&conv->segmentsIr[i]);
 	}
-	free(conv->fft);
-	free(conv->ifft);
-	free(conv->fft_buffer);
+	fft_destroy(conv->fft);
+	fft_destroy(conv->ifft);
+	fft_free(conv->fft_buffer);
 	free(conv->segments);
 	free(conv->segmentsIr);
-	free(conv->pre_mult);
-	free(conv->conv);
-	free(conv->overlap);
-	free(conv->inputBuffer);
+	fft_cpx_free(&conv->pre_mult);
+	fft_cpx_free(&conv->conv);
+	fft_free(conv->overlap);
+	fft_free(conv->inputBuffer);
 	free(conv);
 }
 
 void Sum(float* result, const float* a, const float* b, int len)
 {
 	int i;
+#if defined (__SSE__)
+	const int end4 = 4 * (len / 4);
+	for (i = 0; i < end4; i += 4) {
+		const __m128 va = _mm_load_ps(&a[i]);
+		const __m128 vb = _mm_load_ps(&b[i]);
+		_mm_store_ps(&result[i], _mm_add_ps(va,vb));
+	}
+	for (i = end4; i < len; ++i) {
+		result[i] = a[i] + b[i];
+	}
+#else
 	for (i = 0; i < len; i++)
 		result[i] = a[i] + b[i];
-}
-
-void ComplexMultiplyAccumulate(kiss_fft_f32_cpx *r, const kiss_fft_f32_cpx *a, const kiss_fft_f32_cpx *b, int len)
-{
-	int i;
-	for (i = 0; i < len; i++) {
-		r[i].r += a[i].r * b[i].r - a[i].i * b[i].i;
-		r[i].i += a[i].r * b[i].i + a[i].i * b[i].r;
-	}
+#endif
 }
 
 static int convolver1_run(struct convolver1 *conv, const float *input, float *output, int len)
@@ -176,30 +248,27 @@ static int convolver1_run(struct convolver1 *conv, const float *input, float *ou
 		memcpy(conv->fft_buffer, conv->inputBuffer, conv->blockSize * sizeof(float));
 		memset(conv->fft_buffer + conv->blockSize, 0, (conv->segSize - conv->blockSize) * sizeof(float));
 
-		kiss_fftr_f32(conv->fft, conv->fft_buffer, conv->segments[conv->current]);
+		fft_run(conv->fft, conv->fft_buffer, &conv->segments[conv->current]);
 
 		if (conv->inputBufferFill == 0) {
-			memset(conv->pre_mult, 0, sizeof(kiss_fft_f32_cpx) * conv->fftComplexSize);
+			fft_cpx_clear(&conv->pre_mult, conv->fftComplexSize);
 
 			for (i = 1; i < conv->segCount; i++) {
 				const int indexIr = i;
 				const int indexAudio = (conv->current + i) % conv->segCount;
 
-				ComplexMultiplyAccumulate(conv->pre_mult,
-						conv->segmentsIr[indexIr],
-						conv->segments[indexAudio],
-						conv->fftComplexSize);
+				fft_convolve_accum(conv->fft, &conv->pre_mult,
+						&conv->segmentsIr[indexIr],
+						&conv->segments[indexAudio],
+						conv->fftComplexSize, 1.0f / conv->segSize);
 			}
 		}
-		memcpy(conv->conv, conv->pre_mult, sizeof(kiss_fft_f32_cpx) * conv->fftComplexSize);
+		fft_cpx_copy(&conv->conv, &conv->pre_mult, conv->fftComplexSize);
 
-		ComplexMultiplyAccumulate(conv->conv, conv->segments[conv->current], conv->segmentsIr[0],
-				conv->fftComplexSize);
+		fft_convolve_accum(conv->fft, &conv->conv, &conv->segments[conv->current], &conv->segmentsIr[0],
+				conv->fftComplexSize, 1.0f / conv->segSize);
 
-		kiss_fftri_f32(conv->ifft, conv->conv, conv->fft_buffer);
-
-		for (i = 0; i < conv->segSize; i++)
-			conv->fft_buffer[i] /= conv->segSize;
+		ifft_run(conv->ifft, &conv->conv, conv->fft_buffer);
 
 		Sum(output + processed, conv->fft_buffer + inputBufferPos, conv->overlap + inputBufferPos, processing);
 
@@ -265,19 +334,19 @@ struct convolver *convolver_new(int head_block, int tail_block, const float *ir,
 	if (irlen > conv->tailBlockSize) {
 		int conv1IrLen = SPA_MIN(irlen - conv->tailBlockSize, conv->tailBlockSize);
 		conv->tailConvolver0 = convolver1_new(conv->headBlockSize, ir + conv->tailBlockSize, conv1IrLen);
-		conv->tailOutput0 = calloc(conv->tailBlockSize, sizeof(float));
-		conv->tailPrecalculated0 = calloc(conv->tailBlockSize, sizeof(float));
+		conv->tailOutput0 = fft_alloc(conv->tailBlockSize * sizeof(float));
+		conv->tailPrecalculated0 = fft_alloc(conv->tailBlockSize * sizeof(float));
 	}
 
 	if (irlen > 2 * conv->tailBlockSize) {
 		int tailIrLen = irlen - (2 * conv->tailBlockSize);
 		conv->tailConvolver = convolver1_new(conv->tailBlockSize, ir + (2 * conv->tailBlockSize), tailIrLen);
-		conv->tailOutput = calloc(conv->tailBlockSize, sizeof(float));
-		conv->tailPrecalculated = calloc(conv->tailBlockSize, sizeof(float));
+		conv->tailOutput = fft_alloc(conv->tailBlockSize * sizeof(float));
+		conv->tailPrecalculated = fft_alloc(conv->tailBlockSize * sizeof(float));
 	}
 
 	if (conv->tailConvolver0 || conv->tailConvolver)
-		conv->tailInput = calloc(conv->tailBlockSize, sizeof(float));
+		conv->tailInput = fft_alloc(conv->tailBlockSize * sizeof(float));
 
 	conv->tailInputFill = 0;
 	conv->precalculatedPos = 0;
@@ -293,11 +362,11 @@ void convolver_free(struct convolver *conv)
 		convolver1_free(conv->tailConvolver0);
 	if (conv->tailConvolver)
 		convolver1_free(conv->tailConvolver);
-	free(conv->tailOutput0);
-	free(conv->tailPrecalculated0);
-	free(conv->tailOutput);
-	free(conv->tailPrecalculated);
-	free(conv->tailInput);
+	fft_free(conv->tailOutput0);
+	fft_free(conv->tailPrecalculated0);
+	fft_free(conv->tailOutput);
+	fft_free(conv->tailPrecalculated);
+	fft_free(conv->tailInput);
 	free(conv);
 }
 
diff --git a/src/modules/module-filter-chain/kiss_fft_f32.c b/src/modules/module-filter-chain/kiss_fft_f32.c
deleted file mode 100644
index 092713e13..000000000
--- a/src/modules/module-filter-chain/kiss_fft_f32.c
+++ /dev/null
@@ -1,442 +0,0 @@
-/*
- *  Copyright (c) 2003-2010, Mark Borgerding. All rights reserved.
- *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
- *
- *  SPDX-License-Identifier: BSD-3-Clause
- *  See COPYING file for more information.
- */
-
-
-#include "_kiss_fft_guts_f32.h"
-/* The guts header contains all the multiplication and addition macros that are defined for
- fixed or floating point complex numbers.  It also delares the kf_ internal functions.
- */
-
-static void
-kf_bfly2 (kiss_fft_f32_cpx * Fout,
-    const size_t fstride, const kiss_fft_f32_cfg st, int m)
-{
-  kiss_fft_f32_cpx *Fout2;
-  kiss_fft_f32_cpx *tw1 = st->twiddles;
-  kiss_fft_f32_cpx t;
-  Fout2 = Fout + m;
-  do {
-    C_FIXDIV (*Fout, 2);
-    C_FIXDIV (*Fout2, 2);
-
-    C_MUL (t, *Fout2, *tw1);
-    tw1 += fstride;
-    C_SUB (*Fout2, *Fout, t);
-    C_ADDTO (*Fout, t);
-    ++Fout2;
-    ++Fout;
-  } while (--m);
-}
-
-static void
-kf_bfly4 (kiss_fft_f32_cpx * Fout,
-    const size_t fstride, const kiss_fft_f32_cfg st, const size_t m)
-{
-  kiss_fft_f32_cpx *tw1, *tw2, *tw3;
-  kiss_fft_f32_cpx scratch[6];
-  size_t k = m;
-  const size_t m2 = 2 * m;
-  const size_t m3 = 3 * m;
-
-
-  tw3 = tw2 = tw1 = st->twiddles;
-
-  do {
-    C_FIXDIV (*Fout, 4);
-    C_FIXDIV (Fout[m], 4);
-    C_FIXDIV (Fout[m2], 4);
-    C_FIXDIV (Fout[m3], 4);
-
-    C_MUL (scratch[0], Fout[m], *tw1);
-    C_MUL (scratch[1], Fout[m2], *tw2);
-    C_MUL (scratch[2], Fout[m3], *tw3);
-
-    C_SUB (scratch[5], *Fout, scratch[1]);
-    C_ADDTO (*Fout, scratch[1]);
-    C_ADD (scratch[3], scratch[0], scratch[2]);
-    C_SUB (scratch[4], scratch[0], scratch[2]);
-    C_SUB (Fout[m2], *Fout, scratch[3]);
-    tw1 += fstride;
-    tw2 += fstride * 2;
-    tw3 += fstride * 3;
-    C_ADDTO (*Fout, scratch[3]);
-
-    if (st->inverse) {
-      Fout[m].r = scratch[5].r - scratch[4].i;
-      Fout[m].i = scratch[5].i + scratch[4].r;
-      Fout[m3].r = scratch[5].r + scratch[4].i;
-      Fout[m3].i = scratch[5].i - scratch[4].r;
-    } else {
-      Fout[m].r = scratch[5].r + scratch[4].i;
-      Fout[m].i = scratch[5].i - scratch[4].r;
-      Fout[m3].r = scratch[5].r - scratch[4].i;
-      Fout[m3].i = scratch[5].i + scratch[4].r;
-    }
-    ++Fout;
-  } while (--k);
-}
-
-static void
-kf_bfly3 (kiss_fft_f32_cpx * Fout,
-    const size_t fstride, const kiss_fft_f32_cfg st, size_t m)
-{
-  size_t k = m;
-  const size_t m2 = 2 * m;
-  kiss_fft_f32_cpx *tw1, *tw2;
-  kiss_fft_f32_cpx scratch[5];
-  kiss_fft_f32_cpx epi3;
-  epi3 = st->twiddles[fstride * m];
-
-  tw1 = tw2 = st->twiddles;
-
-  do {
-    C_FIXDIV (*Fout, 3);
-    C_FIXDIV (Fout[m], 3);
-    C_FIXDIV (Fout[m2], 3);
-
-    C_MUL (scratch[1], Fout[m], *tw1);
-    C_MUL (scratch[2], Fout[m2], *tw2);
-
-    C_ADD (scratch[3], scratch[1], scratch[2]);
-    C_SUB (scratch[0], scratch[1], scratch[2]);
-    tw1 += fstride;
-    tw2 += fstride * 2;
-
-    Fout[m].r = Fout->r - HALF_OF (scratch[3].r);
-    Fout[m].i = Fout->i - HALF_OF (scratch[3].i);
-
-    C_MULBYSCALAR (scratch[0], epi3.i);
-
-    C_ADDTO (*Fout, scratch[3]);
-
-    Fout[m2].r = Fout[m].r + scratch[0].i;
-    Fout[m2].i = Fout[m].i - scratch[0].r;
-
-    Fout[m].r -= scratch[0].i;
-    Fout[m].i += scratch[0].r;
-
-    ++Fout;
-  } while (--k);
-}
-
-static void
-kf_bfly5 (kiss_fft_f32_cpx * Fout,
-    const size_t fstride, const kiss_fft_f32_cfg st, int m)
-{
-  kiss_fft_f32_cpx *Fout0, *Fout1, *Fout2, *Fout3, *Fout4;
-  int u;
-  kiss_fft_f32_cpx scratch[13];
-  kiss_fft_f32_cpx *twiddles = st->twiddles;
-  kiss_fft_f32_cpx *tw;
-  kiss_fft_f32_cpx ya, yb;
-  ya = twiddles[fstride * m];
-  yb = twiddles[fstride * 2 * m];
-
-  Fout0 = Fout;
-  Fout1 = Fout0 + m;
-  Fout2 = Fout0 + 2 * m;
-  Fout3 = Fout0 + 3 * m;
-  Fout4 = Fout0 + 4 * m;
-
-  tw = st->twiddles;
-  for (u = 0; u < m; ++u) {
-    C_FIXDIV (*Fout0, 5);
-    C_FIXDIV (*Fout1, 5);
-    C_FIXDIV (*Fout2, 5);
-    C_FIXDIV (*Fout3, 5);
-    C_FIXDIV (*Fout4, 5);
-    scratch[0] = *Fout0;
-
-    C_MUL (scratch[1], *Fout1, tw[u * fstride]);
-    C_MUL (scratch[2], *Fout2, tw[2 * u * fstride]);
-    C_MUL (scratch[3], *Fout3, tw[3 * u * fstride]);
-    C_MUL (scratch[4], *Fout4, tw[4 * u * fstride]);
-
-    C_ADD (scratch[7], scratch[1], scratch[4]);
-    C_SUB (scratch[10], scratch[1], scratch[4]);
-    C_ADD (scratch[8], scratch[2], scratch[3]);
-    C_SUB (scratch[9], scratch[2], scratch[3]);
-
-    Fout0->r += scratch[7].r + scratch[8].r;
-    Fout0->i += scratch[7].i + scratch[8].i;
-
-    scratch[5].r =
-        scratch[0].r + S_MUL (scratch[7].r, ya.r) + S_MUL (scratch[8].r, yb.r);
-    scratch[5].i =
-        scratch[0].i + S_MUL (scratch[7].i, ya.r) + S_MUL (scratch[8].i, yb.r);
-
-    scratch[6].r = S_MUL (scratch[10].i, ya.i) + S_MUL (scratch[9].i, yb.i);
-    scratch[6].i = -S_MUL (scratch[10].r, ya.i) - S_MUL (scratch[9].r, yb.i);
-
-    C_SUB (*Fout1, scratch[5], scratch[6]);
-    C_ADD (*Fout4, scratch[5], scratch[6]);
-
-    scratch[11].r =
-        scratch[0].r + S_MUL (scratch[7].r, yb.r) + S_MUL (scratch[8].r, ya.r);
-    scratch[11].i =
-        scratch[0].i + S_MUL (scratch[7].i, yb.r) + S_MUL (scratch[8].i, ya.r);
-    scratch[12].r = -S_MUL (scratch[10].i, yb.i) + S_MUL (scratch[9].i, ya.i);
-    scratch[12].i = S_MUL (scratch[10].r, yb.i) - S_MUL (scratch[9].r, ya.i);
-
-    C_ADD (*Fout2, scratch[11], scratch[12]);
-    C_SUB (*Fout3, scratch[11], scratch[12]);
-
-    ++Fout0;
-    ++Fout1;
-    ++Fout2;
-    ++Fout3;
-    ++Fout4;
-  }
-}
-
-/* perform the butterfly for one stage of a mixed radix FFT */
-static void
-kf_bfly_generic (kiss_fft_f32_cpx * Fout,
-    const size_t fstride, const kiss_fft_f32_cfg st, int m, int p)
-{
-  int u, k, q1, q;
-  kiss_fft_f32_cpx *twiddles = st->twiddles;
-  kiss_fft_f32_cpx t;
-  int Norig = st->nfft;
-
-  kiss_fft_f32_cpx *scratch =
-      (kiss_fft_f32_cpx *) KISS_FFT_F32_TMP_ALLOC (sizeof (kiss_fft_f32_cpx) *
-      p);
-
-  for (u = 0; u < m; ++u) {
-    k = u;
-    for (q1 = 0; q1 < p; ++q1) {
-      scratch[q1] = Fout[k];
-      C_FIXDIV (scratch[q1], p);
-      k += m;
-    }
-
-    k = u;
-    for (q1 = 0; q1 < p; ++q1) {
-      int twidx = 0;
-      Fout[k] = scratch[0];
-      for (q = 1; q < p; ++q) {
-        twidx += fstride * k;
-        if (twidx >= Norig)
-          twidx -= Norig;
-        C_MUL (t, scratch[q], twiddles[twidx]);
-        C_ADDTO (Fout[k], t);
-      }
-      k += m;
-    }
-  }
-  KISS_FFT_F32_TMP_FREE (scratch);
-}
-
-static void
-kf_work (kiss_fft_f32_cpx * Fout,
-    const kiss_fft_f32_cpx * f,
-    const size_t fstride, int in_stride, int *factors,
-    const kiss_fft_f32_cfg st)
-{
-  kiss_fft_f32_cpx *Fout_beg = Fout;
-  const int p = *factors++;     /* the radix  */
-  const int m = *factors++;     /* stage's fft length/p */
-  const kiss_fft_f32_cpx *Fout_end = Fout + p * m;
-
-#ifdef _OPENMP
-  // use openmp extensions at the 
-  // top-level (not recursive)
-  if (fstride == 1 && p <= 5 && m != 1) {
-    int k;
-
-    // execute the p different work units in different threads
-#       pragma omp parallel for
-    for (k = 0; k < p; ++k)
-      kf_work (Fout + k * m, f + fstride * in_stride * k, fstride * p,
-          in_stride, factors, st);
-    // all threads have joined by this point
-
-    switch (p) {
-      case 2:
-        kf_bfly2 (Fout, fstride, st, m);
-        break;
-      case 3:
-        kf_bfly3 (Fout, fstride, st, m);
-        break;
-      case 4:
-        kf_bfly4 (Fout, fstride, st, m);
-        break;
-      case 5:
-        kf_bfly5 (Fout, fstride, st, m);
-        break;
-      default:
-        kf_bfly_generic (Fout, fstride, st, m, p);
-        break;
-    }
-    return;
-  }
-#endif
-
-  if (m == 1) {
-    do {
-      *Fout = *f;
-      f += fstride * in_stride;
-    } while (++Fout != Fout_end);
-  } else {
-    do {
-      // recursive call:
-      // DFT of size m*p performed by doing
-      // p instances of smaller DFTs of size m, 
-      // each one takes a decimated version of the input
-      kf_work (Fout, f, fstride * p, in_stride, factors, st);
-      f += fstride * in_stride;
-    } while ((Fout += m) != Fout_end);
-  }
-
-  Fout = Fout_beg;
-
-  // recombine the p smaller DFTs 
-  switch (p) {
-    case 2:
-      kf_bfly2 (Fout, fstride, st, m);
-      break;
-    case 3:
-      kf_bfly3 (Fout, fstride, st, m);
-      break;
-    case 4:
-      kf_bfly4 (Fout, fstride, st, m);
-      break;
-    case 5:
-      kf_bfly5 (Fout, fstride, st, m);
-      break;
-    default:
-      kf_bfly_generic (Fout, fstride, st, m, p);
-      break;
-  }
-}
-
-/*  facbuf is populated by p1,m1,p2,m2, ...
-    where 
-    p[i] * m[i] = m[i-1]
-    m0 = n                  */
-static void
-kf_factor (int n, int *facbuf)
-{
-  int p = 4;
-  double floor_sqrt;
-  floor_sqrt = floor (sqrt ((double) n));
-
-  /*factor out powers of 4, powers of 2, then any remaining primes */
-  do {
-    while (n % p) {
-      switch (p) {
-        case 4:
-          p = 2;
-          break;
-        case 2:
-          p = 3;
-          break;
-        default:
-          p += 2;
-          break;
-      }
-      if (p > floor_sqrt)
-        p = n;                  /* no more factors, skip to end */
-    }
-    n /= p;
-    *facbuf++ = p;
-    *facbuf++ = n;
-  } while (n > 1);
-}
-
-/*
- *
- * User-callable function to allocate all necessary storage space for the fft.
- *
- * The return value is a contiguous block of memory, allocated with malloc.  As such,
- * It can be freed with free(), rather than a kiss_fft_f32-specific function.
- * */
-kiss_fft_f32_cfg
-kiss_fft_f32_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem)
-{
-  kiss_fft_f32_cfg st = NULL;
-  size_t memneeded = sizeof (struct kiss_fft_f32_state)
-      + sizeof (kiss_fft_f32_cpx) * (nfft - 1); /* twiddle factors */
-
-  if (lenmem == NULL) {
-    st = (kiss_fft_f32_cfg) KISS_FFT_F32_MALLOC (memneeded);
-  } else {
-    if (mem != NULL && *lenmem >= memneeded)
-      st = (kiss_fft_f32_cfg) mem;
-    *lenmem = memneeded;
-  }
-  if (st) {
-    int i;
-    st->nfft = nfft;
-    st->inverse = inverse_fft;
-
-    for (i = 0; i < nfft; ++i) {
-      const double pi =
-          3.141592653589793238462643383279502884197169399375105820974944;
-      double phase = -2 * pi * i / nfft;
-      if (st->inverse)
-        phase *= -1;
-      kf_cexp (st->twiddles + i, phase);
-    }
-
-    kf_factor (nfft, st->factors);
-  }
-  return st;
-}
-
-
-void
-kiss_fft_f32_stride (kiss_fft_f32_cfg st, const kiss_fft_f32_cpx * fin,
-    kiss_fft_f32_cpx * fout, int in_stride)
-{
-  if (fin == fout) {
-    //NOTE: this is not really an in-place FFT algorithm.
-    //It just performs an out-of-place FFT into a temp buffer
-    kiss_fft_f32_cpx *tmpbuf =
-        (kiss_fft_f32_cpx *) KISS_FFT_F32_TMP_ALLOC (sizeof (kiss_fft_f32_cpx) *
-        st->nfft);
-    kf_work (tmpbuf, fin, 1, in_stride, st->factors, st);
-    memcpy (fout, tmpbuf, sizeof (kiss_fft_f32_cpx) * st->nfft);
-    KISS_FFT_F32_TMP_FREE (tmpbuf);
-  } else {
-    kf_work (fout, fin, 1, in_stride, st->factors, st);
-  }
-}
-
-void
-kiss_fft_f32 (kiss_fft_f32_cfg cfg, const kiss_fft_f32_cpx * fin,
-    kiss_fft_f32_cpx * fout)
-{
-  kiss_fft_f32_stride (cfg, fin, fout, 1);
-}
-
-
-void
-kiss_fft_f32_cleanup (void)
-{
-  // nothing needed any more
-}
-
-int
-kiss_fft_f32_next_fast_size (int n)
-{
-  while (1) {
-    int m = n;
-    while ((m % 2) == 0)
-      m /= 2;
-    while ((m % 3) == 0)
-      m /= 3;
-    while ((m % 5) == 0)
-      m /= 5;
-    if (m <= 1)
-      break;                    /* n is completely factorable by twos, threes, and fives */
-    n++;
-  }
-  return n;
-}
diff --git a/src/modules/module-filter-chain/kiss_fft_f32.h b/src/modules/module-filter-chain/kiss_fft_f32.h
deleted file mode 100644
index 1f9b57b95..000000000
--- a/src/modules/module-filter-chain/kiss_fft_f32.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/*
- *  Copyright (c) 2003-2010, Mark Borgerding. All rights reserved.
- *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
- *
- *  SPDX-License-Identifier: BSD-3-Clause
- *  See COPYING file for more information.
- */
-
-#ifndef KISS_FFT_F32_H
-#define KISS_FFT_F32_H
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <math.h>
-#include <string.h>
-#include <stdint.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-/*
- ATTENTION!
- If you would like a :
- -- a utility that will handle the caching of fft objects
- -- real-only (no imaginary time component ) FFT
- -- a multi-dimensional FFT
- -- a command-line utility to perform ffts
- -- a command-line utility to perform fast-convolution filtering
-
- Then see kfc.h kiss_fftr_f32.h kiss_fft_f32nd.h fftutil.c kiss_fastfir.c
-  in the tools/ directory.
-*/
-
-#define KISS_FFT_F32_MALLOC malloc
-#define KISS_FFT_F32_FREE free
-#define kiss_fft_f32_scalar float
-
-typedef struct {
-    kiss_fft_f32_scalar r;
-    kiss_fft_f32_scalar i;
-}kiss_fft_f32_cpx;
-
-typedef struct kiss_fft_f32_state* kiss_fft_f32_cfg;
-
-/* 
- *  kiss_fft_f32_alloc
- *  
- *  Initialize a FFT (or IFFT) algorithm's cfg/state buffer.
- *
- *  typical usage:      kiss_fft_f32_cfg mycfg=kiss_fft_f32_alloc(1024,0,NULL,NULL);
- *
- *  The return value from fft_alloc is a cfg buffer used internally
- *  by the fft routine or NULL.
- *
- *  If lenmem is NULL, then kiss_fft_f32_alloc will allocate a cfg buffer using malloc.
- *  The returned value should be free()d when done to avoid memory leaks.
- *  
- *  The state can be placed in a user supplied buffer 'mem':
- *  If lenmem is not NULL and mem is not NULL and *lenmem is large enough,
- *      then the function places the cfg in mem and the size used in *lenmem
- *      and returns mem.
- *  
- *  If lenmem is not NULL and ( mem is NULL or *lenmem is not large enough),
- *      then the function returns NULL and places the minimum cfg 
- *      buffer size in *lenmem.
- * */
-
-kiss_fft_f32_cfg kiss_fft_f32_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem); 
-
-/*
- * kiss_fft_f32(cfg,in_out_buf)
- *
- * Perform an FFT on a complex input buffer.
- * for a forward FFT,
- * fin should be  f[0] , f[1] , ... ,f[nfft-1]
- * fout will be   F[0] , F[1] , ... ,F[nfft-1]
- * Note that each element is complex and can be accessed like
-    f[k].r and f[k].i
- * */
-void kiss_fft_f32(kiss_fft_f32_cfg cfg,const kiss_fft_f32_cpx *fin,kiss_fft_f32_cpx *fout);
-
-/*
- A more generic version of the above function. It reads its input from every Nth sample.
- * */
-void kiss_fft_f32_stride(kiss_fft_f32_cfg cfg,const kiss_fft_f32_cpx *fin,kiss_fft_f32_cpx *fout,int fin_stride);
-
-/* If kiss_fft_f32_alloc allocated a buffer, it is one contiguous 
-   buffer and can be simply free()d when no longer needed*/
-#define kiss_fft_f32_free KISS_FFT_F32_FREE
-
-/*
- Cleans up some memory that gets managed internally. Not necessary to call, but it might clean up 
- your compiler output to call this before you exit.
-*/
-void kiss_fft_f32_cleanup(void);
-	
-
-/*
- * Returns the smallest integer k, such that k>=n and k has only "fast" factors (2,3,5)
- */
-int kiss_fft_f32_next_fast_size(int n);
-
-/* for real ffts, we need an even size */
-#define kiss_fftr_f32_next_fast_size_real(n) \
-        (kiss_fft_f32_next_fast_size( ((n)+1)>>1)<<1)
-
-#ifdef __cplusplus
-} 
-#endif
-
-#endif
diff --git a/src/modules/module-filter-chain/kiss_fftr_f32.c b/src/modules/module-filter-chain/kiss_fftr_f32.c
deleted file mode 100644
index 7227b7ce6..000000000
--- a/src/modules/module-filter-chain/kiss_fftr_f32.c
+++ /dev/null
@@ -1,148 +0,0 @@
-/*
- *  Copyright (c) 2003-2004, Mark Borgerding. All rights reserved.
- *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
- *
- *  SPDX-License-Identifier: BSD-3-Clause
- *  See COPYING file for more information.
- */
-
-#include "kiss_fftr_f32.h"
-#include "_kiss_fft_guts_f32.h"
-
-struct kiss_fftr_f32_state
-{
-  kiss_fft_f32_cfg substate;
-  kiss_fft_f32_cpx *tmpbuf;
-  kiss_fft_f32_cpx *super_twiddles;
-#ifdef USE_SIMD
-  void *pad;
-#endif
-};
-
-kiss_fftr_f32_cfg
-kiss_fftr_f32_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem)
-{
-  int i;
-  kiss_fftr_f32_cfg st = NULL;
-  size_t subsize = 0, memneeded;
-
-  nfft >>= 1;
-
-  kiss_fft_f32_alloc (nfft, inverse_fft, NULL, &subsize);
-  memneeded =
-      ALIGN_STRUCT (sizeof (struct kiss_fftr_f32_state)) +
-      ALIGN_STRUCT (subsize) + sizeof (kiss_fft_f32_cpx) * (nfft * 3 / 2);
-
-  if (lenmem == NULL) {
-    st = (kiss_fftr_f32_cfg) KISS_FFT_F32_MALLOC (memneeded);
-  } else {
-    if (*lenmem >= memneeded)
-      st = (kiss_fftr_f32_cfg) mem;
-    *lenmem = memneeded;
-  }
-  if (!st)
-    return NULL;
-
-  st->substate = (kiss_fft_f32_cfg) (((char *) st) + ALIGN_STRUCT (sizeof (struct kiss_fftr_f32_state)));       /*just beyond kiss_fftr_f32_state struct */
-  st->tmpbuf =
-      (kiss_fft_f32_cpx *) (((char *) st->substate) + ALIGN_STRUCT (subsize));
-  st->super_twiddles = st->tmpbuf + nfft;
-  kiss_fft_f32_alloc (nfft, inverse_fft, st->substate, &subsize);
-
-  for (i = 0; i < nfft / 2; ++i) {
-    double phase =
-        -3.14159265358979323846264338327 * ((double) (i + 1) / nfft + .5);
-    if (inverse_fft)
-      phase *= -1;
-    kf_cexp (st->super_twiddles + i, phase);
-  }
-  return st;
-}
-
-void
-kiss_fftr_f32 (kiss_fftr_f32_cfg st, const kiss_fft_f32_scalar * timedata,
-    kiss_fft_f32_cpx * freqdata)
-{
-  /* input buffer timedata is stored row-wise */
-  int k, ncfft;
-  kiss_fft_f32_cpx fpnk, fpk, f1k, f2k, tw, tdc;
-
-  ncfft = st->substate->nfft;
-
-  /*perform the parallel fft of two real signals packed in real,imag */
-  kiss_fft_f32 (st->substate, (const kiss_fft_f32_cpx *) timedata, st->tmpbuf);
-  /* The real part of the DC element of the frequency spectrum in st->tmpbuf
-   * contains the sum of the even-numbered elements of the input time sequence
-   * The imag part is the sum of the odd-numbered elements
-   *
-   * The sum of tdc.r and tdc.i is the sum of the input time sequence. 
-   *      yielding DC of input time sequence
-   * The difference of tdc.r - tdc.i is the sum of the input (dot product) [1,-1,1,-1... 
-   *      yielding Nyquist bin of input time sequence
-   */
-
-  tdc.r = st->tmpbuf[0].r;
-  tdc.i = st->tmpbuf[0].i;
-  C_FIXDIV (tdc, 2);
-  CHECK_OVERFLOW_OP (tdc.r, +, tdc.i);
-  CHECK_OVERFLOW_OP (tdc.r, -, tdc.i);
-  freqdata[0].r = tdc.r + tdc.i;
-  freqdata[ncfft].r = tdc.r - tdc.i;
-#ifdef USE_SIMD
-  freqdata[ncfft].i = freqdata[0].i = _mm_set1_ps (0);
-#else
-  freqdata[ncfft].i = freqdata[0].i = 0;
-#endif
-
-  for (k = 1; k <= ncfft / 2; ++k) {
-    fpk = st->tmpbuf[k];
-    fpnk.r = st->tmpbuf[ncfft - k].r;
-    fpnk.i = -st->tmpbuf[ncfft - k].i;
-    C_FIXDIV (fpk, 2);
-    C_FIXDIV (fpnk, 2);
-
-    C_ADD (f1k, fpk, fpnk);
-    C_SUB (f2k, fpk, fpnk);
-    C_MUL (tw, f2k, st->super_twiddles[k - 1]);
-
-    freqdata[k].r = HALF_OF (f1k.r + tw.r);
-    freqdata[k].i = HALF_OF (f1k.i + tw.i);
-    freqdata[ncfft - k].r = HALF_OF (f1k.r - tw.r);
-    freqdata[ncfft - k].i = HALF_OF (tw.i - f1k.i);
-  }
-}
-
-void
-kiss_fftri_f32 (kiss_fftr_f32_cfg st, const kiss_fft_f32_cpx * freqdata,
-    kiss_fft_f32_scalar * timedata)
-{
-  /* input buffer timedata is stored row-wise */
-  int k, ncfft;
-
-  ncfft = st->substate->nfft;
-
-  st->tmpbuf[0].r = freqdata[0].r + freqdata[ncfft].r;
-  st->tmpbuf[0].i = freqdata[0].r - freqdata[ncfft].r;
-  C_FIXDIV (st->tmpbuf[0], 2);
-
-  for (k = 1; k <= ncfft / 2; ++k) {
-    kiss_fft_f32_cpx fk, fnkc, fek, fok, tmp;
-    fk = freqdata[k];
-    fnkc.r = freqdata[ncfft - k].r;
-    fnkc.i = -freqdata[ncfft - k].i;
-    C_FIXDIV (fk, 2);
-    C_FIXDIV (fnkc, 2);
-
-    C_ADD (fek, fk, fnkc);
-    C_SUB (tmp, fk, fnkc);
-    C_MUL (fok, tmp, st->super_twiddles[k - 1]);
-    C_ADD (st->tmpbuf[k], fek, fok);
-    C_SUB (st->tmpbuf[ncfft - k], fek, fok);
-#ifdef USE_SIMD
-    st->tmpbuf[ncfft - k].i *= _mm_set1_ps (-1.0);
-#else
-    st->tmpbuf[ncfft - k].i *= -1;
-#endif
-  }
-  kiss_fft_f32 (st->substate, st->tmpbuf, (kiss_fft_f32_cpx *) timedata);
-}
diff --git a/src/modules/module-filter-chain/kiss_fftr_f32.h b/src/modules/module-filter-chain/kiss_fftr_f32.h
deleted file mode 100644
index da21245f5..000000000
--- a/src/modules/module-filter-chain/kiss_fftr_f32.h
+++ /dev/null
@@ -1,54 +0,0 @@
-/*
- *  Copyright (c) 2003-2004, Mark Borgerding. All rights reserved.
- *  This file is part of KISS FFT - https://github.com/mborgerding/kissfft
- *
- *  SPDX-License-Identifier: BSD-3-Clause
- *  See COPYING file for more information.
- */
-
-#ifndef KISS_FTR_H
-#define KISS_FTR_H
-
-#include "kiss_fft_f32.h"
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-    
-/* 
- 
- Real optimized version can save about 45% cpu time vs. complex fft of a real seq.
-
- 
- 
- */
-
-typedef struct kiss_fftr_f32_state *kiss_fftr_f32_cfg;
-
-
-kiss_fftr_f32_cfg kiss_fftr_f32_alloc(int nfft,int inverse_fft,void * mem, size_t * lenmem);
-/*
- nfft must be even
-
- If you don't care to allocate space, use mem = lenmem = NULL 
-*/
-
-
-void kiss_fftr_f32(kiss_fftr_f32_cfg cfg,const kiss_fft_f32_scalar *timedata,kiss_fft_f32_cpx *freqdata);
-/*
- input timedata has nfft scalar points
- output freqdata has nfft/2+1 complex points
-*/
-
-void kiss_fftri_f32(kiss_fftr_f32_cfg cfg,const kiss_fft_f32_cpx *freqdata,kiss_fft_f32_scalar *timedata);
-/*
- input freqdata has  nfft/2+1 complex points
- output timedata has nfft scalar points
-*/
-
-#define kiss_fftr_f32_free KISS_FFT_F32_FREE
-
-#ifdef __cplusplus
-}
-#endif
-#endif
diff --git a/src/modules/module-filter-chain/pffft.c b/src/modules/module-filter-chain/pffft.c
new file mode 100644
index 000000000..308dc97bc
--- /dev/null
+++ b/src/modules/module-filter-chain/pffft.c
@@ -0,0 +1,2244 @@
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Based on original fortran 77 code from FFTPACKv4 from NETLIB
+   (http://www.netlib.org/fftpack), authored by Dr Paul Swarztrauber
+   of NCAR, in 1985.
+
+   As confirmed by the NCAR fftpack software curators, the following
+   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
+   released under the same terms.
+
+   FFTPACK license:
+
+   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
+
+   Copyright (c) 2004 the University Corporation for Atmospheric
+   Research ("UCAR"). All rights reserved. Developed by NCAR's
+   Computational and Information Systems Laboratory, UCAR,
+   www.cisl.ucar.edu.
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.  
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+
+   PFFFT : a Pretty Fast FFT.
+
+   This file is largerly based on the original FFTPACK implementation, modified in
+   order to take advantage of SIMD instructions of modern CPUs.
+*/
+
+/*
+  ChangeLog: 
+  - 2011/10/02, version 1: This is the very first release of this file.
+*/
+
+#include "pffft.h"
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+#include <assert.h>
+
+/* detect compiler flavour */
+#if defined(_MSC_VER)
+#define COMPILER_MSVC
+#elif defined(__GNUC__)
+#define COMPILER_GCC
+#endif
+
+#if defined(COMPILER_GCC)
+#define ALWAYS_INLINE(return_type) inline return_type __attribute__ ((always_inline))
+#define NEVER_INLINE(return_type) return_type __attribute__ ((noinline))
+#define RESTRICT __restrict
+#define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ varname__[size__];
+#elif defined(COMPILER_MSVC)
+#define ALWAYS_INLINE(return_type) __forceinline return_type
+#define NEVER_INLINE(return_type) __declspec(noinline) return_type
+#define RESTRICT __restrict
+#define VLA_ARRAY_ON_STACK(type__, varname__, size__) type__ *varname__ = (type__*)_alloca(size__ * sizeof(type__))
+#endif
+
+/* 
+   vector support macros: the rest of the code is independant of
+   SSE/Altivec/NEON -- adding support for other platforms with 4-element
+   vectors should be limited to these macros 
+*/
+
+// define PFFFT_SIMD_DISABLE if you want to use scalar code instead of simd code
+//#define PFFFT_SIMD_DISABLE
+
+/*
+   Altivec support macros 
+*/
+#if !defined(PFFFT_SIMD_DISABLE) && (defined(__ppc__) || defined(__ppc64__))
+typedef vector float v4sf;
+#define SIMD_SZ 4
+#define VZERO() ((vector float) vec_splat_u8(0))
+#define VMUL(a,b) vec_madd(a,b, VZERO())
+#define VADD(a,b) vec_add(a,b)
+#define VMADD(a,b,c) vec_madd(a,b,c)
+#define VSUB(a,b) vec_sub(a,b)
+inline v4sf ld_ps1(const float *p)
+{
+	v4sf v = vec_lde(0, p);
+	return vec_splat(vec_perm(v, v, vec_lvsl(0, p)), 0);
+}
+
+#define LD_PS1(p) ld_ps1(&p)
+#define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = vec_mergeh(in1, in2); out2 = vec_mergel(in1, in2); out1 = tmp__; }
+#define UNINTERLEAVE2(in1, in2, out1, out2) {                           \
+    vector unsigned char vperm1 =  (vector unsigned char)(0,1,2,3,8,9,10,11,16,17,18,19,24,25,26,27); \
+    vector unsigned char vperm2 =  (vector unsigned char)(4,5,6,7,12,13,14,15,20,21,22,23,28,29,30,31); \
+    v4sf tmp__ = vec_perm(in1, in2, vperm1); out2 = vec_perm(in1, in2, vperm2); out1 = tmp__; \
+  }
+#define VTRANSPOSE4(x0,x1,x2,x3) {              \
+    v4sf y0 = vec_mergeh(x0, x2);               \
+    v4sf y1 = vec_mergel(x0, x2);               \
+    v4sf y2 = vec_mergeh(x1, x3);               \
+    v4sf y3 = vec_mergel(x1, x3);               \
+    x0 = vec_mergeh(y0, y2);                    \
+    x1 = vec_mergel(y0, y2);                    \
+    x2 = vec_mergeh(y1, y3);                    \
+    x3 = vec_mergel(y1, y3);                    \
+  }
+#define VSWAPHL(a,b) vec_perm(a,b, (vector unsigned char)(16,17,18,19,20,21,22,23,8,9,10,11,12,13,14,15))
+#define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0)
+
+/*
+  SSE1 support macros
+*/
+#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(i386) || defined(_M_IX86))
+
+#include <xmmintrin.h>
+typedef __m128 v4sf;
+#define SIMD_SZ 4		// 4 floats by simd vector -- this is pretty much hardcoded in the preprocess/finalize functions anyway so you will have to work if you want to enable AVX with its 256-bit vectors.
+#define VZERO() _mm_setzero_ps()
+#define VMUL(a,b) _mm_mul_ps(a,b)
+#define VADD(a,b) _mm_add_ps(a,b)
+#define VMADD(a,b,c) _mm_add_ps(_mm_mul_ps(a,b), c)
+#define VSUB(a,b) _mm_sub_ps(a,b)
+#define LD_PS1(p) _mm_set1_ps(p)
+#define INTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_unpacklo_ps(in1, in2); out2 = _mm_unpackhi_ps(in1, in2); out1 = tmp__; }
+#define UNINTERLEAVE2(in1, in2, out1, out2) { v4sf tmp__ = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(2,0,2,0)); out2 = _mm_shuffle_ps(in1, in2, _MM_SHUFFLE(3,1,3,1)); out1 = tmp__; }
+#define VTRANSPOSE4(x0,x1,x2,x3) _MM_TRANSPOSE4_PS(x0,x1,x2,x3)
+#define VSWAPHL(a,b) _mm_shuffle_ps(b, a, _MM_SHUFFLE(3,2,1,0))
+#define VALIGNED(ptr) ((((long long)(ptr)) & 0xF) == 0)
+
+/*
+  ARM NEON support macros
+*/
+#elif !defined(PFFFT_SIMD_DISABLE) && (defined(__arm__) || defined(__aarch64__) || defined(__arm64__))
+#include <arm_neon.h>
+typedef float32x4_t v4sf;
+#define SIMD_SZ 4
+#define VZERO() vdupq_n_f32(0)
+#define VMUL(a,b) vmulq_f32(a,b)
+#define VADD(a,b) vaddq_f32(a,b)
+#define VMADD(a,b,c) vmlaq_f32(c,a,b)
+#define VSUB(a,b) vsubq_f32(a,b)
+#define LD_PS1(p) vld1q_dup_f32(&(p))
+#define INTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vzipq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
+#define UNINTERLEAVE2(in1, in2, out1, out2) { float32x4x2_t tmp__ = vuzpq_f32(in1,in2); out1=tmp__.val[0]; out2=tmp__.val[1]; }
+#define VTRANSPOSE4(x0,x1,x2,x3) {                                    \
+    float32x4x2_t t0_ = vzipq_f32(x0, x2);                              \
+    float32x4x2_t t1_ = vzipq_f32(x1, x3);                              \
+    float32x4x2_t u0_ = vzipq_f32(t0_.val[0], t1_.val[0]);              \
+    float32x4x2_t u1_ = vzipq_f32(t0_.val[1], t1_.val[1]);              \
+    x0 = u0_.val[0]; x1 = u0_.val[1]; x2 = u1_.val[0]; x3 = u1_.val[1]; \
+  }
+// marginally faster version
+//#  define VTRANSPOSE4(x0,x1,x2,x3) { asm("vtrn.32 %q0, %q1;\n vtrn.32 %q2,%q3\n vswp %f0,%e2\n vswp %f1,%e3" : "+w"(x0), "+w"(x1), "+w"(x2), "+w"(x3)::); }
+#define VSWAPHL(a,b) vcombine_f32(vget_low_f32(b), vget_high_f32(a))
+#define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0)
+#else
+#if !defined(PFFFT_SIMD_DISABLE)
+#warning "building with simd disabled !\n";
+#define PFFFT_SIMD_DISABLE	// fallback to scalar code
+#endif
+#endif
+
+// fallback mode for situations where SSE/Altivec are not available, use scalar mode instead
+#ifdef PFFFT_SIMD_DISABLE
+typedef float v4sf;
+#define SIMD_SZ 1
+#define VZERO() 0.f
+#define VMUL(a,b) ((a)*(b))
+#define VADD(a,b) ((a)+(b))
+#define VMADD(a,b,c) ((a)*(b)+(c))
+#define VSUB(a,b) ((a)-(b))
+#define LD_PS1(p) (p)
+#define VALIGNED(ptr) ((((long long)(ptr)) & 0x3) == 0)
+#endif
+
+// shortcuts for complex multiplcations
+#define VCPLXMUL(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VSUB(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VADD(ai,tmp); }
+#define VCPLXMULCONJ(ar,ai,br,bi) { v4sf tmp; tmp=VMUL(ar,bi); ar=VMUL(ar,br); ar=VADD(ar,VMUL(ai,bi)); ai=VMUL(ai,br); ai=VSUB(ai,tmp); }
+#ifndef SVMUL
+// multiply a scalar with a vector
+#define SVMUL(f,v) VMUL(LD_PS1(f),v)
+#endif
+
+#if !defined(PFFFT_SIMD_DISABLE)
+typedef union v4sf_union {
+	v4sf v;
+	float f[4];
+} v4sf_union;
+
+#include <string.h>
+
+#define assertv4(v,f0,f1,f2,f3) assert(v.f[0] == (f0) && v.f[1] == (f1) && v.f[2] == (f2) && v.f[3] == (f3))
+
+/* detect bugs with the vector support macros */
+void validate_pffft_simd()
+{
+	float f[16] = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 };
+	v4sf_union a0, a1, a2, a3, t, u;
+	memcpy(a0.f, f, 4 * sizeof(float));
+	memcpy(a1.f, f + 4, 4 * sizeof(float));
+	memcpy(a2.f, f + 8, 4 * sizeof(float));
+	memcpy(a3.f, f + 12, 4 * sizeof(float));
+
+	t = a0;
+	u = a1;
+	t.v = VZERO();
+	printf("VZERO=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2], t.f[3]);
+	assertv4(t, 0, 0, 0, 0);
+	t.v = VADD(a1.v, a2.v);
+	printf("VADD(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2],
+	       t.f[3]);
+	assertv4(t, 12, 14, 16, 18);
+	t.v = VMUL(a1.v, a2.v);
+	printf("VMUL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2],
+	       t.f[3]);
+	assertv4(t, 32, 45, 60, 77);
+	t.v = VMADD(a1.v, a2.v, a0.v);
+	printf("VMADD(4:7,8:11,0:3)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1],
+	       t.f[2], t.f[3]);
+	assertv4(t, 32, 46, 62, 80);
+
+	INTERLEAVE2(a1.v, a2.v, t.v, u.v);
+	printf("INTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
+	       t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
+	assertv4(t, 4, 8, 5, 9);
+	assertv4(u, 6, 10, 7, 11);
+	UNINTERLEAVE2(a1.v, a2.v, t.v, u.v);
+	printf("UNINTERLEAVE2(4:7,8:11)=[%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
+	       t.f[0], t.f[1], t.f[2], t.f[3], u.f[0], u.f[1], u.f[2], u.f[3]);
+	assertv4(t, 4, 6, 8, 10);
+	assertv4(u, 5, 7, 9, 11);
+
+	t.v = LD_PS1(f[15]);
+	printf("LD_PS1(15)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2],
+	       t.f[3]);
+	assertv4(t, 15, 15, 15, 15);
+	t.v = VSWAPHL(a1.v, a2.v);
+	printf("VSWAPHL(4:7,8:11)=[%2g %2g %2g %2g]\n", t.f[0], t.f[1], t.f[2],
+	       t.f[3]);
+	assertv4(t, 8, 9, 6, 7);
+	VTRANSPOSE4(a0.v, a1.v, a2.v, a3.v);
+	printf
+	    ("VTRANSPOSE4(0:3,4:7,8:11,12:15)=[%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g] [%2g %2g %2g %2g]\n",
+	     a0.f[0], a0.f[1], a0.f[2], a0.f[3], a1.f[0], a1.f[1], a1.f[2],
+	     a1.f[3], a2.f[0], a2.f[1], a2.f[2], a2.f[3], a3.f[0], a3.f[1],
+	     a3.f[2], a3.f[3]);
+	assertv4(a0, 0, 4, 8, 12);
+	assertv4(a1, 1, 5, 9, 13);
+	assertv4(a2, 2, 6, 10, 14);
+	assertv4(a3, 3, 7, 11, 15);
+}
+#else
+void validate_pffft_simd()
+{
+}				// allow test_pffft.c to call this function even when simd is not available..
+#endif				//!PFFFT_SIMD_DISABLE
+
+/* SSE and co like 16-bytes aligned pointers */
+#define MALLOC_V4SF_ALIGNMENT 64	// with a 64-byte alignment, we are even aligned on L2 cache lines...
+void *pffft_aligned_malloc(size_t nb_bytes)
+{
+	void *p, *p0 = malloc(nb_bytes + MALLOC_V4SF_ALIGNMENT);
+	if (!p0)
+		return (void *)0;
+	p = (void *)(((size_t)p0 + MALLOC_V4SF_ALIGNMENT) &
+		     (~((size_t)(MALLOC_V4SF_ALIGNMENT - 1))));
+	*((void **)p - 1) = p0;
+	return p;
+}
+
+void pffft_aligned_free(void *p)
+{
+	if (p)
+		free(*((void **)p - 1));
+}
+
+int pffft_simd_size()
+{
+	return SIMD_SZ;
+}
+
+/*
+  passf2 and passb2 has been merged here, fsign = -1 for passf2, +1 for passb2
+*/
+static NEVER_INLINE(void) passf2_ps(int ido, int l1, const v4sf * cc, v4sf * ch,
+				    const float *wa1, float fsign)
+{
+	int k, i;
+	int l1ido = l1 * ido;
+	if (ido <= 2) {
+		for (k = 0; k < l1ido; k += ido, ch += ido, cc += 2 * ido) {
+			ch[0] = VADD(cc[0], cc[ido + 0]);
+			ch[l1ido] = VSUB(cc[0], cc[ido + 0]);
+			ch[1] = VADD(cc[1], cc[ido + 1]);
+			ch[l1ido + 1] = VSUB(cc[1], cc[ido + 1]);
+		}
+	} else {
+		for (k = 0; k < l1ido; k += ido, ch += ido, cc += 2 * ido) {
+			for (i = 0; i < ido - 1; i += 2) {
+				v4sf tr2 = VSUB(cc[i + 0], cc[i + ido + 0]);
+				v4sf ti2 = VSUB(cc[i + 1], cc[i + ido + 1]);
+				v4sf wr = LD_PS1(wa1[i]), wi =
+				    VMUL(LD_PS1(fsign), LD_PS1(wa1[i + 1]));
+				ch[i] = VADD(cc[i + 0], cc[i + ido + 0]);
+				ch[i + 1] = VADD(cc[i + 1], cc[i + ido + 1]);
+				VCPLXMUL(tr2, ti2, wr, wi);
+				ch[i + l1ido] = tr2;
+				ch[i + l1ido + 1] = ti2;
+			}
+		}
+	}
+}
+
+/*
+  passf3 and passb3 has been merged here, fsign = -1 for passf3, +1 for passb3
+*/
+static NEVER_INLINE(void) passf3_ps(int ido, int l1, const v4sf * cc, v4sf * ch,
+				    const float *wa1, const float *wa2,
+				    float fsign)
+{
+	static const float taur = -0.5f;
+	float taui = 0.866025403784439f * fsign;
+	int i, k;
+	v4sf tr2, ti2, cr2, ci2, cr3, ci3, dr2, di2, dr3, di3;
+	int l1ido = l1 * ido;
+	float wr1, wi1, wr2, wi2;
+	assert(ido > 2);
+	for (k = 0; k < l1ido; k += ido, cc += 3 * ido, ch += ido) {
+		for (i = 0; i < ido - 1; i += 2) {
+			tr2 = VADD(cc[i + ido], cc[i + 2 * ido]);
+			cr2 = VADD(cc[i], SVMUL(taur, tr2));
+			ch[i] = VADD(cc[i], tr2);
+			ti2 = VADD(cc[i + ido + 1], cc[i + 2 * ido + 1]);
+			ci2 = VADD(cc[i + 1], SVMUL(taur, ti2));
+			ch[i + 1] = VADD(cc[i + 1], ti2);
+			cr3 = SVMUL(taui, VSUB(cc[i + ido], cc[i + 2 * ido]));
+			ci3 =
+			    SVMUL(taui,
+				  VSUB(cc[i + ido + 1], cc[i + 2 * ido + 1]));
+			dr2 = VSUB(cr2, ci3);
+			dr3 = VADD(cr2, ci3);
+			di2 = VADD(ci2, cr3);
+			di3 = VSUB(ci2, cr3);
+			wr1 = wa1[i], wi1 = fsign * wa1[i + 1], wr2 =
+			    wa2[i], wi2 = fsign * wa2[i + 1];
+			VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
+			ch[i + l1ido] = dr2;
+			ch[i + l1ido + 1] = di2;
+			VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
+			ch[i + 2 * l1ido] = dr3;
+			ch[i + 2 * l1ido + 1] = di3;
+		}
+	}
+}				/* passf3 */
+
+static NEVER_INLINE(void) passf4_ps(int ido, int l1, const v4sf * cc, v4sf * ch,
+				    const float *wa1, const float *wa2,
+				    const float *wa3, float fsign)
+{
+	/* isign == -1 for forward transform and +1 for backward transform */
+
+	int i, k;
+	v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3,
+	    tr4;
+	int l1ido = l1 * ido;
+	if (ido == 2) {
+		for (k = 0; k < l1ido; k += ido, ch += ido, cc += 4 * ido) {
+			tr1 = VSUB(cc[0], cc[2 * ido + 0]);
+			tr2 = VADD(cc[0], cc[2 * ido + 0]);
+			ti1 = VSUB(cc[1], cc[2 * ido + 1]);
+			ti2 = VADD(cc[1], cc[2 * ido + 1]);
+			ti4 =
+			    VMUL(VSUB(cc[1 * ido + 0], cc[3 * ido + 0]),
+				 LD_PS1(fsign));
+			tr4 =
+			    VMUL(VSUB(cc[3 * ido + 1], cc[1 * ido + 1]),
+				 LD_PS1(fsign));
+			tr3 = VADD(cc[ido + 0], cc[3 * ido + 0]);
+			ti3 = VADD(cc[ido + 1], cc[3 * ido + 1]);
+
+			ch[0 * l1ido + 0] = VADD(tr2, tr3);
+			ch[0 * l1ido + 1] = VADD(ti2, ti3);
+			ch[1 * l1ido + 0] = VADD(tr1, tr4);
+			ch[1 * l1ido + 1] = VADD(ti1, ti4);
+			ch[2 * l1ido + 0] = VSUB(tr2, tr3);
+			ch[2 * l1ido + 1] = VSUB(ti2, ti3);
+			ch[3 * l1ido + 0] = VSUB(tr1, tr4);
+			ch[3 * l1ido + 1] = VSUB(ti1, ti4);
+		}
+	} else {
+		for (k = 0; k < l1ido; k += ido, ch += ido, cc += 4 * ido) {
+			for (i = 0; i < ido - 1; i += 2) {
+				float wr1, wi1, wr2, wi2, wr3, wi3;
+				tr1 = VSUB(cc[i + 0], cc[i + 2 * ido + 0]);
+				tr2 = VADD(cc[i + 0], cc[i + 2 * ido + 0]);
+				ti1 = VSUB(cc[i + 1], cc[i + 2 * ido + 1]);
+				ti2 = VADD(cc[i + 1], cc[i + 2 * ido + 1]);
+				tr4 =
+				    VMUL(VSUB
+					 (cc[i + 3 * ido + 1],
+					  cc[i + 1 * ido + 1]), LD_PS1(fsign));
+				ti4 =
+				    VMUL(VSUB
+					 (cc[i + 1 * ido + 0],
+					  cc[i + 3 * ido + 0]), LD_PS1(fsign));
+				tr3 =
+				    VADD(cc[i + ido + 0], cc[i + 3 * ido + 0]);
+				ti3 =
+				    VADD(cc[i + ido + 1], cc[i + 3 * ido + 1]);
+
+				ch[i] = VADD(tr2, tr3);
+				cr3 = VSUB(tr2, tr3);
+				ch[i + 1] = VADD(ti2, ti3);
+				ci3 = VSUB(ti2, ti3);
+
+				cr2 = VADD(tr1, tr4);
+				cr4 = VSUB(tr1, tr4);
+				ci2 = VADD(ti1, ti4);
+				ci4 = VSUB(ti1, ti4);
+				wr1 = wa1[i], wi1 = fsign * wa1[i + 1];
+				VCPLXMUL(cr2, ci2, LD_PS1(wr1), LD_PS1(wi1));
+				wr2 = wa2[i], wi2 = fsign * wa2[i + 1];
+				ch[i + l1ido] = cr2;
+				ch[i + l1ido + 1] = ci2;
+
+				VCPLXMUL(cr3, ci3, LD_PS1(wr2), LD_PS1(wi2));
+				wr3 = wa3[i], wi3 = fsign * wa3[i + 1];
+				ch[i + 2 * l1ido] = cr3;
+				ch[i + 2 * l1ido + 1] = ci3;
+
+				VCPLXMUL(cr4, ci4, LD_PS1(wr3), LD_PS1(wi3));
+				ch[i + 3 * l1ido] = cr4;
+				ch[i + 3 * l1ido + 1] = ci4;
+			}
+		}
+	}
+}				/* passf4 */
+
+/*
+  passf5 and passb5 has been merged here, fsign = -1 for passf5, +1 for passb5
+*/
+static NEVER_INLINE(void) passf5_ps(int ido, int l1, const v4sf * cc, v4sf * ch,
+				    const float *wa1, const float *wa2,
+				    const float *wa3, const float *wa4,
+				    float fsign)
+{
+	static const float tr11 = .309016994374947f;
+	const float ti11 = .951056516295154f * fsign;
+	static const float tr12 = -.809016994374947f;
+	const float ti12 = .587785252292473f * fsign;
+
+	/* Local variables */
+	int i, k;
+	v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2,
+	    ti3, ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
+
+	float wr1, wi1, wr2, wi2, wr3, wi3, wr4, wi4;
+
+#define cc_ref(a_1,a_2) cc[(a_2-1)*ido + a_1 + 1]
+#define ch_ref(a_1,a_3) ch[(a_3-1)*l1*ido + a_1 + 1]
+
+	assert(ido > 2);
+	for (k = 0; k < l1; ++k, cc += 5 * ido, ch += ido) {
+		for (i = 0; i < ido - 1; i += 2) {
+			ti5 = VSUB(cc_ref(i, 2), cc_ref(i, 5));
+			ti2 = VADD(cc_ref(i, 2), cc_ref(i, 5));
+			ti4 = VSUB(cc_ref(i, 3), cc_ref(i, 4));
+			ti3 = VADD(cc_ref(i, 3), cc_ref(i, 4));
+			tr5 = VSUB(cc_ref(i - 1, 2), cc_ref(i - 1, 5));
+			tr2 = VADD(cc_ref(i - 1, 2), cc_ref(i - 1, 5));
+			tr4 = VSUB(cc_ref(i - 1, 3), cc_ref(i - 1, 4));
+			tr3 = VADD(cc_ref(i - 1, 3), cc_ref(i - 1, 4));
+			ch_ref(i - 1, 1) =
+			    VADD(cc_ref(i - 1, 1), VADD(tr2, tr3));
+			ch_ref(i, 1) = VADD(cc_ref(i, 1), VADD(ti2, ti3));
+			cr2 =
+			    VADD(cc_ref(i - 1, 1),
+				 VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
+			ci2 =
+			    VADD(cc_ref(i, 1),
+				 VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3)));
+			cr3 =
+			    VADD(cc_ref(i - 1, 1),
+				 VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
+			ci3 =
+			    VADD(cc_ref(i, 1),
+				 VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3)));
+			cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
+			ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
+			cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
+			ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
+			dr3 = VSUB(cr3, ci4);
+			dr4 = VADD(cr3, ci4);
+			di3 = VADD(ci3, cr4);
+			di4 = VSUB(ci3, cr4);
+			dr5 = VADD(cr2, ci5);
+			dr2 = VSUB(cr2, ci5);
+			di5 = VSUB(ci2, cr5);
+			di2 = VADD(ci2, cr5);
+			wr1 = wa1[i], wi1 = fsign * wa1[i + 1], wr2 =
+			    wa2[i], wi2 = fsign * wa2[i + 1];
+			wr3 = wa3[i], wi3 = fsign * wa3[i + 1], wr4 =
+			    wa4[i], wi4 = fsign * wa4[i + 1];
+			VCPLXMUL(dr2, di2, LD_PS1(wr1), LD_PS1(wi1));
+			ch_ref(i - 1, 2) = dr2;
+			ch_ref(i, 2) = di2;
+			VCPLXMUL(dr3, di3, LD_PS1(wr2), LD_PS1(wi2));
+			ch_ref(i - 1, 3) = dr3;
+			ch_ref(i, 3) = di3;
+			VCPLXMUL(dr4, di4, LD_PS1(wr3), LD_PS1(wi3));
+			ch_ref(i - 1, 4) = dr4;
+			ch_ref(i, 4) = di4;
+			VCPLXMUL(dr5, di5, LD_PS1(wr4), LD_PS1(wi4));
+			ch_ref(i - 1, 5) = dr5;
+			ch_ref(i, 5) = di5;
+		}
+	}
+#undef ch_ref
+#undef cc_ref
+}
+
+static NEVER_INLINE(void) radf2_ps(int ido, int l1, const v4sf * RESTRICT cc,
+				   v4sf * RESTRICT ch, const float *wa1)
+{
+	static const float minus_one = -1.f;
+	int i, k, l1ido = l1 * ido;
+	for (k = 0; k < l1ido; k += ido) {
+		v4sf a = cc[k], b = cc[k + l1ido];
+		ch[2 * k] = VADD(a, b);
+		ch[2 * (k + ido) - 1] = VSUB(a, b);
+	}
+	if (ido < 2)
+		return;
+	if (ido != 2) {
+		for (k = 0; k < l1ido; k += ido) {
+			for (i = 2; i < ido; i += 2) {
+				v4sf tr2 = cc[i - 1 + k + l1ido], ti2 =
+				    cc[i + k + l1ido];
+				v4sf br = cc[i - 1 + k], bi = cc[i + k];
+				VCPLXMULCONJ(tr2, ti2, LD_PS1(wa1[i - 2]),
+					     LD_PS1(wa1[i - 1]));
+				ch[i + 2 * k] = VADD(bi, ti2);
+				ch[2 * (k + ido) - i] = VSUB(ti2, bi);
+				ch[i - 1 + 2 * k] = VADD(br, tr2);
+				ch[2 * (k + ido) - i - 1] = VSUB(br, tr2);
+			}
+		}
+		if (ido % 2 == 1)
+			return;
+	}
+	for (k = 0; k < l1ido; k += ido) {
+		ch[2 * k + ido] = SVMUL(minus_one, cc[ido - 1 + k + l1ido]);
+		ch[2 * k + ido - 1] = cc[k + ido - 1];
+	}
+}				/* radf2 */
+
+static NEVER_INLINE(void) radb2_ps(int ido, int l1, const v4sf * cc, v4sf * ch,
+				   const float *wa1)
+{
+	static const float minus_two = -2;
+	int i, k, l1ido = l1 * ido;
+	v4sf a, b, c, d, tr2, ti2;
+	for (k = 0; k < l1ido; k += ido) {
+		a = cc[2 * k];
+		b = cc[2 * (k + ido) - 1];
+		ch[k] = VADD(a, b);
+		ch[k + l1ido] = VSUB(a, b);
+	}
+	if (ido < 2)
+		return;
+	if (ido != 2) {
+		for (k = 0; k < l1ido; k += ido) {
+			for (i = 2; i < ido; i += 2) {
+				a = cc[i - 1 + 2 * k];
+				b = cc[2 * (k + ido) - i - 1];
+				c = cc[i + 0 + 2 * k];
+				d = cc[2 * (k + ido) - i + 0];
+				ch[i - 1 + k] = VADD(a, b);
+				tr2 = VSUB(a, b);
+				ch[i + 0 + k] = VSUB(c, d);
+				ti2 = VADD(c, d);
+				VCPLXMUL(tr2, ti2, LD_PS1(wa1[i - 2]),
+					 LD_PS1(wa1[i - 1]));
+				ch[i - 1 + k + l1ido] = tr2;
+				ch[i + 0 + k + l1ido] = ti2;
+			}
+		}
+		if (ido % 2 == 1)
+			return;
+	}
+	for (k = 0; k < l1ido; k += ido) {
+		a = cc[2 * k + ido - 1];
+		b = cc[2 * k + ido];
+		ch[k + ido - 1] = VADD(a, a);
+		ch[k + ido - 1 + l1ido] = SVMUL(minus_two, b);
+	}
+}				/* radb2 */
+
+static void radf3_ps(int ido, int l1, const v4sf * RESTRICT cc,
+		     v4sf * RESTRICT ch, const float *wa1, const float *wa2)
+{
+	static const float taur = -0.5f;
+	static const float taui = 0.866025403784439f;
+	int i, k, ic;
+	v4sf ci2, di2, di3, cr2, dr2, dr3, ti2, ti3, tr2, tr3, wr1, wi1, wr2,
+	    wi2;
+	for (k = 0; k < l1; k++) {
+		cr2 = VADD(cc[(k + l1) * ido], cc[(k + 2 * l1) * ido]);
+		ch[3 * k * ido] = VADD(cc[k * ido], cr2);
+		ch[(3 * k + 2) * ido] =
+		    SVMUL(taui,
+			  VSUB(cc[(k + l1 * 2) * ido], cc[(k + l1) * ido]));
+		ch[ido - 1 + (3 * k + 1) * ido] =
+		    VADD(cc[k * ido], SVMUL(taur, cr2));
+	}
+	if (ido == 1)
+		return;
+	for (k = 0; k < l1; k++) {
+		for (i = 2; i < ido; i += 2) {
+			ic = ido - i;
+			wr1 = LD_PS1(wa1[i - 2]);
+			wi1 = LD_PS1(wa1[i - 1]);
+			dr2 = cc[i - 1 + (k + l1) * ido];
+			di2 = cc[i + (k + l1) * ido];
+			VCPLXMULCONJ(dr2, di2, wr1, wi1);
+
+			wr2 = LD_PS1(wa2[i - 2]);
+			wi2 = LD_PS1(wa2[i - 1]);
+			dr3 = cc[i - 1 + (k + l1 * 2) * ido];
+			di3 = cc[i + (k + l1 * 2) * ido];
+			VCPLXMULCONJ(dr3, di3, wr2, wi2);
+
+			cr2 = VADD(dr2, dr3);
+			ci2 = VADD(di2, di3);
+			ch[i - 1 + 3 * k * ido] =
+			    VADD(cc[i - 1 + k * ido], cr2);
+			ch[i + 3 * k * ido] = VADD(cc[i + k * ido], ci2);
+			tr2 = VADD(cc[i - 1 + k * ido], SVMUL(taur, cr2));
+			ti2 = VADD(cc[i + k * ido], SVMUL(taur, ci2));
+			tr3 = SVMUL(taui, VSUB(di2, di3));
+			ti3 = SVMUL(taui, VSUB(dr3, dr2));
+			ch[i - 1 + (3 * k + 2) * ido] = VADD(tr2, tr3);
+			ch[ic - 1 + (3 * k + 1) * ido] = VSUB(tr2, tr3);
+			ch[i + (3 * k + 2) * ido] = VADD(ti2, ti3);
+			ch[ic + (3 * k + 1) * ido] = VSUB(ti3, ti2);
+		}
+	}
+}				/* radf3 */
+
+static void radb3_ps(int ido, int l1, const v4sf * RESTRICT cc,
+		     v4sf * RESTRICT ch, const float *wa1, const float *wa2)
+{
+	static const float taur = -0.5f;
+	static const float taui = 0.866025403784439f;
+	static const float taui_2 = 0.866025403784439f * 2;
+	int i, k, ic;
+	v4sf ci2, ci3, di2, di3, cr2, cr3, dr2, dr3, ti2, tr2;
+	for (k = 0; k < l1; k++) {
+		tr2 = cc[ido - 1 + (3 * k + 1) * ido];
+		tr2 = VADD(tr2, tr2);
+		cr2 = VMADD(LD_PS1(taur), tr2, cc[3 * k * ido]);
+		ch[k * ido] = VADD(cc[3 * k * ido], tr2);
+		ci3 = SVMUL(taui_2, cc[(3 * k + 2) * ido]);
+		ch[(k + l1) * ido] = VSUB(cr2, ci3);
+		ch[(k + 2 * l1) * ido] = VADD(cr2, ci3);
+	}
+	if (ido == 1)
+		return;
+	for (k = 0; k < l1; k++) {
+		for (i = 2; i < ido; i += 2) {
+			ic = ido - i;
+			tr2 =
+			    VADD(cc[i - 1 + (3 * k + 2) * ido],
+				 cc[ic - 1 + (3 * k + 1) * ido]);
+			cr2 = VMADD(LD_PS1(taur), tr2, cc[i - 1 + 3 * k * ido]);
+			ch[i - 1 + k * ido] =
+			    VADD(cc[i - 1 + 3 * k * ido], tr2);
+			ti2 =
+			    VSUB(cc[i + (3 * k + 2) * ido],
+				 cc[ic + (3 * k + 1) * ido]);
+			ci2 = VMADD(LD_PS1(taur), ti2, cc[i + 3 * k * ido]);
+			ch[i + k * ido] = VADD(cc[i + 3 * k * ido], ti2);
+			cr3 =
+			    SVMUL(taui,
+				  VSUB(cc[i - 1 + (3 * k + 2) * ido],
+				       cc[ic - 1 + (3 * k + 1) * ido]));
+			ci3 =
+			    SVMUL(taui,
+				  VADD(cc[i + (3 * k + 2) * ido],
+				       cc[ic + (3 * k + 1) * ido]));
+			dr2 = VSUB(cr2, ci3);
+			dr3 = VADD(cr2, ci3);
+			di2 = VADD(ci2, cr3);
+			di3 = VSUB(ci2, cr3);
+			VCPLXMUL(dr2, di2, LD_PS1(wa1[i - 2]),
+				 LD_PS1(wa1[i - 1]));
+			ch[i - 1 + (k + l1) * ido] = dr2;
+			ch[i + (k + l1) * ido] = di2;
+			VCPLXMUL(dr3, di3, LD_PS1(wa2[i - 2]),
+				 LD_PS1(wa2[i - 1]));
+			ch[i - 1 + (k + 2 * l1) * ido] = dr3;
+			ch[i + (k + 2 * l1) * ido] = di3;
+		}
+	}
+}				/* radb3 */
+
+static NEVER_INLINE(void) radf4_ps(int ido, int l1, const v4sf * RESTRICT cc,
+				   v4sf * RESTRICT ch,
+				   const float *RESTRICT wa1,
+				   const float *RESTRICT wa2,
+				   const float *RESTRICT wa3)
+{
+	static const float minus_hsqt2 = (float)-0.7071067811865475;
+	int i, k, l1ido = l1 * ido;
+	{
+		const v4sf *RESTRICT cc_ = cc, *RESTRICT cc_end = cc + l1ido;
+		v4sf *RESTRICT ch_ = ch;
+		while (cc < cc_end) {
+			// this loop represents between 25% and 40% of total radf4_ps cost !
+			v4sf a0 = cc[0], a1 = cc[l1ido];
+			v4sf a2 = cc[2 * l1ido], a3 = cc[3 * l1ido];
+			v4sf tr1 = VADD(a1, a3);
+			v4sf tr2 = VADD(a0, a2);
+			ch[2 * ido - 1] = VSUB(a0, a2);
+			ch[2 * ido] = VSUB(a3, a1);
+			ch[0] = VADD(tr1, tr2);
+			ch[4 * ido - 1] = VSUB(tr2, tr1);
+			cc += ido;
+			ch += 4 * ido;
+		}
+		cc = cc_;
+		ch = ch_;
+	}
+	if (ido < 2)
+		return;
+	if (ido != 2) {
+		for (k = 0; k < l1ido; k += ido) {
+			const v4sf *RESTRICT pc = (v4sf *) (cc + 1 + k);
+			for (i = 2; i < ido; i += 2, pc += 2) {
+				int ic = ido - i;
+				v4sf wr, wi, cr2, ci2, cr3, ci3, cr4, ci4;
+				v4sf tr1, ti1, tr2, ti2, tr3, ti3, tr4, ti4;
+
+				cr2 = pc[1 * l1ido + 0];
+				ci2 = pc[1 * l1ido + 1];
+				wr = LD_PS1(wa1[i - 2]);
+				wi = LD_PS1(wa1[i - 1]);
+				VCPLXMULCONJ(cr2, ci2, wr, wi);
+
+				cr3 = pc[2 * l1ido + 0];
+				ci3 = pc[2 * l1ido + 1];
+				wr = LD_PS1(wa2[i - 2]);
+				wi = LD_PS1(wa2[i - 1]);
+				VCPLXMULCONJ(cr3, ci3, wr, wi);
+
+				cr4 = pc[3 * l1ido];
+				ci4 = pc[3 * l1ido + 1];
+				wr = LD_PS1(wa3[i - 2]);
+				wi = LD_PS1(wa3[i - 1]);
+				VCPLXMULCONJ(cr4, ci4, wr, wi);
+
+				/* at this point, on SSE, five of "cr2 cr3 cr4 ci2 ci3 ci4" should be loaded in registers */
+
+				tr1 = VADD(cr2, cr4);
+				tr4 = VSUB(cr4, cr2);
+				tr2 = VADD(pc[0], cr3);
+				tr3 = VSUB(pc[0], cr3);
+				ch[i - 1 + 4 * k] = VADD(tr1, tr2);
+				ch[ic - 1 + 4 * k + 3 * ido] = VSUB(tr2, tr1);	// at this point tr1 and tr2 can be disposed
+				ti1 = VADD(ci2, ci4);
+				ti4 = VSUB(ci2, ci4);
+				ch[i - 1 + 4 * k + 2 * ido] = VADD(ti4, tr3);
+				ch[ic - 1 + 4 * k + 1 * ido] = VSUB(tr3, ti4);	// dispose tr3, ti4
+				ti2 = VADD(pc[1], ci3);
+				ti3 = VSUB(pc[1], ci3);
+				ch[i + 4 * k] = VADD(ti1, ti2);
+				ch[ic + 4 * k + 3 * ido] = VSUB(ti1, ti2);
+				ch[i + 4 * k + 2 * ido] = VADD(tr4, ti3);
+				ch[ic + 4 * k + 1 * ido] = VSUB(tr4, ti3);
+			}
+		}
+		if (ido % 2 == 1)
+			return;
+	}
+	for (k = 0; k < l1ido; k += ido) {
+		v4sf a = cc[ido - 1 + k + l1ido], b =
+		    cc[ido - 1 + k + 3 * l1ido];
+		v4sf c = cc[ido - 1 + k], d = cc[ido - 1 + k + 2 * l1ido];
+		v4sf ti1 = SVMUL(minus_hsqt2, VADD(a, b));
+		v4sf tr1 = SVMUL(minus_hsqt2, VSUB(b, a));
+		ch[ido - 1 + 4 * k] = VADD(tr1, c);
+		ch[ido - 1 + 4 * k + 2 * ido] = VSUB(c, tr1);
+		ch[4 * k + 1 * ido] = VSUB(ti1, d);
+		ch[4 * k + 3 * ido] = VADD(ti1, d);
+	}
+}				/* radf4 */
+
+static NEVER_INLINE(void) radb4_ps(int ido, int l1, const v4sf * RESTRICT cc,
+				   v4sf * RESTRICT ch,
+				   const float *RESTRICT wa1,
+				   const float *RESTRICT wa2,
+				   const float *RESTRICT wa3)
+{
+	static const float minus_sqrt2 = (float)-1.414213562373095;
+	static const float two = 2.f;
+	int i, k, l1ido = l1 * ido;
+	v4sf ci2, ci3, ci4, cr2, cr3, cr4, ti1, ti2, ti3, ti4, tr1, tr2, tr3,
+	    tr4;
+	{
+		const v4sf *RESTRICT cc_ = cc, *RESTRICT ch_end = ch + l1ido;
+		v4sf *ch_ = ch;
+		while (ch < ch_end) {
+			v4sf a = cc[0], b = cc[4 * ido - 1];
+			v4sf c = cc[2 * ido], d = cc[2 * ido - 1];
+			tr3 = SVMUL(two, d);
+			tr2 = VADD(a, b);
+			tr1 = VSUB(a, b);
+			tr4 = SVMUL(two, c);
+			ch[0 * l1ido] = VADD(tr2, tr3);
+			ch[2 * l1ido] = VSUB(tr2, tr3);
+			ch[1 * l1ido] = VSUB(tr1, tr4);
+			ch[3 * l1ido] = VADD(tr1, tr4);
+
+			cc += 4 * ido;
+			ch += ido;
+		}
+		cc = cc_;
+		ch = ch_;
+	}
+	if (ido < 2)
+		return;
+	if (ido != 2) {
+		for (k = 0; k < l1ido; k += ido) {
+			const v4sf *RESTRICT pc = (v4sf *) (cc - 1 + 4 * k);
+			v4sf *RESTRICT ph = (v4sf *) (ch + k + 1);
+			for (i = 2; i < ido; i += 2) {
+
+				tr1 = VSUB(pc[i], pc[4 * ido - i]);
+				tr2 = VADD(pc[i], pc[4 * ido - i]);
+				ti4 = VSUB(pc[2 * ido + i], pc[2 * ido - i]);
+				tr3 = VADD(pc[2 * ido + i], pc[2 * ido - i]);
+				ph[0] = VADD(tr2, tr3);
+				cr3 = VSUB(tr2, tr3);
+
+				ti3 =
+				    VSUB(pc[2 * ido + i + 1],
+					 pc[2 * ido - i + 1]);
+				tr4 =
+				    VADD(pc[2 * ido + i + 1],
+					 pc[2 * ido - i + 1]);
+				cr2 = VSUB(tr1, tr4);
+				cr4 = VADD(tr1, tr4);
+
+				ti1 = VADD(pc[i + 1], pc[4 * ido - i + 1]);
+				ti2 = VSUB(pc[i + 1], pc[4 * ido - i + 1]);
+
+				ph[1] = VADD(ti2, ti3);
+				ph += l1ido;
+				ci3 = VSUB(ti2, ti3);
+				ci2 = VADD(ti1, ti4);
+				ci4 = VSUB(ti1, ti4);
+				VCPLXMUL(cr2, ci2, LD_PS1(wa1[i - 2]),
+					 LD_PS1(wa1[i - 1]));
+				ph[0] = cr2;
+				ph[1] = ci2;
+				ph += l1ido;
+				VCPLXMUL(cr3, ci3, LD_PS1(wa2[i - 2]),
+					 LD_PS1(wa2[i - 1]));
+				ph[0] = cr3;
+				ph[1] = ci3;
+				ph += l1ido;
+				VCPLXMUL(cr4, ci4, LD_PS1(wa3[i - 2]),
+					 LD_PS1(wa3[i - 1]));
+				ph[0] = cr4;
+				ph[1] = ci4;
+				ph = ph - 3 * l1ido + 2;
+			}
+		}
+		if (ido % 2 == 1)
+			return;
+	}
+	for (k = 0; k < l1ido; k += ido) {
+		int i0 = 4 * k + ido;
+		v4sf c = cc[i0 - 1], d = cc[i0 + 2 * ido - 1];
+		v4sf a = cc[i0 + 0], b = cc[i0 + 2 * ido + 0];
+		tr1 = VSUB(c, d);
+		tr2 = VADD(c, d);
+		ti1 = VADD(b, a);
+		ti2 = VSUB(b, a);
+		ch[ido - 1 + k + 0 * l1ido] = VADD(tr2, tr2);
+		ch[ido - 1 + k + 1 * l1ido] =
+		    SVMUL(minus_sqrt2, VSUB(ti1, tr1));
+		ch[ido - 1 + k + 2 * l1ido] = VADD(ti2, ti2);
+		ch[ido - 1 + k + 3 * l1ido] =
+		    SVMUL(minus_sqrt2, VADD(ti1, tr1));
+	}
+}				/* radb4 */
+
+static void radf5_ps(int ido, int l1, const v4sf * RESTRICT cc,
+		     v4sf * RESTRICT ch, const float *wa1, const float *wa2,
+		     const float *wa3, const float *wa4)
+{
+	static const float tr11 = .309016994374947f;
+	static const float ti11 = .951056516295154f;
+	static const float tr12 = -.809016994374947f;
+	static const float ti12 = .587785252292473f;
+
+	/* System generated locals */
+	int cc_offset, ch_offset;
+
+	/* Local variables */
+	int i, k, ic;
+	v4sf ci2, di2, ci4, ci5, di3, di4, di5, ci3, cr2, cr3, dr2, dr3, dr4,
+	    dr5, cr5, cr4, ti2, ti3, ti5, ti4, tr2, tr3, tr4, tr5;
+	int idp2;
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*l1 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*5 + (a_2))*ido + a_1]
+
+	/* Parameter adjustments */
+	ch_offset = 1 + ido * 6;
+	ch -= ch_offset;
+	cc_offset = 1 + ido * (1 + l1);
+	cc -= cc_offset;
+
+	/* Function Body */
+	for (k = 1; k <= l1; ++k) {
+		cr2 = VADD(cc_ref(1, k, 5), cc_ref(1, k, 2));
+		ci5 = VSUB(cc_ref(1, k, 5), cc_ref(1, k, 2));
+		cr3 = VADD(cc_ref(1, k, 4), cc_ref(1, k, 3));
+		ci4 = VSUB(cc_ref(1, k, 4), cc_ref(1, k, 3));
+		ch_ref(1, 1, k) = VADD(cc_ref(1, k, 1), VADD(cr2, cr3));
+		ch_ref(ido, 2, k) =
+		    VADD(cc_ref(1, k, 1),
+			 VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
+		ch_ref(1, 3, k) = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
+		ch_ref(ido, 4, k) =
+		    VADD(cc_ref(1, k, 1),
+			 VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
+		ch_ref(1, 5, k) = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
+		//printf("pffft: radf5, k=%d ch_ref=%f, ci4=%f\n", k, ch_ref(1, 5, k), ci4);
+	}
+	if (ido == 1) {
+		return;
+	}
+	idp2 = ido + 2;
+	for (k = 1; k <= l1; ++k) {
+		for (i = 3; i <= ido; i += 2) {
+			ic = idp2 - i;
+			dr2 = LD_PS1(wa1[i - 3]);
+			di2 = LD_PS1(wa1[i - 2]);
+			dr3 = LD_PS1(wa2[i - 3]);
+			di3 = LD_PS1(wa2[i - 2]);
+			dr4 = LD_PS1(wa3[i - 3]);
+			di4 = LD_PS1(wa3[i - 2]);
+			dr5 = LD_PS1(wa4[i - 3]);
+			di5 = LD_PS1(wa4[i - 2]);
+			VCPLXMULCONJ(dr2, di2, cc_ref(i - 1, k, 2),
+				     cc_ref(i, k, 2));
+			VCPLXMULCONJ(dr3, di3, cc_ref(i - 1, k, 3),
+				     cc_ref(i, k, 3));
+			VCPLXMULCONJ(dr4, di4, cc_ref(i - 1, k, 4),
+				     cc_ref(i, k, 4));
+			VCPLXMULCONJ(dr5, di5, cc_ref(i - 1, k, 5),
+				     cc_ref(i, k, 5));
+			cr2 = VADD(dr2, dr5);
+			ci5 = VSUB(dr5, dr2);
+			cr5 = VSUB(di2, di5);
+			ci2 = VADD(di2, di5);
+			cr3 = VADD(dr3, dr4);
+			ci4 = VSUB(dr4, dr3);
+			cr4 = VSUB(di3, di4);
+			ci3 = VADD(di3, di4);
+			ch_ref(i - 1, 1, k) =
+			    VADD(cc_ref(i - 1, k, 1), VADD(cr2, cr3));
+			ch_ref(i, 1, k) = VSUB(cc_ref(i, k, 1), VADD(ci2, ci3));	//
+			tr2 =
+			    VADD(cc_ref(i - 1, k, 1),
+				 VADD(SVMUL(tr11, cr2), SVMUL(tr12, cr3)));
+			ti2 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr11, ci2), SVMUL(tr12, ci3)));	//
+			tr3 =
+			    VADD(cc_ref(i - 1, k, 1),
+				 VADD(SVMUL(tr12, cr2), SVMUL(tr11, cr3)));
+			ti3 = VSUB(cc_ref(i, k, 1), VADD(SVMUL(tr12, ci2), SVMUL(tr11, ci3)));	//
+			tr5 = VADD(SVMUL(ti11, cr5), SVMUL(ti12, cr4));
+			ti5 = VADD(SVMUL(ti11, ci5), SVMUL(ti12, ci4));
+			tr4 = VSUB(SVMUL(ti12, cr5), SVMUL(ti11, cr4));
+			ti4 = VSUB(SVMUL(ti12, ci5), SVMUL(ti11, ci4));
+			ch_ref(i - 1, 3, k) = VSUB(tr2, tr5);
+			ch_ref(ic - 1, 2, k) = VADD(tr2, tr5);
+			ch_ref(i, 3, k) = VADD(ti2, ti5);
+			ch_ref(ic, 2, k) = VSUB(ti5, ti2);
+			ch_ref(i - 1, 5, k) = VSUB(tr3, tr4);
+			ch_ref(ic - 1, 4, k) = VADD(tr3, tr4);
+			ch_ref(i, 5, k) = VADD(ti3, ti4);
+			ch_ref(ic, 4, k) = VSUB(ti4, ti3);
+		}
+	}
+#undef cc_ref
+#undef ch_ref
+}				/* radf5 */
+
+static void radb5_ps(int ido, int l1, const v4sf * RESTRICT cc,
+		     v4sf * RESTRICT ch, const float *wa1, const float *wa2,
+		     const float *wa3, const float *wa4)
+{
+	static const float tr11 = .309016994374947f;
+	static const float ti11 = .951056516295154f;
+	static const float tr12 = -.809016994374947f;
+	static const float ti12 = .587785252292473f;
+
+	int cc_offset, ch_offset;
+
+	/* Local variables */
+	int i, k, ic;
+	v4sf ci2, ci3, ci4, ci5, di3, di4, di5, di2, cr2, cr3, cr5, cr4, ti2,
+	    ti3, ti4, ti5, dr3, dr4, dr5, dr2, tr2, tr3, tr4, tr5;
+	int idp2;
+
+#define cc_ref(a_1,a_2,a_3) cc[((a_3)*5 + (a_2))*ido + a_1]
+#define ch_ref(a_1,a_2,a_3) ch[((a_3)*l1 + (a_2))*ido + a_1]
+
+	/* Parameter adjustments */
+	ch_offset = 1 + ido * (1 + l1);
+	ch -= ch_offset;
+	cc_offset = 1 + ido * 6;
+	cc -= cc_offset;
+
+	/* Function Body */
+	for (k = 1; k <= l1; ++k) {
+		ti5 = VADD(cc_ref(1, 3, k), cc_ref(1, 3, k));
+		ti4 = VADD(cc_ref(1, 5, k), cc_ref(1, 5, k));
+		tr2 = VADD(cc_ref(ido, 2, k), cc_ref(ido, 2, k));
+		tr3 = VADD(cc_ref(ido, 4, k), cc_ref(ido, 4, k));
+		ch_ref(1, k, 1) = VADD(cc_ref(1, 1, k), VADD(tr2, tr3));
+		cr2 =
+		    VADD(cc_ref(1, 1, k),
+			 VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
+		cr3 =
+		    VADD(cc_ref(1, 1, k),
+			 VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
+		ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
+		ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
+		ch_ref(1, k, 2) = VSUB(cr2, ci5);
+		ch_ref(1, k, 3) = VSUB(cr3, ci4);
+		ch_ref(1, k, 4) = VADD(cr3, ci4);
+		ch_ref(1, k, 5) = VADD(cr2, ci5);
+	}
+	if (ido == 1) {
+		return;
+	}
+	idp2 = ido + 2;
+	for (k = 1; k <= l1; ++k) {
+		for (i = 3; i <= ido; i += 2) {
+			ic = idp2 - i;
+			ti5 = VADD(cc_ref(i, 3, k), cc_ref(ic, 2, k));
+			ti2 = VSUB(cc_ref(i, 3, k), cc_ref(ic, 2, k));
+			ti4 = VADD(cc_ref(i, 5, k), cc_ref(ic, 4, k));
+			ti3 = VSUB(cc_ref(i, 5, k), cc_ref(ic, 4, k));
+			tr5 = VSUB(cc_ref(i - 1, 3, k), cc_ref(ic - 1, 2, k));
+			tr2 = VADD(cc_ref(i - 1, 3, k), cc_ref(ic - 1, 2, k));
+			tr4 = VSUB(cc_ref(i - 1, 5, k), cc_ref(ic - 1, 4, k));
+			tr3 = VADD(cc_ref(i - 1, 5, k), cc_ref(ic - 1, 4, k));
+			ch_ref(i - 1, k, 1) =
+			    VADD(cc_ref(i - 1, 1, k), VADD(tr2, tr3));
+			ch_ref(i, k, 1) = VADD(cc_ref(i, 1, k), VADD(ti2, ti3));
+			cr2 =
+			    VADD(cc_ref(i - 1, 1, k),
+				 VADD(SVMUL(tr11, tr2), SVMUL(tr12, tr3)));
+			ci2 =
+			    VADD(cc_ref(i, 1, k),
+				 VADD(SVMUL(tr11, ti2), SVMUL(tr12, ti3)));
+			cr3 =
+			    VADD(cc_ref(i - 1, 1, k),
+				 VADD(SVMUL(tr12, tr2), SVMUL(tr11, tr3)));
+			ci3 =
+			    VADD(cc_ref(i, 1, k),
+				 VADD(SVMUL(tr12, ti2), SVMUL(tr11, ti3)));
+			cr5 = VADD(SVMUL(ti11, tr5), SVMUL(ti12, tr4));
+			ci5 = VADD(SVMUL(ti11, ti5), SVMUL(ti12, ti4));
+			cr4 = VSUB(SVMUL(ti12, tr5), SVMUL(ti11, tr4));
+			ci4 = VSUB(SVMUL(ti12, ti5), SVMUL(ti11, ti4));
+			dr3 = VSUB(cr3, ci4);
+			dr4 = VADD(cr3, ci4);
+			di3 = VADD(ci3, cr4);
+			di4 = VSUB(ci3, cr4);
+			dr5 = VADD(cr2, ci5);
+			dr2 = VSUB(cr2, ci5);
+			di5 = VSUB(ci2, cr5);
+			di2 = VADD(ci2, cr5);
+			VCPLXMUL(dr2, di2, LD_PS1(wa1[i - 3]),
+				 LD_PS1(wa1[i - 2]));
+			VCPLXMUL(dr3, di3, LD_PS1(wa2[i - 3]),
+				 LD_PS1(wa2[i - 2]));
+			VCPLXMUL(dr4, di4, LD_PS1(wa3[i - 3]),
+				 LD_PS1(wa3[i - 2]));
+			VCPLXMUL(dr5, di5, LD_PS1(wa4[i - 3]),
+				 LD_PS1(wa4[i - 2]));
+
+			ch_ref(i - 1, k, 2) = dr2;
+			ch_ref(i, k, 2) = di2;
+			ch_ref(i - 1, k, 3) = dr3;
+			ch_ref(i, k, 3) = di3;
+			ch_ref(i - 1, k, 4) = dr4;
+			ch_ref(i, k, 4) = di4;
+			ch_ref(i - 1, k, 5) = dr5;
+			ch_ref(i, k, 5) = di5;
+		}
+	}
+#undef cc_ref
+#undef ch_ref
+}				/* radb5 */
+
+static NEVER_INLINE(v4sf *) rfftf1_ps(int n, const v4sf * input_readonly,
+				      v4sf * work1, v4sf * work2,
+				      const float *wa, const int *ifac)
+{
+	v4sf *in = (v4sf *) input_readonly;
+	v4sf *out = (in == work2 ? work1 : work2);
+	int nf = ifac[1], k1;
+	int l2 = n;
+	int iw = n - 1;
+	assert(in != out && work1 != work2);
+	for (k1 = 1; k1 <= nf; ++k1) {
+		int kh = nf - k1;
+		int ip = ifac[kh + 2];
+		int l1 = l2 / ip;
+		int ido = n / l2;
+		iw -= (ip - 1) * ido;
+		switch (ip) {
+		case 5:{
+				int ix2 = iw + ido;
+				int ix3 = ix2 + ido;
+				int ix4 = ix3 + ido;
+				radf5_ps(ido, l1, in, out, &wa[iw], &wa[ix2],
+					 &wa[ix3], &wa[ix4]);
+			} break;
+		case 4:{
+				int ix2 = iw + ido;
+				int ix3 = ix2 + ido;
+				radf4_ps(ido, l1, in, out, &wa[iw], &wa[ix2],
+					 &wa[ix3]);
+			} break;
+		case 3:{
+				int ix2 = iw + ido;
+				radf3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
+			} break;
+		case 2:
+			radf2_ps(ido, l1, in, out, &wa[iw]);
+			break;
+		default:
+			assert(0);
+			break;
+		}
+		l2 = l1;
+		if (out == work2) {
+			out = work1;
+			in = work2;
+		} else {
+			out = work2;
+			in = work1;
+		}
+	}
+	return in;		/* this is in fact the output .. */
+}				/* rfftf1 */
+
+static NEVER_INLINE(v4sf *) rfftb1_ps(int n, const v4sf * input_readonly,
+				      v4sf * work1, v4sf * work2,
+				      const float *wa, const int *ifac)
+{
+	v4sf *in = (v4sf *) input_readonly;
+	v4sf *out = (in == work2 ? work1 : work2);
+	int nf = ifac[1], k1;
+	int l1 = 1;
+	int iw = 0;
+	assert(in != out);
+	for (k1 = 1; k1 <= nf; k1++) {
+		int ip = ifac[k1 + 1];
+		int l2 = ip * l1;
+		int ido = n / l2;
+		switch (ip) {
+		case 5:{
+				int ix2 = iw + ido;
+				int ix3 = ix2 + ido;
+				int ix4 = ix3 + ido;
+				radb5_ps(ido, l1, in, out, &wa[iw], &wa[ix2],
+					 &wa[ix3], &wa[ix4]);
+			} break;
+		case 4:{
+				int ix2 = iw + ido;
+				int ix3 = ix2 + ido;
+				radb4_ps(ido, l1, in, out, &wa[iw], &wa[ix2],
+					 &wa[ix3]);
+			} break;
+		case 3:{
+				int ix2 = iw + ido;
+				radb3_ps(ido, l1, in, out, &wa[iw], &wa[ix2]);
+			} break;
+		case 2:
+			radb2_ps(ido, l1, in, out, &wa[iw]);
+			break;
+		default:
+			assert(0);
+			break;
+		}
+		l1 = l2;
+		iw += (ip - 1) * ido;
+
+		if (out == work2) {
+			out = work1;
+			in = work2;
+		} else {
+			out = work2;
+			in = work1;
+		}
+	}
+	return in;		/* this is in fact the output .. */
+}
+
+static int decompose(int n, int *ifac, const int *ntryh)
+{
+	int nl = n, nf = 0, i, j = 0;
+	for (j = 0; ntryh[j]; ++j) {
+		int ntry = ntryh[j];
+		while (nl != 1) {
+			int nq = nl / ntry;
+			int nr = nl - ntry * nq;
+			if (nr == 0) {
+				ifac[2 + nf++] = ntry;
+				nl = nq;
+				if (ntry == 2 && nf != 1) {
+					for (i = 2; i <= nf; ++i) {
+						int ib = nf - i + 2;
+						ifac[ib + 1] = ifac[ib];
+					}
+					ifac[2] = 2;
+				}
+			} else
+				break;
+		}
+	}
+	ifac[0] = n;
+	ifac[1] = nf;
+	return nf;
+}
+
+static void rffti1_ps(int n, float *wa, int *ifac)
+{
+	static const int ntryh[] = { 4, 2, 3, 5, 0 };
+	int k1, j, ii;
+
+	int nf = decompose(n, ifac, ntryh);
+	float argh = (2 * M_PI) / n;
+	int is = 0;
+	int nfm1 = nf - 1;
+	int l1 = 1;
+	for (k1 = 1; k1 <= nfm1; k1++) {
+		int ip = ifac[k1 + 1];
+		int ld = 0;
+		int l2 = l1 * ip;
+		int ido = n / l2;
+		int ipm = ip - 1;
+		for (j = 1; j <= ipm; ++j) {
+			float argld;
+			int i = is, fi = 0;
+			ld += l1;
+			argld = ld * argh;
+			for (ii = 3; ii <= ido; ii += 2) {
+				i += 2;
+				fi += 1;
+				wa[i - 2] = cos(fi * argld);
+				wa[i - 1] = sin(fi * argld);
+			}
+			is += ido;
+		}
+		l1 = l2;
+	}
+}				/* rffti1 */
+
+void cffti1_ps(int n, float *wa, int *ifac)
+{
+	static const int ntryh[] = { 5, 3, 4, 2, 0 };
+	int k1, j, ii;
+
+	int nf = decompose(n, ifac, ntryh);
+	float argh = (2 * M_PI) / (float)n;
+	int i = 1;
+	int l1 = 1;
+	for (k1 = 1; k1 <= nf; k1++) {
+		int ip = ifac[k1 + 1];
+		int ld = 0;
+		int l2 = l1 * ip;
+		int ido = n / l2;
+		int idot = ido + ido + 2;
+		int ipm = ip - 1;
+		for (j = 1; j <= ipm; j++) {
+			float argld;
+			int i1 = i, fi = 0;
+			wa[i - 1] = 1;
+			wa[i] = 0;
+			ld += l1;
+			argld = ld * argh;
+			for (ii = 4; ii <= idot; ii += 2) {
+				i += 2;
+				fi += 1;
+				wa[i - 1] = cos(fi * argld);
+				wa[i] = sin(fi * argld);
+			}
+			if (ip > 5) {
+				wa[i1 - 1] = wa[i - 1];
+				wa[i1] = wa[i];
+			}
+		}
+		l1 = l2;
+	}
+}				/* cffti1 */
+
+v4sf *cfftf1_ps(int n, const v4sf * input_readonly, v4sf * work1, v4sf * work2,
+		const float *wa, const int *ifac, int isign)
+{
+	v4sf *in = (v4sf *) input_readonly;
+	v4sf *out = (in == work2 ? work1 : work2);
+	int nf = ifac[1], k1;
+	int l1 = 1;
+	int iw = 0;
+	assert(in != out && work1 != work2);
+	for (k1 = 2; k1 <= nf + 1; k1++) {
+		int ip = ifac[k1];
+		int l2 = ip * l1;
+		int ido = n / l2;
+		int idot = ido + ido;
+		switch (ip) {
+		case 5:{
+				int ix2 = iw + idot;
+				int ix3 = ix2 + idot;
+				int ix4 = ix3 + idot;
+				passf5_ps(idot, l1, in, out, &wa[iw], &wa[ix2],
+					  &wa[ix3], &wa[ix4], isign);
+			} break;
+		case 4:{
+				int ix2 = iw + idot;
+				int ix3 = ix2 + idot;
+				passf4_ps(idot, l1, in, out, &wa[iw], &wa[ix2],
+					  &wa[ix3], isign);
+			} break;
+		case 2:{
+				passf2_ps(idot, l1, in, out, &wa[iw], isign);
+			}
+			break;
+		case 3:{
+				int ix2 = iw + idot;
+				passf3_ps(idot, l1, in, out, &wa[iw], &wa[ix2],
+					  isign);
+			} break;
+		default:
+			assert(0);
+		}
+		l1 = l2;
+		iw += (ip - 1) * idot;
+		if (out == work2) {
+			out = work1;
+			in = work2;
+		} else {
+			out = work2;
+			in = work1;
+		}
+	}
+
+	return in;		/* this is in fact the output .. */
+}
+
+struct PFFFT_Setup {
+	int N;
+	int Ncvec;		// nb of complex simd vectors (N/4 if PFFFT_COMPLEX, N/8 if PFFFT_REAL)
+	int ifac[15];
+	pffft_transform_t transform;
+	v4sf *data;		// allocated room for twiddle coefs
+	float *e;		// points into 'data' , N/4*3 elements
+	float *twiddle;		// points into 'data', N/4 elements
+};
+
+PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform)
+{
+	PFFFT_Setup *s = (PFFFT_Setup *) malloc(sizeof(PFFFT_Setup));
+	int k, m;
+	/* unfortunately, the fft size must be a multiple of 16 for complex FFTs 
+	   and 32 for real FFTs -- a lot of stuff would need to be rewritten to
+	   handle other cases (or maybe just switch to a scalar fft, I don't know..) */
+	if (transform == PFFFT_REAL) {
+		assert((N % (2 * SIMD_SZ * SIMD_SZ)) == 0 && N > 0);
+	}
+	if (transform == PFFFT_COMPLEX) {
+		assert((N % (SIMD_SZ * SIMD_SZ)) == 0 && N > 0);
+	}
+	//assert((N % 32) == 0);
+	s->N = N;
+	s->transform = transform;
+	/* nb of complex simd vectors */
+	s->Ncvec = (transform == PFFFT_REAL ? N / 2 : N) / SIMD_SZ;
+	s->data = (v4sf *) pffft_aligned_malloc(2 * s->Ncvec * sizeof(v4sf));
+	s->e = (float *)s->data;
+	s->twiddle =
+	    (float *)(s->data + (2 * s->Ncvec * (SIMD_SZ - 1)) / SIMD_SZ);
+
+	if (transform == PFFFT_REAL) {
+		for (k = 0; k < s->Ncvec; ++k) {
+			int i = k / SIMD_SZ;
+			int j = k % SIMD_SZ;
+			for (m = 0; m < SIMD_SZ - 1; ++m) {
+				float A = -2 * M_PI * (m + 1) * k / N;
+				s->e[(2 * (i * 3 + m) + 0) * SIMD_SZ + j] =
+				    cos(A);
+				s->e[(2 * (i * 3 + m) + 1) * SIMD_SZ + j] =
+				    sin(A);
+			}
+		}
+		rffti1_ps(N / SIMD_SZ, s->twiddle, s->ifac);
+	} else {
+		for (k = 0; k < s->Ncvec; ++k) {
+			int i = k / SIMD_SZ;
+			int j = k % SIMD_SZ;
+			for (m = 0; m < SIMD_SZ - 1; ++m) {
+				float A = -2 * M_PI * (m + 1) * k / N;
+				s->e[(2 * (i * 3 + m) + 0) * SIMD_SZ + j] =
+				    cos(A);
+				s->e[(2 * (i * 3 + m) + 1) * SIMD_SZ + j] =
+				    sin(A);
+			}
+		}
+		cffti1_ps(N / SIMD_SZ, s->twiddle, s->ifac);
+	}
+
+	/* check that N is decomposable with allowed prime factors */
+	for (k = 0, m = 1; k < s->ifac[1]; ++k) {
+		m *= s->ifac[2 + k];
+	}
+	if (m != N / SIMD_SZ) {
+		pffft_destroy_setup(s);
+		s = 0;
+	}
+
+	return s;
+}
+
+void pffft_destroy_setup(PFFFT_Setup * s)
+{
+	pffft_aligned_free(s->data);
+	free(s);
+}
+
+#if !defined(PFFFT_SIMD_DISABLE)
+
+/* [0 0 1 2 3 4 5 6 7 8] -> [0 8 7 6 5 4 3 2 1] */
+static void reversed_copy(int N, const v4sf * in, int in_stride, v4sf * out)
+{
+	v4sf g0, g1;
+	int k;
+	INTERLEAVE2(in[0], in[1], g0, g1);
+	in += in_stride;
+
+	*--out = VSWAPHL(g0, g1);	// [g0l, g0h], [g1l g1h] -> [g1l, g0h]
+	for (k = 1; k < N; ++k) {
+		v4sf h0, h1;
+		INTERLEAVE2(in[0], in[1], h0, h1);
+		in += in_stride;
+		*--out = VSWAPHL(g1, h0);
+		*--out = VSWAPHL(h0, h1);
+		g1 = h1;
+	}
+	*--out = VSWAPHL(g1, g0);
+}
+
+static void unreversed_copy(int N, const v4sf * in, v4sf * out, int out_stride)
+{
+	v4sf g0, g1, h0, h1;
+	int k;
+	g0 = g1 = in[0];
+	++in;
+	for (k = 1; k < N; ++k) {
+		h0 = *in++;
+		h1 = *in++;
+		g1 = VSWAPHL(g1, h0);
+		h0 = VSWAPHL(h0, h1);
+		UNINTERLEAVE2(h0, g1, out[0], out[1]);
+		out += out_stride;
+		g1 = h1;
+	}
+	h0 = *in++;
+	h1 = g0;
+	g1 = VSWAPHL(g1, h0);
+	h0 = VSWAPHL(h0, h1);
+	UNINTERLEAVE2(h0, g1, out[0], out[1]);
+}
+
+void pffft_zreorder(PFFFT_Setup * setup, const float *in, float *out,
+		    pffft_direction_t direction)
+{
+	int k, N = setup->N, Ncvec = setup->Ncvec;
+	const v4sf *vin = (const v4sf *)in;
+	v4sf *vout = (v4sf *) out;
+	assert(in != out);
+	if (setup->transform == PFFFT_REAL) {
+		int k, dk = N / 32;
+		if (direction == PFFFT_FORWARD) {
+			for (k = 0; k < dk; ++k) {
+				INTERLEAVE2(vin[k * 8 + 0], vin[k * 8 + 1],
+					    vout[2 * (0 * dk + k) + 0],
+					    vout[2 * (0 * dk + k) + 1]);
+				INTERLEAVE2(vin[k * 8 + 4], vin[k * 8 + 5],
+					    vout[2 * (2 * dk + k) + 0],
+					    vout[2 * (2 * dk + k) + 1]);
+			}
+			reversed_copy(dk, vin + 2, 8, (v4sf *) (out + N / 2));
+			reversed_copy(dk, vin + 6, 8, (v4sf *) (out + N));
+		} else {
+			for (k = 0; k < dk; ++k) {
+				UNINTERLEAVE2(vin[2 * (0 * dk + k) + 0],
+					      vin[2 * (0 * dk + k) + 1],
+					      vout[k * 8 + 0], vout[k * 8 + 1]);
+				UNINTERLEAVE2(vin[2 * (2 * dk + k) + 0],
+					      vin[2 * (2 * dk + k) + 1],
+					      vout[k * 8 + 4], vout[k * 8 + 5]);
+			}
+			unreversed_copy(dk, (v4sf *) (in + N / 4),
+					(v4sf *) (out + N - 6 * SIMD_SZ), -8);
+			unreversed_copy(dk, (v4sf *) (in + 3 * N / 4),
+					(v4sf *) (out + N - 2 * SIMD_SZ), -8);
+		}
+	} else {
+		if (direction == PFFFT_FORWARD) {
+			for (k = 0; k < Ncvec; ++k) {
+				int kk = (k / 4) + (k % 4) * (Ncvec / 4);
+				INTERLEAVE2(vin[k * 2], vin[k * 2 + 1],
+					    vout[kk * 2], vout[kk * 2 + 1]);
+			}
+		} else {
+			for (k = 0; k < Ncvec; ++k) {
+				int kk = (k / 4) + (k % 4) * (Ncvec / 4);
+				UNINTERLEAVE2(vin[kk * 2], vin[kk * 2 + 1],
+					      vout[k * 2], vout[k * 2 + 1]);
+			}
+		}
+	}
+}
+
+void pffft_cplx_finalize(int Ncvec, const v4sf * in, v4sf * out, const v4sf * e)
+{
+	int k, dk = Ncvec / SIMD_SZ;	// number of 4x4 matrix blocks
+	v4sf r0, i0, r1, i1, r2, i2, r3, i3;
+	v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
+	assert(in != out);
+	for (k = 0; k < dk; ++k) {
+		r0 = in[8 * k + 0];
+		i0 = in[8 * k + 1];
+		r1 = in[8 * k + 2];
+		i1 = in[8 * k + 3];
+		r2 = in[8 * k + 4];
+		i2 = in[8 * k + 5];
+		r3 = in[8 * k + 6];
+		i3 = in[8 * k + 7];
+		VTRANSPOSE4(r0, r1, r2, r3);
+		VTRANSPOSE4(i0, i1, i2, i3);
+		VCPLXMUL(r1, i1, e[k * 6 + 0], e[k * 6 + 1]);
+		VCPLXMUL(r2, i2, e[k * 6 + 2], e[k * 6 + 3]);
+		VCPLXMUL(r3, i3, e[k * 6 + 4], e[k * 6 + 5]);
+
+		sr0 = VADD(r0, r2);
+		dr0 = VSUB(r0, r2);
+		sr1 = VADD(r1, r3);
+		dr1 = VSUB(r1, r3);
+		si0 = VADD(i0, i2);
+		di0 = VSUB(i0, i2);
+		si1 = VADD(i1, i3);
+		di1 = VSUB(i1, i3);
+
+		/*
+		   transformation for each column is:
+
+		   [1   1   1   1   0   0   0   0]   [r0]
+		   [1   0  -1   0   0  -1   0   1]   [r1]
+		   [1  -1   1  -1   0   0   0   0]   [r2]
+		   [1   0  -1   0   0   1   0  -1]   [r3]
+		   [0   0   0   0   1   1   1   1] * [i0]
+		   [0   1   0  -1   1   0  -1   0]   [i1]
+		   [0   0   0   0   1  -1   1  -1]   [i2]
+		   [0  -1   0   1   1   0  -1   0]   [i3]    
+		 */
+
+		r0 = VADD(sr0, sr1);
+		i0 = VADD(si0, si1);
+		r1 = VADD(dr0, di1);
+		i1 = VSUB(di0, dr1);
+		r2 = VSUB(sr0, sr1);
+		i2 = VSUB(si0, si1);
+		r3 = VSUB(dr0, di1);
+		i3 = VADD(di0, dr1);
+
+		*out++ = r0;
+		*out++ = i0;
+		*out++ = r1;
+		*out++ = i1;
+		*out++ = r2;
+		*out++ = i2;
+		*out++ = r3;
+		*out++ = i3;
+	}
+}
+
+void pffft_cplx_preprocess(int Ncvec, const v4sf * in, v4sf * out,
+			   const v4sf * e)
+{
+	int k, dk = Ncvec / SIMD_SZ;	// number of 4x4 matrix blocks
+	v4sf r0, i0, r1, i1, r2, i2, r3, i3;
+	v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
+	assert(in != out);
+	for (k = 0; k < dk; ++k) {
+		r0 = in[8 * k + 0];
+		i0 = in[8 * k + 1];
+		r1 = in[8 * k + 2];
+		i1 = in[8 * k + 3];
+		r2 = in[8 * k + 4];
+		i2 = in[8 * k + 5];
+		r3 = in[8 * k + 6];
+		i3 = in[8 * k + 7];
+
+		sr0 = VADD(r0, r2);
+		dr0 = VSUB(r0, r2);
+		sr1 = VADD(r1, r3);
+		dr1 = VSUB(r1, r3);
+		si0 = VADD(i0, i2);
+		di0 = VSUB(i0, i2);
+		si1 = VADD(i1, i3);
+		di1 = VSUB(i1, i3);
+
+		r0 = VADD(sr0, sr1);
+		i0 = VADD(si0, si1);
+		r1 = VSUB(dr0, di1);
+		i1 = VADD(di0, dr1);
+		r2 = VSUB(sr0, sr1);
+		i2 = VSUB(si0, si1);
+		r3 = VADD(dr0, di1);
+		i3 = VSUB(di0, dr1);
+
+		VCPLXMULCONJ(r1, i1, e[k * 6 + 0], e[k * 6 + 1]);
+		VCPLXMULCONJ(r2, i2, e[k * 6 + 2], e[k * 6 + 3]);
+		VCPLXMULCONJ(r3, i3, e[k * 6 + 4], e[k * 6 + 5]);
+
+		VTRANSPOSE4(r0, r1, r2, r3);
+		VTRANSPOSE4(i0, i1, i2, i3);
+
+		*out++ = r0;
+		*out++ = i0;
+		*out++ = r1;
+		*out++ = i1;
+		*out++ = r2;
+		*out++ = i2;
+		*out++ = r3;
+		*out++ = i3;
+	}
+}
+
+static ALWAYS_INLINE(void) pffft_real_finalize_4x4(const v4sf * in0,
+						   const v4sf * in1,
+						   const v4sf * in,
+						   const v4sf * e, v4sf * out)
+{
+	v4sf r0, i0, r1, i1, r2, i2, r3, i3;
+	v4sf sr0, dr0, sr1, dr1, si0, di0, si1, di1;
+	r0 = *in0;
+	i0 = *in1;
+	r1 = *in++;
+	i1 = *in++;
+	r2 = *in++;
+	i2 = *in++;
+	r3 = *in++;
+	i3 = *in++;
+	VTRANSPOSE4(r0, r1, r2, r3);
+	VTRANSPOSE4(i0, i1, i2, i3);
+
+	/*
+	   transformation for each column is:
+
+	   [1   1   1   1   0   0   0   0]   [r0]
+	   [1   0  -1   0   0  -1   0   1]   [r1]
+	   [1   0  -1   0   0   1   0  -1]   [r2]
+	   [1  -1   1  -1   0   0   0   0]   [r3]
+	   [0   0   0   0   1   1   1   1] * [i0]
+	   [0  -1   0   1  -1   0   1   0]   [i1]
+	   [0  -1   0   1   1   0  -1   0]   [i2]
+	   [0   0   0   0  -1   1  -1   1]   [i3]    
+	 */
+
+	//cerr << "matrix initial, before e , REAL:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
+	//cerr << "matrix initial, before e, IMAG :\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
+
+	VCPLXMUL(r1, i1, e[0], e[1]);
+	VCPLXMUL(r2, i2, e[2], e[3]);
+	VCPLXMUL(r3, i3, e[4], e[5]);
+
+	//cerr << "matrix initial, real part:\n 1: " << r0 << "\n 1: " << r1 << "\n 1: " << r2 << "\n 1: " << r3 << "\n";
+	//cerr << "matrix initial, imag part:\n 1: " << i0 << "\n 1: " << i1 << "\n 1: " << i2 << "\n 1: " << i3 << "\n";
+
+	sr0 = VADD(r0, r2);
+	dr0 = VSUB(r0, r2);
+	sr1 = VADD(r1, r3);
+	dr1 = VSUB(r3, r1);
+	si0 = VADD(i0, i2);
+	di0 = VSUB(i0, i2);
+	si1 = VADD(i1, i3);
+	di1 = VSUB(i3, i1);
+
+	r0 = VADD(sr0, sr1);
+	r3 = VSUB(sr0, sr1);
+	i0 = VADD(si0, si1);
+	i3 = VSUB(si1, si0);
+	r1 = VADD(dr0, di1);
+	r2 = VSUB(dr0, di1);
+	i1 = VSUB(dr1, di0);
+	i2 = VADD(dr1, di0);
+
+	*out++ = r0;
+	*out++ = i0;
+	*out++ = r1;
+	*out++ = i1;
+	*out++ = r2;
+	*out++ = i2;
+	*out++ = r3;
+	*out++ = i3;
+
+}
+
+static NEVER_INLINE(void) pffft_real_finalize(int Ncvec, const v4sf * in,
+					      v4sf * out, const v4sf * e)
+{
+	int k, dk = Ncvec / SIMD_SZ;	// number of 4x4 matrix blocks
+	/* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
+
+	v4sf_union cr, ci, *uout = (v4sf_union *) out;
+	v4sf save = in[7], zero = VZERO();
+	float xr0, xi0, xr1, xi1, xr2, xi2, xr3, xi3;
+	static const float s = M_SQRT2 / 2;
+
+	cr.v = in[0];
+	ci.v = in[Ncvec * 2 - 1];
+	assert(in != out);
+	pffft_real_finalize_4x4(&zero, &zero, in + 1, e, out);
+
+	/*
+	   [cr0 cr1 cr2 cr3 ci0 ci1 ci2 ci3]
+
+	   [Xr(1)]  ] [1   1   1   1   0   0   0   0]
+	   [Xr(N/4) ] [0   0   0   0   1   s   0  -s]
+	   [Xr(N/2) ] [1   0  -1   0   0   0   0   0]
+	   [Xr(3N/4)] [0   0   0   0   1  -s   0   s]
+	   [Xi(1)   ] [1  -1   1  -1   0   0   0   0]
+	   [Xi(N/4) ] [0   0   0   0   0  -s  -1  -s]
+	   [Xi(N/2) ] [0  -1   0   1   0   0   0   0]
+	   [Xi(3N/4)] [0   0   0   0   0  -s   1  -s]
+	 */
+
+	xr0 = (cr.f[0] + cr.f[2]) + (cr.f[1] + cr.f[3]);
+	uout[0].f[0] = xr0;
+	xi0 = (cr.f[0] + cr.f[2]) - (cr.f[1] + cr.f[3]);
+	uout[1].f[0] = xi0;
+	xr2 = (cr.f[0] - cr.f[2]);
+	uout[4].f[0] = xr2;
+	xi2 = (cr.f[3] - cr.f[1]);
+	uout[5].f[0] = xi2;
+	xr1 = ci.f[0] + s * (ci.f[1] - ci.f[3]);
+	uout[2].f[0] = xr1;
+	xi1 = -ci.f[2] - s * (ci.f[1] + ci.f[3]);
+	uout[3].f[0] = xi1;
+	xr3 = ci.f[0] - s * (ci.f[1] - ci.f[3]);
+	uout[6].f[0] = xr3;
+	xi3 = ci.f[2] - s * (ci.f[1] + ci.f[3]);
+	uout[7].f[0] = xi3;
+
+	for (k = 1; k < dk; ++k) {
+		v4sf save_next = in[8 * k + 7];
+		pffft_real_finalize_4x4(&save, &in[8 * k + 0], in + 8 * k + 1,
+					e + k * 6, out + k * 8);
+		save = save_next;
+	}
+
+}
+
+static ALWAYS_INLINE(void) pffft_real_preprocess_4x4(const v4sf * in,
+						     const v4sf * e, v4sf * out,
+						     int first)
+{
+	v4sf r0 = in[0], i0 = in[1], r1 = in[2], i1 = in[3], r2 = in[4], i2 =
+	    in[5], r3 = in[6], i3 = in[7];
+	/*
+	   transformation for each column is:
+
+	   [1   1   1   1   0   0   0   0]   [r0]
+	   [1   0   0  -1   0  -1  -1   0]   [r1]
+	   [1  -1  -1   1   0   0   0   0]   [r2]
+	   [1   0   0  -1   0   1   1   0]   [r3]
+	   [0   0   0   0   1  -1   1  -1] * [i0]
+	   [0  -1   1   0   1   0   0   1]   [i1]
+	   [0   0   0   0   1   1  -1  -1]   [i2]
+	   [0   1  -1   0   1   0   0   1]   [i3]    
+	 */
+
+	v4sf sr0 = VADD(r0, r3), dr0 = VSUB(r0, r3);
+	v4sf sr1 = VADD(r1, r2), dr1 = VSUB(r1, r2);
+	v4sf si0 = VADD(i0, i3), di0 = VSUB(i0, i3);
+	v4sf si1 = VADD(i1, i2), di1 = VSUB(i1, i2);
+
+	r0 = VADD(sr0, sr1);
+	r2 = VSUB(sr0, sr1);
+	r1 = VSUB(dr0, si1);
+	r3 = VADD(dr0, si1);
+	i0 = VSUB(di0, di1);
+	i2 = VADD(di0, di1);
+	i1 = VSUB(si0, dr1);
+	i3 = VADD(si0, dr1);
+
+	VCPLXMULCONJ(r1, i1, e[0], e[1]);
+	VCPLXMULCONJ(r2, i2, e[2], e[3]);
+	VCPLXMULCONJ(r3, i3, e[4], e[5]);
+
+	VTRANSPOSE4(r0, r1, r2, r3);
+	VTRANSPOSE4(i0, i1, i2, i3);
+
+	if (!first) {
+		*out++ = r0;
+		*out++ = i0;
+	}
+	*out++ = r1;
+	*out++ = i1;
+	*out++ = r2;
+	*out++ = i2;
+	*out++ = r3;
+	*out++ = i3;
+}
+
+static NEVER_INLINE(void) pffft_real_preprocess(int Ncvec, const v4sf * in,
+						v4sf * out, const v4sf * e)
+{
+	int k, dk = Ncvec / SIMD_SZ;	// number of 4x4 matrix blocks
+	/* fftpack order is f0r f1r f1i f2r f2i ... f(n-1)r f(n-1)i f(n)r */
+
+	v4sf_union Xr, Xi, *uout = (v4sf_union *) out;
+	float cr0, ci0, cr1, ci1, cr2, ci2, cr3, ci3;
+	static const float s = M_SQRT2;
+	assert(in != out);
+	for (k = 0; k < 4; ++k) {
+		Xr.f[k] = ((float *)in)[8 * k];
+		Xi.f[k] = ((float *)in)[8 * k + 4];
+	}
+
+	pffft_real_preprocess_4x4(in, e, out + 1, 1);	// will write only 6 values
+
+	/*
+	   [Xr0 Xr1 Xr2 Xr3 Xi0 Xi1 Xi2 Xi3]
+
+	   [cr0] [1   0   2   0   1   0   0   0]
+	   [cr1] [1   0   0   0  -1   0  -2   0]
+	   [cr2] [1   0  -2   0   1   0   0   0]
+	   [cr3] [1   0   0   0  -1   0   2   0]
+	   [ci0] [0   2   0   2   0   0   0   0]
+	   [ci1] [0   s   0  -s   0  -s   0  -s]
+	   [ci2] [0   0   0   0   0  -2   0   2]
+	   [ci3] [0  -s   0   s   0  -s   0  -s]
+	 */
+	for (k = 1; k < dk; ++k) {
+		pffft_real_preprocess_4x4(in + 8 * k, e + k * 6,
+					  out - 1 + k * 8, 0);
+	}
+
+	cr0 = (Xr.f[0] + Xi.f[0]) + 2 * Xr.f[2];
+	uout[0].f[0] = cr0;
+	cr1 = (Xr.f[0] - Xi.f[0]) - 2 * Xi.f[2];
+	uout[0].f[1] = cr1;
+	cr2 = (Xr.f[0] + Xi.f[0]) - 2 * Xr.f[2];
+	uout[0].f[2] = cr2;
+	cr3 = (Xr.f[0] - Xi.f[0]) + 2 * Xi.f[2];
+	uout[0].f[3] = cr3;
+	ci0 = 2 * (Xr.f[1] + Xr.f[3]);
+	uout[2 * Ncvec - 1].f[0] = ci0;
+	ci1 = s * (Xr.f[1] - Xr.f[3]) - s * (Xi.f[1] + Xi.f[3]);
+	uout[2 * Ncvec - 1].f[1] = ci1;
+	ci2 = 2 * (Xi.f[3] - Xi.f[1]);
+	uout[2 * Ncvec - 1].f[2] = ci2;
+	ci3 = -s * (Xr.f[1] - Xr.f[3]) - s * (Xi.f[1] + Xi.f[3]);
+	uout[2 * Ncvec - 1].f[3] = ci3;
+}
+
+void pffft_transform_internal(PFFFT_Setup * setup, const float *finput,
+			      float *foutput, v4sf * scratch,
+			      pffft_direction_t direction, int ordered)
+{
+	int k, Ncvec = setup->Ncvec;
+	int nf_odd = (setup->ifac[1] & 1);
+
+	// temporary buffer is allocated on the stack if the scratch pointer is NULL
+	int stack_allocate = (scratch == 0 ? Ncvec * 2 : 1);
+	VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
+
+	const v4sf *vinput = (const v4sf *)finput;
+	v4sf *voutput = (v4sf *) foutput;
+	v4sf *buff[2] = { voutput, scratch ? scratch : scratch_on_stack };
+	int ib = (nf_odd ^ ordered ? 1 : 0);
+
+	assert(VALIGNED(finput) && VALIGNED(foutput));
+
+	//assert(finput != foutput);
+	if (direction == PFFFT_FORWARD) {
+		ib = !ib;
+		if (setup->transform == PFFFT_REAL) {
+			ib = (rfftf1_ps(Ncvec * 2, vinput, buff[ib], buff[!ib],
+					setup->twiddle,
+					&setup->ifac[0]) == buff[0] ? 0 : 1);
+			pffft_real_finalize(Ncvec, buff[ib], buff[!ib],
+					    (v4sf *) setup->e);
+		} else {
+			v4sf *tmp = buff[ib];
+			for (k = 0; k < Ncvec; ++k) {
+				UNINTERLEAVE2(vinput[k * 2], vinput[k * 2 + 1],
+					      tmp[k * 2], tmp[k * 2 + 1]);
+			}
+			ib = (cfftf1_ps(Ncvec, buff[ib], buff[!ib], buff[ib],
+					setup->twiddle, &setup->ifac[0],
+					-1) == buff[0] ? 0 : 1);
+			pffft_cplx_finalize(Ncvec, buff[ib], buff[!ib],
+					    (v4sf *) setup->e);
+		}
+		if (ordered) {
+			pffft_zreorder(setup, (float *)buff[!ib],
+				       (float *)buff[ib], PFFFT_FORWARD);
+		} else
+			ib = !ib;
+	} else {
+		if (vinput == buff[ib]) {
+			ib = !ib;	// may happen when finput == foutput
+		}
+		if (ordered) {
+			pffft_zreorder(setup, (float *)vinput,
+				       (float *)buff[ib], PFFFT_BACKWARD);
+			vinput = buff[ib];
+			ib = !ib;
+		}
+		if (setup->transform == PFFFT_REAL) {
+			pffft_real_preprocess(Ncvec, vinput, buff[ib],
+					      (v4sf *) setup->e);
+			ib = (rfftb1_ps
+			      (Ncvec * 2, buff[ib], buff[0], buff[1],
+			       setup->twiddle,
+			       &setup->ifac[0]) == buff[0] ? 0 : 1);
+		} else {
+			pffft_cplx_preprocess(Ncvec, vinput, buff[ib],
+					      (v4sf *) setup->e);
+			ib = (cfftf1_ps
+			      (Ncvec, buff[ib], buff[0], buff[1],
+			       setup->twiddle, &setup->ifac[0],
+			       +1) == buff[0] ? 0 : 1);
+			for (k = 0; k < Ncvec; ++k) {
+				INTERLEAVE2(buff[ib][k * 2],
+					    buff[ib][k * 2 + 1],
+					    buff[ib][k * 2],
+					    buff[ib][k * 2 + 1]);
+			}
+		}
+	}
+
+	if (buff[ib] != voutput) {
+		/* extra copy required -- this situation should only happen when finput == foutput */
+		assert(finput == foutput);
+		for (k = 0; k < Ncvec; ++k) {
+			v4sf a = buff[ib][2 * k], b = buff[ib][2 * k + 1];
+			voutput[2 * k] = a;
+			voutput[2 * k + 1] = b;
+		}
+		ib = !ib;
+	}
+	assert(buff[ib] == voutput);
+}
+
+void pffft_zconvolve_accumulate(PFFFT_Setup * s, const float *a, const float *b,
+				float *ab, float scaling)
+{
+	int Ncvec = s->Ncvec;
+	const v4sf *RESTRICT va = (const v4sf *)a;
+	const v4sf *RESTRICT vb = (const v4sf *)b;
+	v4sf *RESTRICT vab = (v4sf *) ab;
+
+#ifdef __arm__
+	__builtin_prefetch(va);
+	__builtin_prefetch(vb);
+	__builtin_prefetch(vab);
+	__builtin_prefetch(va + 2);
+	__builtin_prefetch(vb + 2);
+	__builtin_prefetch(vab + 2);
+	__builtin_prefetch(va + 4);
+	__builtin_prefetch(vb + 4);
+	__builtin_prefetch(vab + 4);
+	__builtin_prefetch(va + 6);
+	__builtin_prefetch(vb + 6);
+	__builtin_prefetch(vab + 6);
+#ifndef __clang__
+#define ZCONVOLVE_USING_INLINE_NEON_ASM
+#endif
+#endif
+
+	float ar, ai, br, bi, abr, abi;
+#ifndef ZCONVOLVE_USING_INLINE_ASM
+	v4sf vscal = LD_PS1(scaling);
+	int i;
+#endif
+
+	assert(VALIGNED(a) && VALIGNED(b) && VALIGNED(ab));
+	ar = ((v4sf_union *) va)[0].f[0];
+	ai = ((v4sf_union *) va)[1].f[0];
+	br = ((v4sf_union *) vb)[0].f[0];
+	bi = ((v4sf_union *) vb)[1].f[0];
+	abr = ((v4sf_union *) vab)[0].f[0];
+	abi = ((v4sf_union *) vab)[1].f[0];
+
+#ifdef ZCONVOLVE_USING_INLINE_ASM	// inline asm version, unfortunately miscompiled by clang 3.2, at least on ubuntu.. so this will be restricted to gcc
+	const float *a_ = a, *b_ = b;
+	float *ab_ = ab;
+	int N = Ncvec;
+	asm volatile ("mov         r8, %2                  \n"
+		      "vdup.f32    q15, %4                 \n"
+		      "1:                                  \n"
+		      "pld         [%0,#64]                \n"
+		      "pld         [%1,#64]                \n"
+		      "pld         [%2,#64]                \n"
+		      "pld         [%0,#96]                \n"
+		      "pld         [%1,#96]                \n"
+		      "pld         [%2,#96]                \n"
+		      "vld1.f32    {q0,q1},   [%0,:128]!         \n"
+		      "vld1.f32    {q4,q5},   [%1,:128]!         \n"
+		      "vld1.f32    {q2,q3},   [%0,:128]!         \n"
+		      "vld1.f32    {q6,q7},   [%1,:128]!         \n"
+		      "vld1.f32    {q8,q9},   [r8,:128]!          \n"
+		      "vmul.f32    q10, q0, q4             \n"
+		      "vmul.f32    q11, q0, q5             \n"
+		      "vmul.f32    q12, q2, q6             \n"
+		      "vmul.f32    q13, q2, q7             \n"
+		      "vmls.f32    q10, q1, q5             \n"
+		      "vmla.f32    q11, q1, q4             \n"
+		      "vld1.f32    {q0,q1}, [r8,:128]!     \n"
+		      "vmls.f32    q12, q3, q7             \n"
+		      "vmla.f32    q13, q3, q6             \n"
+		      "vmla.f32    q8, q10, q15            \n"
+		      "vmla.f32    q9, q11, q15            \n"
+		      "vmla.f32    q0, q12, q15            \n"
+		      "vmla.f32    q1, q13, q15            \n"
+		      "vst1.f32    {q8,q9},[%2,:128]!    \n"
+		      "vst1.f32    {q0,q1},[%2,:128]!    \n"
+		      "subs        %3, #2                  \n"
+		      "bne         1b                      \n":"+r" (a_),
+		      "+r"(b_), "+r"(ab_), "+r"(N):"r"(scaling):"r8", "q0",
+		      "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9",
+		      "q10", "q11", "q12", "q13", "q15", "memory");
+#else				// default routine, works fine for non-arm cpus with current compilers
+	for (i = 0; i < Ncvec; i += 2) {
+		v4sf ar, ai, br, bi;
+		ar = va[2 * i + 0];
+		ai = va[2 * i + 1];
+		br = vb[2 * i + 0];
+		bi = vb[2 * i + 1];
+		VCPLXMUL(ar, ai, br, bi);
+		vab[2 * i + 0] = VMADD(ar, vscal, vab[2 * i + 0]);
+		vab[2 * i + 1] = VMADD(ai, vscal, vab[2 * i + 1]);
+		ar = va[2 * i + 2];
+		ai = va[2 * i + 3];
+		br = vb[2 * i + 2];
+		bi = vb[2 * i + 3];
+		VCPLXMUL(ar, ai, br, bi);
+		vab[2 * i + 2] = VMADD(ar, vscal, vab[2 * i + 2]);
+		vab[2 * i + 3] = VMADD(ai, vscal, vab[2 * i + 3]);
+	}
+#endif
+	if (s->transform == PFFFT_REAL) {
+		((v4sf_union *) vab)[0].f[0] = abr + ar * br * scaling;
+		((v4sf_union *) vab)[1].f[0] = abi + ai * bi * scaling;
+	}
+}
+
+#else				// defined(PFFFT_SIMD_DISABLE)
+
+// standard routine using scalar floats, without SIMD stuff.
+
+#define pffft_zreorder_nosimd pffft_zreorder
+void pffft_zreorder_nosimd(PFFFT_Setup * setup, const float *in, float *out,
+			   pffft_direction_t direction)
+{
+	int k, N = setup->N;
+	if (setup->transform == PFFFT_COMPLEX) {
+		for (k = 0; k < 2 * N; ++k)
+			out[k] = in[k];
+		return;
+	} else if (direction == PFFFT_FORWARD) {
+		float x_N = in[N - 1];
+		for (k = N - 1; k > 1; --k)
+			out[k] = in[k - 1];
+		out[0] = in[0];
+		out[1] = x_N;
+	} else {
+		float x_N = in[1];
+		for (k = 1; k < N - 1; ++k)
+			out[k] = in[k + 1];
+		out[0] = in[0];
+		out[N - 1] = x_N;
+	}
+}
+
+#define pffft_transform_internal_nosimd pffft_transform_internal
+void pffft_transform_internal_nosimd(PFFFT_Setup * setup, const float *input,
+				     float *output, float *scratch,
+				     pffft_direction_t direction, int ordered)
+{
+	int Ncvec = setup->Ncvec;
+	int nf_odd = (setup->ifac[1] & 1);
+
+	// temporary buffer is allocated on the stack if the scratch pointer is NULL
+	int stack_allocate = (scratch == 0 ? Ncvec * 2 : 1);
+	VLA_ARRAY_ON_STACK(v4sf, scratch_on_stack, stack_allocate);
+	float *buff[2];
+	int ib;
+	if (scratch == 0)
+		scratch = scratch_on_stack;
+	buff[0] = output;
+	buff[1] = scratch;
+
+	if (setup->transform == PFFFT_COMPLEX)
+		ordered = 0;	// it is always ordered.
+	ib = (nf_odd ^ ordered ? 1 : 0);
+
+	if (direction == PFFFT_FORWARD) {
+		if (setup->transform == PFFFT_REAL) {
+			ib = (rfftf1_ps(Ncvec * 2, input, buff[ib], buff[!ib],
+					setup->twiddle,
+					&setup->ifac[0]) == buff[0] ? 0 : 1);
+		} else {
+			ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
+					setup->twiddle, &setup->ifac[0],
+					-1) == buff[0] ? 0 : 1);
+		}
+		if (ordered) {
+			pffft_zreorder(setup, buff[ib], buff[!ib],
+				       PFFFT_FORWARD);
+			ib = !ib;
+		}
+	} else {
+		if (input == buff[ib]) {
+			ib = !ib;	// may happen when finput == foutput
+		}
+		if (ordered) {
+			pffft_zreorder(setup, input, buff[!ib], PFFFT_BACKWARD);
+			input = buff[!ib];
+		}
+		if (setup->transform == PFFFT_REAL) {
+			ib = (rfftb1_ps(Ncvec * 2, input, buff[ib], buff[!ib],
+					setup->twiddle,
+					&setup->ifac[0]) == buff[0] ? 0 : 1);
+		} else {
+			ib = (cfftf1_ps(Ncvec, input, buff[ib], buff[!ib],
+					setup->twiddle, &setup->ifac[0],
+					+1) == buff[0] ? 0 : 1);
+		}
+	}
+	if (buff[ib] != output) {
+		int k;
+		// extra copy required -- this situation should happens only when finput == foutput
+		assert(input == output);
+		for (k = 0; k < Ncvec; ++k) {
+			float a = buff[ib][2 * k], b = buff[ib][2 * k + 1];
+			output[2 * k] = a;
+			output[2 * k + 1] = b;
+		}
+		ib = !ib;
+	}
+	assert(buff[ib] == output);
+}
+
+#define pffft_zconvolve_accumulate_nosimd pffft_zconvolve_accumulate
+void pffft_zconvolve_accumulate_nosimd(PFFFT_Setup * s, const float *a,
+				       const float *b, float *ab, float scaling)
+{
+	int i, Ncvec = s->Ncvec;
+
+	if (s->transform == PFFFT_REAL) {
+		// take care of the fftpack ordering
+		ab[0] += a[0] * b[0] * scaling;
+		ab[2 * Ncvec - 1] +=
+		    a[2 * Ncvec - 1] * b[2 * Ncvec - 1] * scaling;
+		++ab;
+		++a;
+		++b;
+		--Ncvec;
+	}
+	for (i = 0; i < Ncvec; ++i) {
+		float ar, ai, br, bi;
+		ar = a[2 * i + 0];
+		ai = a[2 * i + 1];
+		br = b[2 * i + 0];
+		bi = b[2 * i + 1];
+		VCPLXMUL(ar, ai, br, bi);
+		ab[2 * i + 0] += ar * scaling;
+		ab[2 * i + 1] += ai * scaling;
+	}
+}
+
+#endif				// defined(PFFFT_SIMD_DISABLE)
+
+void pffft_transform(PFFFT_Setup * setup, const float *input, float *output,
+		     float *work, pffft_direction_t direction)
+{
+	pffft_transform_internal(setup, input, output, (v4sf *) work, direction,
+				 0);
+}
+
+void pffft_transform_ordered(PFFFT_Setup * setup, const float *input,
+			     float *output, float *work,
+			     pffft_direction_t direction)
+{
+	pffft_transform_internal(setup, input, output, (v4sf *) work, direction,
+				 1);
+}
diff --git a/src/modules/module-filter-chain/pffft.h b/src/modules/module-filter-chain/pffft.h
new file mode 100644
index 000000000..dd554fe93
--- /dev/null
+++ b/src/modules/module-filter-chain/pffft.h
@@ -0,0 +1,177 @@
+/* Copyright (c) 2013  Julien Pommier ( pommier@modartt.com )
+
+   Based on original fortran 77 code from FFTPACKv4 from NETLIB,
+   authored by Dr Paul Swarztrauber of NCAR, in 1985.
+
+   As confirmed by the NCAR fftpack software curators, the following
+   FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
+   released under the same terms.
+
+   FFTPACK license:
+
+   http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
+
+   Copyright (c) 2004 the University Corporation for Atmospheric
+   Research ("UCAR"). All rights reserved. Developed by NCAR's
+   Computational and Information Systems Laboratory, UCAR,
+   www.cisl.ucar.edu.
+
+   Redistribution and use of the Software in source and binary forms,
+   with or without modification, is permitted provided that the
+   following conditions are met:
+
+   - Neither the names of NCAR's Computational and Information Systems
+   Laboratory, the University Corporation for Atmospheric Research,
+   nor the names of its sponsors or contributors may be used to
+   endorse or promote products derived from this Software without
+   specific prior written permission.
+
+   - Redistributions of source code must retain the above copyright
+   notices, this list of conditions, and the disclaimer below.
+
+   - Redistributions in binary form must reproduce the above copyright
+   notice, this list of conditions, and the disclaimer below in the
+   documentation and/or other materials provided with the
+   distribution.
+
+   THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+   EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
+   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+   NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
+   HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
+   EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+   ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+   CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
+   SOFTWARE.
+*/
+
+/*
+   PFFFT : a Pretty Fast FFT.
+
+   This is basically an adaptation of the single precision fftpack
+   (v4) as found on netlib taking advantage of SIMD instruction found
+   on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
+
+   For architectures where no SIMD instruction is available, the code
+   falls back to a scalar version.
+
+   Restrictions:
+
+   - 1D transforms only, with 32-bit single precision.
+
+   - supports only transforms for inputs of length N of the form
+   N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
+   144, 160, etc are all acceptable lengths). Performance is best for
+   128<=N<=8192.
+
+   - all (float*) pointers in the functions below are expected to
+   have an "simd-compatible" alignment, that is 16 bytes on x86 and
+   powerpc CPUs.
+
+   You can allocate such buffers with the functions
+   pffft_aligned_malloc / pffft_aligned_free (or with stuff like
+   posix_memalign..)
+
+*/
+
+#ifndef PFFFT_H
+#define PFFFT_H
+
+#include <stddef.h> // for size_t
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+  /* opaque struct holding internal stuff (precomputed twiddle factors)
+     this struct can be shared by many threads as it contains only
+     read-only data.
+  */
+  typedef struct PFFFT_Setup PFFFT_Setup;
+
+  /* direction of the transform */
+  typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
+
+  /* type of transform */
+  typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
+
+  /*
+    prepare for performing transforms of size N -- the returned
+    PFFFT_Setup structure is read-only so it can safely be shared by
+    multiple concurrent threads.
+  */
+  PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
+  void pffft_destroy_setup(PFFFT_Setup *);
+  /*
+     Perform a Fourier transform , The z-domain data is stored in the
+     most efficient order for transforming it back, or using it for
+     convolution. If you need to have its content sorted in the
+     "usual" way, that is as an array of interleaved complex numbers,
+     either use pffft_transform_ordered , or call pffft_zreorder after
+     the forward fft, and before the backward fft.
+
+     Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
+     Typically you will want to scale the backward transform by 1/N.
+
+     The 'work' pointer should point to an area of N (2*N for complex
+     fft) floats, properly aligned. If 'work' is NULL, then stack will
+     be used instead (this is probably the best strategy for small
+     FFTs, say for N < 16384).
+
+     input and output may alias.
+  */
+  void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
+
+  /*
+     Similar to pffft_transform, but makes sure that the output is
+     ordered as expected (interleaved complex numbers).  This is
+     similar to calling pffft_transform and then pffft_zreorder.
+
+     input and output may alias.
+  */
+  void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
+
+  /*
+     call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
+     PFFFT_FORWARD) if you want to have the frequency components in
+     the correct "canonical" order, as interleaved complex numbers.
+
+     (for real transforms, both 0-frequency and half frequency
+     components, which are real, are assembled in the first entry as
+     F(0)+i*F(n/2+1). Note that the original fftpack did place
+     F(n/2+1) at the end of the arrays).
+
+     input and output should not alias.
+  */
+  void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
+
+  /*
+     Perform a multiplication of the frequency components of dft_a and
+     dft_b and accumulate them into dft_ab. The arrays should have
+     been obtained with pffft_transform(.., PFFFT_FORWARD) and should
+     *not* have been reordered with pffft_zreorder (otherwise just
+     perform the operation yourself as the dft coefs are stored as
+     interleaved complex numbers).
+
+     the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
+
+     The dft_a, dft_b and dft_ab pointers may alias.
+  */
+  void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
+
+  /*
+    the float buffers must have the correct alignment (16-byte boundary
+    on intel and powerpc). This function may be used to obtain such
+    correctly aligned buffers. 
+  */
+  void *pffft_aligned_malloc(size_t nb_bytes);
+  void pffft_aligned_free(void *);
+
+  /* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
+  int pffft_simd_size();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // PFFFT_H