filter-chain: use pffft for the convolver

It is faster.
This commit is contained in:
Wim Taymans 2021-08-23 21:00:45 +02:00
parent 9dbfa63193
commit 0f5face73f
9 changed files with 2550 additions and 990 deletions

View file

@ -50,8 +50,7 @@ pipewire_module_filter_chain = shared_library('pipewire-module-filter-chain',
'module-filter-chain/biquad.c', 'module-filter-chain/biquad.c',
'module-filter-chain/ladspa_plugin.c', 'module-filter-chain/ladspa_plugin.c',
'module-filter-chain/builtin_plugin.c', 'module-filter-chain/builtin_plugin.c',
'module-filter-chain/kiss_fft_f32.c', 'module-filter-chain/pffft.c',
'module-filter-chain/kiss_fftr_f32.c',
'module-filter-chain/convolver.c' ], 'module-filter-chain/convolver.c' ],
include_directories : [configinc, spa_inc], include_directories : [configinc, spa_inc],
install : true, install : true,

View file

@ -1,173 +0,0 @@
/*
* Copyright (c) 2003-2010, Mark Borgerding. All rights reserved.
* This file is part of KISS FFT - https://github.com/mborgerding/kissfft
*
* SPDX-License-Identifier: BSD-3-Clause
* See COPYING file for more information.
*/
/* kiss_fft_f32.h
defines kiss_fft_f32_scalar as either short or a float type
and defines
typedef struct { kiss_fft_f32_scalar r; kiss_fft_f32_scalar i; }kiss_fft_f32_cpx; */
#include "kiss_fft_f32.h"
#include <limits.h>
/* The 2*sizeof(size_t) alignment here is borrowed from
* GNU libc, so it should be good most everywhere.
* It is more conservative than is needed on some 64-bit
* platforms, but ia64 does require a 16-byte alignment.
* The SIMD extensions for x86 and ppc32 would want a
* larger alignment than this, but we don't need to
* do better than malloc.
*
* Borrowed from GLib's gobject/gtype.c
*/
#define STRUCT_ALIGNMENT (2 * sizeof (size_t))
#define ALIGN_STRUCT(offset) \
((offset + (STRUCT_ALIGNMENT - 1)) & -STRUCT_ALIGNMENT)
#define MAXFACTORS 32
/* e.g. an fft of length 128 has 4 factors
as far as kissfft is concerned
4*4*4*2
*/
struct kiss_fft_f32_state{
int nfft;
int inverse;
int factors[2*MAXFACTORS];
kiss_fft_f32_cpx twiddles[1];
};
/*
Explanation of macros dealing with complex math:
C_MUL(m,a,b) : m = a*b
C_FIXDIV( c , div ) : if a fixed point impl., c /= div. noop otherwise
C_SUB( res, a,b) : res = a - b
C_SUBFROM( res , a) : res -= a
C_ADDTO( res , a) : res += a
* */
#ifdef FIXED_POINT
#include <stdint.h>
#if (FIXED_POINT==32)
# define FRACBITS 31
# define SAMPPROD int64_t
#define SAMP_MAX INT32_MAX
#define SAMP_MIN INT32_MIN
#else
# define FRACBITS 15
# define SAMPPROD int32_t
#define SAMP_MAX INT16_MAX
#define SAMP_MIN INT16_MIN
#endif
#if defined(CHECK_OVERFLOW)
# define CHECK_OVERFLOW_OP(a,op,b) \
if ( (SAMPPROD)(a) op (SAMPPROD)(b) > SAMP_MAX || (SAMPPROD)(a) op (SAMPPROD)(b) < SAMP_MIN ) { \
g_critical("overflow @ " __FILE__ "(%d): (%d " #op" %d) = %ld",__LINE__,(a),(b),(SAMPPROD)(a) op (SAMPPROD)(b) ); }
#endif
# define smul(a,b) ( (SAMPPROD)(a)*(b) )
# define sround( x ) (kiss_fft_f32_scalar)( ( (x) + (1<<(FRACBITS-1)) ) >> FRACBITS )
# define S_MUL(a,b) sround( smul(a,b) )
# define C_MUL(m,a,b) \
do{ (m).r = sround( smul((a).r,(b).r) - smul((a).i,(b).i) ); \
(m).i = sround( smul((a).r,(b).i) + smul((a).i,(b).r) ); }while(0)
# define DIVSCALAR(x,k) \
(x) = sround( smul( x, SAMP_MAX/k ) )
# define C_FIXDIV(c,div) \
do { DIVSCALAR( (c).r , div); \
DIVSCALAR( (c).i , div); }while (0)
# define C_MULBYSCALAR( c, s ) \
do{ (c).r = sround( smul( (c).r , s ) ) ;\
(c).i = sround( smul( (c).i , s ) ) ; }while(0)
#else /* not FIXED_POINT*/
# define S_MUL(a,b) ( (a)*(b) )
#define C_MUL(m,a,b) \
do{ (m).r = (a).r*(b).r - (a).i*(b).i;\
(m).i = (a).r*(b).i + (a).i*(b).r; }while(0)
# define C_FIXDIV(c,div) /* NOOP */
# define C_MULBYSCALAR( c, s ) \
do{ (c).r *= (s);\
(c).i *= (s); }while(0)
#endif
#ifndef CHECK_OVERFLOW_OP
# define CHECK_OVERFLOW_OP(a,op,b) /* noop */
#endif
#define C_ADD( res, a,b)\
do { \
CHECK_OVERFLOW_OP((a).r,+,(b).r)\
CHECK_OVERFLOW_OP((a).i,+,(b).i)\
(res).r=(a).r+(b).r; (res).i=(a).i+(b).i; \
}while(0)
#define C_SUB( res, a,b)\
do { \
CHECK_OVERFLOW_OP((a).r,-,(b).r)\
CHECK_OVERFLOW_OP((a).i,-,(b).i)\
(res).r=(a).r-(b).r; (res).i=(a).i-(b).i; \
}while(0)
#define C_ADDTO( res , a)\
do { \
CHECK_OVERFLOW_OP((res).r,+,(a).r)\
CHECK_OVERFLOW_OP((res).i,+,(a).i)\
(res).r += (a).r; (res).i += (a).i;\
}while(0)
#define C_SUBFROM( res , a)\
do {\
CHECK_OVERFLOW_OP((res).r,-,(a).r)\
CHECK_OVERFLOW_OP((res).i,-,(a).i)\
(res).r -= (a).r; (res).i -= (a).i; \
}while(0)
#ifdef FIXED_POINT
# define KISS_FFT_F32_COS(phase) floor(.5+SAMP_MAX * cos (phase))
# define KISS_FFT_F32_SIN(phase) floor(.5+SAMP_MAX * sin (phase))
# define HALF_OF(x) ((x)>>1)
#elif defined(USE_SIMD)
# define KISS_FFT_F32_COS(phase) _mm_set1_ps( cos(phase) )
# define KISS_FFT_F32_SIN(phase) _mm_set1_ps( sin(phase) )
# define HALF_OF(x) ((x)*_mm_set1_ps(.5))
#else
# define KISS_FFT_F32_COS(phase) (kiss_fft_f32_scalar) cos(phase)
# define KISS_FFT_F32_SIN(phase) (kiss_fft_f32_scalar) sin(phase)
# define HALF_OF(x) ((x)*.5)
#endif
#define kf_cexp(x,phase) \
do{ \
(x)->r = KISS_FFT_F32_COS(phase);\
(x)->i = KISS_FFT_F32_SIN(phase);\
}while(0)
/* a debugging function */
#define pcpx(c)\
fprintf(stderr,"%g + %gi\n",(double)((c)->r),(double)((c)->i) )
#ifdef KISS_FFT_F32_USE_ALLOCA
// define this to allow use of alloca instead of malloc for temporary buffers
// Temporary buffers are used in two case:
// 1. FFT sizes that have "bad" factors. i.e. not 2,3 and 5
// 2. "in-place" FFTs. Notice the quotes, since kissfft does not really do an in-place transform.
#include <alloca.h>
#define KISS_FFT_F32_TMP_ALLOC(nbytes) alloca(nbytes)
#define KISS_FFT_F32_TMP_FREE(ptr)
#else
#define KISS_FFT_F32_TMP_ALLOC(nbytes) KISS_FFT_F32_MALLOC(nbytes)
#define KISS_FFT_F32_TMP_FREE(ptr) KISS_FFT_F32_FREE(ptr)
#endif

View file

@ -28,8 +28,14 @@
#include <spa/utils/defs.h> #include <spa/utils/defs.h>
#include "kiss_fft_f32.h" #include <math.h>
#include "kiss_fftr_f32.h" #include <xmmintrin.h>
#include "pffft.h"
struct fft_cpx {
float *v;
};
struct convolver1 { struct convolver1 {
int blockSize; int blockSize;
@ -37,16 +43,16 @@ struct convolver1 {
int segCount; int segCount;
int fftComplexSize; int fftComplexSize;
kiss_fft_f32_cpx **segments; struct fft_cpx *segments;
kiss_fft_f32_cpx **segmentsIr; struct fft_cpx *segmentsIr;
float *fft_buffer; float *fft_buffer;
void *fft; void *fft;
void *ifft; void *ifft;
kiss_fft_f32_cpx *pre_mult; struct fft_cpx pre_mult;
kiss_fft_f32_cpx *conv; struct fft_cpx conv;
float *overlap; float *overlap;
float *inputBuffer; float *inputBuffer;
@ -55,6 +61,38 @@ struct convolver1 {
int current; int current;
}; };
static void *fft_alloc(int size)
{
void *d;
d = pffft_aligned_malloc(size);
memset(d, 0, size);
return d;
}
static void fft_free(void *data)
{
pffft_aligned_free(data);
}
static void fft_cpx_init(struct fft_cpx *cpx, int size)
{
cpx->v = fft_alloc(size * 2 * sizeof(float));
}
static void fft_cpx_free(struct fft_cpx *cpx)
{
fft_free(cpx->v);
}
static void fft_cpx_clear(struct fft_cpx *cpx, int size)
{
memset(cpx->v, 0, sizeof(float) * 2 * size);
}
static void fft_cpx_copy(struct fft_cpx *dst, struct fft_cpx *src, int size)
{
memcpy(dst->v, src->v, sizeof(float) * 2 * size);
}
static int next_power_of_two(int val) static int next_power_of_two(int val)
{ {
int r = 1; int r = 1;
@ -63,6 +101,37 @@ static int next_power_of_two(int val)
return r; return r;
} }
static inline void *fft_new(int size)
{
return pffft_new_setup(size, PFFFT_REAL);
}
static inline void *ifft_new(int size)
{
return pffft_new_setup(size, PFFFT_REAL);
}
static inline void fft_destroy(void *fft)
{
pffft_destroy_setup(fft);
}
static inline void fft_run(void *fft, float *in, struct fft_cpx *out)
{
pffft_transform(fft, in, out->v, NULL, PFFFT_FORWARD);
}
static inline void ifft_run(void *ifft, struct fft_cpx *in, float *out)
{
pffft_transform(ifft, in->v, out, NULL, PFFFT_BACKWARD);
}
static inline void fft_convolve_accum(void *fft, struct fft_cpx *r,
const struct fft_cpx *a, const struct fft_cpx *b, int len, float scale)
{
pffft_zconvolve_accumulate(fft, a->v, b->v, r->v, scale);
}
static struct convolver1 *convolver1_new(int block, const float *ir, int irlen) static struct convolver1 *convolver1_new(int block, const float *ir, int irlen)
{ {
struct convolver1 *conv; struct convolver1 *conv;
@ -86,37 +155,37 @@ static struct convolver1 *convolver1_new(int block, const float *ir, int irlen)
conv->segCount = (irlen + conv->blockSize-1) / conv->blockSize; conv->segCount = (irlen + conv->blockSize-1) / conv->blockSize;
conv->fftComplexSize = (conv->segSize / 2) + 1; conv->fftComplexSize = (conv->segSize / 2) + 1;
conv->fft = kiss_fftr_f32_alloc(conv->segSize, 0, NULL, NULL); conv->fft = fft_new(conv->segSize);
if (conv->fft == NULL) if (conv->fft == NULL)
return NULL; return NULL;
conv->ifft = kiss_fftr_f32_alloc(conv->segSize, 1, NULL, NULL); conv->ifft = ifft_new(conv->segSize);
if (conv->ifft == NULL) if (conv->ifft == NULL)
return NULL; return NULL;
conv->fft_buffer = calloc(sizeof(float), conv->segSize); conv->fft_buffer = fft_alloc(sizeof(float) * conv->segSize);
if (conv->fft_buffer == NULL) if (conv->fft_buffer == NULL)
return NULL; return NULL;
conv->segments = calloc(sizeof(kiss_fft_f32_cpx*), conv->segCount); conv->segments = calloc(sizeof(struct fft_cpx), conv->segCount);
conv->segmentsIr = calloc(sizeof(kiss_fft_f32_cpx*), conv->segCount); conv->segmentsIr = calloc(sizeof(struct fft_cpx), conv->segCount);
for (i = 0; i < conv->segCount; i++) { for (i = 0; i < conv->segCount; i++) {
int left = irlen - (i * conv->blockSize); int left = irlen - (i * conv->blockSize);
int copy = SPA_MIN(conv->blockSize, left); int copy = SPA_MIN(conv->blockSize, left);
conv->segments[i] = calloc(sizeof(kiss_fft_f32_cpx), conv->fftComplexSize); fft_cpx_init(&conv->segments[i], conv->fftComplexSize);
conv->segmentsIr[i] = calloc(sizeof(kiss_fft_f32_cpx), conv->fftComplexSize); fft_cpx_init(&conv->segmentsIr[i], conv->fftComplexSize);
memcpy(conv->fft_buffer, &ir[i * conv->blockSize], copy * sizeof(float)); memcpy(conv->fft_buffer, &ir[i * conv->blockSize], copy * sizeof(float));
if (copy < conv->segSize) if (copy < conv->segSize)
memset(conv->fft_buffer + copy, 0, (conv->segSize - copy) * sizeof(float)); memset(conv->fft_buffer + copy, 0, (conv->segSize - copy) * sizeof(float));
kiss_fftr_f32(conv->fft, conv->fft_buffer, conv->segmentsIr[i]); fft_run(conv->fft, conv->fft_buffer, &conv->segmentsIr[i]);
} }
conv->pre_mult = calloc(sizeof(kiss_fft_f32_cpx), conv->fftComplexSize); fft_cpx_init(&conv->pre_mult, conv->fftComplexSize);
conv->conv = calloc(sizeof(kiss_fft_f32_cpx), conv->fftComplexSize); fft_cpx_init(&conv->conv, conv->fftComplexSize);
conv->overlap = calloc(sizeof(float), conv->blockSize); conv->overlap = fft_alloc(sizeof(float) * conv->blockSize);
conv->inputBuffer = calloc(sizeof(float), conv->blockSize); conv->inputBuffer = fft_alloc(sizeof(float) * conv->blockSize);
conv->inputBufferFill = 0; conv->inputBufferFill = 0;
conv->current = 0; conv->current = 0;
@ -127,35 +196,38 @@ static void convolver1_free(struct convolver1 *conv)
{ {
int i; int i;
for (i = 0; i < conv->segCount; i++) { for (i = 0; i < conv->segCount; i++) {
free(conv->segments[i]); fft_cpx_free(&conv->segments[i]);
free(conv->segmentsIr[i]); fft_cpx_free(&conv->segmentsIr[i]);
} }
free(conv->fft); fft_destroy(conv->fft);
free(conv->ifft); fft_destroy(conv->ifft);
free(conv->fft_buffer); fft_free(conv->fft_buffer);
free(conv->segments); free(conv->segments);
free(conv->segmentsIr); free(conv->segmentsIr);
free(conv->pre_mult); fft_cpx_free(&conv->pre_mult);
free(conv->conv); fft_cpx_free(&conv->conv);
free(conv->overlap); fft_free(conv->overlap);
free(conv->inputBuffer); fft_free(conv->inputBuffer);
free(conv); free(conv);
} }
void Sum(float* result, const float* a, const float* b, int len) void Sum(float* result, const float* a, const float* b, int len)
{ {
int i; int i;
for (i = 0; i < len; i++) #if defined (__SSE__)
const int end4 = 4 * (len / 4);
for (i = 0; i < end4; i += 4) {
const __m128 va = _mm_load_ps(&a[i]);
const __m128 vb = _mm_load_ps(&b[i]);
_mm_store_ps(&result[i], _mm_add_ps(va,vb));
}
for (i = end4; i < len; ++i) {
result[i] = a[i] + b[i]; result[i] = a[i] + b[i];
} }
#else
void ComplexMultiplyAccumulate(kiss_fft_f32_cpx *r, const kiss_fft_f32_cpx *a, const kiss_fft_f32_cpx *b, int len) for (i = 0; i < len; i++)
{ result[i] = a[i] + b[i];
int i; #endif
for (i = 0; i < len; i++) {
r[i].r += a[i].r * b[i].r - a[i].i * b[i].i;
r[i].i += a[i].r * b[i].i + a[i].i * b[i].r;
}
} }
static int convolver1_run(struct convolver1 *conv, const float *input, float *output, int len) static int convolver1_run(struct convolver1 *conv, const float *input, float *output, int len)
@ -176,30 +248,27 @@ static int convolver1_run(struct convolver1 *conv, const float *input, float *ou
memcpy(conv->fft_buffer, conv->inputBuffer, conv->blockSize * sizeof(float)); memcpy(conv->fft_buffer, conv->inputBuffer, conv->blockSize * sizeof(float));
memset(conv->fft_buffer + conv->blockSize, 0, (conv->segSize - conv->blockSize) * sizeof(float)); memset(conv->fft_buffer + conv->blockSize, 0, (conv->segSize - conv->blockSize) * sizeof(float));
kiss_fftr_f32(conv->fft, conv->fft_buffer, conv->segments[conv->current]); fft_run(conv->fft, conv->fft_buffer, &conv->segments[conv->current]);
if (conv->inputBufferFill == 0) { if (conv->inputBufferFill == 0) {
memset(conv->pre_mult, 0, sizeof(kiss_fft_f32_cpx) * conv->fftComplexSize); fft_cpx_clear(&conv->pre_mult, conv->fftComplexSize);
for (i = 1; i < conv->segCount; i++) { for (i = 1; i < conv->segCount; i++) {
const int indexIr = i; const int indexIr = i;
const int indexAudio = (conv->current + i) % conv->segCount; const int indexAudio = (conv->current + i) % conv->segCount;
ComplexMultiplyAccumulate(conv->pre_mult, fft_convolve_accum(conv->fft, &conv->pre_mult,
conv->segmentsIr[indexIr], &conv->segmentsIr[indexIr],
conv->segments[indexAudio], &conv->segments[indexAudio],
conv->fftComplexSize); conv->fftComplexSize, 1.0f / conv->segSize);
} }
} }
memcpy(conv->conv, conv->pre_mult, sizeof(kiss_fft_f32_cpx) * conv->fftComplexSize); fft_cpx_copy(&conv->conv, &conv->pre_mult, conv->fftComplexSize);
ComplexMultiplyAccumulate(conv->conv, conv->segments[conv->current], conv->segmentsIr[0], fft_convolve_accum(conv->fft, &conv->conv, &conv->segments[conv->current], &conv->segmentsIr[0],
conv->fftComplexSize); conv->fftComplexSize, 1.0f / conv->segSize);
kiss_fftri_f32(conv->ifft, conv->conv, conv->fft_buffer); ifft_run(conv->ifft, &conv->conv, conv->fft_buffer);
for (i = 0; i < conv->segSize; i++)
conv->fft_buffer[i] /= conv->segSize;
Sum(output + processed, conv->fft_buffer + inputBufferPos, conv->overlap + inputBufferPos, processing); Sum(output + processed, conv->fft_buffer + inputBufferPos, conv->overlap + inputBufferPos, processing);
@ -265,19 +334,19 @@ struct convolver *convolver_new(int head_block, int tail_block, const float *ir,
if (irlen > conv->tailBlockSize) { if (irlen > conv->tailBlockSize) {
int conv1IrLen = SPA_MIN(irlen - conv->tailBlockSize, conv->tailBlockSize); int conv1IrLen = SPA_MIN(irlen - conv->tailBlockSize, conv->tailBlockSize);
conv->tailConvolver0 = convolver1_new(conv->headBlockSize, ir + conv->tailBlockSize, conv1IrLen); conv->tailConvolver0 = convolver1_new(conv->headBlockSize, ir + conv->tailBlockSize, conv1IrLen);
conv->tailOutput0 = calloc(conv->tailBlockSize, sizeof(float)); conv->tailOutput0 = fft_alloc(conv->tailBlockSize * sizeof(float));
conv->tailPrecalculated0 = calloc(conv->tailBlockSize, sizeof(float)); conv->tailPrecalculated0 = fft_alloc(conv->tailBlockSize * sizeof(float));
} }
if (irlen > 2 * conv->tailBlockSize) { if (irlen > 2 * conv->tailBlockSize) {
int tailIrLen = irlen - (2 * conv->tailBlockSize); int tailIrLen = irlen - (2 * conv->tailBlockSize);
conv->tailConvolver = convolver1_new(conv->tailBlockSize, ir + (2 * conv->tailBlockSize), tailIrLen); conv->tailConvolver = convolver1_new(conv->tailBlockSize, ir + (2 * conv->tailBlockSize), tailIrLen);
conv->tailOutput = calloc(conv->tailBlockSize, sizeof(float)); conv->tailOutput = fft_alloc(conv->tailBlockSize * sizeof(float));
conv->tailPrecalculated = calloc(conv->tailBlockSize, sizeof(float)); conv->tailPrecalculated = fft_alloc(conv->tailBlockSize * sizeof(float));
} }
if (conv->tailConvolver0 || conv->tailConvolver) if (conv->tailConvolver0 || conv->tailConvolver)
conv->tailInput = calloc(conv->tailBlockSize, sizeof(float)); conv->tailInput = fft_alloc(conv->tailBlockSize * sizeof(float));
conv->tailInputFill = 0; conv->tailInputFill = 0;
conv->precalculatedPos = 0; conv->precalculatedPos = 0;
@ -293,11 +362,11 @@ void convolver_free(struct convolver *conv)
convolver1_free(conv->tailConvolver0); convolver1_free(conv->tailConvolver0);
if (conv->tailConvolver) if (conv->tailConvolver)
convolver1_free(conv->tailConvolver); convolver1_free(conv->tailConvolver);
free(conv->tailOutput0); fft_free(conv->tailOutput0);
free(conv->tailPrecalculated0); fft_free(conv->tailPrecalculated0);
free(conv->tailOutput); fft_free(conv->tailOutput);
free(conv->tailPrecalculated); fft_free(conv->tailPrecalculated);
free(conv->tailInput); fft_free(conv->tailInput);
free(conv); free(conv);
} }

View file

@ -1,442 +0,0 @@
/*
* Copyright (c) 2003-2010, Mark Borgerding. All rights reserved.
* This file is part of KISS FFT - https://github.com/mborgerding/kissfft
*
* SPDX-License-Identifier: BSD-3-Clause
* See COPYING file for more information.
*/
#include "_kiss_fft_guts_f32.h"
/* The guts header contains all the multiplication and addition macros that are defined for
fixed or floating point complex numbers. It also delares the kf_ internal functions.
*/
static void
kf_bfly2 (kiss_fft_f32_cpx * Fout,
const size_t fstride, const kiss_fft_f32_cfg st, int m)
{
kiss_fft_f32_cpx *Fout2;
kiss_fft_f32_cpx *tw1 = st->twiddles;
kiss_fft_f32_cpx t;
Fout2 = Fout + m;
do {
C_FIXDIV (*Fout, 2);
C_FIXDIV (*Fout2, 2);
C_MUL (t, *Fout2, *tw1);
tw1 += fstride;
C_SUB (*Fout2, *Fout, t);
C_ADDTO (*Fout, t);
++Fout2;
++Fout;
} while (--m);
}
static void
kf_bfly4 (kiss_fft_f32_cpx * Fout,
const size_t fstride, const kiss_fft_f32_cfg st, const size_t m)
{
kiss_fft_f32_cpx *tw1, *tw2, *tw3;
kiss_fft_f32_cpx scratch[6];
size_t k = m;
const size_t m2 = 2 * m;
const size_t m3 = 3 * m;
tw3 = tw2 = tw1 = st->twiddles;
do {
C_FIXDIV (*Fout, 4);
C_FIXDIV (Fout[m], 4);
C_FIXDIV (Fout[m2], 4);
C_FIXDIV (Fout[m3], 4);
C_MUL (scratch[0], Fout[m], *tw1);
C_MUL (scratch[1], Fout[m2], *tw2);
C_MUL (scratch[2], Fout[m3], *tw3);
C_SUB (scratch[5], *Fout, scratch[1]);
C_ADDTO (*Fout, scratch[1]);
C_ADD (scratch[3], scratch[0], scratch[2]);
C_SUB (scratch[4], scratch[0], scratch[2]);
C_SUB (Fout[m2], *Fout, scratch[3]);
tw1 += fstride;
tw2 += fstride * 2;
tw3 += fstride * 3;
C_ADDTO (*Fout, scratch[3]);
if (st->inverse) {
Fout[m].r = scratch[5].r - scratch[4].i;
Fout[m].i = scratch[5].i + scratch[4].r;
Fout[m3].r = scratch[5].r + scratch[4].i;
Fout[m3].i = scratch[5].i - scratch[4].r;
} else {
Fout[m].r = scratch[5].r + scratch[4].i;
Fout[m].i = scratch[5].i - scratch[4].r;
Fout[m3].r = scratch[5].r - scratch[4].i;
Fout[m3].i = scratch[5].i + scratch[4].r;
}
++Fout;
} while (--k);
}
static void
kf_bfly3 (kiss_fft_f32_cpx * Fout,
const size_t fstride, const kiss_fft_f32_cfg st, size_t m)
{
size_t k = m;
const size_t m2 = 2 * m;
kiss_fft_f32_cpx *tw1, *tw2;
kiss_fft_f32_cpx scratch[5];
kiss_fft_f32_cpx epi3;
epi3 = st->twiddles[fstride * m];
tw1 = tw2 = st->twiddles;
do {
C_FIXDIV (*Fout, 3);
C_FIXDIV (Fout[m], 3);
C_FIXDIV (Fout[m2], 3);
C_MUL (scratch[1], Fout[m], *tw1);
C_MUL (scratch[2], Fout[m2], *tw2);
C_ADD (scratch[3], scratch[1], scratch[2]);
C_SUB (scratch[0], scratch[1], scratch[2]);
tw1 += fstride;
tw2 += fstride * 2;
Fout[m].r = Fout->r - HALF_OF (scratch[3].r);
Fout[m].i = Fout->i - HALF_OF (scratch[3].i);
C_MULBYSCALAR (scratch[0], epi3.i);
C_ADDTO (*Fout, scratch[3]);
Fout[m2].r = Fout[m].r + scratch[0].i;
Fout[m2].i = Fout[m].i - scratch[0].r;
Fout[m].r -= scratch[0].i;
Fout[m].i += scratch[0].r;
++Fout;
} while (--k);
}
static void
kf_bfly5 (kiss_fft_f32_cpx * Fout,
const size_t fstride, const kiss_fft_f32_cfg st, int m)
{
kiss_fft_f32_cpx *Fout0, *Fout1, *Fout2, *Fout3, *Fout4;
int u;
kiss_fft_f32_cpx scratch[13];
kiss_fft_f32_cpx *twiddles = st->twiddles;
kiss_fft_f32_cpx *tw;
kiss_fft_f32_cpx ya, yb;
ya = twiddles[fstride * m];
yb = twiddles[fstride * 2 * m];
Fout0 = Fout;
Fout1 = Fout0 + m;
Fout2 = Fout0 + 2 * m;
Fout3 = Fout0 + 3 * m;
Fout4 = Fout0 + 4 * m;
tw = st->twiddles;
for (u = 0; u < m; ++u) {
C_FIXDIV (*Fout0, 5);
C_FIXDIV (*Fout1, 5);
C_FIXDIV (*Fout2, 5);
C_FIXDIV (*Fout3, 5);
C_FIXDIV (*Fout4, 5);
scratch[0] = *Fout0;
C_MUL (scratch[1], *Fout1, tw[u * fstride]);
C_MUL (scratch[2], *Fout2, tw[2 * u * fstride]);
C_MUL (scratch[3], *Fout3, tw[3 * u * fstride]);
C_MUL (scratch[4], *Fout4, tw[4 * u * fstride]);
C_ADD (scratch[7], scratch[1], scratch[4]);
C_SUB (scratch[10], scratch[1], scratch[4]);
C_ADD (scratch[8], scratch[2], scratch[3]);
C_SUB (scratch[9], scratch[2], scratch[3]);
Fout0->r += scratch[7].r + scratch[8].r;
Fout0->i += scratch[7].i + scratch[8].i;
scratch[5].r =
scratch[0].r + S_MUL (scratch[7].r, ya.r) + S_MUL (scratch[8].r, yb.r);
scratch[5].i =
scratch[0].i + S_MUL (scratch[7].i, ya.r) + S_MUL (scratch[8].i, yb.r);
scratch[6].r = S_MUL (scratch[10].i, ya.i) + S_MUL (scratch[9].i, yb.i);
scratch[6].i = -S_MUL (scratch[10].r, ya.i) - S_MUL (scratch[9].r, yb.i);
C_SUB (*Fout1, scratch[5], scratch[6]);
C_ADD (*Fout4, scratch[5], scratch[6]);
scratch[11].r =
scratch[0].r + S_MUL (scratch[7].r, yb.r) + S_MUL (scratch[8].r, ya.r);
scratch[11].i =
scratch[0].i + S_MUL (scratch[7].i, yb.r) + S_MUL (scratch[8].i, ya.r);
scratch[12].r = -S_MUL (scratch[10].i, yb.i) + S_MUL (scratch[9].i, ya.i);
scratch[12].i = S_MUL (scratch[10].r, yb.i) - S_MUL (scratch[9].r, ya.i);
C_ADD (*Fout2, scratch[11], scratch[12]);
C_SUB (*Fout3, scratch[11], scratch[12]);
++Fout0;
++Fout1;
++Fout2;
++Fout3;
++Fout4;
}
}
/* perform the butterfly for one stage of a mixed radix FFT */
static void
kf_bfly_generic (kiss_fft_f32_cpx * Fout,
const size_t fstride, const kiss_fft_f32_cfg st, int m, int p)
{
int u, k, q1, q;
kiss_fft_f32_cpx *twiddles = st->twiddles;
kiss_fft_f32_cpx t;
int Norig = st->nfft;
kiss_fft_f32_cpx *scratch =
(kiss_fft_f32_cpx *) KISS_FFT_F32_TMP_ALLOC (sizeof (kiss_fft_f32_cpx) *
p);
for (u = 0; u < m; ++u) {
k = u;
for (q1 = 0; q1 < p; ++q1) {
scratch[q1] = Fout[k];
C_FIXDIV (scratch[q1], p);
k += m;
}
k = u;
for (q1 = 0; q1 < p; ++q1) {
int twidx = 0;
Fout[k] = scratch[0];
for (q = 1; q < p; ++q) {
twidx += fstride * k;
if (twidx >= Norig)
twidx -= Norig;
C_MUL (t, scratch[q], twiddles[twidx]);
C_ADDTO (Fout[k], t);
}
k += m;
}
}
KISS_FFT_F32_TMP_FREE (scratch);
}
static void
kf_work (kiss_fft_f32_cpx * Fout,
const kiss_fft_f32_cpx * f,
const size_t fstride, int in_stride, int *factors,
const kiss_fft_f32_cfg st)
{
kiss_fft_f32_cpx *Fout_beg = Fout;
const int p = *factors++; /* the radix */
const int m = *factors++; /* stage's fft length/p */
const kiss_fft_f32_cpx *Fout_end = Fout + p * m;
#ifdef _OPENMP
// use openmp extensions at the
// top-level (not recursive)
if (fstride == 1 && p <= 5 && m != 1) {
int k;
// execute the p different work units in different threads
# pragma omp parallel for
for (k = 0; k < p; ++k)
kf_work (Fout + k * m, f + fstride * in_stride * k, fstride * p,
in_stride, factors, st);
// all threads have joined by this point
switch (p) {
case 2:
kf_bfly2 (Fout, fstride, st, m);
break;
case 3:
kf_bfly3 (Fout, fstride, st, m);
break;
case 4:
kf_bfly4 (Fout, fstride, st, m);
break;
case 5:
kf_bfly5 (Fout, fstride, st, m);
break;
default:
kf_bfly_generic (Fout, fstride, st, m, p);
break;
}
return;
}
#endif
if (m == 1) {
do {
*Fout = *f;
f += fstride * in_stride;
} while (++Fout != Fout_end);
} else {
do {
// recursive call:
// DFT of size m*p performed by doing
// p instances of smaller DFTs of size m,
// each one takes a decimated version of the input
kf_work (Fout, f, fstride * p, in_stride, factors, st);
f += fstride * in_stride;
} while ((Fout += m) != Fout_end);
}
Fout = Fout_beg;
// recombine the p smaller DFTs
switch (p) {
case 2:
kf_bfly2 (Fout, fstride, st, m);
break;
case 3:
kf_bfly3 (Fout, fstride, st, m);
break;
case 4:
kf_bfly4 (Fout, fstride, st, m);
break;
case 5:
kf_bfly5 (Fout, fstride, st, m);
break;
default:
kf_bfly_generic (Fout, fstride, st, m, p);
break;
}
}
/* facbuf is populated by p1,m1,p2,m2, ...
where
p[i] * m[i] = m[i-1]
m0 = n */
static void
kf_factor (int n, int *facbuf)
{
int p = 4;
double floor_sqrt;
floor_sqrt = floor (sqrt ((double) n));
/*factor out powers of 4, powers of 2, then any remaining primes */
do {
while (n % p) {
switch (p) {
case 4:
p = 2;
break;
case 2:
p = 3;
break;
default:
p += 2;
break;
}
if (p > floor_sqrt)
p = n; /* no more factors, skip to end */
}
n /= p;
*facbuf++ = p;
*facbuf++ = n;
} while (n > 1);
}
/*
*
* User-callable function to allocate all necessary storage space for the fft.
*
* The return value is a contiguous block of memory, allocated with malloc. As such,
* It can be freed with free(), rather than a kiss_fft_f32-specific function.
* */
kiss_fft_f32_cfg
kiss_fft_f32_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem)
{
kiss_fft_f32_cfg st = NULL;
size_t memneeded = sizeof (struct kiss_fft_f32_state)
+ sizeof (kiss_fft_f32_cpx) * (nfft - 1); /* twiddle factors */
if (lenmem == NULL) {
st = (kiss_fft_f32_cfg) KISS_FFT_F32_MALLOC (memneeded);
} else {
if (mem != NULL && *lenmem >= memneeded)
st = (kiss_fft_f32_cfg) mem;
*lenmem = memneeded;
}
if (st) {
int i;
st->nfft = nfft;
st->inverse = inverse_fft;
for (i = 0; i < nfft; ++i) {
const double pi =
3.141592653589793238462643383279502884197169399375105820974944;
double phase = -2 * pi * i / nfft;
if (st->inverse)
phase *= -1;
kf_cexp (st->twiddles + i, phase);
}
kf_factor (nfft, st->factors);
}
return st;
}
void
kiss_fft_f32_stride (kiss_fft_f32_cfg st, const kiss_fft_f32_cpx * fin,
kiss_fft_f32_cpx * fout, int in_stride)
{
if (fin == fout) {
//NOTE: this is not really an in-place FFT algorithm.
//It just performs an out-of-place FFT into a temp buffer
kiss_fft_f32_cpx *tmpbuf =
(kiss_fft_f32_cpx *) KISS_FFT_F32_TMP_ALLOC (sizeof (kiss_fft_f32_cpx) *
st->nfft);
kf_work (tmpbuf, fin, 1, in_stride, st->factors, st);
memcpy (fout, tmpbuf, sizeof (kiss_fft_f32_cpx) * st->nfft);
KISS_FFT_F32_TMP_FREE (tmpbuf);
} else {
kf_work (fout, fin, 1, in_stride, st->factors, st);
}
}
void
kiss_fft_f32 (kiss_fft_f32_cfg cfg, const kiss_fft_f32_cpx * fin,
kiss_fft_f32_cpx * fout)
{
kiss_fft_f32_stride (cfg, fin, fout, 1);
}
void
kiss_fft_f32_cleanup (void)
{
// nothing needed any more
}
int
kiss_fft_f32_next_fast_size (int n)
{
while (1) {
int m = n;
while ((m % 2) == 0)
m /= 2;
while ((m % 3) == 0)
m /= 3;
while ((m % 5) == 0)
m /= 5;
if (m <= 1)
break; /* n is completely factorable by twos, threes, and fives */
n++;
}
return n;
}

View file

@ -1,112 +0,0 @@
/*
* Copyright (c) 2003-2010, Mark Borgerding. All rights reserved.
* This file is part of KISS FFT - https://github.com/mborgerding/kissfft
*
* SPDX-License-Identifier: BSD-3-Clause
* See COPYING file for more information.
*/
#ifndef KISS_FFT_F32_H
#define KISS_FFT_F32_H
#include <stdlib.h>
#include <stdio.h>
#include <math.h>
#include <string.h>
#include <stdint.h>
#ifdef __cplusplus
extern "C" {
#endif
/*
ATTENTION!
If you would like a :
-- a utility that will handle the caching of fft objects
-- real-only (no imaginary time component ) FFT
-- a multi-dimensional FFT
-- a command-line utility to perform ffts
-- a command-line utility to perform fast-convolution filtering
Then see kfc.h kiss_fftr_f32.h kiss_fft_f32nd.h fftutil.c kiss_fastfir.c
in the tools/ directory.
*/
#define KISS_FFT_F32_MALLOC malloc
#define KISS_FFT_F32_FREE free
#define kiss_fft_f32_scalar float
typedef struct {
kiss_fft_f32_scalar r;
kiss_fft_f32_scalar i;
}kiss_fft_f32_cpx;
typedef struct kiss_fft_f32_state* kiss_fft_f32_cfg;
/*
* kiss_fft_f32_alloc
*
* Initialize a FFT (or IFFT) algorithm's cfg/state buffer.
*
* typical usage: kiss_fft_f32_cfg mycfg=kiss_fft_f32_alloc(1024,0,NULL,NULL);
*
* The return value from fft_alloc is a cfg buffer used internally
* by the fft routine or NULL.
*
* If lenmem is NULL, then kiss_fft_f32_alloc will allocate a cfg buffer using malloc.
* The returned value should be free()d when done to avoid memory leaks.
*
* The state can be placed in a user supplied buffer 'mem':
* If lenmem is not NULL and mem is not NULL and *lenmem is large enough,
* then the function places the cfg in mem and the size used in *lenmem
* and returns mem.
*
* If lenmem is not NULL and ( mem is NULL or *lenmem is not large enough),
* then the function returns NULL and places the minimum cfg
* buffer size in *lenmem.
* */
kiss_fft_f32_cfg kiss_fft_f32_alloc(int nfft,int inverse_fft,void * mem,size_t * lenmem);
/*
* kiss_fft_f32(cfg,in_out_buf)
*
* Perform an FFT on a complex input buffer.
* for a forward FFT,
* fin should be f[0] , f[1] , ... ,f[nfft-1]
* fout will be F[0] , F[1] , ... ,F[nfft-1]
* Note that each element is complex and can be accessed like
f[k].r and f[k].i
* */
void kiss_fft_f32(kiss_fft_f32_cfg cfg,const kiss_fft_f32_cpx *fin,kiss_fft_f32_cpx *fout);
/*
A more generic version of the above function. It reads its input from every Nth sample.
* */
void kiss_fft_f32_stride(kiss_fft_f32_cfg cfg,const kiss_fft_f32_cpx *fin,kiss_fft_f32_cpx *fout,int fin_stride);
/* If kiss_fft_f32_alloc allocated a buffer, it is one contiguous
buffer and can be simply free()d when no longer needed*/
#define kiss_fft_f32_free KISS_FFT_F32_FREE
/*
Cleans up some memory that gets managed internally. Not necessary to call, but it might clean up
your compiler output to call this before you exit.
*/
void kiss_fft_f32_cleanup(void);
/*
* Returns the smallest integer k, such that k>=n and k has only "fast" factors (2,3,5)
*/
int kiss_fft_f32_next_fast_size(int n);
/* for real ffts, we need an even size */
#define kiss_fftr_f32_next_fast_size_real(n) \
(kiss_fft_f32_next_fast_size( ((n)+1)>>1)<<1)
#ifdef __cplusplus
}
#endif
#endif

View file

@ -1,148 +0,0 @@
/*
* Copyright (c) 2003-2004, Mark Borgerding. All rights reserved.
* This file is part of KISS FFT - https://github.com/mborgerding/kissfft
*
* SPDX-License-Identifier: BSD-3-Clause
* See COPYING file for more information.
*/
#include "kiss_fftr_f32.h"
#include "_kiss_fft_guts_f32.h"
struct kiss_fftr_f32_state
{
kiss_fft_f32_cfg substate;
kiss_fft_f32_cpx *tmpbuf;
kiss_fft_f32_cpx *super_twiddles;
#ifdef USE_SIMD
void *pad;
#endif
};
kiss_fftr_f32_cfg
kiss_fftr_f32_alloc (int nfft, int inverse_fft, void *mem, size_t * lenmem)
{
int i;
kiss_fftr_f32_cfg st = NULL;
size_t subsize = 0, memneeded;
nfft >>= 1;
kiss_fft_f32_alloc (nfft, inverse_fft, NULL, &subsize);
memneeded =
ALIGN_STRUCT (sizeof (struct kiss_fftr_f32_state)) +
ALIGN_STRUCT (subsize) + sizeof (kiss_fft_f32_cpx) * (nfft * 3 / 2);
if (lenmem == NULL) {
st = (kiss_fftr_f32_cfg) KISS_FFT_F32_MALLOC (memneeded);
} else {
if (*lenmem >= memneeded)
st = (kiss_fftr_f32_cfg) mem;
*lenmem = memneeded;
}
if (!st)
return NULL;
st->substate = (kiss_fft_f32_cfg) (((char *) st) + ALIGN_STRUCT (sizeof (struct kiss_fftr_f32_state))); /*just beyond kiss_fftr_f32_state struct */
st->tmpbuf =
(kiss_fft_f32_cpx *) (((char *) st->substate) + ALIGN_STRUCT (subsize));
st->super_twiddles = st->tmpbuf + nfft;
kiss_fft_f32_alloc (nfft, inverse_fft, st->substate, &subsize);
for (i = 0; i < nfft / 2; ++i) {
double phase =
-3.14159265358979323846264338327 * ((double) (i + 1) / nfft + .5);
if (inverse_fft)
phase *= -1;
kf_cexp (st->super_twiddles + i, phase);
}
return st;
}
void
kiss_fftr_f32 (kiss_fftr_f32_cfg st, const kiss_fft_f32_scalar * timedata,
kiss_fft_f32_cpx * freqdata)
{
/* input buffer timedata is stored row-wise */
int k, ncfft;
kiss_fft_f32_cpx fpnk, fpk, f1k, f2k, tw, tdc;
ncfft = st->substate->nfft;
/*perform the parallel fft of two real signals packed in real,imag */
kiss_fft_f32 (st->substate, (const kiss_fft_f32_cpx *) timedata, st->tmpbuf);
/* The real part of the DC element of the frequency spectrum in st->tmpbuf
* contains the sum of the even-numbered elements of the input time sequence
* The imag part is the sum of the odd-numbered elements
*
* The sum of tdc.r and tdc.i is the sum of the input time sequence.
* yielding DC of input time sequence
* The difference of tdc.r - tdc.i is the sum of the input (dot product) [1,-1,1,-1...
* yielding Nyquist bin of input time sequence
*/
tdc.r = st->tmpbuf[0].r;
tdc.i = st->tmpbuf[0].i;
C_FIXDIV (tdc, 2);
CHECK_OVERFLOW_OP (tdc.r, +, tdc.i);
CHECK_OVERFLOW_OP (tdc.r, -, tdc.i);
freqdata[0].r = tdc.r + tdc.i;
freqdata[ncfft].r = tdc.r - tdc.i;
#ifdef USE_SIMD
freqdata[ncfft].i = freqdata[0].i = _mm_set1_ps (0);
#else
freqdata[ncfft].i = freqdata[0].i = 0;
#endif
for (k = 1; k <= ncfft / 2; ++k) {
fpk = st->tmpbuf[k];
fpnk.r = st->tmpbuf[ncfft - k].r;
fpnk.i = -st->tmpbuf[ncfft - k].i;
C_FIXDIV (fpk, 2);
C_FIXDIV (fpnk, 2);
C_ADD (f1k, fpk, fpnk);
C_SUB (f2k, fpk, fpnk);
C_MUL (tw, f2k, st->super_twiddles[k - 1]);
freqdata[k].r = HALF_OF (f1k.r + tw.r);
freqdata[k].i = HALF_OF (f1k.i + tw.i);
freqdata[ncfft - k].r = HALF_OF (f1k.r - tw.r);
freqdata[ncfft - k].i = HALF_OF (tw.i - f1k.i);
}
}
void
kiss_fftri_f32 (kiss_fftr_f32_cfg st, const kiss_fft_f32_cpx * freqdata,
kiss_fft_f32_scalar * timedata)
{
/* input buffer timedata is stored row-wise */
int k, ncfft;
ncfft = st->substate->nfft;
st->tmpbuf[0].r = freqdata[0].r + freqdata[ncfft].r;
st->tmpbuf[0].i = freqdata[0].r - freqdata[ncfft].r;
C_FIXDIV (st->tmpbuf[0], 2);
for (k = 1; k <= ncfft / 2; ++k) {
kiss_fft_f32_cpx fk, fnkc, fek, fok, tmp;
fk = freqdata[k];
fnkc.r = freqdata[ncfft - k].r;
fnkc.i = -freqdata[ncfft - k].i;
C_FIXDIV (fk, 2);
C_FIXDIV (fnkc, 2);
C_ADD (fek, fk, fnkc);
C_SUB (tmp, fk, fnkc);
C_MUL (fok, tmp, st->super_twiddles[k - 1]);
C_ADD (st->tmpbuf[k], fek, fok);
C_SUB (st->tmpbuf[ncfft - k], fek, fok);
#ifdef USE_SIMD
st->tmpbuf[ncfft - k].i *= _mm_set1_ps (-1.0);
#else
st->tmpbuf[ncfft - k].i *= -1;
#endif
}
kiss_fft_f32 (st->substate, st->tmpbuf, (kiss_fft_f32_cpx *) timedata);
}

View file

@ -1,54 +0,0 @@
/*
* Copyright (c) 2003-2004, Mark Borgerding. All rights reserved.
* This file is part of KISS FFT - https://github.com/mborgerding/kissfft
*
* SPDX-License-Identifier: BSD-3-Clause
* See COPYING file for more information.
*/
#ifndef KISS_FTR_H
#define KISS_FTR_H
#include "kiss_fft_f32.h"
#ifdef __cplusplus
extern "C" {
#endif
/*
Real optimized version can save about 45% cpu time vs. complex fft of a real seq.
*/
typedef struct kiss_fftr_f32_state *kiss_fftr_f32_cfg;
kiss_fftr_f32_cfg kiss_fftr_f32_alloc(int nfft,int inverse_fft,void * mem, size_t * lenmem);
/*
nfft must be even
If you don't care to allocate space, use mem = lenmem = NULL
*/
void kiss_fftr_f32(kiss_fftr_f32_cfg cfg,const kiss_fft_f32_scalar *timedata,kiss_fft_f32_cpx *freqdata);
/*
input timedata has nfft scalar points
output freqdata has nfft/2+1 complex points
*/
void kiss_fftri_f32(kiss_fftr_f32_cfg cfg,const kiss_fft_f32_cpx *freqdata,kiss_fft_f32_scalar *timedata);
/*
input freqdata has nfft/2+1 complex points
output timedata has nfft scalar points
*/
#define kiss_fftr_f32_free KISS_FFT_F32_FREE
#ifdef __cplusplus
}
#endif
#endif

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,177 @@
/* Copyright (c) 2013 Julien Pommier ( pommier@modartt.com )
Based on original fortran 77 code from FFTPACKv4 from NETLIB,
authored by Dr Paul Swarztrauber of NCAR, in 1985.
As confirmed by the NCAR fftpack software curators, the following
FFTPACKv5 license applies to FFTPACKv4 sources. My changes are
released under the same terms.
FFTPACK license:
http://www.cisl.ucar.edu/css/software/fftpack5/ftpk.html
Copyright (c) 2004 the University Corporation for Atmospheric
Research ("UCAR"). All rights reserved. Developed by NCAR's
Computational and Information Systems Laboratory, UCAR,
www.cisl.ucar.edu.
Redistribution and use of the Software in source and binary forms,
with or without modification, is permitted provided that the
following conditions are met:
- Neither the names of NCAR's Computational and Information Systems
Laboratory, the University Corporation for Atmospheric Research,
nor the names of its sponsors or contributors may be used to
endorse or promote products derived from this Software without
specific prior written permission.
- Redistributions of source code must retain the above copyright
notices, this list of conditions, and the disclaimer below.
- Redistributions in binary form must reproduce the above copyright
notice, this list of conditions, and the disclaimer below in the
documentation and/or other materials provided with the
distribution.
THIS SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
EXPRESS OR IMPLIED, INCLUDING, BUT NOT LIMITED TO THE WARRANTIES OF
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
NONINFRINGEMENT. IN NO EVENT SHALL THE CONTRIBUTORS OR COPYRIGHT
HOLDERS BE LIABLE FOR ANY CLAIM, INDIRECT, INCIDENTAL, SPECIAL,
EXEMPLARY, OR CONSEQUENTIAL DAMAGES OR OTHER LIABILITY, WHETHER IN AN
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS WITH THE
SOFTWARE.
*/
/*
PFFFT : a Pretty Fast FFT.
This is basically an adaptation of the single precision fftpack
(v4) as found on netlib taking advantage of SIMD instruction found
on cpus such as intel x86 (SSE1), powerpc (Altivec), and arm (NEON).
For architectures where no SIMD instruction is available, the code
falls back to a scalar version.
Restrictions:
- 1D transforms only, with 32-bit single precision.
- supports only transforms for inputs of length N of the form
N=(2^a)*(3^b)*(5^c), a >= 5, b >=0, c >= 0 (32, 48, 64, 96, 128,
144, 160, etc are all acceptable lengths). Performance is best for
128<=N<=8192.
- all (float*) pointers in the functions below are expected to
have an "simd-compatible" alignment, that is 16 bytes on x86 and
powerpc CPUs.
You can allocate such buffers with the functions
pffft_aligned_malloc / pffft_aligned_free (or with stuff like
posix_memalign..)
*/
#ifndef PFFFT_H
#define PFFFT_H
#include <stddef.h> // for size_t
#ifdef __cplusplus
extern "C" {
#endif
/* opaque struct holding internal stuff (precomputed twiddle factors)
this struct can be shared by many threads as it contains only
read-only data.
*/
typedef struct PFFFT_Setup PFFFT_Setup;
/* direction of the transform */
typedef enum { PFFFT_FORWARD, PFFFT_BACKWARD } pffft_direction_t;
/* type of transform */
typedef enum { PFFFT_REAL, PFFFT_COMPLEX } pffft_transform_t;
/*
prepare for performing transforms of size N -- the returned
PFFFT_Setup structure is read-only so it can safely be shared by
multiple concurrent threads.
*/
PFFFT_Setup *pffft_new_setup(int N, pffft_transform_t transform);
void pffft_destroy_setup(PFFFT_Setup *);
/*
Perform a Fourier transform , The z-domain data is stored in the
most efficient order for transforming it back, or using it for
convolution. If you need to have its content sorted in the
"usual" way, that is as an array of interleaved complex numbers,
either use pffft_transform_ordered , or call pffft_zreorder after
the forward fft, and before the backward fft.
Transforms are not scaled: PFFFT_BACKWARD(PFFFT_FORWARD(x)) = N*x.
Typically you will want to scale the backward transform by 1/N.
The 'work' pointer should point to an area of N (2*N for complex
fft) floats, properly aligned. If 'work' is NULL, then stack will
be used instead (this is probably the best strategy for small
FFTs, say for N < 16384).
input and output may alias.
*/
void pffft_transform(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
/*
Similar to pffft_transform, but makes sure that the output is
ordered as expected (interleaved complex numbers). This is
similar to calling pffft_transform and then pffft_zreorder.
input and output may alias.
*/
void pffft_transform_ordered(PFFFT_Setup *setup, const float *input, float *output, float *work, pffft_direction_t direction);
/*
call pffft_zreorder(.., PFFFT_FORWARD) after pffft_transform(...,
PFFFT_FORWARD) if you want to have the frequency components in
the correct "canonical" order, as interleaved complex numbers.
(for real transforms, both 0-frequency and half frequency
components, which are real, are assembled in the first entry as
F(0)+i*F(n/2+1). Note that the original fftpack did place
F(n/2+1) at the end of the arrays).
input and output should not alias.
*/
void pffft_zreorder(PFFFT_Setup *setup, const float *input, float *output, pffft_direction_t direction);
/*
Perform a multiplication of the frequency components of dft_a and
dft_b and accumulate them into dft_ab. The arrays should have
been obtained with pffft_transform(.., PFFFT_FORWARD) and should
*not* have been reordered with pffft_zreorder (otherwise just
perform the operation yourself as the dft coefs are stored as
interleaved complex numbers).
the operation performed is: dft_ab += (dft_a * fdt_b)*scaling
The dft_a, dft_b and dft_ab pointers may alias.
*/
void pffft_zconvolve_accumulate(PFFFT_Setup *setup, const float *dft_a, const float *dft_b, float *dft_ab, float scaling);
/*
the float buffers must have the correct alignment (16-byte boundary
on intel and powerpc). This function may be used to obtain such
correctly aligned buffers.
*/
void *pffft_aligned_malloc(size_t nb_bytes);
void pffft_aligned_free(void *);
/* return 4 or 1 wether support SSE/Altivec instructions was enable when building pffft.c */
int pffft_simd_size();
#ifdef __cplusplus
}
#endif
#endif // PFFFT_H