[PATCH i-g-t 2/3] lib/vendor: Add the meow_fft library
Peter Senna Tschudin
peter.senna at intel.com
Tue Oct 8 06:06:16 UTC 2024
On 13.09.2024 12:44, Kamil Konieczny wrote:
> From: Ryszard Knop <ryszard.knop at intel.com>
>
> The meow_fft library provides a simple and permissively-licensed Fast
> Fourier Transform implementation. It is an alternative for GSL's FFT.
>
> The library was modified to conform to the IGT's compiler warning flags
> and to use doubles instead of floats by default for higher accuracy.
>
> v2: Replace licence with equivalent SPDX 0BSD (Kamil)
>
Reviewed-by: Peter Senna Tschudin <peter.senna at linux.intel.com>
> Signed-off-by: Ryszard Knop <ryszard.knop at intel.com>
> Signed-off-by: Kamil Konieczny <kamil.konieczny at linux.intel.com>
> ---
> lib/meson.build | 1 +
> lib/vendor/meow_fft/meow_fft.c | 12 +
> lib/vendor/meow_fft/meow_fft.h | 2402 ++++++++++++++++++++++++++++++++
> 3 files changed, 2415 insertions(+)
> create mode 100644 lib/vendor/meow_fft/meow_fft.c
> create mode 100644 lib/vendor/meow_fft/meow_fft.h
>
> diff --git a/lib/meson.build b/lib/meson.build
> index b10410c8a..dff8b4728 100644
> --- a/lib/meson.build
> +++ b/lib/meson.build
> @@ -119,6 +119,7 @@ lib_sources = [
>
> # Vendored libraries:
> 'vendor/uwildmat/uwildmat.c',
> + 'vendor/meow_fft/meow_fft.c',
> ]
>
> lib_deps = [
> diff --git a/lib/vendor/meow_fft/meow_fft.c b/lib/vendor/meow_fft/meow_fft.c
> new file mode 100644
> index 000000000..a1ec504cd
> --- /dev/null
> +++ b/lib/vendor/meow_fft/meow_fft.c
> @@ -0,0 +1,12 @@
> +// SPDX-License-Identifier: 0BSD
> +
> +#define MEOW_FFT_IMPLEMENTATION
> +#define MEOW_FLOAT_TYPE double
> +#include "meow_fft.h"
> +
> +/*
> + * This is a dummy file to build the object file containing
> + * the meow_fft implementation. All code should go into the
> + * header itself. If you want to use it in any other file,
> + * just #include the header without defining MEOW_FFT_IMPL.
> + */
> diff --git a/lib/vendor/meow_fft/meow_fft.h b/lib/vendor/meow_fft/meow_fft.h
> new file mode 100644
> index 000000000..0269211ea
> --- /dev/null
> +++ b/lib/vendor/meow_fft/meow_fft.h
> @@ -0,0 +1,2402 @@
> +/* SPDX-License-Identifier: 0BSD */
> +/*
> + Imported from https://github.com/JodiTheTigger/meow_fft + Pull Request #11
> +
> + Changes for igt-gpu-tools:
> + - Default MEOW_FLOAT_TYPE is now double
> + - Added extra function prototypes to satisfy -Wmissing-prototypes
> + - Hoisted variable declarations to satisfy -Wdeclaration-after-statement
> +*/
> +/*
> + meow_fft. My Easy Oresome Wonderful Fast Fourier Transform.
> + Copyright (C) 2017 Richard Maxwell <jodi.the.tigger at gmail.com>
> +*/
> +
> +#ifndef MEOW_FFT
> +#define MEOW_FFT
> +
> +#include <stdlib.h>
> +// for size_t, abort
> +
> +#ifdef __cplusplus
> +extern "C" {
> +#endif
> +
> +// C-API -----------------------------------------------------------------------
> +
> +// Can be float, double or long double
> +#ifndef MEOW_FLOAT_TYPE
> +#define MEOW_FLOAT_TYPE double
> +#endif
> +
> +typedef MEOW_FLOAT_TYPE Meow_Float;
> +
> +typedef struct Meow_FFT_Complex
> +{
> + Meow_Float r;
> + Meow_Float j;
> +}
> +Meow_FFT_Complex;
> +
> +struct Meow_FFT_Workset;
> +struct Meow_FFT_Workset_Real;
> +
> +size_t meow_fft_generate_workset
> +(
> + int N
> + , struct Meow_FFT_Workset* workset
> +);
> +// returns the size of the workset if null is passed. 0 if N is invalid.
> +
> +size_t meow_fft_generate_workset_real
> +(
> + int N
> + , struct Meow_FFT_Workset_Real* workset
> +);
> +// returns the size of the workset if null is passed. 0 if N is invalid.
> +
> +unsigned meow_fft_is_slow (const struct Meow_FFT_Workset* workset);
> +unsigned meow_fft_is_slow_real(const struct Meow_FFT_Workset_Real* workset);
> +// returns non-zero if the fft has a slow dft in any one of its stages.
> +
> +// C-API (ffts) ----------------------------------------------------------------
> +
> +// NOTES:
> +// countof(out) == countof(in).
> +// In order to do that I have mixed out[0] with out[N/2]. That is:
> +// out[0].r == out[0].r, out[0].j = out[N/2].r
> +
> +
> +void meow_fft_real
> +(
> + const struct Meow_FFT_Workset_Real* workset
> + , const Meow_Float* in
> + , Meow_FFT_Complex* out
> +);
> +
> +void meow_fft_real_i
> +(
> + const struct Meow_FFT_Workset_Real* workset
> + , const Meow_FFT_Complex* in
> + , Meow_FFT_Complex* temp
> + , Meow_Float* out
> +);
> +
> +void meow_fft
> +(
> + const struct Meow_FFT_Workset* data
> + , const Meow_FFT_Complex* in
> + , Meow_FFT_Complex* out
> +);
> +
> +void meow_fft_i
> +(
> + const struct Meow_FFT_Workset* data
> + , const Meow_FFT_Complex* in
> + , Meow_FFT_Complex* out
> +);
> +
> +// -----------------------------------------------------------------------------
> +
> +#ifdef __cplusplus
> +}
> +#endif
> +
> +#endif // MEOW_FFT
> +
> +#ifdef MEOW_FFT_IMPLEMENTATION
> +
> +// Reading List ----------------------------------------------------------------
> +//
> +// It's a circle! -> How FFTs _actually_ work
> +// http://betterexplained.com/articles/an-interactive-guide-to-the-fourier-transform/
> +//
> +// How to get radix-2, 3, 4, and 5 formulas:
> +// http://www.briangough.com/fftalgorithms.pdf pages 18 and 19
> +//
> +// How do make a faster fft when only dealing with real (non-complex) inputs.
> +// (Warning, the maths is confusing due to inconsisten formulas and assumptions)
> +// http://www.engineeringproductivitytools.com/stuff/T0001/PT10.HTM
> +//
> +// Finally, know that ffts are pretty much as fast as you can get, you need
> +// to start making them cache friendly to get any extra speed.
> +// https://math.mit.edu/~stevenj/18.335/FFTW-Alan-2008.pdf
> +//
> +// -----------------------------------------------------------------------------
> +
> +#include <math.h>
> +#include <stdint.h>
> +
> +typedef const Meow_FFT_Complex Complex;
> +
> +#define MEOW_TAU 6.283185307179586476925286766559005768394338798750211641949889
> +
> +// Plumbing --------------------------------------------------------------------
> +
> +typedef struct Meow_Fft_Stages
> +{
> + unsigned count;
> + unsigned* radix;
> + unsigned* remainder;
> + unsigned* offsets;
> +}
> +Meow_Fft_Stages;
> +
> +typedef struct Meow_FFT_Workset
> +{
> + int N;
> +
> + Meow_FFT_Complex* wn;
> + // Non-null only defined if there is a slow-dft as one of the radix stages.
> +
> + Meow_FFT_Complex* wn_ordered;
> + // Sequentially ordered per stage, will be duplicates between stages.
> +
> + Meow_Fft_Stages stages;
> +}
> +Meow_FFT_Workset;
> +
> +typedef struct Meow_FFT_Workset_Real
> +{
> + Meow_FFT_Complex* w_2n;
> + Meow_FFT_Workset half;
> +}
> +Meow_FFT_Workset_Real;
> +
> +typedef struct Meow_Stage_Info
> +{
> + unsigned is_slow;
> + unsigned stage_count;
> + unsigned w_count;
> +}
> +Meow_Stage_Info;
> +
> +// --------------------- IGT prototype changes start here ---------------------
> +
> +unsigned meow_is_codelet(unsigned radix);
> +void meow_make_twiddles(unsigned n, unsigned count, Meow_FFT_Complex* w);
> +unsigned meow_make_twiddles_sequential(unsigned n, Meow_FFT_Complex* w, Meow_Fft_Stages* stages);
> +Meow_Stage_Info meow_calculate_stages(unsigned n, Meow_FFT_Workset* workset);
> +
> +void meow_dft_n_dit(const Meow_FFT_Complex* w_n,
> + Meow_FFT_Complex* out,
> + unsigned count,
> + unsigned w_multiplier,
> + unsigned radix,
> + unsigned N,
> + unsigned reverse);
> +
> +void meow_radix_2_dit(const Meow_FFT_Complex* w_n, Meow_FFT_Complex* out, unsigned count);
> +void meow_radix_3_dit(const Meow_FFT_Complex* w_n, Meow_FFT_Complex* out, unsigned count);
> +void meow_radix_4_dit(const Meow_FFT_Complex* w_n, Meow_FFT_Complex* out, unsigned count);
> +void meow_radix_5_dit(const Meow_FFT_Complex* w_n, Meow_FFT_Complex* out, unsigned count);
> +
> +void meow_radix_2_dit_i(const Meow_FFT_Complex* w_n, Meow_FFT_Complex* out, unsigned count);
> +void meow_radix_3_dit_i(const Meow_FFT_Complex* w_n, Meow_FFT_Complex* out, unsigned count);
> +void meow_radix_4_dit_i(const Meow_FFT_Complex* w_n, Meow_FFT_Complex* out, unsigned count);
> +void meow_radix_5_dit_i(const Meow_FFT_Complex* w_n, Meow_FFT_Complex* out, unsigned count);
> +
> +void meow_recursive_fft_mixed_meow_radix_dit(const Meow_FFT_Workset* fft,
> + unsigned stage,
> + Complex* in,
> + Meow_FFT_Complex* out,
> + unsigned w_mul);
> +
> +void meow_recursive_fft_mixed_meow_radix_dit_i(const Meow_FFT_Workset* fft,
> + unsigned stage,
> + Complex* in,
> + Meow_FFT_Complex* out,
> + unsigned w_mul);
> +
> +// ---------------------- IGT prototype changes end here ----------------------
> +
> +unsigned meow_is_codelet(unsigned radix)
> +{
> + return ((radix <= 5) || (radix == 8));
> +}
> +
> +void meow_make_twiddles
> +(
> + unsigned n
> + , unsigned count
> + , Meow_FFT_Complex* w
> +)
> +{
> + const double ni = 1.0f / n;
> + for (unsigned i = 0; i < count; ++i)
> + {
> + w[i].r = (Meow_Float) cos(MEOW_TAU * i * ni);
> + w[i].j = (Meow_Float) sin(MEOW_TAU * i * ni);
> + }
> +}
> +
> +unsigned meow_make_twiddles_sequential
> +(
> + unsigned n
> + , Meow_FFT_Complex* w
> + , Meow_Fft_Stages* stages
> +)
> +// Returns number of W constants needed.
> +{
> + // Figure out the tiddle offsets.
> + unsigned w_count = 0;
> + {
> + unsigned offset = 0;
> +
> + for (unsigned s = 0; s < stages->count; s++)
> + {
> + unsigned r = stages->radix[s];
> + unsigned count = stages->remainder[s];
> +
> + unsigned amount = meow_is_codelet(r) ? (r - 1) * (count - 1) : 0u;
> +
> + stages->offsets[s] = amount;
> + offset += amount;
> + }
> +
> + w_count = offset;
> +
> + for (unsigned s = 0; s < stages->count; s++)
> + {
> + unsigned count = stages->offsets[s];
> + offset -= count;
> + stages->offsets[s] = offset;
> + }
> + }
> +
> + // Fill in the twiddles so that they are accessed sequentially in the radix
> + // code for best cacheline use.
> + if (w)
> + {
> + unsigned w_mul = 1;
> + double ni = 1.0 / n;
> +
> + for (unsigned s = 0; s < stages->count; s++)
> + {
> + const unsigned radix = stages->radix[s];
> + const unsigned count = stages->remainder[s];
> + unsigned offset = stages->offsets[s];
> +
> + if (meow_is_codelet(radix))
> + {
> + for (unsigned i = 1 ; i < count; i++)
> + {
> + for (unsigned j = 1; j < radix; j++)
> + {
> + const unsigned w_x = i * j * w_mul;
> +
> + w[offset].r = (Meow_Float) cos(MEOW_TAU * w_x * ni);
> + w[offset].j = (Meow_Float) sin(MEOW_TAU * w_x * ni);
> +
> + offset++;
> + }
> + }
> + }
> +
> + w_mul *= radix;
> + }
> + }
> +
> + return w_count;
> +}
> +
> +Meow_Stage_Info meow_calculate_stages(unsigned n, Meow_FFT_Workset* workset)
> +{
> + unsigned is_slow = 0u;
> + unsigned stage = 0u;
> + unsigned w_count = 0u;
> +
> + while (n > 1)
> + {
> + // premade codelets 2, 3, 4, 5, 8
> + unsigned i = 8;
> + for (; i > 1; i--)
> + {
> + if ((i == 7) || (i == 6))
> + {
> + // don't have radix-7 or radix-6.
> + continue;
> + }
> +
> + if (!(n % i))
> + {
> + w_count += ((i - 1) * (n - 1));
> + break;
> + }
> + }
> +
> + // bah, plain slow dft instead
> + if (i == 1)
> + {
> + is_slow = 1;
> + i = 7;
> + for (; i <= n; i++)
> + {
> + if (!(n % i))
> + {
> + break;
> + }
> + }
> + }
> +
> + n /= i;
> +
> + if (workset)
> + {
> + workset->stages.radix[stage] = i;
> + workset->stages.remainder[stage] = n;
> + }
> +
> + stage++;
> + }
> +
> + {
> + Meow_Stage_Info result =
> + {
> + is_slow
> + , stage
> + , w_count
> + };
> +
> + return result;
> + }
> +}
> +
> +size_t meow_fft_generate_workset(int N, Meow_FFT_Workset* workset)
> +{
> + size_t size_workset, size_radix, size_remainder, size_offsets;
> + size_t size_twiddles, size_twiddles_ordered, size_total;
> + Meow_Stage_Info info;
> +
> + if (N < 2)
> + {
> + // Too small.
> + return 0;
> + }
> +
> + info = meow_calculate_stages(N, NULL);
> +
> + size_workset = sizeof(Meow_FFT_Workset);
> + size_radix = info.stage_count * sizeof(int);
> + size_remainder = info.stage_count * sizeof(int);
> + size_offsets = info.stage_count * sizeof(int);
> +
> + size_twiddles_ordered =
> + info.w_count * sizeof(Meow_FFT_Complex);
> +
> + size_twiddles =
> + N * sizeof(Meow_FFT_Complex) * info.is_slow;
> +
> + size_total =
> + size_workset
> + + size_twiddles
> + + size_radix
> + + size_remainder
> + + size_offsets
> + + size_twiddles_ordered;
> +
> + if (workset)
> + {
> + uint8_t* data = (uint8_t*)(workset);
> +
> + uint8_t* data_wn = data + size_workset;
> + uint8_t* data_radix = data_wn + size_twiddles;
> + uint8_t* data_remainder = data_radix + size_radix;
> + uint8_t* data_offsets = data_remainder + size_offsets;
> + uint8_t* data_wn_ordered = data_offsets + size_remainder;
> +
> + workset->wn =
> + (info.is_slow)
> + ? (Meow_FFT_Complex*)(data_wn)
> + : NULL;
> +
> + workset->stages.radix = (unsigned*)(data_radix);
> + workset->stages.remainder = (unsigned*)(data_remainder);
> + workset->stages.offsets = (unsigned*)(data_offsets);
> + workset->stages.count = info.stage_count;
> + workset->wn_ordered = (Meow_FFT_Complex*)(data_wn_ordered);
> + workset->N = N;
> +
> + if (workset->wn)
> + {
> + meow_make_twiddles(N, N, workset->wn);
> + }
> +
> + meow_calculate_stages(N, workset);
> +
> + meow_make_twiddles_sequential
> + (
> + N
> + , workset->wn_ordered
> + , &workset->stages
> + );
> + }
> +
> + return size_total;
> +}
> +
> +size_t meow_fft_generate_workset_real
> +(
> + const int N
> + , Meow_FFT_Workset_Real* workset
> +)
> +{
> + unsigned N_2, N_4;
> + size_t size_workset, size_w_2n, size_half;
> +
> + if ((N < 4) || (N % 2))
> + {
> + // Too small or not divisible by two.
> + return 0;
> + }
> +
> + N_2 = N / 2;
> + N_4 = N / 4;
> +
> + size_workset = sizeof(Meow_FFT_Workset_Real);
> + size_w_2n = (N_4 + 1) * sizeof(Meow_FFT_Complex);
> + size_half = meow_fft_generate_workset(N_2, NULL);
> +
> + if (workset)
> + {
> + uint8_t *data, *data_w_2n;
> +
> + meow_fft_generate_workset(N_2, &workset->half);
> +
> + data = (uint8_t*)(&workset->half);
> + data_w_2n = data + size_half;
> +
> + workset->w_2n = (Meow_FFT_Complex*)(data_w_2n);
> +
> + meow_make_twiddles(N, N_4 + 1, workset->w_2n);
> + }
> +
> + return size_workset + size_w_2n + size_half;
> +}
> +
> +// -----------------------------------------------------------------------------
> +
> +unsigned meow_fft_is_slow(const Meow_FFT_Workset* workset)
> +{
> + return !!(workset->wn);
> +}
> +
> +unsigned meow_fft_is_slow_real(const Meow_FFT_Workset_Real* workset)
> +{
> + return meow_fft_is_slow(&workset->half);
> +}
> +
> +// -----------------------------------------------------------------------------
> +
> +inline Meow_FFT_Complex meow_add
> +(
> + const Meow_FFT_Complex lhs
> + , const Meow_FFT_Complex rhs
> +)
> +{
> + Meow_FFT_Complex result =
> + {
> + lhs.r + rhs.r
> + , lhs.j + rhs.j
> + };
> +
> + return result;
> +}
> +
> +inline Meow_FFT_Complex meow_sub
> +(
> + const Meow_FFT_Complex lhs
> + , const Meow_FFT_Complex rhs
> +)
> +{
> + Meow_FFT_Complex result =
> + {
> + lhs.r - rhs.r
> + , lhs.j - rhs.j
> + };
> +
> + return result;
> +}
> +
> +inline Meow_FFT_Complex meow_negate(const Meow_FFT_Complex lhs)
> +{
> + Meow_FFT_Complex result =
> + {
> + -lhs.r
> + , -lhs.j
> + };
> +
> + return result;
> +}
> +
> +inline Meow_FFT_Complex meow_conjugate(const Meow_FFT_Complex lhs)
> +{
> + Meow_FFT_Complex result =
> + {
> + lhs.r
> + , -lhs.j
> + };
> +
> + return result;
> +}
> +
> +inline Meow_FFT_Complex meow_mul
> +(
> + const Meow_FFT_Complex lhs
> + , const Meow_FFT_Complex rhs
> +)
> +{
> + Meow_FFT_Complex result =
> + {
> + (lhs.r * rhs.r) - (lhs.j * rhs.j)
> + , (lhs.r * rhs.j) + (lhs.j * rhs.r)
> + };
> +
> + return result;
> +}
> +
> +inline Meow_FFT_Complex meow_mul_by_conjugate
> +(
> + const Meow_FFT_Complex lhs
> + , const Meow_FFT_Complex rhs
> +)
> +{
> + Meow_FFT_Complex result =
> + {
> + (lhs.r * rhs.r) + (lhs.j * rhs.j)
> + , (lhs.j * rhs.r) - (lhs.r * rhs.j)
> + };
> +
> + return result;
> +}
> +
> +inline Meow_FFT_Complex meow_mul_by_j(const Meow_FFT_Complex lhs)
> +{
> + Meow_FFT_Complex result =
> + {
> + -lhs.j
> + , lhs.r
> + };
> +
> + return result;
> +}
> +
> +inline Meow_FFT_Complex meow_mulf
> +(
> + const Meow_FFT_Complex lhs
> + , Meow_Float rhs
> +)
> +{
> + Meow_FFT_Complex result =
> + {
> + lhs.r * rhs
> + , lhs.j * rhs
> + };
> +
> + return result;
> +}
> +
> +// -----------------------------------------------------------------------------
> +
> +// https://developercommunity.visualstudio.com/t/fatal-error-C1001:-Internal-compiler-err/1390698
> +// https://developercommunity.visualstudio.com/t/bug-in-visual-c-2019-and-below-i-think-it-is-relat/1119500
> +// https://developercommunity.visualstudio.com/t/optimized-compiler-bug/846597
> +#if defined(_MSC_VER) && (_MSC_VER < 1930)
> +#define MSVC_BUGFIX volatile
> +#else
> +#define MSVC_BUGFIX
> +#endif
> +
> +void meow_dft_n_dit
> +(
> + const Meow_FFT_Complex* w_n
> + , Meow_FFT_Complex* out
> + , unsigned count
> + , unsigned w_multiplier
> + , unsigned radix
> + , unsigned N
> + , unsigned reverse
> +)
> +{
> + Meow_FFT_Complex scratch[2048];
> + // Can I do something with the knowledge that n is always odd?
> +
> + if (radix > 2048)
> + {
> + abort();
> + // removing VLAs, so set a hard limit we support.
> + }
> +
> + for (unsigned butterfly = 0; butterfly < count; ++butterfly)
> + {
> + for (unsigned i = 0; i < radix; i++)
> + {
> + scratch[i] = out[i * count + butterfly];
> + }
> +
> + for (unsigned i = 0 ; i < radix ; ++i)
> + {
> + MSVC_BUGFIX const unsigned index_out = i * count + butterfly;
> +
> + // W0 is always 1
> + Meow_FFT_Complex sum = scratch[0];
> +
> + for (unsigned j = 1; j < radix; ++j )
> + {
> + const unsigned wi = (j * w_multiplier * index_out) % N;
> + Complex w = w_n[wi];
> + Complex in = scratch[j];
> +
> + Meow_Float rr;
> + Meow_Float jj;
> +
> + if (reverse)
> + {
> + rr = (in.r * w.r) - (in.j * w.j);
> + jj = (in.r * w.j) + (in.j * w.r);
> + }
> + else
> + {
> + rr = (in.r * w.r) + (in.j * w.j);
> + jj = (in.j * w.r) - (in.r * w.j);
> + }
> +
> + sum.r += rr;
> + sum.j += jj;
> + }
> +
> + out[index_out] = sum;
> + }
> + }
> +}
> +
> +// -----------------------------------------------------------------------------
> +
> +// Algorithms taken from
> +// http://www.briangough.com/fftalgorithms.pdf
> +// (equations 135 to 146)
> +// in, out and twiddle indicies taken from kiss_fft
> +// All twiddles are assumed to be ifft calculated. Conjugation is done in the
> +// maths.
> +// All twiddle input arrays are assumed to be sequentiall accessed. Twiddle
> +// indicies are pre-calculated.
> +
> +// -----------------------------------------------------------------------------
> +// Forward
> +// -----------------------------------------------------------------------------
> +
> +void meow_radix_2_dit
> +(
> + const Meow_FFT_Complex* w_n
> + , Meow_FFT_Complex* out
> + , unsigned count
> +)
> +{
> + // butteryfly 0 always has the twiddle factor == 1.0f
> + // so special case that one.
> + {
> + Complex z0 = out[0];
> + Complex z1 = out[count];
> +
> + out[0] = meow_add(z0, z1);
> + out[count] = meow_sub(z0, z1);
> + }
> +
> + for (unsigned butterfly = 1; butterfly < count; ++butterfly)
> + {
> + Complex w = w_n[butterfly - 1];
> +
> + const unsigned i0 = butterfly;
> + const unsigned i1 = butterfly + count;
> +
> + Complex z0 = out[i0];
> + Complex z1 = meow_mul_by_conjugate(out[i1], w);
> +
> + out[i0] = meow_add(z0, z1);
> + out[i1] = meow_sub(z0, z1);
> + // Equation 135
> + }
> +}
> +
> +#define MEOW_SIN_PI_3 -0.866025403784438646763723170752936183471402626905190314f
> +
> +void meow_radix_3_dit
> +(
> + const Meow_FFT_Complex* w_n
> + , Meow_FFT_Complex* out
> + , unsigned count
> +)
> +{
> + unsigned wi = 0;
> +
> + // W[0] is always 1.0f;
> + {
> + const unsigned i0 = 0 * count;
> + const unsigned i1 = 1 * count;
> + const unsigned i2 = 2 * count;
> +
> + Complex z0 = out[i0];
> + Complex z1 = out[i1];
> + Complex z2 = out[i2];
> +
> + Complex t1 = meow_add(z1, z2);
> + Complex t2 = meow_sub(z0, meow_mulf(t1, 0.5));
> + Complex t3j = meow_mul_by_j(meow_mulf(meow_sub(z1, z2), MEOW_SIN_PI_3));
> +
> + out[i0] = meow_add(z0, t1);
> + out[i1] = meow_add(t2, t3j);
> + out[i2] = meow_sub(t2, t3j);
> + }
> +
> + for (unsigned butterfly = 1; butterfly < count; butterfly++, wi+=2)
> + {
> + Complex w1 = w_n[wi + 0];
> + Complex w2 = w_n[wi + 1];
> +
> + const unsigned i0 = butterfly;
> + const unsigned i1 = butterfly + count;
> + const unsigned i2 = butterfly + 2 * count;
> +
> + Complex z0 = out[i0];
> + Complex z1 = meow_mul_by_conjugate(out[i1], w1);
> + Complex z2 = meow_mul_by_conjugate(out[i2], w2);
> +
> + Complex t1 = meow_add(z1, z2);
> + Complex t2 = meow_sub(z0, meow_mulf(t1, 0.5));
> + Complex t3j = meow_mul_by_j(meow_mulf(meow_sub(z1, z2), MEOW_SIN_PI_3));
> + // Equation 136
> +
> + out[i0] = meow_add(z0, t1);
> + out[i1] = meow_add(t2, t3j);
> + out[i2] = meow_sub(t2, t3j);
> + // Equation 137
> + }
> +}
> +
> +void meow_radix_4_dit
> +(
> + const Meow_FFT_Complex* w_n
> + , Meow_FFT_Complex* out
> + , unsigned count
> +)
> +{
> + unsigned wi = 0u;
> +
> + // W[0] is always 1.0f;
> + {
> + const unsigned i0 = 0 * count;
> + const unsigned i1 = 1 * count;
> + const unsigned i2 = 2 * count;
> + const unsigned i3 = 3 * count;
> +
> + Complex z0 = out[i0];
> + Complex z1 = out[i1];
> + Complex z2 = out[i2];
> + Complex z3 = out[i3];
> +
> + Complex t1 = meow_add(z0, z2);
> + Complex t2 = meow_add(z1, z3);
> + Complex t3 = meow_sub(z0, z2);
> + Complex t4j = meow_negate(meow_mul_by_j(meow_sub(z1, z3)));
> +
> + out[i0] = meow_add(t1, t2);
> + out[i1] = meow_add(t3, t4j);
> + out[i2] = meow_sub(t1, t2);
> + out[i3] = meow_sub(t3, t4j);
> + }
> +
> + for (unsigned butterfly = 1; butterfly < count; ++butterfly, wi+=3)
> + {
> + Complex w1 = w_n[wi + 0];
> + Complex w2 = w_n[wi + 1];
> + Complex w3 = w_n[wi + 2];
> +
> + const unsigned i0 = butterfly + 0 * count;
> + const unsigned i1 = butterfly + 1 * count;
> + const unsigned i2 = butterfly + 2 * count;
> + const unsigned i3 = butterfly + 3 * count;
> +
> + Complex z0 = out[i0];
> + Complex z1 = meow_mul_by_conjugate(out[i1], w1);
> + Complex z2 = meow_mul_by_conjugate(out[i2], w2);
> + Complex z3 = meow_mul_by_conjugate(out[i3], w3);
> +
> + Complex t1 = meow_add(z0, z2);
> + Complex t2 = meow_add(z1, z3);
> + Complex t3 = meow_sub(z0, z2);
> + Complex t4j = meow_negate(meow_mul_by_j(meow_sub(z1, z3)));
> + // Equations 138
> + // Also instead of conjugating the input and multplying with the
> + // twiddles for the ifft, we invert the twiddles instead. This works
> + // fine except here, the mul_by_j is assuming that it's the forward
> + // fft twiddle we are multiplying with, not the conjugated one we
> + // actually have. So we have to conjugate it _back_ if we are doing the
> + // ifft.
> + // Also, had to multiply by -j, not j for reasons I am yet to grasp.
> +
> + out[i0] = meow_add(t1, t2);
> + out[i1] = meow_add(t3, t4j);
> + out[i2] = meow_sub(t1, t2);
> + out[i3] = meow_sub(t3, t4j);
> + // Equations 139
> + }
> +}
> +
> +#define MEOW_SQR_5_DIV_4 0.5590169943749474241022934171828190588601545899028814f
> +#define MEOW_SIN_2PI_5 -0.9510565162951535721164393333793821434056986341257502f
> +#define MEOW_SIN_2PI_10 -0.5877852522924731291687059546390727685976524376431459f
> +
> +void meow_radix_5_dit
> +(
> + const Meow_FFT_Complex* w_n
> + , Meow_FFT_Complex* out
> + , unsigned count
> +)
> +{
> + unsigned wi = 0u;
> +
> + // W[0] is always 1.0f;
> + {
> + const unsigned i0 = 0 * count;
> + const unsigned i1 = 1 * count;
> + const unsigned i2 = 2 * count;
> + const unsigned i3 = 3 * count;
> + const unsigned i4 = 4 * count;
> +
> + Complex z0 = out[i0];
> + Complex z1 = out[i1];
> + Complex z2 = out[i2];
> + Complex z3 = out[i3];
> + Complex z4 = out[i4];
> +
> + Complex t1 = meow_add(z1, z4);
> + Complex t2 = meow_add(z2, z3);
> + Complex t3 = meow_sub(z1, z4);
> + Complex t4 = meow_sub(z2, z3);
> + // Equations 140
> +
> + Complex t5 = meow_add(t1, t2);
> + Complex t6 = meow_mulf(meow_sub(t1, t2), MEOW_SQR_5_DIV_4);
> + Complex t7 = meow_sub(z0, meow_mulf(t5, 0.25f));
> + // Equation 141
> +
> + Complex t8 = meow_add(t7, t6);
> + Complex t9 = meow_sub(t7, t6);
> + // Equation 142
> +
> + Complex t10j = meow_mul_by_j
> + (
> + meow_add
> + (
> + meow_mulf(t3, MEOW_SIN_2PI_5)
> + , meow_mulf(t4, MEOW_SIN_2PI_10)
> + )
> + );
> +
> + Complex t11j = meow_mul_by_j
> + (
> + meow_sub
> + (
> + meow_mulf(t3, MEOW_SIN_2PI_10)
> + , meow_mulf(t4, MEOW_SIN_2PI_5)
> + )
> + );
> + // Equation 143
> +
> + out[i0] = meow_add(z0, t5);
> + // Equation 144
> +
> + out[i1] = meow_add(t8, t10j);
> + out[i2] = meow_add(t9, t11j);
> + // Equation 145
> +
> + out[i3] = meow_sub(t9, t11j);
> + out[i4] = meow_sub(t8, t10j);
> + // Equation 146
> + }
> +
> + for (unsigned butterfly = 1; butterfly < count; ++butterfly, wi+=4)
> + {
> + Complex w1 = w_n[wi + 0];
> + Complex w2 = w_n[wi + 1];
> + Complex w3 = w_n[wi + 2];
> + Complex w4 = w_n[wi + 3];
> +
> + unsigned i0 = butterfly + 0 * count;
> + unsigned i1 = butterfly + 1 * count;
> + unsigned i2 = butterfly + 2 * count;
> + unsigned i3 = butterfly + 3 * count;
> + unsigned i4 = butterfly + 4 * count;
> +
> + Complex z0 = out[i0];
> + Complex z1 = meow_mul_by_conjugate(out[i1], w1);
> + Complex z2 = meow_mul_by_conjugate(out[i2], w2);
> + Complex z3 = meow_mul_by_conjugate(out[i3], w3);
> + Complex z4 = meow_mul_by_conjugate(out[i4], w4);
> +
> + Complex t1 = meow_add(z1, z4);
> + Complex t2 = meow_add(z2, z3);
> + Complex t3 = meow_sub(z1, z4);
> + Complex t4 = meow_sub(z2, z3);
> + // Equations 140
> +
> + Complex t5 = meow_add(t1, t2);
> + Complex t6 = meow_mulf(meow_sub(t1, t2), MEOW_SQR_5_DIV_4);
> + Complex t7 = meow_sub(z0, meow_mulf(t5, 0.25f));
> + // Equation 141
> +
> + Complex t8 = meow_add(t7, t6);
> + Complex t9 = meow_sub(t7, t6);
> + // Equation 142
> +
> + Complex t10j = meow_mul_by_j
> + (
> + meow_add
> + (
> + meow_mulf(t3, MEOW_SIN_2PI_5)
> + , meow_mulf(t4, MEOW_SIN_2PI_10)
> + )
> + );
> +
> + Complex t11j = meow_mul_by_j
> + (
> + meow_sub
> + (
> + meow_mulf(t3, MEOW_SIN_2PI_10)
> + , meow_mulf(t4, MEOW_SIN_2PI_5)
> + )
> + );
> + // Equation 143
> +
> + out[i0] = meow_add(z0, t5);
> + // Equation 144
> +
> + out[i1] = meow_add(t8, t10j);
> + out[i2] = meow_add(t9, t11j);
> + // Equation 145
> +
> + out[i3] = meow_sub(t9, t11j);
> + out[i4] = meow_sub(t8, t10j);
> + // Equation 146
> + }
> +}
> +
> +#define MEOW_1_DIV_SQR_2 0.707106781186547524400844362104849039284835938f
> +
> +static void meow_radix_8_dit
> +(
> + const Meow_FFT_Complex* w_n
> + , Meow_FFT_Complex* out
> + , unsigned count
> +)
> +{
> + const Meow_Float* W = &w_n[0].r;
> +
> + {
> + Meow_Float T3;
> + Meow_Float T23;
> + Meow_Float T18;
> + Meow_Float T38;
> + Meow_Float T6;
> + Meow_Float T37;
> + Meow_Float T21;
> + Meow_Float T24;
> + Meow_Float T13;
> + Meow_Float T49;
> + Meow_Float T35;
> + Meow_Float T43;
> + Meow_Float T10;
> + Meow_Float T48;
> + Meow_Float T30;
> + Meow_Float T42;
> + {
> + Meow_Float T1;
> + Meow_Float T2;
> + Meow_Float T19;
> + Meow_Float T20;
> + T1 = out[0].r;
> + T2 = out[count * 4].r;
> + T3 = T1 + T2;
> + T23 = T1 - T2;
> + {
> + Meow_Float T16;
> + Meow_Float T17;
> + Meow_Float T4;
> + Meow_Float T5;
> + T16 = out[0].j;
> + T17 = out[count * 4].j;
> + T18 = T16 + T17;
> + T38 = T16 - T17;
> + T4 = out[count * 2].r;
> + T5 = out[count * 6].r;
> + T6 = T4 + T5;
> + T37 = T4 - T5;
> + }
> + T19 = out[count * 2].j;
> + T20 = out[count * 6].j;
> + T21 = T19 + T20;
> + T24 = T19 - T20;
> + {
> + Meow_Float T11;
> + Meow_Float T12;
> + Meow_Float T31;
> + Meow_Float T32;
> + Meow_Float T33;
> + Meow_Float T34;
> + T11 = out[count * 7].r;
> + T12 = out[count * 3].r;
> + T31 = T11 - T12;
> + T32 = out[count * 7].j;
> + T33 = out[count * 3].j;
> + T34 = T32 - T33;
> + T13 = T11 + T12;
> + T49 = T32 + T33;
> + T35 = T31 - T34;
> + T43 = T31 + T34;
> + }
> + {
> + Meow_Float T8;
> + Meow_Float T9;
> + Meow_Float T26;
> + Meow_Float T27;
> + Meow_Float T28;
> + Meow_Float T29;
> + T8 = out[count * 1].r;
> + T9 = out[count * 5].r;
> + T26 = T8 - T9;
> + T27 = out[count * 1].j;
> + T28 = out[count * 5].j;
> + T29 = T27 - T28;
> + T10 = T8 + T9;
> + T48 = T27 + T28;
> + T30 = T26 + T29;
> + T42 = T29 - T26;
> + }
> + }
> + {
> + Meow_Float T7;
> + Meow_Float T14;
> + Meow_Float T51;
> + Meow_Float T52;
> + T7 = T3 + T6;
> + T14 = T10 + T13;
> + out[count * 4].r = T7 - T14;
> + out[0].r = T7 + T14;
> + T51 = T18 + T21;
> + T52 = T48 + T49;
> + out[count * 4].j = T51 - T52;
> + out[0].j = T51 + T52;
> + }
> + {
> + Meow_Float T15;
> + Meow_Float T22;
> + Meow_Float T47;
> + Meow_Float T50;
> + T15 = T13 - T10;
> + T22 = T18 - T21;
> + out[count * 2].j = T15 + T22;
> + out[count * 6].j = T22 - T15;
> + T47 = T3 - T6;
> + T50 = T48 - T49;
> + out[count * 6].r = T47 - T50;
> + out[count * 2].r = T47 + T50;
> + }
> + {
> + Meow_Float T25;
> + Meow_Float T36;
> + Meow_Float T45;
> + Meow_Float T46;
> + T25 = T23 + T24;
> + T36 = MEOW_1_DIV_SQR_2 * (T30 + T35);
> + out[count * 5].r = T25 - T36;
> + out[count * 1].r = T25 + T36;
> + T45 = T38 - T37;
> + T46 = MEOW_1_DIV_SQR_2 * (T42 + T43);
> + out[count * 5].j = T45 - T46;
> + out[count * 1].j = T45 + T46;
> + }
> + {
> + Meow_Float T39;
> + Meow_Float T40;
> + Meow_Float T41;
> + Meow_Float T44;
> + T39 = T37 + T38;
> + T40 = MEOW_1_DIV_SQR_2 * (T35 - T30);
> + out[count * 7].j = T39 - T40;
> + out[count * 3].j = T39 + T40;
> + T41 = T23 - T24;
> + T44 = MEOW_1_DIV_SQR_2 * (T42 - T43);
> + out[count * 7].r = T41 - T44;
> + out[count * 3].r = T41 + T44;
> + }
> + }
> +
> + out = out + 1;
> +
> + {
> + unsigned m;
> + for (m = 1; m < count; m = m + 1, out = out + 1, W = W + 14)
> + {
> + Meow_Float T7;
> + Meow_Float T76;
> + Meow_Float T43;
> + Meow_Float T71;
> + Meow_Float T41;
> + Meow_Float T65;
> + Meow_Float T53;
> + Meow_Float T56;
> + Meow_Float T18;
> + Meow_Float T77;
> + Meow_Float T46;
> + Meow_Float T68;
> + Meow_Float T30;
> + Meow_Float T64;
> + Meow_Float T48;
> + Meow_Float T51;
> + {
> + Meow_Float T1;
> + Meow_Float T70;
> + Meow_Float T6;
> + Meow_Float T69;
> + T1 = out[0].r;
> + T70 = out[0].j;
> + {
> + Meow_Float T3;
> + Meow_Float T5;
> + Meow_Float T2;
> + Meow_Float T4;
> + T3 = out[count * 4].r;
> + T5 = out[count * 4].j;
> + T2 = W[6];
> + T4 = W[7];
> + T6 = (T2 * T3) + (T4 * T5);
> + T69 = (T2 * T5) - (T4 * T3);
> + }
> + T7 = T1 + T6;
> + T76 = T70 - T69;
> + T43 = T1 - T6;
> + T71 = T69 + T70;
> + }
> + {
> + Meow_Float T35;
> + Meow_Float T54;
> + Meow_Float T40;
> + Meow_Float T55;
> + {
> + Meow_Float T32;
> + Meow_Float T34;
> + Meow_Float T31;
> + Meow_Float T33;
> + T32 = out[count * 7].r;
> + T34 = out[count * 7].j;
> + T31 = W[12];
> + T33 = W[13];
> + T35 = (T31 * T32) + (T33 * T34);
> + T54 = (T31 * T34) - (T33 * T32);
> + }
> + {
> + Meow_Float T37;
> + Meow_Float T39;
> + Meow_Float T36;
> + Meow_Float T38;
> + T37 = out[count * 3].r;
> + T39 = out[count * 3].j;
> + T36 = W[4];
> + T38 = W[5];
> + T40 = (T36 * T37) + (T38 * T39);
> + T55 = (T36 * T39) - (T38 * T37);
> + }
> + T41 = T35 + T40;
> + T65 = T54 + T55;
> + T53 = T35 - T40;
> + T56 = T54 - T55;
> + }
> + {
> + Meow_Float T12;
> + Meow_Float T44;
> + Meow_Float T17;
> + Meow_Float T45;
> + {
> + Meow_Float T9;
> + Meow_Float T11;
> + Meow_Float T8;
> + Meow_Float T10;
> + T9 = out[count * 2].r;
> + T11 = out[count * 2].j;
> + T8 = W[2];
> + T10 = W[3];
> + T12 = (T8 * T9) + (T10 * T11);
> + T44 = (T8 * T11) - (T10 * T9);
> + }
> + {
> + Meow_Float T14;
> + Meow_Float T16;
> + Meow_Float T13;
> + Meow_Float T15;
> + T14 = out[count * 6].r;
> + T16 = out[count * 6].j;
> + T13 = W[10];
> + T15 = W[11];
> + T17 = (T13 * T14) + (T15 * T16);
> + T45 = (T13 * T16) - (T15 * T14);
> + }
> + T18 = T12 + T17;
> + T77 = T12 - T17;
> + T46 = T44 - T45;
> + T68 = T44 + T45;
> + }
> + {
> + Meow_Float T24;
> + Meow_Float T49;
> + Meow_Float T29;
> + Meow_Float T50;
> + {
> + Meow_Float T21;
> + Meow_Float T23;
> + Meow_Float T20;
> + Meow_Float T22;
> + T21 = out[count * 1].r;
> + T23 = out[count * 1].j;
> + T20 = W[0];
> + T22 = W[1];
> + T24 = (T20 * T21) + (T22 * T23);
> + T49 = (T20 * T23) - (T22 * T21);
> + }
> + {
> + Meow_Float T26;
> + Meow_Float T28;
> + Meow_Float T25;
> + Meow_Float T27;
> + T26 = out[count * 5].r;
> + T28 = out[count * 5].j;
> + T25 = W[8];
> + T27 = W[9];
> + T29 = (T25 * T26) + (T27 * T28);
> + T50 = (T25 * T28) - (T27 * T26);
> + }
> + T30 = T24 + T29;
> + T64 = T49 + T50;
> + T48 = T24 - T29;
> + T51 = T49 - T50;
> + }
> + {
> + Meow_Float T19;
> + Meow_Float T42;
> + Meow_Float T73;
> + Meow_Float T74;
> + T19 = T7 + T18;
> + T42 = T30 + T41;
> + out[count * 4].r = T19 - T42;
> + out[0].r = T19 + T42;
> + {
> + Meow_Float T67;
> + Meow_Float T72;
> + Meow_Float T63;
> + Meow_Float T66;
> + T67 = T64 + T65;
> + T72 = T68 + T71;
> + out[0].j = T67 + T72;
> + out[count * 4].j = T72 - T67;
> + T63 = T7 - T18;
> + T66 = T64 - T65;
> + out[count * 6].r = T63 - T66;
> + out[count * 2].r = T63 + T66;
> + }
> + T73 = T41 - T30;
> + T74 = T71 - T68;
> + out[count * 2].j = T73 + T74;
> + out[count * 6].j = T74 - T73;
> + {
> + Meow_Float T59;
> + Meow_Float T78;
> + Meow_Float T62;
> + Meow_Float T75;
> + Meow_Float T60;
> + Meow_Float T61;
> + T59 = T43 - T46;
> + T78 = T76 - T77;
> + T60 = T51 - T48;
> + T61 = T53 + T56;
> + T62 = MEOW_1_DIV_SQR_2 * (T60 - T61);
> + T75 = MEOW_1_DIV_SQR_2 * (T60 + T61);
> + out[count * 7].r = T59 - T62;
> + out[count * 5].j = T78 - T75;
> + out[count * 3].r = T59 + T62;
> + out[count * 1].j = T75 + T78;
> + }
> + {
> + Meow_Float T47;
> + Meow_Float T80;
> + Meow_Float T58;
> + Meow_Float T79;
> + Meow_Float T52;
> + Meow_Float T57;
> + T47 = T43 + T46;
> + T80 = T77 + T76;
> + T52 = T48 + T51;
> + T57 = T53 - T56;
> + T58 = MEOW_1_DIV_SQR_2 * (T52 + T57);
> + T79 = MEOW_1_DIV_SQR_2 * (T57 - T52);
> + out[count * 5].r = T47 - T58;
> + out[count * 7].j = T80 - T79;
> + out[count * 1].r = T47 + T58;
> + out[count * 3].j = T79 + T80;
> + }
> + }
> + }
> + }
> +}
> +
> +// -----------------------------------------------------------------------------
> +// Inverse
> +// -----------------------------------------------------------------------------
> +
> +void meow_radix_2_dit_i
> +(
> + const Meow_FFT_Complex* w_n
> + , Meow_FFT_Complex* out
> + , unsigned count
> +)
> +{
> + // butteryfly 0 always has the twiddle factor == 1.0f
> + // so special case that one.
> + {
> + Complex z0 = out[0];
> + Complex z1 = out[count];
> +
> + out[0] = meow_add(z0, z1);
> + out[count] = meow_sub(z0, z1);
> + }
> +
> + for (unsigned butterfly = 1; butterfly < count; ++butterfly)
> + {
> + Complex w = w_n[butterfly - 1];
> +
> + const unsigned i0 = butterfly;
> + const unsigned i1 = butterfly + count;
> +
> + Complex z0 = out[i0];
> + Complex z1 = meow_mul(out[i1], w);
> +
> + out[i0] = meow_add(z0, z1);
> + out[i1] = meow_sub(z0, z1);
> + // Equation 135
> + }
> +}
> +
> +void meow_radix_3_dit_i
> +(
> + const Meow_FFT_Complex* w_n
> + , Meow_FFT_Complex* out
> + , unsigned count
> +)
> +{
> + unsigned wi = 0u;
> +
> + // W[0] is always 1.0f;
> + {
> + const unsigned i0 = 0 * count;
> + const unsigned i1 = 1 * count;
> + const unsigned i2 = 2 * count;
> +
> + Complex z0 = out[i0];
> + Complex z1 = out[i1];
> + Complex z2 = out[i2];
> +
> + Complex t1 = meow_add(z1, z2);
> + Complex t2 = meow_sub(z0, meow_mulf(t1, 0.5));
> + Complex t3j = meow_mul_by_j
> + (
> + meow_mulf(meow_sub(z1, z2), -MEOW_SIN_PI_3)
> + );
> +
> + out[i0] = meow_add(z0, t1);
> + out[i1] = meow_add(t2, t3j);
> + out[i2] = meow_sub(t2, t3j);
> + }
> +
> + for (unsigned butterfly = 1; butterfly < count; butterfly++, wi+=2)
> + {
> + Complex w1 = w_n[wi + 0];
> + Complex w2 = w_n[wi + 1];
> +
> + const unsigned i0 = butterfly;
> + const unsigned i1 = butterfly + count;
> + const unsigned i2 = butterfly + 2 * count;
> +
> + Complex z0 = out[i0];
> + Complex z1 = meow_mul(w1, out[i1]);
> + Complex z2 = meow_mul(w2, out[i2]);
> +
> + Complex t1 = meow_add(z1, z2);
> + Complex t2 = meow_sub(z0, meow_mulf(t1, 0.5));
> + Complex t3j = meow_mul_by_j
> + (
> + meow_mulf(meow_sub(z1, z2), -MEOW_SIN_PI_3)
> + );
> + // Equation 136
> +
> + out[i0] = meow_add(z0, t1);
> + out[i1] = meow_add(t2, t3j);
> + out[i2] = meow_sub(t2, t3j);
> + // Equation 137
> + }
> +}
> +
> +void meow_radix_4_dit_i
> +(
> + const Meow_FFT_Complex* w_n
> + , Meow_FFT_Complex* out
> + , unsigned count
> +)
> +{
> + unsigned wi = 0u;
> +
> + // W[0] is always 1.0f;
> + {
> + const unsigned i0 = 0 * count;
> + const unsigned i1 = 1 * count;
> + const unsigned i2 = 2 * count;
> + const unsigned i3 = 3 * count;
> +
> + Complex z0 = out[i0];
> + Complex z1 = out[i1];
> + Complex z2 = out[i2];
> + Complex z3 = out[i3];
> +
> + Complex t1 = meow_add(z0, z2);
> + Complex t2 = meow_add(z1, z3);
> + Complex t3 = meow_sub(z0, z2);
> + Complex t4j = meow_mul_by_j(meow_sub(z1, z3));
> +
> + out[i0] = meow_add(t1, t2);
> + out[i1] = meow_add(t3, t4j);
> + out[i2] = meow_sub(t1, t2);
> + out[i3] = meow_sub(t3, t4j);
> + }
> +
> + for (unsigned butterfly = 1; butterfly < count; ++butterfly, wi+=3)
> + {
> + Complex w1 = w_n[wi + 0];
> + Complex w2 = w_n[wi + 1];
> + Complex w3 = w_n[wi + 2];
> +
> + const unsigned i0 = butterfly + 0 * count;
> + const unsigned i1 = butterfly + 1 * count;
> + const unsigned i2 = butterfly + 2 * count;
> + const unsigned i3 = butterfly + 3 * count;
> +
> + Complex z0 = out[i0];
> + Complex z1 = meow_mul(w1, out[i1]);
> + Complex z2 = meow_mul(w2, out[i2]);
> + Complex z3 = meow_mul(w3, out[i3]);
> +
> + Complex t1 = meow_add(z0, z2);
> + Complex t2 = meow_add(z1, z3);
> + Complex t3 = meow_sub(z0, z2);
> +
> + Complex t4j = meow_mul_by_j(meow_sub(z1, z3));
> + // Equations 138
> + // Also instead of conjugating the input and multplying with the
> + // twiddles for the ifft, we invert the twiddles instead. This works
> + // fine except here, the mul_by_j is assuming that it's the forward
> + // fft twiddle we are multiplying with, not the conjugated one we
> + // actually have. So we have to conjugate it _back_ if we are doing the
> + // ifft.
> + // Also, had to multiply by -j, not j for reasons I am yet to grasp.
> +
> + out[i0] = meow_add(t1, t2);
> + out[i1] = meow_add(t3, t4j);
> + out[i2] = meow_sub(t1, t2);
> + out[i3] = meow_sub(t3, t4j);
> + // Equations 139
> + }
> +}
> +
> +void meow_radix_5_dit_i
> +(
> + const Meow_FFT_Complex* w_n
> + , Meow_FFT_Complex* out
> + , unsigned count
> +)
> +{
> + unsigned wi = 0u;
> +
> + // W[0] is always 1.0f;
> + {
> + const unsigned i0 = 0 * count;
> + const unsigned i1 = 1 * count;
> + const unsigned i2 = 2 * count;
> + const unsigned i3 = 3 * count;
> + const unsigned i4 = 4 * count;
> +
> + Complex z0 = out[i0];
> + Complex z1 = out[i1];
> + Complex z2 = out[i2];
> + Complex z3 = out[i3];
> + Complex z4 = out[i4];
> +
> + Complex t1 = meow_add(z1, z4);
> + Complex t2 = meow_add(z2, z3);
> + Complex t3 = meow_sub(z1, z4);
> + Complex t4 = meow_sub(z2, z3);
> + // Equations 140
> +
> + Complex t5 = meow_add(t1, t2);
> + Complex t6 = meow_mulf(meow_sub(t1, t2), MEOW_SQR_5_DIV_4);
> + Complex t7 = meow_sub(z0, meow_mulf(t5, 0.25f));
> + // Equation 141
> +
> + Complex t8 = meow_add(t7, t6);
> + Complex t9 = meow_sub(t7, t6);
> + // Equation 142
> +
> + Complex t10j = meow_mul_by_j
> + (
> + meow_add
> + (
> + meow_mulf(t3, -MEOW_SIN_2PI_5)
> + , meow_mulf(t4, -MEOW_SIN_2PI_10)
> + )
> + );
> +
> + Complex t11j = meow_mul_by_j
> + (
> + meow_sub
> + (
> + meow_mulf(t3, -MEOW_SIN_2PI_10)
> + , meow_mulf(t4, -MEOW_SIN_2PI_5)
> + )
> + );
> + // Equation 143
> +
> + out[i0] = meow_add(z0, t5);
> + // Equation 144
> +
> + out[i1] = meow_add(t8, t10j);
> + out[i2] = meow_add(t9, t11j);
> + // Equation 145
> +
> + out[i3] = meow_sub(t9, t11j);
> + out[i4] = meow_sub(t8, t10j);
> + // Equation 146
> + }
> +
> + for (unsigned butterfly = 1; butterfly < count; ++butterfly, wi+=4)
> + {
> + Complex w1 = w_n[wi + 0];
> + Complex w2 = w_n[wi + 1];
> + Complex w3 = w_n[wi + 2];
> + Complex w4 = w_n[wi + 3];
> +
> + const unsigned i0 = butterfly + 0 * count;
> + const unsigned i1 = butterfly + 1 * count;
> + const unsigned i2 = butterfly + 2 * count;
> + const unsigned i3 = butterfly + 3 * count;
> + const unsigned i4 = butterfly + 4 * count;
> +
> + Complex z0 = out[i0];
> + Complex z1 = meow_mul(w1, out[i1]);
> + Complex z2 = meow_mul(w2, out[i2]);
> + Complex z3 = meow_mul(w3, out[i3]);
> + Complex z4 = meow_mul(w4, out[i4]);
> +
> + Complex t1 = meow_add(z1, z4);
> + Complex t2 = meow_add(z2, z3);
> + Complex t3 = meow_sub(z1, z4);
> + Complex t4 = meow_sub(z2, z3);
> + // Equations 140
> +
> + Complex t5 = meow_add(t1, t2);
> + Complex t6 = meow_mulf(meow_sub(t1, t2), MEOW_SQR_5_DIV_4);
> + Complex t7 = meow_sub(z0, meow_mulf(t5, 0.25f));
> + // Equation 141
> +
> + Complex t8 = meow_add(t7, t6);
> + Complex t9 = meow_sub(t7, t6);
> + // Equation 142
> +
> + Complex t10j = meow_mul_by_j
> + (
> + meow_add
> + (
> + meow_mulf(t3, -MEOW_SIN_2PI_5)
> + , meow_mulf(t4, -MEOW_SIN_2PI_10)
> + )
> + );
> +
> + Complex t11j = meow_mul_by_j
> + (
> + meow_sub
> + (
> + meow_mulf(t3, -MEOW_SIN_2PI_10)
> + , meow_mulf(t4, -MEOW_SIN_2PI_5)
> + )
> + );
> + // Equation 143
> +
> + out[i0] = meow_add(z0, t5);
> + // Equation 144
> +
> + out[i1] = meow_add(t8, t10j);
> + out[i2] = meow_add(t9, t11j);
> + // Equation 145
> +
> + out[i3] = meow_sub(t9, t11j);
> + out[i4] = meow_sub(t8, t10j);
> + // Equation 146
> + }
> +}
> +
> +static void meow_radix_8_dit_i
> +(
> + const Meow_FFT_Complex* w_n
> + , Meow_FFT_Complex* out
> + , unsigned count
> +)
> +{
> + const Meow_Float* W = &w_n[0].r;
> + {
> + Meow_Float T3;
> + Meow_Float T37;
> + Meow_Float T18;
> + Meow_Float T23;
> + Meow_Float T6;
> + Meow_Float T24;
> + Meow_Float T21;
> + Meow_Float T38;
> + Meow_Float T13;
> + Meow_Float T49;
> + Meow_Float T35;
> + Meow_Float T43;
> + Meow_Float T10;
> + Meow_Float T48;
> + Meow_Float T30;
> + Meow_Float T42;
> + {
> + Meow_Float T1;
> + Meow_Float T2;
> + Meow_Float T19;
> + Meow_Float T20;
> + T1 = out[0].r;
> + T2 = out[count * 4].r;
> + T3 = T1 + T2;
> + T37 = T1 - T2;
> + {
> + Meow_Float T16;
> + Meow_Float T17;
> + Meow_Float T4;
> + Meow_Float T5;
> + T16 = out[0].j;
> + T17 = out[count * 4].j;
> + T18 = T16 + T17;
> + T23 = T16 - T17;
> + T4 = out[count * 2].r;
> + T5 = out[count * 6].r;
> + T6 = T4 + T5;
> + T24 = T4 - T5;
> + }
> + T19 = out[count * 2].j;
> + T20 = out[count * 6].j;
> + T21 = T19 + T20;
> + T38 = T19 - T20;
> + {
> + Meow_Float T11;
> + Meow_Float T12;
> + Meow_Float T31;
> + Meow_Float T32;
> + Meow_Float T33;
> + Meow_Float T34;
> + T11 = out[count * 7].r;
> + T12 = out[count * 3].r;
> + T31 = T11 - T12;
> + T32 = out[count * 7].j;
> + T33 = out[count * 3].j;
> + T34 = T32 - T33;
> + T13 = T11 + T12;
> + T49 = T32 + T33;
> + T35 = T31 + T34;
> + T43 = T34 - T31;
> + }
> + {
> + Meow_Float T8;
> + Meow_Float T9;
> + Meow_Float T26;
> + Meow_Float T27;
> + Meow_Float T28;
> + Meow_Float T29;
> + T8 = out[count * 1].r;
> + T9 = out[count * 5].r;
> + T26 = T8 - T9;
> + T27 = out[count * 1].j;
> + T28 = out[count * 5].j;
> + T29 = T27 - T28;
> + T10 = T8 + T9;
> + T48 = T27 + T28;
> + T30 = T26 - T29;
> + T42 = T26 + T29;
> + }
> + }
> + {
> + Meow_Float T7;
> + Meow_Float T14;
> + Meow_Float T47;
> + Meow_Float T50;
> + T7 = T3 + T6;
> + T14 = T10 + T13;
> + out[count * 4].r = T7 - T14;
> + out[0].r = T7 + T14;
> + T47 = T18 + T21;
> + T50 = T48 + T49;
> + out[count * 4].j = T47 - T50;
> + out[0].j = T47 + T50;
> + }
> + {
> + Meow_Float T15;
> + Meow_Float T22;
> + Meow_Float T51;
> + Meow_Float T52;
> + T15 = T10 - T13;
> + T22 = T18 - T21;
> + out[count * 2].j = T15 + T22;
> + out[count * 6].j = T22 - T15;
> + T51 = T3 - T6;
> + T52 = T49 - T48;
> + out[count * 6].r = T51 - T52;
> + out[count * 2].r = T51 + T52;
> + }
> + {
> + Meow_Float T25;
> + Meow_Float T36;
> + Meow_Float T45;
> + Meow_Float T46;
> + T25 = T23 - T24;
> + T36 = MEOW_1_DIV_SQR_2 * (T30 - T35);
> + out[count * 7].j = T25 - T36;
> + out[count * 3].j = T25 + T36;
> + T45 = T37 + T38;
> + T46 = MEOW_1_DIV_SQR_2 * (T43 - T42);
> + out[count * 7].r = T45 - T46;
> + out[count * 3].r = T45 + T46;
> + }
> + {
> + Meow_Float T39;
> + Meow_Float T40;
> + Meow_Float T41;
> + Meow_Float T44;
> + T39 = T37 - T38;
> + T40 = MEOW_1_DIV_SQR_2 * (T30 + T35);
> + out[count * 5].r = T39 - T40;
> + out[count * 1].r = T39 + T40;
> + T41 = T24 + T23;
> + T44 = MEOW_1_DIV_SQR_2 * (T42 + T43);
> + out[count * 5].j = T41 - T44;
> + out[count * 1].j = T41 + T44;
> + }
> + }
> +
> + out = out + 1;
> +
> + {
> + unsigned m;
> + for (m = 1; m < count; m = m + 1, out = out + 1, W = W + 14)
> + {
> + Meow_Float T7;
> + Meow_Float T77;
> + Meow_Float T43;
> + Meow_Float T71;
> + Meow_Float T41;
> + Meow_Float T64;
> + Meow_Float T53;
> + Meow_Float T56;
> + Meow_Float T18;
> + Meow_Float T76;
> + Meow_Float T46;
> + Meow_Float T68;
> + Meow_Float T30;
> + Meow_Float T65;
> + Meow_Float T48;
> + Meow_Float T51;
> + {
> + Meow_Float T1;
> + Meow_Float T70;
> + Meow_Float T6;
> + Meow_Float T69;
> + T1 = out[0].r;
> + T70 = out[0].j;
> + {
> + Meow_Float T3;
> + Meow_Float T5;
> + Meow_Float T2;
> + Meow_Float T4;
> + T3 = out[count * 4].r;
> + T5 = out[count * 4].j;
> + T2 = W[6];
> + T4 = W[7];
> + T6 = (T2 * T3) - (T4 * T5);
> + T69 = (T4 * T3) + (T2 * T5);
> + }
> + T7 = T1 + T6;
> + T77 = T70 - T69;
> + T43 = T1 - T6;
> + T71 = T69 + T70;
> + }
> + {
> + Meow_Float T35;
> + Meow_Float T54;
> + Meow_Float T40;
> + Meow_Float T55;
> + {
> + Meow_Float T32;
> + Meow_Float T34;
> + Meow_Float T31;
> + Meow_Float T33;
> + T32 = out[count * 7].r;
> + T34 = out[count * 7].j;
> + T31 = W[12];
> + T33 = W[13];
> + T35 = (T31 * T32) - (T33 * T34);
> + T54 = (T33 * T32) + (T31 * T34);
> + }
> + {
> + Meow_Float T37;
> + Meow_Float T39;
> + Meow_Float T36;
> + Meow_Float T38;
> + T37 = out[count * 3].r;
> + T39 = out[count * 3].j;
> + T36 = W[4];
> + T38 = W[5];
> + T40 = (T36 * T37) - (T38 * T39);
> + T55 = (T38 * T37) + (T36 * T39);
> + }
> + T41 = T35 + T40;
> + T64 = T54 + T55;
> + T53 = T35 - T40;
> + T56 = T54 - T55;
> + }
> + {
> + Meow_Float T12;
> + Meow_Float T44;
> + Meow_Float T17;
> + Meow_Float T45;
> + {
> + Meow_Float T9;
> + Meow_Float T11;
> + Meow_Float T8;
> + Meow_Float T10;
> + T9 = out[count * 2].r;
> + T11 = out[count * 2].j;
> + T8 = W[2];
> + T10 = W[3];
> + T12 = (T8 * T9) - (T10 * T11);
> + T44 = (T10 * T9) + (T8 * T11);
> + }
> + {
> + Meow_Float T14;
> + Meow_Float T16;
> + Meow_Float T13;
> + Meow_Float T15;
> + T14 = out[count * 6].r;
> + T16 = out[count * 6].j;
> + T13 = W[10];
> + T15 = W[11];
> + T17 = (T13 * T14) - (T15 * T16);
> + T45 = (T15 * T14) + (T13 * T16);
> + }
> + T18 = T12 + T17;
> + T76 = T12 - T17;
> + T46 = T44 - T45;
> + T68 = T44 + T45;
> + }
> + {
> + Meow_Float T24;
> + Meow_Float T49;
> + Meow_Float T29;
> + Meow_Float T50;
> + {
> + Meow_Float T21;
> + Meow_Float T23;
> + Meow_Float T20;
> + Meow_Float T22;
> + T21 = out[count * 1].r;
> + T23 = out[count * 1].j;
> + T20 = W[0];
> + T22 = W[1];
> + T24 = (T20 * T21) - (T22 * T23);
> + T49 = (T22 * T21) + (T20 * T23);
> + }
> + {
> + Meow_Float T26;
> + Meow_Float T28;
> + Meow_Float T25;
> + Meow_Float T27;
> + T26 = out[count * 5].r;
> + T28 = out[count * 5].j;
> + T25 = W[8];
> + T27 = W[9];
> + T29 = (T25 * T26) - (T27 * T28);
> + T50 = (T27 * T26) + (T25 * T28);
> + }
> + T30 = T24 + T29;
> + T65 = T49 + T50;
> + T48 = T24 - T29;
> + T51 = T49 - T50;
> + }
> + {
> + Meow_Float T19;
> + Meow_Float T42;
> + Meow_Float T73;
> + Meow_Float T74;
> + T19 = T7 + T18;
> + T42 = T30 + T41;
> + out[count * 4].r = T19 - T42;
> + out[0].r = T19 + T42;
> + {
> + Meow_Float T67;
> + Meow_Float T72;
> + Meow_Float T63;
> + Meow_Float T66;
> + T67 = T65 + T64;
> + T72 = T68 + T71;
> + out[0].j = T67 + T72;
> + out[count * 4].j = T72 - T67;
> + T63 = T7 - T18;
> + T66 = T64 - T65;
> + out[count * 6].r = T63 - T66;
> + out[count * 2].r = T63 + T66;
> + }
> + T73 = T30 - T41;
> + T74 = T71 - T68;
> + out[count * 2].j = T73 + T74;
> + out[count * 6].j = T74 - T73;
> + {
> + Meow_Float T59;
> + Meow_Float T78;
> + Meow_Float T62;
> + Meow_Float T75;
> + Meow_Float T60;
> + Meow_Float T61;
> + T59 = T43 + T46;
> + T78 = T76 + T77;
> + T60 = T56 - T53;
> + T61 = T48 + T51;
> + T62 = MEOW_1_DIV_SQR_2 * (T60 - T61);
> + T75 = MEOW_1_DIV_SQR_2 * (T61 + T60);
> + out[count * 7].r = T59 - T62;
> + out[count * 5].j = T78 - T75;
> + out[count * 3].r = T59 + T62;
> + out[count * 1].j = T75 + T78;
> + }
> + {
> + Meow_Float T47;
> + Meow_Float T80;
> + Meow_Float T58;
> + Meow_Float T79;
> + Meow_Float T52;
> + Meow_Float T57;
> + T47 = T43 - T46;
> + T80 = T77 - T76;
> + T52 = T48 - T51;
> + T57 = T53 + T56;
> + T58 = MEOW_1_DIV_SQR_2 * (T52 + T57);
> + T79 = MEOW_1_DIV_SQR_2 * (T52 - T57);
> + out[count * 5].r = T47 - T58;
> + out[count * 7].j = T80 - T79;
> + out[count * 1].r = T47 + T58;
> + out[count * 3].j = T79 + T80;
> + }
> + }
> + }
> + }
> +}
> +
> +// -----------------------------------------------------------------------------
> +
> +// Paraphrased from kiss-fft
> +void meow_recursive_fft_mixed_meow_radix_dit
> +(
> + const Meow_FFT_Workset* fft
> + , unsigned stage
> + , Complex* in
> + , Meow_FFT_Complex* out
> + , unsigned w_mul
> +)
> +{
> + const unsigned radix = fft->stages.radix[stage];
> + const unsigned count = fft->stages.remainder[stage];
> +
> + Complex* w = fft->wn;
> + const unsigned w_offset = fft->stages.offsets[stage];
> + Complex* w_sequential = &fft->wn_ordered[w_offset];
> +
> + if (count == 1)
> + {
> + for (unsigned i = 0; i < radix; i++)
> + {
> + out[i] = in[i * w_mul];
> + }
> + }
> + else
> + {
> + const unsigned new_w_multiplier = w_mul * radix;
> +
> + for (unsigned i = 0; i < radix; ++i)
> + {
> + meow_recursive_fft_mixed_meow_radix_dit
> + (
> + fft
> + , stage + 1
> + , &in[w_mul * i]
> + , &out[count * i]
> + , new_w_multiplier
> + );
> + }
> + }
> +
> + switch (radix)
> + {
> + case 2: meow_radix_2_dit(w_sequential, out, count); break;
> + case 3: meow_radix_3_dit(w_sequential, out, count); break;
> + case 4: meow_radix_4_dit(w_sequential, out, count); break;
> + case 5: meow_radix_5_dit(w_sequential, out, count); break;
> + case 8: meow_radix_8_dit(w_sequential, out, count); break;
> +
> + default: meow_dft_n_dit(w, out, count, w_mul, radix, fft->N, 0); break;
> + }
> +}
> +
> +void meow_recursive_fft_mixed_meow_radix_dit_i
> +(
> + const Meow_FFT_Workset* fft
> + , unsigned stage
> + , Complex* in
> + , Meow_FFT_Complex* out
> + , unsigned w_mul
> +)
> +{
> + const unsigned radix = fft->stages.radix[stage];
> + const unsigned count = fft->stages.remainder[stage];
> +
> + Complex* w = fft->wn;
> + const unsigned w_offset = fft->stages.offsets[stage];
> + Complex* w_sequential = &fft->wn_ordered[w_offset];
> +
> + if (count == 1)
> + {
> + for (unsigned i = 0; i < radix; i++)
> + {
> + out[i] = in[i * w_mul];
> + }
> + }
> + else
> + {
> + const unsigned new_w_multiplier = w_mul * radix;
> +
> + for (unsigned i = 0; i < radix; ++i)
> + {
> + meow_recursive_fft_mixed_meow_radix_dit_i
> + (
> + fft
> + , stage + 1
> + , &in[w_mul * i]
> + , &out[count * i]
> + , new_w_multiplier
> + );
> + }
> + }
> +
> + switch (radix)
> + {
> + case 2: meow_radix_2_dit_i(w_sequential, out, count); break;
> + case 3: meow_radix_3_dit_i(w_sequential, out, count); break;
> + case 4: meow_radix_4_dit_i(w_sequential, out, count); break;
> + case 5: meow_radix_5_dit_i(w_sequential, out, count); break;
> + case 8: meow_radix_8_dit_i(w_sequential, out, count); break;
> +
> + default: meow_dft_n_dit(w, out, count, w_mul, radix, fft->N, 1); break;
> + }
> +}
> +
> +// -----------------------------------------------------------------------------
> +//
> +// http://www.engineeringproductivitytools.com/stuff/T0001/PT10.HTM
> +// I finally figured out why this page confused me so much. It wasn't
> +// _consistent_ with formular chunks. Sometimes with would include j, othertimes
> +// it wouldn't. Sometimes it would include Wk, othertimes not. Ugh. No wonder
> +// my maths was wrong by the wrong twiddle or incorrect sign.
> +//
> +// So the magic forumlar is:
> +//
> +// Z() == two-for-one-fft result waiting for final tiwddle
> +// F() == result fft, what we want
> +// Wn() == twiddle factor for N
> +// Wn2() == twiddle factor for N/2
> +//
> +// Note for fft Wn() == e^(-j*2*pi*k/N)
> +// Note for ifft Wn() == e^( j*2*pi*k/N) // <--- NOTICE MISSING '-'!
> +
> +// For FFT
> +// k = 0
> +
> +// F(0) = Z(0).real + Z(0).imag
> +// F(N/2) = Z(0).real - Z(0).imag
> +
> +// k > 0
> +// Feven_a(k) = Z(k) + Z(N/2 - k)*
> +// Fodd_a(k) = Z(k) - Z(N/2 - k)*
> +//
> +// Feven = Feven_a(k)
> +// Fodd = -j * Fodd_a(k)
> +//
> +// F(k) = 0.5 * (Feven + (Wn(k) * Fodd))
> +
> +// -----------------------------------------------------------------------------
> +
> +// For IFFT (think the twiddle is DIF hence not * 0.5)
> +// k = 0
> +
> +// Z(0).real = F(0).real + F(N/2).real
> +// Z(0).imag = F(0).real - F(N/2).real
> +
> +// k > 0
> +// Zeven_a(k) = F(k) + F(N/2 - k)*
> +// Zodd_a(k) = F(k) - F(N/2 - k)*
> +//
> +// Feven = Feven_a(k)
> +// Fodd = j * Fodd_a(k) // <---- NOTICE missing '-'!
> +//
> +// F(k) = Feven + (Wn(k) * Fodd) // <---- Remember ifft twiddles!, no * 0.5.
> +
> +inline void meow_mixer
> +(
> + unsigned N_2
> + , Complex* w_2n
> + , Complex* in
> + , Meow_FFT_Complex* out
> +)
> +{
> + // looking at kiss_fft they use double symmetery, so we only need to
> + // actaully do N/4 twiddles. engineeringproductivitytools mentions something
> + // similar as well.
> +
> + const unsigned N_4 = N_2 / 2u;
> +
> + for (unsigned k = 1; k <= N_4; k++)
> + {
> + Complex x_k = in[k];
> + Complex x_n_2_minus_k = meow_conjugate(in[N_2 - k]);
> + Complex wk = meow_conjugate(w_2n[k]);
> +
> + Complex za = meow_add(x_k, x_n_2_minus_k);
> + Complex zb = meow_sub(x_k, x_n_2_minus_k);
> +
> + Complex x_even = za;
> +
> + Complex x_odd = meow_negate(meow_mul_by_j(zb));
> +
> + Complex wk_x_odd = meow_mul(x_odd, wk);
> + Complex x_k_sum = meow_add(x_even, wk_x_odd);
> + Complex x_n_2_minus_k_sum = meow_conjugate(meow_sub(x_even, wk_x_odd));
> +
> + Complex x_k_out = meow_mulf(x_k_sum , 0.5f);
> + Complex x_n_2_minus_k_out = meow_mulf(x_n_2_minus_k_sum, 0.5f);
> +
> + out[k] = x_k_out;
> + out[N_2 - k] = x_n_2_minus_k_out;
> + }
> +}
> +
> +inline void meow_mixer_i
> +(
> + unsigned N_2
> + , Complex* w_2n
> + , Complex* in
> + , Meow_FFT_Complex* out
> +)
> +{
> + const unsigned N_4 = N_2 / 2u;
> +
> + for (unsigned k = 1; k <= N_4; k++)
> + {
> + Complex x_k = in[k];
> + Complex x_n_2_minus_k = meow_conjugate(in[N_2 - k]);
> + Complex wk = w_2n[k];
> +
> + Complex za = meow_add(x_k, x_n_2_minus_k);
> + Complex zb = meow_sub(x_k, x_n_2_minus_k);
> +
> + Complex x_even = za;
> +
> + Complex x_odd = meow_mul_by_j(zb);
> +
> + Complex wk_x_odd = meow_mul(x_odd, wk);
> + Complex x_k_sum = meow_add(x_even, wk_x_odd);
> + Complex x_n_2_minus_k_sum = meow_conjugate(meow_sub(x_even, wk_x_odd));
> +
> + Complex x_k_out = x_k_sum;
> + Complex x_n_2_minus_k_out = x_n_2_minus_k_sum;
> +
> + out[k] = x_k_out;
> + out[N_2 - k] = x_n_2_minus_k_out;
> + }
> +}
> +
> +// -----------------------------------------------------------------------------
> +
> +void meow_fft_real
> +(
> + const Meow_FFT_Workset_Real* fft
> + , const Meow_Float* in
> + , Meow_FFT_Complex* out
> +)
> +{
> + // Need two sets of twiddles. One set for N/2, and one set for N.
> + // However for the N set, we only need N/4 twiddles due to symmetry * 2.
> +
> + const unsigned N_2 = fft->half.N;
> + Complex* magic = fft->w_2n;
> + Meow_FFT_Complex z0;
> +
> + meow_recursive_fft_mixed_meow_radix_dit
> + (
> + &fft->half
> + , 0
> + , (Complex*)(in)
> + , out
> + , 1
> + );
> +
> + z0 = out[0];
> +
> + out[0].r = z0.r + z0.j;
> + out[0].j = z0.r - z0.j;
> + // combine 0, and N_2 into the same number in order
> + // to keep the array size N/2
> +
> + meow_mixer(N_2, magic, out, out);
> +}
> +
> +void meow_fft_real_i
> +(
> + const Meow_FFT_Workset_Real* ifft
> + , const Meow_FFT_Complex* in
> + , Meow_FFT_Complex* temp
> + , Meow_Float* out
> +)
> +{
> + const unsigned N_2 = ifft->half.N;
> + Complex* magic = ifft->w_2n;
> +
> + temp[0].r = in[0].r + in[0].j;
> + temp[0].j = in[0].r - in[0].j;
> +
> + meow_mixer_i(N_2, magic, in, temp);
> +
> + meow_recursive_fft_mixed_meow_radix_dit_i
> + (
> + &ifft->half
> + , 0
> + , temp
> + , (Meow_FFT_Complex*)(out)
> + , 1
> + );
> +}
> +
> +void meow_fft
> +(
> + const Meow_FFT_Workset* data
> + , const Meow_FFT_Complex* in
> + , Meow_FFT_Complex* out
> +)
> +{
> + meow_recursive_fft_mixed_meow_radix_dit
> + (
> + data
> + , 0
> + , in
> + , out
> + , 1
> + );
> +}
> +
> +void meow_fft_i
> +(
> + const Meow_FFT_Workset* data
> + , const Meow_FFT_Complex* in
> + , Meow_FFT_Complex* out
> +)
> +{
> + meow_recursive_fft_mixed_meow_radix_dit_i
> + (
> + data
> + , 0
> + , in
> + , out
> + , 1
> + );
> +}
> +
> +#endif
> +
> +// bash script used to generate radix N codelets from fftw that suit this code.
> +// since my tests run slower with > 8 radix, I hand modified the radix 8
> +// generated (function signature) instead of updating this code.
> +#if 0
> +#!/bin/sh
> +# <command> N [-1|1] [|_i]
> +
> +cat << EOF
> +
> +static void meow_radix_${1}_dit${3}
> +(
> + meow_fft_complex* out
> + , const Meow_Float* W
> + , unsigned count
> +)
> +{
> +EOF
> +
> +#// First loop doesn't use twiddles
> +./gen_notw.native -n ${1} -standalone -sign ${2} \
> + | sed 's/E /Meow_Float /g' \
> + | sed '/INT.*/d' \
> + | sed -r 's/r[io]\[(.+)]/out\[\1\].r/g' \
> + | sed -r 's/i[io]\[(.+)]/out\[\1\].j/g' \
> + | sed '/for (.*).*/d' \
> + | sed -r 's/WS\(.., (.*)\)/count * \1/g' \
> + | sed -r 's/FMA\((.+), (.+), (.+)\)/\(\1 * \2\) + \(\3\)/g' \
> + | sed -r 's/FMS\((.+), (.+), (.+)\)/\(\1 * \2\) - \(\3\)/g' \
> + | sed -r 's/FNMA\((.+), (.+), (.+)\)/-\(\1 * \2\) + \(\3\)/g' \
> + | sed -r 's/FNMS\((.+), (.+), (.+)\)/\(\3\) - \(\1 * \2\)/g' \
> + | sed -r 's/DK\((.+), (.+)\);/static const Meow_Float \1 = \2;/g' \
> + | head -n -3 \
> + | tail -n +9
> +
> +echo ""
> +echo "out = out + 1;"
> +echo ""
> +
> +./gen_twiddle.native -n ${1} -standalone -sign ${2} -dit -with-ms 1 \
> + | sed 's/E /Meow_Float /g' \
> + | sed 's/INT /unsigned /g' \
> + | sed -r 's/r[io]\[(.+)]/out\[\1\].r/g' \
> + | sed -r 's/i[io]\[(.+)]/out\[\1\].j/g' \
> + | sed 's/, MAKE_VOLATILE_STRIDE(.*))/)/g' \
> + | sed 's/, MAKE_VOLATILE_STRIDE(.*)//g' \
> + | sed -r 's/WS\(.., (.*)\)/count * \1/g' \
> + | sed -r 's/FMA\((.+), (.+), (.+)\)/\(\1 * \2\) + \(\3\)/g' \
> + | sed -r 's/FMS\((.+), (.+), (.+)\)/\(\1 * \2\) - \(\3\)/g' \
> + | sed -r 's/FNMA\((.+), (.+), (.+)\)/-\(\1 * \2\) + \(\3\)/g' \
> + | sed -r 's/FNMS\((.+), (.+), (.+)\)/\(\3\) - \(\1 * \2\)/g' \
> + | sed '/DK(.*);/d' \
> + | sed 's/ri = ri + 1, ii = ii + 1,/out = out + 1,/g' \
> + | sed 's/m = mb, W = W + (mb * .*);/m = 1;/g' \
> + | sed -r 's/(for \(.*\)).*/\1\n{/g' \
> + | sed 's/me/count/g' \
> + | head -n -2 \
> + | tail -n +10
> +
> +
> +echo "}"
> +#endif
Intel Semiconductor AG
Registered No. 020.30.913.786-7
Registered Office: Dufourstrasse 101 , 8008 Zurich, Switzerland
More information about the igt-dev
mailing list