File indexing completed on 2024-05-12 04:53:39

0001 /*
0002     SPDX-FileCopyrightText: 2003-2010 Mark Borgerding <Mark@Borgerding.net>
0003     SPDX-License-Identifier: BSD-3-Clause
0004 */
0005 
0006 #include "_kiss_fft_guts.h"
0007 /* The guts header contains all the multiplication and addition macros that are defined for
0008  fixed or floating point complex numbers.  It also delares the kf_ internal functions.
0009  */
0010 
0011 static void kf_bfly2(kiss_fft_cpx *Fout, const size_t fstride, const kiss_fft_cfg st, int m)
0012 {
0013     kiss_fft_cpx *Fout2;
0014     kiss_fft_cpx *tw1 = st->twiddles;
0015     kiss_fft_cpx t;
0016     Fout2 = Fout + m;
0017     do {
0018         C_FIXDIV(*Fout, 2);
0019         C_FIXDIV(*Fout2, 2);
0020 
0021         C_MUL(t, *Fout2, *tw1);
0022         tw1 += fstride;
0023         C_SUB(*Fout2, *Fout, t);
0024         C_ADDTO(*Fout, t);
0025         ++Fout2;
0026         ++Fout;
0027     } while (--m);
0028 }
0029 
0030 static void kf_bfly4(kiss_fft_cpx *Fout, const size_t fstride, const kiss_fft_cfg st, const size_t m)
0031 {
0032     kiss_fft_cpx *tw1, *tw2, *tw3;
0033     kiss_fft_cpx scratch[6];
0034     size_t k = m;
0035     const size_t m2 = 2 * m;
0036     const size_t m3 = 3 * m;
0037 
0038     tw3 = tw2 = tw1 = st->twiddles;
0039 
0040     do {
0041         C_FIXDIV(*Fout, 4);
0042         C_FIXDIV(Fout[m], 4);
0043         C_FIXDIV(Fout[m2], 4);
0044         C_FIXDIV(Fout[m3], 4);
0045 
0046         C_MUL(scratch[0], Fout[m], *tw1);
0047         C_MUL(scratch[1], Fout[m2], *tw2);
0048         C_MUL(scratch[2], Fout[m3], *tw3);
0049 
0050         C_SUB(scratch[5], *Fout, scratch[1]);
0051         C_ADDTO(*Fout, scratch[1]);
0052         C_ADD(scratch[3], scratch[0], scratch[2]);
0053         C_SUB(scratch[4], scratch[0], scratch[2]);
0054         C_SUB(Fout[m2], *Fout, scratch[3]);
0055         tw1 += fstride;
0056         tw2 += fstride * 2;
0057         tw3 += fstride * 3;
0058         C_ADDTO(*Fout, scratch[3]);
0059 
0060         if (st->inverse) {
0061             Fout[m].r = scratch[5].r - scratch[4].i;
0062             Fout[m].i = scratch[5].i + scratch[4].r;
0063             Fout[m3].r = scratch[5].r + scratch[4].i;
0064             Fout[m3].i = scratch[5].i - scratch[4].r;
0065         } else {
0066             Fout[m].r = scratch[5].r + scratch[4].i;
0067             Fout[m].i = scratch[5].i - scratch[4].r;
0068             Fout[m3].r = scratch[5].r - scratch[4].i;
0069             Fout[m3].i = scratch[5].i + scratch[4].r;
0070         }
0071         ++Fout;
0072     } while (--k);
0073 }
0074 
0075 static void kf_bfly3(kiss_fft_cpx *Fout, const size_t fstride, const kiss_fft_cfg st, size_t m)
0076 {
0077     size_t k = m;
0078     const size_t m2 = 2 * m;
0079     kiss_fft_cpx *tw1, *tw2;
0080     kiss_fft_cpx scratch[5];
0081     kiss_fft_cpx epi3;
0082     epi3 = st->twiddles[fstride * m];
0083 
0084     tw1 = tw2 = st->twiddles;
0085 
0086     do {
0087         C_FIXDIV(*Fout, 3);
0088         C_FIXDIV(Fout[m], 3);
0089         C_FIXDIV(Fout[m2], 3);
0090 
0091         C_MUL(scratch[1], Fout[m], *tw1);
0092         C_MUL(scratch[2], Fout[m2], *tw2);
0093 
0094         C_ADD(scratch[3], scratch[1], scratch[2]);
0095         C_SUB(scratch[0], scratch[1], scratch[2]);
0096         tw1 += fstride;
0097         tw2 += fstride * 2;
0098 
0099         Fout[m].r = Fout->r - HALF_OF(scratch[3].r);
0100         Fout[m].i = Fout->i - HALF_OF(scratch[3].i);
0101 
0102         C_MULBYSCALAR(scratch[0], epi3.i);
0103 
0104         C_ADDTO(*Fout, scratch[3]);
0105 
0106         Fout[m2].r = Fout[m].r + scratch[0].i;
0107         Fout[m2].i = Fout[m].i - scratch[0].r;
0108 
0109         Fout[m].r -= scratch[0].i;
0110         Fout[m].i += scratch[0].r;
0111 
0112         ++Fout;
0113     } while (--k);
0114 }
0115 
0116 static void kf_bfly5(kiss_fft_cpx *Fout, const size_t fstride, const kiss_fft_cfg st, int m)
0117 {
0118     kiss_fft_cpx *Fout0, *Fout1, *Fout2, *Fout3, *Fout4;
0119     int u;
0120     kiss_fft_cpx scratch[13];
0121     kiss_fft_cpx *twiddles = st->twiddles;
0122     kiss_fft_cpx *tw;
0123     kiss_fft_cpx ya, yb;
0124     ya = twiddles[fstride * m];
0125     yb = twiddles[fstride * 2 * m];
0126 
0127     Fout0 = Fout;
0128     Fout1 = Fout0 + m;
0129     Fout2 = Fout0 + 2 * m;
0130     Fout3 = Fout0 + 3 * m;
0131     Fout4 = Fout0 + 4 * m;
0132 
0133     tw = st->twiddles;
0134     for (u = 0; u < m; ++u) {
0135         C_FIXDIV(*Fout0, 5);
0136         C_FIXDIV(*Fout1, 5);
0137         C_FIXDIV(*Fout2, 5);
0138         C_FIXDIV(*Fout3, 5);
0139         C_FIXDIV(*Fout4, 5);
0140         scratch[0] = *Fout0;
0141 
0142         C_MUL(scratch[1], *Fout1, tw[u * fstride]);
0143         C_MUL(scratch[2], *Fout2, tw[2 * u * fstride]);
0144         C_MUL(scratch[3], *Fout3, tw[3 * u * fstride]);
0145         C_MUL(scratch[4], *Fout4, tw[4 * u * fstride]);
0146 
0147         C_ADD(scratch[7], scratch[1], scratch[4]);
0148         C_SUB(scratch[10], scratch[1], scratch[4]);
0149         C_ADD(scratch[8], scratch[2], scratch[3]);
0150         C_SUB(scratch[9], scratch[2], scratch[3]);
0151 
0152         Fout0->r += scratch[7].r + scratch[8].r;
0153         Fout0->i += scratch[7].i + scratch[8].i;
0154 
0155         scratch[5].r = scratch[0].r + S_MUL(scratch[7].r, ya.r) + S_MUL(scratch[8].r, yb.r);
0156         scratch[5].i = scratch[0].i + S_MUL(scratch[7].i, ya.r) + S_MUL(scratch[8].i, yb.r);
0157 
0158         scratch[6].r = S_MUL(scratch[10].i, ya.i) + S_MUL(scratch[9].i, yb.i);
0159         scratch[6].i = -S_MUL(scratch[10].r, ya.i) - S_MUL(scratch[9].r, yb.i);
0160 
0161         C_SUB(*Fout1, scratch[5], scratch[6]);
0162         C_ADD(*Fout4, scratch[5], scratch[6]);
0163 
0164         scratch[11].r = scratch[0].r + S_MUL(scratch[7].r, yb.r) + S_MUL(scratch[8].r, ya.r);
0165         scratch[11].i = scratch[0].i + S_MUL(scratch[7].i, yb.r) + S_MUL(scratch[8].i, ya.r);
0166         scratch[12].r = -S_MUL(scratch[10].i, yb.i) + S_MUL(scratch[9].i, ya.i);
0167         scratch[12].i = S_MUL(scratch[10].r, yb.i) - S_MUL(scratch[9].r, ya.i);
0168 
0169         C_ADD(*Fout2, scratch[11], scratch[12]);
0170         C_SUB(*Fout3, scratch[11], scratch[12]);
0171 
0172         ++Fout0;
0173         ++Fout1;
0174         ++Fout2;
0175         ++Fout3;
0176         ++Fout4;
0177     }
0178 }
0179 
0180 /* perform the butterfly for one stage of a mixed radix FFT */
0181 static void kf_bfly_generic(kiss_fft_cpx *Fout, const size_t fstride, const kiss_fft_cfg st, int m, int p)
0182 {
0183     int u, q1, q;
0184     kiss_fft_cpx *twiddles = st->twiddles;
0185     kiss_fft_cpx t;
0186     int Norig = st->nfft;
0187 
0188     kiss_fft_cpx scratch[p];
0189 
0190     for (u = 0; u < m; ++u) {
0191         int k = u;
0192         for (q1 = 0; q1 < p; ++q1) {
0193             scratch[q1] = Fout[k];
0194             C_FIXDIV(scratch[q1], p);
0195             k += m;
0196         }
0197 
0198         k = u;
0199         for (q1 = 0; q1 < p; ++q1) {
0200             int twidx = 0;
0201             Fout[k] = scratch[0];
0202             for (q = 1; q < p; ++q) {
0203                 twidx += fstride * k;
0204                 if (twidx >= Norig) twidx -= Norig;
0205                 C_MUL(t, scratch[q], twiddles[twidx]);
0206                 C_ADDTO(Fout[k], t);
0207             }
0208             k += m;
0209         }
0210     }
0211 }
0212 
0213 static void kf_work(kiss_fft_cpx *Fout, const kiss_fft_cpx *f, const size_t fstride, int in_stride, int *factors, const kiss_fft_cfg st)
0214 {
0215     kiss_fft_cpx *Fout_beg = Fout;
0216     const int p = *factors++; /* the radix  */
0217     const int m = *factors++; /* stage's fft length/p */
0218     const kiss_fft_cpx *Fout_end = Fout + p * m;
0219 
0220 #ifdef _OPENMP
0221     // use openmp extensions at the
0222     // top-level (not recursive)
0223     if (fstride == 1 && p <= 5) {
0224         int k;
0225 
0226 // execute the p different work units in different threads
0227 #pragma omp parallel for
0228         for (k = 0; k < p; ++k)
0229             kf_work(Fout + k * m, f + fstride * in_stride * k, fstride * p, in_stride, factors, st);
0230         // all threads have joined by this point
0231 
0232         switch (p) {
0233         case 2:
0234             kf_bfly2(Fout, fstride, st, m);
0235             break;
0236         case 3:
0237             kf_bfly3(Fout, fstride, st, m);
0238             break;
0239         case 4:
0240             kf_bfly4(Fout, fstride, st, m);
0241             break;
0242         case 5:
0243             kf_bfly5(Fout, fstride, st, m);
0244             break;
0245         default:
0246             kf_bfly_generic(Fout, fstride, st, m, p);
0247             break;
0248         }
0249         return;
0250     }
0251 #endif
0252 
0253     if (m == 1) {
0254         do {
0255             *Fout = *f;
0256             f += fstride * in_stride;
0257         } while (++Fout != Fout_end);
0258     } else {
0259         do {
0260             // recursive call:
0261             // DFT of size m*p performed by doing
0262             // p instances of smaller DFTs of size m,
0263             // each one takes a decimated version of the input
0264             kf_work(Fout, f, fstride * p, in_stride, factors, st);
0265             f += fstride * in_stride;
0266         } while ((Fout += m) != Fout_end);
0267     }
0268 
0269     Fout = Fout_beg;
0270 
0271     // recombine the p smaller DFTs
0272     switch (p) {
0273     case 2:
0274         kf_bfly2(Fout, fstride, st, m);
0275         break;
0276     case 3:
0277         kf_bfly3(Fout, fstride, st, m);
0278         break;
0279     case 4:
0280         kf_bfly4(Fout, fstride, st, m);
0281         break;
0282     case 5:
0283         kf_bfly5(Fout, fstride, st, m);
0284         break;
0285     default:
0286         kf_bfly_generic(Fout, fstride, st, m, p);
0287         break;
0288     }
0289 }
0290 
0291 /*  facbuf is populated by p1,m1,p2,m2, ...
0292     where
0293     p[i] * m[i] = m[i-1]
0294     m0 = n                  */
0295 static void kf_factor(int n, int *facbuf)
0296 {
0297     int p = 4;
0298     double floor_sqrt;
0299     floor_sqrt = floor(sqrt((double)n));
0300 
0301     /*factor out powers of 4, powers of 2, then any remaining primes */
0302     do {
0303         while (n % p) {
0304             switch (p) {
0305             case 4:
0306                 p = 2;
0307                 break;
0308             case 2:
0309                 p = 3;
0310                 break;
0311             default:
0312                 p += 2;
0313                 break;
0314             }
0315             if (p > floor_sqrt) p = n; /* no more factors, skip to end */
0316         }
0317         n /= p;
0318         *facbuf++ = p;
0319         *facbuf++ = n;
0320     } while (n > 1);
0321 }
0322 
0323 /*
0324  *
0325  * User-callable function to allocate all necessary storage space for the fft.
0326  *
0327  * The return value is a contiguous block of memory, allocated with malloc.  As such,
0328  * It can be freed with free(), rather than a kiss_fft-specific function.
0329  * */
0330 kiss_fft_cfg kiss_fft_alloc(int nfft, int inverse_fft, void *mem, size_t *lenmem)
0331 {
0332     kiss_fft_cfg st = NULL;
0333     size_t memneeded = sizeof(struct kiss_fft_state) + sizeof(kiss_fft_cpx) * (nfft - 1); /* twiddle factors*/
0334 
0335     if (lenmem == NULL) {
0336         st = (kiss_fft_cfg)KISS_FFT_MALLOC(memneeded);
0337     } else {
0338         if (mem != NULL && *lenmem >= memneeded) st = (kiss_fft_cfg)mem;
0339         *lenmem = memneeded;
0340     }
0341     if (st) {
0342         int i;
0343         st->nfft = nfft;
0344         st->inverse = inverse_fft;
0345 
0346         for (i = 0; i < nfft; ++i) {
0347             const double pi = 3.141592653589793238462643383279502884197169399375105820974944;
0348             double phase = -2 * pi * i / nfft;
0349             if (st->inverse) phase *= -1;
0350             kf_cexp(st->twiddles + i, phase);
0351         }
0352 
0353         kf_factor(nfft, st->factors);
0354     }
0355     return st;
0356 }
0357 
0358 void kiss_fft_stride(kiss_fft_cfg st, const kiss_fft_cpx *fin, kiss_fft_cpx *fout, int in_stride)
0359 {
0360     if (fin == fout) {
0361         // NOTE: this is not really an in-place FFT algorithm.
0362         // It just performs an out-of-place FFT into a temp buffer
0363         kiss_fft_cpx tmpbuf[st->nfft];
0364         kf_work(tmpbuf, fin, 1, in_stride, st->factors, st);
0365         memcpy(fout, tmpbuf, sizeof(kiss_fft_cpx) * st->nfft);
0366     } else {
0367         kf_work(fout, fin, 1, in_stride, st->factors, st);
0368     }
0369 }
0370 
0371 void kiss_fft(kiss_fft_cfg cfg, const kiss_fft_cpx *fin, kiss_fft_cpx *fout)
0372 {
0373     kiss_fft_stride(cfg, fin, fout, 1);
0374 }