arm_rfft_fast_f32.c
1 /* ---------------------------------------------------------------------- 2 * Project: CMSIS DSP Library 3 * Title: arm_rfft_fast_f32.c 4 * Description: RFFT & RIFFT Floating point process function 5 * 6 * $Date: 23 April 2021 7 * $Revision: V1.9.0 8 * 9 * Target Processor: Cortex-M and Cortex-A cores 10 * -------------------------------------------------------------------- */ 11 /* 12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. 13 * 14 * SPDX-License-Identifier: Apache-2.0 15 * 16 * Licensed under the Apache License, Version 2.0 (the License); you may 17 * not use this file except in compliance with the License. 18 * You may obtain a copy of the License at 19 * 20 * www.apache.org/licenses/LICENSE-2.0 21 * 22 * Unless required by applicable law or agreed to in writing, software 23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 * See the License for the specific language governing permissions and 26 * limitations under the License. 27 */ 28 29 #include "dsp/transform_functions.h" 30 31 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) 32 void stage_rfft_f32( 33 const arm_rfft_fast_instance_f32 * S, 34 float32_t * p, 35 float32_t * pOut) 36 { 37 int32_t k; /* Loop Counter */ 38 float32_t twR, twI; /* RFFT Twiddle coefficients */ 39 const float32_t * pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */ 40 float32_t *pA = p; /* increasing pointer */ 41 float32_t *pB = p; /* decreasing pointer */ 42 float32_t xAR, xAI, xBR, xBI; /* temporary variables */ 43 float32_t t1a, t1b; /* temporary variables */ 44 float32_t p0, p1, p2, p3; /* temporary variables */ 45 46 float32x4x2_t tw,xA,xB; 47 float32x4x2_t tmp1, tmp2, res; 48 49 uint32x4_t vecStridesFwd, vecStridesBkwd; 50 51 vecStridesFwd = vidupq_u32((uint32_t)0, 2); 52 vecStridesBkwd = -vecStridesFwd; 53 54 int blockCnt; 55 56 57 k = (S->Sint).fftLen - 1; 58 59 /* Pack first and last sample of the frequency domain together */ 60 61 xBR = pB[0]; 62 xBI = pB[1]; 63 xAR = pA[0]; 64 xAI = pA[1]; 65 66 twR = *pCoeff++ ; 67 twI = *pCoeff++ ; 68 69 // U1 = XA(1) + XB(1); % It is real 70 t1a = xBR + xAR ; 71 72 // U2 = XB(1) - XA(1); % It is imaginary 73 t1b = xBI + xAI ; 74 75 // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI); 76 // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI); 77 *pOut++ = 0.5f * ( t1a + t1b ); 78 *pOut++ = 0.5f * ( t1a - t1b ); 79 80 // XA(1) = 1/2*( U1 - imag(U2) + i*( U1 +imag(U2) )); 81 pB = p + 2*k; 82 pA += 2; 83 84 blockCnt = k >> 2; 85 while (blockCnt > 0) 86 { 87 /* 88 function X = my_split_rfft(X, ifftFlag) 89 % X is a series of real numbers 90 L = length(X); 91 XC = X(1:2:end) +i*X(2:2:end); 92 XA = fft(XC); 93 XB = conj(XA([1 end:-1:2])); 94 TW = i*exp(-2*pi*i*[0:L/2-1]/L).'; 95 for l = 2:L/2 96 XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l))); 97 end 98 XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1)))); 99 X = XA; 100 */ 101 102 103 xA = vld2q_f32(pA); 104 pA += 8; 105 106 xB = vld2q_f32(pB); 107 108 xB.val[0] = vldrwq_gather_shifted_offset_f32(pB, vecStridesBkwd); 109 xB.val[1] = vldrwq_gather_shifted_offset_f32(&pB[1], vecStridesBkwd); 110 111 xB.val[1] = vnegq_f32(xB.val[1]); 112 pB -= 8; 113 114 115 tw = vld2q_f32(pCoeff); 116 pCoeff += 8; 117 118 119 tmp1.val[0] = vaddq_f32(xA.val[0],xB.val[0]); 120 tmp1.val[1] = vaddq_f32(xA.val[1],xB.val[1]); 121 122 tmp2.val[0] = vsubq_f32(xB.val[0],xA.val[0]); 123 tmp2.val[1] = vsubq_f32(xB.val[1],xA.val[1]); 124 125 res.val[0] = vmulq(tw.val[0], tmp2.val[0]); 126 res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]); 127 128 res.val[1] = vmulq(tw.val[0], tmp2.val[1]); 129 res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]); 130 131 res.val[0] = vaddq_f32(res.val[0],tmp1.val[0] ); 132 res.val[1] = vaddq_f32(res.val[1],tmp1.val[1] ); 133 134 res.val[0] = vmulq_n_f32(res.val[0], 0.5f); 135 res.val[1] = vmulq_n_f32(res.val[1], 0.5f); 136 137 138 vst2q_f32(pOut, res); 139 pOut += 8; 140 141 142 blockCnt--; 143 } 144 145 blockCnt = k & 3; 146 while (blockCnt > 0) 147 { 148 /* 149 function X = my_split_rfft(X, ifftFlag) 150 % X is a series of real numbers 151 L = length(X); 152 XC = X(1:2:end) +i*X(2:2:end); 153 XA = fft(XC); 154 XB = conj(XA([1 end:-1:2])); 155 TW = i*exp(-2*pi*i*[0:L/2-1]/L).'; 156 for l = 2:L/2 157 XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l))); 158 end 159 XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1)))); 160 X = XA; 161 */ 162 163 xBI = pB[1]; 164 xBR = pB[0]; 165 xAR = pA[0]; 166 xAI = pA[1]; 167 168 twR = *pCoeff++; 169 twI = *pCoeff++; 170 171 t1a = xBR - xAR ; 172 t1b = xBI + xAI ; 173 174 // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI); 175 // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI); 176 p0 = twR * t1a; 177 p1 = twI * t1a; 178 p2 = twR * t1b; 179 p3 = twI * t1b; 180 181 *pOut++ = 0.5f * (xAR + xBR + p0 + p3 ); //xAR 182 *pOut++ = 0.5f * (xAI - xBI + p1 - p2 ); //xAI 183 184 pA += 2; 185 pB -= 2; 186 blockCnt--; 187 } 188 } 189 190 /* Prepares data for inverse cfft */ 191 void merge_rfft_f32( 192 const arm_rfft_fast_instance_f32 * S, 193 float32_t * p, 194 float32_t * pOut) 195 { 196 int32_t k; /* Loop Counter */ 197 float32_t twR, twI; /* RFFT Twiddle coefficients */ 198 const float32_t *pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */ 199 float32_t *pA = p; /* increasing pointer */ 200 float32_t *pB = p; /* decreasing pointer */ 201 float32_t xAR, xAI, xBR, xBI; /* temporary variables */ 202 float32_t t1a, t1b, r, s, t, u; /* temporary variables */ 203 204 float32x4x2_t tw,xA,xB; 205 float32x4x2_t tmp1, tmp2, res; 206 uint32x4_t vecStridesFwd, vecStridesBkwd; 207 208 vecStridesFwd = vidupq_u32((uint32_t)0, 2); 209 vecStridesBkwd = -vecStridesFwd; 210 211 int blockCnt; 212 213 214 k = (S->Sint).fftLen - 1; 215 216 xAR = pA[0]; 217 xAI = pA[1]; 218 219 pCoeff += 2 ; 220 221 *pOut++ = 0.5f * ( xAR + xAI ); 222 *pOut++ = 0.5f * ( xAR - xAI ); 223 224 pB = p + 2*k ; 225 pA += 2 ; 226 227 blockCnt = k >> 2; 228 while (blockCnt > 0) 229 { 230 /* G is half of the frequency complex spectrum */ 231 //for k = 2:N 232 // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2)))); 233 xA = vld2q_f32(pA); 234 pA += 8; 235 236 xB = vld2q_f32(pB); 237 238 xB.val[0] = vldrwq_gather_shifted_offset_f32(pB, vecStridesBkwd); 239 xB.val[1] = vldrwq_gather_shifted_offset_f32(&pB[1], vecStridesBkwd); 240 241 xB.val[1] = vnegq_f32(xB.val[1]); 242 pB -= 8; 243 244 245 tw = vld2q_f32(pCoeff); 246 tw.val[1] = vnegq_f32(tw.val[1]); 247 pCoeff += 8; 248 249 250 tmp1.val[0] = vaddq_f32(xA.val[0],xB.val[0]); 251 tmp1.val[1] = vaddq_f32(xA.val[1],xB.val[1]); 252 253 tmp2.val[0] = vsubq_f32(xB.val[0],xA.val[0]); 254 tmp2.val[1] = vsubq_f32(xB.val[1],xA.val[1]); 255 256 res.val[0] = vmulq(tw.val[0], tmp2.val[0]); 257 res.val[0] = vfmsq(res.val[0],tw.val[1], tmp2.val[1]); 258 259 res.val[1] = vmulq(tw.val[0], tmp2.val[1]); 260 res.val[1] = vfmaq(res.val[1], tw.val[1], tmp2.val[0]); 261 262 res.val[0] = vaddq_f32(res.val[0],tmp1.val[0] ); 263 res.val[1] = vaddq_f32(res.val[1],tmp1.val[1] ); 264 265 res.val[0] = vmulq_n_f32(res.val[0], 0.5f); 266 res.val[1] = vmulq_n_f32(res.val[1], 0.5f); 267 268 269 vst2q_f32(pOut, res); 270 pOut += 8; 271 272 273 blockCnt--; 274 } 275 276 blockCnt = k & 3; 277 while (blockCnt > 0) 278 { 279 /* G is half of the frequency complex spectrum */ 280 //for k = 2:N 281 // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2)))); 282 xBI = pB[1] ; 283 xBR = pB[0] ; 284 xAR = pA[0]; 285 xAI = pA[1]; 286 287 twR = *pCoeff++; 288 twI = *pCoeff++; 289 290 t1a = xAR - xBR ; 291 t1b = xAI + xBI ; 292 293 r = twR * t1a; 294 s = twI * t1b; 295 t = twI * t1a; 296 u = twR * t1b; 297 298 // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI); 299 // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI); 300 *pOut++ = 0.5f * (xAR + xBR - r - s ); //xAR 301 *pOut++ = 0.5f * (xAI - xBI + t - u ); //xAI 302 303 pA += 2; 304 pB -= 2; 305 blockCnt--; 306 } 307 308 } 309 #else 310 void stage_rfft_f32( 311 const arm_rfft_fast_instance_f32 * S, 312 float32_t * p, 313 float32_t * pOut) 314 { 315 int32_t k; /* Loop Counter */ 316 float32_t twR, twI; /* RFFT Twiddle coefficients */ 317 const float32_t * pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */ 318 float32_t *pA = p; /* increasing pointer */ 319 float32_t *pB = p; /* decreasing pointer */ 320 float32_t xAR, xAI, xBR, xBI; /* temporary variables */ 321 float32_t t1a, t1b; /* temporary variables */ 322 float32_t p0, p1, p2, p3; /* temporary variables */ 323 324 325 k = (S->Sint).fftLen - 1; 326 327 /* Pack first and last sample of the frequency domain together */ 328 329 xBR = pB[0]; 330 xBI = pB[1]; 331 xAR = pA[0]; 332 xAI = pA[1]; 333 334 twR = *pCoeff++ ; 335 twI = *pCoeff++ ; 336 337 338 // U1 = XA(1) + XB(1); % It is real 339 t1a = xBR + xAR ; 340 341 // U2 = XB(1) - XA(1); % It is imaginary 342 t1b = xBI + xAI ; 343 344 // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI); 345 // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI); 346 *pOut++ = 0.5f * ( t1a + t1b ); 347 *pOut++ = 0.5f * ( t1a - t1b ); 348 349 // XA(1) = 1/2*( U1 - imag(U2) + i*( U1 +imag(U2) )); 350 pB = p + 2*k; 351 pA += 2; 352 353 do 354 { 355 /* 356 function X = my_split_rfft(X, ifftFlag) 357 % X is a series of real numbers 358 L = length(X); 359 XC = X(1:2:end) +i*X(2:2:end); 360 XA = fft(XC); 361 XB = conj(XA([1 end:-1:2])); 362 TW = i*exp(-2*pi*i*[0:L/2-1]/L).'; 363 for l = 2:L/2 364 XA(l) = 1/2 * (XA(l) + XB(l) + TW(l) * (XB(l) - XA(l))); 365 end 366 XA(1) = 1/2* (XA(1) + XB(1) + TW(1) * (XB(1) - XA(1))) + i*( 1/2*( XA(1) + XB(1) + i*( XA(1) - XB(1)))); 367 X = XA; 368 */ 369 370 xBI = pB[1]; 371 xBR = pB[0]; 372 xAR = pA[0]; 373 xAI = pA[1]; 374 375 twR = *pCoeff++; 376 twI = *pCoeff++; 377 378 t1a = xBR - xAR ; 379 t1b = xBI + xAI ; 380 381 // real(tw * (xB - xA)) = twR * (xBR - xAR) - twI * (xBI - xAI); 382 // imag(tw * (xB - xA)) = twI * (xBR - xAR) + twR * (xBI - xAI); 383 p0 = twR * t1a; 384 p1 = twI * t1a; 385 p2 = twR * t1b; 386 p3 = twI * t1b; 387 388 *pOut++ = 0.5f * (xAR + xBR + p0 + p3 ); //xAR 389 *pOut++ = 0.5f * (xAI - xBI + p1 - p2 ); //xAI 390 391 392 pA += 2; 393 pB -= 2; 394 k--; 395 } while (k > 0); 396 } 397 398 /* Prepares data for inverse cfft */ 399 void merge_rfft_f32( 400 const arm_rfft_fast_instance_f32 * S, 401 float32_t * p, 402 float32_t * pOut) 403 { 404 int32_t k; /* Loop Counter */ 405 float32_t twR, twI; /* RFFT Twiddle coefficients */ 406 const float32_t *pCoeff = S->pTwiddleRFFT; /* Points to RFFT Twiddle factors */ 407 float32_t *pA = p; /* increasing pointer */ 408 float32_t *pB = p; /* decreasing pointer */ 409 float32_t xAR, xAI, xBR, xBI; /* temporary variables */ 410 float32_t t1a, t1b, r, s, t, u; /* temporary variables */ 411 412 k = (S->Sint).fftLen - 1; 413 414 xAR = pA[0]; 415 xAI = pA[1]; 416 417 pCoeff += 2 ; 418 419 *pOut++ = 0.5f * ( xAR + xAI ); 420 *pOut++ = 0.5f * ( xAR - xAI ); 421 422 pB = p + 2*k ; 423 pA += 2 ; 424 425 while (k > 0) 426 { 427 /* G is half of the frequency complex spectrum */ 428 //for k = 2:N 429 // Xk(k) = 1/2 * (G(k) + conj(G(N-k+2)) + Tw(k)*( G(k) - conj(G(N-k+2)))); 430 xBI = pB[1] ; 431 xBR = pB[0] ; 432 xAR = pA[0]; 433 xAI = pA[1]; 434 435 twR = *pCoeff++; 436 twI = *pCoeff++; 437 438 t1a = xAR - xBR ; 439 t1b = xAI + xBI ; 440 441 r = twR * t1a; 442 s = twI * t1b; 443 t = twI * t1a; 444 u = twR * t1b; 445 446 // real(tw * (xA - xB)) = twR * (xAR - xBR) - twI * (xAI - xBI); 447 // imag(tw * (xA - xB)) = twI * (xAR - xBR) + twR * (xAI - xBI); 448 *pOut++ = 0.5f * (xAR + xBR - r - s ); //xAR 449 *pOut++ = 0.5f * (xAI - xBI + t - u ); //xAI 450 451 pA += 2; 452 pB -= 2; 453 k--; 454 } 455 456 } 457 458 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ 459 460 /** 461 @ingroup groupTransforms 462 */ 463 464 /** 465 @defgroup RealFFT Real FFT Functions 466 467 @par 468 The CMSIS DSP library includes specialized algorithms for computing the 469 FFT of real data sequences. The FFT is defined over complex data but 470 in many applications the input is real. Real FFT algorithms take advantage 471 of the symmetry properties of the FFT and have a speed advantage over complex 472 algorithms of the same length. 473 @par 474 The Fast RFFT algorithm relays on the mixed radix CFFT that save processor usage. 475 @par 476 The real length N forward FFT of a sequence is computed using the steps shown below. 477 @par 478 \image html RFFT.gif "Real Fast Fourier Transform" 479 @par 480 The real sequence is initially treated as if it were complex to perform a CFFT. 481 Later, a processing stage reshapes the data to obtain half of the frequency spectrum 482 in complex format. Except the first complex number that contains the two real numbers 483 X[0] and X[N/2] all the data is complex. In other words, the first complex sample 484 contains two real values packed. 485 @par 486 The input for the inverse RFFT should keep the same format as the output of the 487 forward RFFT. A first processing stage pre-process the data to later perform an 488 inverse CFFT. 489 @par 490 \image html RIFFT.gif "Real Inverse Fast Fourier Transform" 491 @par 492 The algorithms for floating-point, Q15, and Q31 data are slightly different 493 and we describe each algorithm in turn. 494 @par Floating-point 495 The main functions are \ref arm_rfft_fast_f32() and \ref arm_rfft_fast_init_f32(). 496 The older functions \ref arm_rfft_f32() and \ref arm_rfft_init_f32() have been deprecated 497 but are still documented. 498 @par 499 The FFT of a real N-point sequence has even symmetry in the frequency domain. 500 The second half of the data equals the conjugate of the first half flipped in frequency. 501 Looking at the data, we see that we can uniquely represent the FFT using only N/2 complex numbers. 502 These are packed into the output array in alternating real and imaginary components: 503 @par 504 X = { real[0], imag[0], real[1], imag[1], real[2], imag[2] ... 505 real[(N/2)-1], imag[(N/2)-1 } 506 @par 507 It happens that the first complex number (real[0], imag[0]) is actually 508 all real. real[0] represents the DC offset, and imag[0] should be 0. 509 (real[1], imag[1]) is the fundamental frequency, (real[2], imag[2]) is 510 the first harmonic and so on. 511 @par 512 The real FFT functions pack the frequency domain data in this fashion. 513 The forward transform outputs the data in this form and the inverse 514 transform expects input data in this form. The function always performs 515 the needed bitreversal so that the input and output data is always in 516 normal order. The functions support lengths of [32, 64, 128, ..., 4096] 517 samples. 518 @par Q15 and Q31 519 The real algorithms are defined in a similar manner and utilize N/2 complex 520 transforms behind the scenes. 521 @par 522 The complex transforms used internally include scaling to prevent fixed-point 523 overflows. The overall scaling equals 1/(fftLen/2). 524 Due to the use of complex transform internally, the source buffer is 525 modified by the rfft. 526 @par 527 A separate instance structure must be defined for each transform used but 528 twiddle factor and bit reversal tables can be reused. 529 @par 530 There is also an associated initialization function for each data type. 531 The initialization function performs the following operations: 532 - Sets the values of the internal structure fields. 533 - Initializes twiddle factor table and bit reversal table pointers. 534 - Initializes the internal complex FFT data structure. 535 @par 536 Use of the initialization function is optional **except for MVE versions where it is mandatory**. 537 If you don't use the initialization functions, then the structures should be initialized with code 538 similar to the one below: 539 <pre> 540 arm_rfft_instance_q31 S = {fftLenReal, fftLenBy2, ifftFlagR, bitReverseFlagR, twidCoefRModifier, pTwiddleAReal, pTwiddleBReal, pCfft}; 541 arm_rfft_instance_q15 S = {fftLenReal, fftLenBy2, ifftFlagR, bitReverseFlagR, twidCoefRModifier, pTwiddleAReal, pTwiddleBReal, pCfft}; 542 </pre> 543 where <code>fftLenReal</code> is the length of the real transform; 544 <code>fftLenBy2</code> length of the internal complex transform (fftLenReal/2). 545 <code>ifftFlagR</code> Selects forward (=0) or inverse (=1) transform. 546 <code>bitReverseFlagR</code> Selects bit reversed output (=0) or normal order 547 output (=1). 548 <code>twidCoefRModifier</code> stride modifier for the twiddle factor table. 549 The value is based on the FFT length; 550 <code>pTwiddleAReal</code>points to the A array of twiddle coefficients; 551 <code>pTwiddleBReal</code>points to the B array of twiddle coefficients; 552 <code>pCfft</code> points to the CFFT Instance structure. The CFFT structure 553 must also be initialized. 554 @par 555 Note that with MVE versions you can't initialize instance structures directly and **must 556 use the initialization function**. 557 */ 558 559 /** 560 @addtogroup RealFFT 561 @{ 562 */ 563 564 /** 565 @brief Processing function for the floating-point real FFT. 566 @param[in] S points to an arm_rfft_fast_instance_f32 structure 567 @param[in] p points to input buffer (Source buffer is modified by this function.) 568 @param[in] pOut points to output buffer 569 @param[in] ifftFlag 570 - value = 0: RFFT 571 - value = 1: RIFFT 572 @return none 573 */ 574 575 void arm_rfft_fast_f32( 576 const arm_rfft_fast_instance_f32 * S, 577 float32_t * p, 578 float32_t * pOut, 579 uint8_t ifftFlag) 580 { 581 const arm_cfft_instance_f32 * Sint = &(S->Sint); 582 583 /* Calculation of Real FFT */ 584 if (ifftFlag) 585 { 586 /* Real FFT compression */ 587 merge_rfft_f32(S, p, pOut); 588 /* Complex radix-4 IFFT process */ 589 arm_cfft_f32( Sint, pOut, ifftFlag, 1); 590 } 591 else 592 { 593 /* Calculation of RFFT of input */ 594 arm_cfft_f32( Sint, p, ifftFlag, 1); 595 596 /* Real FFT extraction */ 597 stage_rfft_f32(S, p, pOut); 598 } 599 } 600 601 /** 602 * @} end of RealFFT group 603 */