/ Drivers / CMSIS / DSP / Source / TransformFunctions / arm_cfft_radix8_f16.c
arm_cfft_radix8_f16.c
  1  /* ----------------------------------------------------------------------
  2   * Project:      CMSIS DSP Library
  3   * Title:        arm_cfft_radix8_f16.c
  4   * Description:  Radix-8 Decimation in Frequency CFFT & CIFFT Floating point processing function
  5   *
  6   * $Date:        23 April 2021
  7   * $Revision:    V1.9.0
  8   *
  9   * Target Processor: Cortex-M and Cortex-A cores
 10   * -------------------------------------------------------------------- */
 11  /*
 12   * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
 13   *
 14   * SPDX-License-Identifier: Apache-2.0
 15   *
 16   * Licensed under the Apache License, Version 2.0 (the License); you may
 17   * not use this file except in compliance with the License.
 18   * You may obtain a copy of the License at
 19   *
 20   * www.apache.org/licenses/LICENSE-2.0
 21   *
 22   * Unless required by applicable law or agreed to in writing, software
 23   * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 24   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 25   * See the License for the specific language governing permissions and
 26   * limitations under the License.
 27   */
 28  
 29  #include "dsp/transform_functions_f16.h"
 30  
 31  #if defined(ARM_FLOAT16_SUPPORTED)
 32  
 33  
 34  /* ----------------------------------------------------------------------
 35   * Internal helper function used by the FFTs
 36   * -------------------------------------------------------------------- */
 37  
 38  /**
 39    brief         Core function for the floating-point CFFT butterfly process.
 40    param[in,out] pSrc             points to the in-place buffer of floating-point data type.
 41    param[in]     fftLen           length of the FFT.
 42    param[in]     pCoef            points to the twiddle coefficient buffer.
 43    param[in]     twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table.
 44    return        none
 45  */
 46  
 47  void arm_radix8_butterfly_f16(
 48    float16_t * pSrc,
 49    uint16_t fftLen,
 50    const float16_t * pCoef,
 51    uint16_t twidCoefModifier)
 52  {
 53     uint32_t ia1, ia2, ia3, ia4, ia5, ia6, ia7;
 54     uint32_t i1, i2, i3, i4, i5, i6, i7, i8;
 55     uint32_t id;
 56     uint32_t n1, n2, j;
 57  
 58     float16_t r1, r2, r3, r4, r5, r6, r7, r8;
 59     float16_t t1, t2;
 60     float16_t s1, s2, s3, s4, s5, s6, s7, s8;
 61     float16_t p1, p2, p3, p4;
 62     float16_t co2, co3, co4, co5, co6, co7, co8;
 63     float16_t si2, si3, si4, si5, si6, si7, si8;
 64     const float16_t C81 = 0.70710678118f16;
 65  
 66     n2 = fftLen;
 67  
 68     do
 69     {
 70        n1 = n2;
 71        n2 = n2 >> 3;
 72        i1 = 0;
 73  
 74        do
 75        {
 76           i2 = i1 + n2;
 77           i3 = i2 + n2;
 78           i4 = i3 + n2;
 79           i5 = i4 + n2;
 80           i6 = i5 + n2;
 81           i7 = i6 + n2;
 82           i8 = i7 + n2;
 83           r1 = (_Float16)pSrc[2 * i1] + (_Float16)pSrc[2 * i5];
 84           r5 = (_Float16)pSrc[2 * i1] - (_Float16)pSrc[2 * i5];
 85           r2 = (_Float16)pSrc[2 * i2] + (_Float16)pSrc[2 * i6];
 86           r6 = (_Float16)pSrc[2 * i2] - (_Float16)pSrc[2 * i6];
 87           r3 = (_Float16)pSrc[2 * i3] + (_Float16)pSrc[2 * i7];
 88           r7 = (_Float16)pSrc[2 * i3] - (_Float16)pSrc[2 * i7];
 89           r4 = (_Float16)pSrc[2 * i4] + (_Float16)pSrc[2 * i8];
 90           r8 = (_Float16)pSrc[2 * i4] - (_Float16)pSrc[2 * i8];
 91           t1 = (_Float16)r1 - (_Float16)r3;
 92           r1 = (_Float16)r1 + (_Float16)r3;
 93           r3 = (_Float16)r2 - (_Float16)r4;
 94           r2 = (_Float16)r2 + (_Float16)r4;
 95           pSrc[2 * i1] = (_Float16)r1 + (_Float16)r2;
 96           pSrc[2 * i5] = (_Float16)r1 - (_Float16)r2;
 97           r1 = (_Float16)pSrc[2 * i1 + 1] + (_Float16)pSrc[2 * i5 + 1];
 98           s5 = (_Float16)pSrc[2 * i1 + 1] - (_Float16)pSrc[2 * i5 + 1];
 99           r2 = (_Float16)pSrc[2 * i2 + 1] + (_Float16)pSrc[2 * i6 + 1];
100           s6 = (_Float16)pSrc[2 * i2 + 1] - (_Float16)pSrc[2 * i6 + 1];
101           s3 = (_Float16)pSrc[2 * i3 + 1] + (_Float16)pSrc[2 * i7 + 1];
102           s7 = (_Float16)pSrc[2 * i3 + 1] - (_Float16)pSrc[2 * i7 + 1];
103           r4 = (_Float16)pSrc[2 * i4 + 1] + (_Float16)pSrc[2 * i8 + 1];
104           s8 = (_Float16)pSrc[2 * i4 + 1] - (_Float16)pSrc[2 * i8 + 1];
105           t2 = (_Float16)r1 - (_Float16)s3;
106           r1 = (_Float16)r1 + (_Float16)s3;
107           s3 = (_Float16)r2 - (_Float16)r4;
108           r2 = (_Float16)r2 + (_Float16)r4;
109           pSrc[2 * i1 + 1] = (_Float16)r1 + (_Float16)r2;
110           pSrc[2 * i5 + 1] = (_Float16)r1 - (_Float16)r2;
111           pSrc[2 * i3]     = (_Float16)t1 + (_Float16)s3;
112           pSrc[2 * i7]     = (_Float16)t1 - (_Float16)s3;
113           pSrc[2 * i3 + 1] = (_Float16)t2 - (_Float16)r3;
114           pSrc[2 * i7 + 1] = (_Float16)t2 + (_Float16)r3;
115           r1 = ((_Float16)r6 - (_Float16)r8) * (_Float16)C81;
116           r6 = ((_Float16)r6 + (_Float16)r8) * (_Float16)C81;
117           r2 = ((_Float16)s6 - (_Float16)s8) * (_Float16)C81;
118           s6 = ((_Float16)s6 + (_Float16)s8) * (_Float16)C81;
119           t1 = (_Float16)r5 - (_Float16)r1;
120           r5 = (_Float16)r5 + (_Float16)r1;
121           r8 = (_Float16)r7 - (_Float16)r6;
122           r7 = (_Float16)r7 + (_Float16)r6;
123           t2 = (_Float16)s5 - (_Float16)r2;
124           s5 = (_Float16)s5 + (_Float16)r2;
125           s8 = (_Float16)s7 - (_Float16)s6;
126           s7 = (_Float16)s7 + (_Float16)s6;
127           pSrc[2 * i2]     = (_Float16)r5 + (_Float16)s7;
128           pSrc[2 * i8]     = (_Float16)r5 - (_Float16)s7;
129           pSrc[2 * i6]     = (_Float16)t1 + (_Float16)s8;
130           pSrc[2 * i4]     = (_Float16)t1 - (_Float16)s8;
131           pSrc[2 * i2 + 1] = (_Float16)s5 - (_Float16)r7;
132           pSrc[2 * i8 + 1] = (_Float16)s5 + (_Float16)r7;
133           pSrc[2 * i6 + 1] = (_Float16)t2 - (_Float16)r8;
134           pSrc[2 * i4 + 1] = (_Float16)t2 + (_Float16)r8;
135  
136           i1 += n1;
137        } while (i1 < fftLen);
138  
139        if (n2 < 8)
140           break;
141  
142        ia1 = 0;
143        j = 1;
144  
145        do
146        {
147           /*  index calculation for the coefficients */
148           id  = ia1 + twidCoefModifier;
149           ia1 = id;
150           ia2 = ia1 + id;
151           ia3 = ia2 + id;
152           ia4 = ia3 + id;
153           ia5 = ia4 + id;
154           ia6 = ia5 + id;
155           ia7 = ia6 + id;
156  
157           co2 = pCoef[2 * ia1];
158           co3 = pCoef[2 * ia2];
159           co4 = pCoef[2 * ia3];
160           co5 = pCoef[2 * ia4];
161           co6 = pCoef[2 * ia5];
162           co7 = pCoef[2 * ia6];
163           co8 = pCoef[2 * ia7];
164           si2 = pCoef[2 * ia1 + 1];
165           si3 = pCoef[2 * ia2 + 1];
166           si4 = pCoef[2 * ia3 + 1];
167           si5 = pCoef[2 * ia4 + 1];
168           si6 = pCoef[2 * ia5 + 1];
169           si7 = pCoef[2 * ia6 + 1];
170           si8 = pCoef[2 * ia7 + 1];
171  
172           i1 = j;
173  
174           do
175           {
176              /*  index calculation for the input */
177              i2 = i1 + n2;
178              i3 = i2 + n2;
179              i4 = i3 + n2;
180              i5 = i4 + n2;
181              i6 = i5 + n2;
182              i7 = i6 + n2;
183              i8 = i7 + n2;
184              r1 = (_Float16)pSrc[2 * i1] + (_Float16)pSrc[2 * i5];
185              r5 = (_Float16)pSrc[2 * i1] - (_Float16)pSrc[2 * i5];
186              r2 = (_Float16)pSrc[2 * i2] + (_Float16)pSrc[2 * i6];
187              r6 = (_Float16)pSrc[2 * i2] - (_Float16)pSrc[2 * i6];
188              r3 = (_Float16)pSrc[2 * i3] + (_Float16)pSrc[2 * i7];
189              r7 = (_Float16)pSrc[2 * i3] - (_Float16)pSrc[2 * i7];
190              r4 = (_Float16)pSrc[2 * i4] + (_Float16)pSrc[2 * i8];
191              r8 = (_Float16)pSrc[2 * i4] - (_Float16)pSrc[2 * i8];
192              t1 = (_Float16)r1 - (_Float16)r3;
193              r1 = (_Float16)r1 + (_Float16)r3;
194              r3 = (_Float16)r2 - (_Float16)r4;
195              r2 = (_Float16)r2 + (_Float16)r4;
196              pSrc[2 * i1] = (_Float16)r1 + (_Float16)r2;
197              r2 = (_Float16)r1 - (_Float16)r2;
198              s1 = (_Float16)pSrc[2 * i1 + 1] + (_Float16)pSrc[2 * i5 + 1];
199              s5 = (_Float16)pSrc[2 * i1 + 1] - (_Float16)pSrc[2 * i5 + 1];
200              s2 = (_Float16)pSrc[2 * i2 + 1] + (_Float16)pSrc[2 * i6 + 1];
201              s6 = (_Float16)pSrc[2 * i2 + 1] - (_Float16)pSrc[2 * i6 + 1];
202              s3 = (_Float16)pSrc[2 * i3 + 1] + (_Float16)pSrc[2 * i7 + 1];
203              s7 = (_Float16)pSrc[2 * i3 + 1] - (_Float16)pSrc[2 * i7 + 1];
204              s4 = (_Float16)pSrc[2 * i4 + 1] + (_Float16)pSrc[2 * i8 + 1];
205              s8 = (_Float16)pSrc[2 * i4 + 1] - (_Float16)pSrc[2 * i8 + 1];
206              t2 = (_Float16)s1 - (_Float16)s3;
207              s1 = (_Float16)s1 + (_Float16)s3;
208              s3 = (_Float16)s2 - (_Float16)s4;
209              s2 = (_Float16)s2 + (_Float16)s4;
210              r1 = (_Float16)t1 + (_Float16)s3;
211              t1 = (_Float16)t1 - (_Float16)s3;
212              pSrc[2 * i1 + 1] = (_Float16)s1 + (_Float16)s2;
213              s2 = (_Float16)s1 - (_Float16)s2;
214              s1 = (_Float16)t2 - (_Float16)r3;
215              t2 = (_Float16)t2 + (_Float16)r3;
216              p1 = (_Float16)co5 * (_Float16)r2;
217              p2 = (_Float16)si5 * (_Float16)s2;
218              p3 = (_Float16)co5 * (_Float16)s2;
219              p4 = (_Float16)si5 * (_Float16)r2;
220              pSrc[2 * i5]     = (_Float16)p1 + (_Float16)p2;
221              pSrc[2 * i5 + 1] = (_Float16)p3 - (_Float16)p4;
222              p1 = (_Float16)co3 * (_Float16)r1;
223              p2 = (_Float16)si3 * (_Float16)s1;
224              p3 = (_Float16)co3 * (_Float16)s1;
225              p4 = (_Float16)si3 * (_Float16)r1;
226              pSrc[2 * i3]     = (_Float16)p1 + (_Float16)p2;
227              pSrc[2 * i3 + 1] = (_Float16)p3 - (_Float16)p4;
228              p1 = (_Float16)co7 * (_Float16)t1;
229              p2 = (_Float16)si7 * (_Float16)t2;
230              p3 = (_Float16)co7 * (_Float16)t2;
231              p4 = (_Float16)si7 * (_Float16)t1;
232              pSrc[2 * i7]     = (_Float16)p1 + (_Float16)p2;
233              pSrc[2 * i7 + 1] = (_Float16)p3 - (_Float16)p4;
234              r1 = ((_Float16)r6 - (_Float16)r8) * (_Float16)C81;
235              r6 = ((_Float16)r6 + (_Float16)r8) * (_Float16)C81;
236              s1 = ((_Float16)s6 - (_Float16)s8) * (_Float16)C81;
237              s6 = ((_Float16)s6 + (_Float16)s8) * (_Float16)C81;
238              t1 = (_Float16)r5 - (_Float16)r1;
239              r5 = (_Float16)r5 + (_Float16)r1;
240              r8 = (_Float16)r7 - (_Float16)r6;
241              r7 = (_Float16)r7 + (_Float16)r6;
242              t2 = (_Float16)s5 - (_Float16)s1;
243              s5 = (_Float16)s5 + (_Float16)s1;
244              s8 = (_Float16)s7 - (_Float16)s6;
245              s7 = (_Float16)s7 + (_Float16)s6;
246              r1 = (_Float16)r5 + (_Float16)s7;
247              r5 = (_Float16)r5 - (_Float16)s7;
248              r6 = (_Float16)t1 + (_Float16)s8;
249              t1 = (_Float16)t1 - (_Float16)s8;
250              s1 = (_Float16)s5 - (_Float16)r7;
251              s5 = (_Float16)s5 + (_Float16)r7;
252              s6 = (_Float16)t2 - (_Float16)r8;
253              t2 = (_Float16)t2 + (_Float16)r8;
254              p1 = (_Float16)co2 * (_Float16)r1;
255              p2 = (_Float16)si2 * (_Float16)s1;
256              p3 = (_Float16)co2 * (_Float16)s1;
257              p4 = (_Float16)si2 * (_Float16)r1;
258              pSrc[2 * i2]     = (_Float16)p1 + (_Float16)p2;
259              pSrc[2 * i2 + 1] = (_Float16)p3 - (_Float16)p4;
260              p1 = (_Float16)co8 * (_Float16)r5;
261              p2 = (_Float16)si8 * (_Float16)s5;
262              p3 = (_Float16)co8 * (_Float16)s5;
263              p4 = (_Float16)si8 * (_Float16)r5;
264              pSrc[2 * i8]     = (_Float16)p1 + (_Float16)p2;
265              pSrc[2 * i8 + 1] = (_Float16)p3 - (_Float16)p4;
266              p1 = (_Float16)co6 * (_Float16)r6;
267              p2 = (_Float16)si6 * (_Float16)s6;
268              p3 = (_Float16)co6 * (_Float16)s6;
269              p4 = (_Float16)si6 * (_Float16)r6;
270              pSrc[2 * i6]     = (_Float16)p1 + (_Float16)p2;
271              pSrc[2 * i6 + 1] = (_Float16)p3 - (_Float16)p4;
272              p1 = (_Float16)co4 * (_Float16)t1;
273              p2 = (_Float16)si4 * (_Float16)t2;
274              p3 = (_Float16)co4 * (_Float16)t2;
275              p4 = (_Float16)si4 * (_Float16)t1;
276              pSrc[2 * i4]     = (_Float16)p1 + (_Float16)p2;
277              pSrc[2 * i4 + 1] = (_Float16)p3 - (_Float16)p4;
278  
279              i1 += n1;
280           } while (i1 < fftLen);
281  
282           j++;
283        } while (j < n2);
284  
285        twidCoefModifier <<= 3;
286     } while (n2 > 7);
287  }
288  
289  #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */