arm_cfft_radix8_f16.c
1 /* ---------------------------------------------------------------------- 2 * Project: CMSIS DSP Library 3 * Title: arm_cfft_radix8_f16.c 4 * Description: Radix-8 Decimation in Frequency CFFT & CIFFT Floating point processing function 5 * 6 * $Date: 23 April 2021 7 * $Revision: V1.9.0 8 * 9 * Target Processor: Cortex-M and Cortex-A cores 10 * -------------------------------------------------------------------- */ 11 /* 12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. 13 * 14 * SPDX-License-Identifier: Apache-2.0 15 * 16 * Licensed under the Apache License, Version 2.0 (the License); you may 17 * not use this file except in compliance with the License. 18 * You may obtain a copy of the License at 19 * 20 * www.apache.org/licenses/LICENSE-2.0 21 * 22 * Unless required by applicable law or agreed to in writing, software 23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 * See the License for the specific language governing permissions and 26 * limitations under the License. 27 */ 28 29 #include "dsp/transform_functions_f16.h" 30 31 #if defined(ARM_FLOAT16_SUPPORTED) 32 33 34 /* ---------------------------------------------------------------------- 35 * Internal helper function used by the FFTs 36 * -------------------------------------------------------------------- */ 37 38 /** 39 brief Core function for the floating-point CFFT butterfly process. 40 param[in,out] pSrc points to the in-place buffer of floating-point data type. 41 param[in] fftLen length of the FFT. 42 param[in] pCoef points to the twiddle coefficient buffer. 43 param[in] twidCoefModifier twiddle coefficient modifier that supports different size FFTs with the same twiddle factor table. 44 return none 45 */ 46 47 void arm_radix8_butterfly_f16( 48 float16_t * pSrc, 49 uint16_t fftLen, 50 const float16_t * pCoef, 51 uint16_t twidCoefModifier) 52 { 53 uint32_t ia1, ia2, ia3, ia4, ia5, ia6, ia7; 54 uint32_t i1, i2, i3, i4, i5, i6, i7, i8; 55 uint32_t id; 56 uint32_t n1, n2, j; 57 58 float16_t r1, r2, r3, r4, r5, r6, r7, r8; 59 float16_t t1, t2; 60 float16_t s1, s2, s3, s4, s5, s6, s7, s8; 61 float16_t p1, p2, p3, p4; 62 float16_t co2, co3, co4, co5, co6, co7, co8; 63 float16_t si2, si3, si4, si5, si6, si7, si8; 64 const float16_t C81 = 0.70710678118f16; 65 66 n2 = fftLen; 67 68 do 69 { 70 n1 = n2; 71 n2 = n2 >> 3; 72 i1 = 0; 73 74 do 75 { 76 i2 = i1 + n2; 77 i3 = i2 + n2; 78 i4 = i3 + n2; 79 i5 = i4 + n2; 80 i6 = i5 + n2; 81 i7 = i6 + n2; 82 i8 = i7 + n2; 83 r1 = (_Float16)pSrc[2 * i1] + (_Float16)pSrc[2 * i5]; 84 r5 = (_Float16)pSrc[2 * i1] - (_Float16)pSrc[2 * i5]; 85 r2 = (_Float16)pSrc[2 * i2] + (_Float16)pSrc[2 * i6]; 86 r6 = (_Float16)pSrc[2 * i2] - (_Float16)pSrc[2 * i6]; 87 r3 = (_Float16)pSrc[2 * i3] + (_Float16)pSrc[2 * i7]; 88 r7 = (_Float16)pSrc[2 * i3] - (_Float16)pSrc[2 * i7]; 89 r4 = (_Float16)pSrc[2 * i4] + (_Float16)pSrc[2 * i8]; 90 r8 = (_Float16)pSrc[2 * i4] - (_Float16)pSrc[2 * i8]; 91 t1 = (_Float16)r1 - (_Float16)r3; 92 r1 = (_Float16)r1 + (_Float16)r3; 93 r3 = (_Float16)r2 - (_Float16)r4; 94 r2 = (_Float16)r2 + (_Float16)r4; 95 pSrc[2 * i1] = (_Float16)r1 + (_Float16)r2; 96 pSrc[2 * i5] = (_Float16)r1 - (_Float16)r2; 97 r1 = (_Float16)pSrc[2 * i1 + 1] + (_Float16)pSrc[2 * i5 + 1]; 98 s5 = (_Float16)pSrc[2 * i1 + 1] - (_Float16)pSrc[2 * i5 + 1]; 99 r2 = (_Float16)pSrc[2 * i2 + 1] + (_Float16)pSrc[2 * i6 + 1]; 100 s6 = (_Float16)pSrc[2 * i2 + 1] - (_Float16)pSrc[2 * i6 + 1]; 101 s3 = (_Float16)pSrc[2 * i3 + 1] + (_Float16)pSrc[2 * i7 + 1]; 102 s7 = (_Float16)pSrc[2 * i3 + 1] - (_Float16)pSrc[2 * i7 + 1]; 103 r4 = (_Float16)pSrc[2 * i4 + 1] + (_Float16)pSrc[2 * i8 + 1]; 104 s8 = (_Float16)pSrc[2 * i4 + 1] - (_Float16)pSrc[2 * i8 + 1]; 105 t2 = (_Float16)r1 - (_Float16)s3; 106 r1 = (_Float16)r1 + (_Float16)s3; 107 s3 = (_Float16)r2 - (_Float16)r4; 108 r2 = (_Float16)r2 + (_Float16)r4; 109 pSrc[2 * i1 + 1] = (_Float16)r1 + (_Float16)r2; 110 pSrc[2 * i5 + 1] = (_Float16)r1 - (_Float16)r2; 111 pSrc[2 * i3] = (_Float16)t1 + (_Float16)s3; 112 pSrc[2 * i7] = (_Float16)t1 - (_Float16)s3; 113 pSrc[2 * i3 + 1] = (_Float16)t2 - (_Float16)r3; 114 pSrc[2 * i7 + 1] = (_Float16)t2 + (_Float16)r3; 115 r1 = ((_Float16)r6 - (_Float16)r8) * (_Float16)C81; 116 r6 = ((_Float16)r6 + (_Float16)r8) * (_Float16)C81; 117 r2 = ((_Float16)s6 - (_Float16)s8) * (_Float16)C81; 118 s6 = ((_Float16)s6 + (_Float16)s8) * (_Float16)C81; 119 t1 = (_Float16)r5 - (_Float16)r1; 120 r5 = (_Float16)r5 + (_Float16)r1; 121 r8 = (_Float16)r7 - (_Float16)r6; 122 r7 = (_Float16)r7 + (_Float16)r6; 123 t2 = (_Float16)s5 - (_Float16)r2; 124 s5 = (_Float16)s5 + (_Float16)r2; 125 s8 = (_Float16)s7 - (_Float16)s6; 126 s7 = (_Float16)s7 + (_Float16)s6; 127 pSrc[2 * i2] = (_Float16)r5 + (_Float16)s7; 128 pSrc[2 * i8] = (_Float16)r5 - (_Float16)s7; 129 pSrc[2 * i6] = (_Float16)t1 + (_Float16)s8; 130 pSrc[2 * i4] = (_Float16)t1 - (_Float16)s8; 131 pSrc[2 * i2 + 1] = (_Float16)s5 - (_Float16)r7; 132 pSrc[2 * i8 + 1] = (_Float16)s5 + (_Float16)r7; 133 pSrc[2 * i6 + 1] = (_Float16)t2 - (_Float16)r8; 134 pSrc[2 * i4 + 1] = (_Float16)t2 + (_Float16)r8; 135 136 i1 += n1; 137 } while (i1 < fftLen); 138 139 if (n2 < 8) 140 break; 141 142 ia1 = 0; 143 j = 1; 144 145 do 146 { 147 /* index calculation for the coefficients */ 148 id = ia1 + twidCoefModifier; 149 ia1 = id; 150 ia2 = ia1 + id; 151 ia3 = ia2 + id; 152 ia4 = ia3 + id; 153 ia5 = ia4 + id; 154 ia6 = ia5 + id; 155 ia7 = ia6 + id; 156 157 co2 = pCoef[2 * ia1]; 158 co3 = pCoef[2 * ia2]; 159 co4 = pCoef[2 * ia3]; 160 co5 = pCoef[2 * ia4]; 161 co6 = pCoef[2 * ia5]; 162 co7 = pCoef[2 * ia6]; 163 co8 = pCoef[2 * ia7]; 164 si2 = pCoef[2 * ia1 + 1]; 165 si3 = pCoef[2 * ia2 + 1]; 166 si4 = pCoef[2 * ia3 + 1]; 167 si5 = pCoef[2 * ia4 + 1]; 168 si6 = pCoef[2 * ia5 + 1]; 169 si7 = pCoef[2 * ia6 + 1]; 170 si8 = pCoef[2 * ia7 + 1]; 171 172 i1 = j; 173 174 do 175 { 176 /* index calculation for the input */ 177 i2 = i1 + n2; 178 i3 = i2 + n2; 179 i4 = i3 + n2; 180 i5 = i4 + n2; 181 i6 = i5 + n2; 182 i7 = i6 + n2; 183 i8 = i7 + n2; 184 r1 = (_Float16)pSrc[2 * i1] + (_Float16)pSrc[2 * i5]; 185 r5 = (_Float16)pSrc[2 * i1] - (_Float16)pSrc[2 * i5]; 186 r2 = (_Float16)pSrc[2 * i2] + (_Float16)pSrc[2 * i6]; 187 r6 = (_Float16)pSrc[2 * i2] - (_Float16)pSrc[2 * i6]; 188 r3 = (_Float16)pSrc[2 * i3] + (_Float16)pSrc[2 * i7]; 189 r7 = (_Float16)pSrc[2 * i3] - (_Float16)pSrc[2 * i7]; 190 r4 = (_Float16)pSrc[2 * i4] + (_Float16)pSrc[2 * i8]; 191 r8 = (_Float16)pSrc[2 * i4] - (_Float16)pSrc[2 * i8]; 192 t1 = (_Float16)r1 - (_Float16)r3; 193 r1 = (_Float16)r1 + (_Float16)r3; 194 r3 = (_Float16)r2 - (_Float16)r4; 195 r2 = (_Float16)r2 + (_Float16)r4; 196 pSrc[2 * i1] = (_Float16)r1 + (_Float16)r2; 197 r2 = (_Float16)r1 - (_Float16)r2; 198 s1 = (_Float16)pSrc[2 * i1 + 1] + (_Float16)pSrc[2 * i5 + 1]; 199 s5 = (_Float16)pSrc[2 * i1 + 1] - (_Float16)pSrc[2 * i5 + 1]; 200 s2 = (_Float16)pSrc[2 * i2 + 1] + (_Float16)pSrc[2 * i6 + 1]; 201 s6 = (_Float16)pSrc[2 * i2 + 1] - (_Float16)pSrc[2 * i6 + 1]; 202 s3 = (_Float16)pSrc[2 * i3 + 1] + (_Float16)pSrc[2 * i7 + 1]; 203 s7 = (_Float16)pSrc[2 * i3 + 1] - (_Float16)pSrc[2 * i7 + 1]; 204 s4 = (_Float16)pSrc[2 * i4 + 1] + (_Float16)pSrc[2 * i8 + 1]; 205 s8 = (_Float16)pSrc[2 * i4 + 1] - (_Float16)pSrc[2 * i8 + 1]; 206 t2 = (_Float16)s1 - (_Float16)s3; 207 s1 = (_Float16)s1 + (_Float16)s3; 208 s3 = (_Float16)s2 - (_Float16)s4; 209 s2 = (_Float16)s2 + (_Float16)s4; 210 r1 = (_Float16)t1 + (_Float16)s3; 211 t1 = (_Float16)t1 - (_Float16)s3; 212 pSrc[2 * i1 + 1] = (_Float16)s1 + (_Float16)s2; 213 s2 = (_Float16)s1 - (_Float16)s2; 214 s1 = (_Float16)t2 - (_Float16)r3; 215 t2 = (_Float16)t2 + (_Float16)r3; 216 p1 = (_Float16)co5 * (_Float16)r2; 217 p2 = (_Float16)si5 * (_Float16)s2; 218 p3 = (_Float16)co5 * (_Float16)s2; 219 p4 = (_Float16)si5 * (_Float16)r2; 220 pSrc[2 * i5] = (_Float16)p1 + (_Float16)p2; 221 pSrc[2 * i5 + 1] = (_Float16)p3 - (_Float16)p4; 222 p1 = (_Float16)co3 * (_Float16)r1; 223 p2 = (_Float16)si3 * (_Float16)s1; 224 p3 = (_Float16)co3 * (_Float16)s1; 225 p4 = (_Float16)si3 * (_Float16)r1; 226 pSrc[2 * i3] = (_Float16)p1 + (_Float16)p2; 227 pSrc[2 * i3 + 1] = (_Float16)p3 - (_Float16)p4; 228 p1 = (_Float16)co7 * (_Float16)t1; 229 p2 = (_Float16)si7 * (_Float16)t2; 230 p3 = (_Float16)co7 * (_Float16)t2; 231 p4 = (_Float16)si7 * (_Float16)t1; 232 pSrc[2 * i7] = (_Float16)p1 + (_Float16)p2; 233 pSrc[2 * i7 + 1] = (_Float16)p3 - (_Float16)p4; 234 r1 = ((_Float16)r6 - (_Float16)r8) * (_Float16)C81; 235 r6 = ((_Float16)r6 + (_Float16)r8) * (_Float16)C81; 236 s1 = ((_Float16)s6 - (_Float16)s8) * (_Float16)C81; 237 s6 = ((_Float16)s6 + (_Float16)s8) * (_Float16)C81; 238 t1 = (_Float16)r5 - (_Float16)r1; 239 r5 = (_Float16)r5 + (_Float16)r1; 240 r8 = (_Float16)r7 - (_Float16)r6; 241 r7 = (_Float16)r7 + (_Float16)r6; 242 t2 = (_Float16)s5 - (_Float16)s1; 243 s5 = (_Float16)s5 + (_Float16)s1; 244 s8 = (_Float16)s7 - (_Float16)s6; 245 s7 = (_Float16)s7 + (_Float16)s6; 246 r1 = (_Float16)r5 + (_Float16)s7; 247 r5 = (_Float16)r5 - (_Float16)s7; 248 r6 = (_Float16)t1 + (_Float16)s8; 249 t1 = (_Float16)t1 - (_Float16)s8; 250 s1 = (_Float16)s5 - (_Float16)r7; 251 s5 = (_Float16)s5 + (_Float16)r7; 252 s6 = (_Float16)t2 - (_Float16)r8; 253 t2 = (_Float16)t2 + (_Float16)r8; 254 p1 = (_Float16)co2 * (_Float16)r1; 255 p2 = (_Float16)si2 * (_Float16)s1; 256 p3 = (_Float16)co2 * (_Float16)s1; 257 p4 = (_Float16)si2 * (_Float16)r1; 258 pSrc[2 * i2] = (_Float16)p1 + (_Float16)p2; 259 pSrc[2 * i2 + 1] = (_Float16)p3 - (_Float16)p4; 260 p1 = (_Float16)co8 * (_Float16)r5; 261 p2 = (_Float16)si8 * (_Float16)s5; 262 p3 = (_Float16)co8 * (_Float16)s5; 263 p4 = (_Float16)si8 * (_Float16)r5; 264 pSrc[2 * i8] = (_Float16)p1 + (_Float16)p2; 265 pSrc[2 * i8 + 1] = (_Float16)p3 - (_Float16)p4; 266 p1 = (_Float16)co6 * (_Float16)r6; 267 p2 = (_Float16)si6 * (_Float16)s6; 268 p3 = (_Float16)co6 * (_Float16)s6; 269 p4 = (_Float16)si6 * (_Float16)r6; 270 pSrc[2 * i6] = (_Float16)p1 + (_Float16)p2; 271 pSrc[2 * i6 + 1] = (_Float16)p3 - (_Float16)p4; 272 p1 = (_Float16)co4 * (_Float16)t1; 273 p2 = (_Float16)si4 * (_Float16)t2; 274 p3 = (_Float16)co4 * (_Float16)t2; 275 p4 = (_Float16)si4 * (_Float16)t1; 276 pSrc[2 * i4] = (_Float16)p1 + (_Float16)p2; 277 pSrc[2 * i4 + 1] = (_Float16)p3 - (_Float16)p4; 278 279 i1 += n1; 280 } while (i1 < fftLen); 281 282 j++; 283 } while (j < n2); 284 285 twidCoefModifier <<= 3; 286 } while (n2 > 7); 287 } 288 289 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */