arm_cmplx_mult_cmplx_f32.c
1 /* ---------------------------------------------------------------------- 2 * Project: CMSIS DSP Library 3 * Title: arm_cmplx_mult_cmplx_f32.c 4 * Description: Floating-point complex-by-complex multiplication 5 * 6 * $Date: 23 April 2021 7 * $Revision: V1.9.0 8 * 9 * Target Processor: Cortex-M and Cortex-A cores 10 * -------------------------------------------------------------------- */ 11 /* 12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. 13 * 14 * SPDX-License-Identifier: Apache-2.0 15 * 16 * Licensed under the Apache License, Version 2.0 (the License); you may 17 * not use this file except in compliance with the License. 18 * You may obtain a copy of the License at 19 * 20 * www.apache.org/licenses/LICENSE-2.0 21 * 22 * Unless required by applicable law or agreed to in writing, software 23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 * See the License for the specific language governing permissions and 26 * limitations under the License. 27 */ 28 29 #include "dsp/complex_math_functions.h" 30 31 /** 32 @ingroup groupCmplxMath 33 */ 34 35 /** 36 @defgroup CmplxByCmplxMult Complex-by-Complex Multiplication 37 38 Multiplies a complex vector by another complex vector and generates a complex result. 39 The data in the complex arrays is stored in an interleaved fashion 40 (real, imag, real, imag, ...). 41 The parameter <code>numSamples</code> represents the number of complex 42 samples processed. The complex arrays have a total of <code>2*numSamples</code> 43 real values. 44 45 The underlying algorithm is used: 46 47 <pre> 48 for (n = 0; n < numSamples; n++) { 49 pDst[(2*n)+0] = pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1]; 50 pDst[(2*n)+1] = pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0]; 51 } 52 </pre> 53 54 There are separate functions for floating-point, Q15, and Q31 data types. 55 */ 56 57 /** 58 @addtogroup CmplxByCmplxMult 59 @{ 60 */ 61 62 /** 63 @brief Floating-point complex-by-complex multiplication. 64 @param[in] pSrcA points to first input vector 65 @param[in] pSrcB points to second input vector 66 @param[out] pDst points to output vector 67 @param[in] numSamples number of samples in each vector 68 @return none 69 */ 70 71 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) 72 73 void arm_cmplx_mult_cmplx_f32( 74 const float32_t * pSrcA, 75 const float32_t * pSrcB, 76 float32_t * pDst, 77 uint32_t numSamples) 78 { 79 int32_t blkCnt; 80 f32x4_t vecSrcA, vecSrcB; 81 f32x4_t vecSrcC, vecSrcD; 82 f32x4_t vec_acc; 83 84 blkCnt = numSamples >> 2; 85 blkCnt -= 1; 86 if (blkCnt > 0) { 87 /* should give more freedom to generate stall free code */ 88 vecSrcA = vld1q(pSrcA); 89 vecSrcB = vld1q(pSrcB); 90 pSrcA += 4; 91 pSrcB += 4; 92 93 while (blkCnt > 0) { 94 vec_acc = vcmulq(vecSrcA, vecSrcB); 95 vecSrcC = vld1q(pSrcA); 96 pSrcA += 4; 97 98 vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB); 99 vecSrcD = vld1q(pSrcB); 100 pSrcB += 4; 101 vst1q(pDst, vec_acc); 102 pDst += 4; 103 104 vec_acc = vcmulq(vecSrcC, vecSrcD); 105 vecSrcA = vld1q(pSrcA); 106 pSrcA += 4; 107 108 vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD); 109 vecSrcB = vld1q(pSrcB); 110 pSrcB += 4; 111 vst1q(pDst, vec_acc); 112 pDst += 4; 113 /* 114 * Decrement the blockSize loop counter 115 */ 116 blkCnt--; 117 } 118 119 /* process last elements out of the loop avoid the armclang breaking the SW pipeline */ 120 vec_acc = vcmulq(vecSrcA, vecSrcB); 121 vecSrcC = vld1q(pSrcA); 122 123 vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB); 124 vecSrcD = vld1q(pSrcB); 125 vst1q(pDst, vec_acc); 126 pDst += 4; 127 128 vec_acc = vcmulq(vecSrcC, vecSrcD); 129 vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD); 130 vst1q(pDst, vec_acc); 131 pDst += 4; 132 133 /* 134 * tail 135 */ 136 blkCnt = CMPLX_DIM * (numSamples & 3); 137 while (blkCnt > 0) { 138 mve_pred16_t p = vctp32q(blkCnt); 139 pSrcA += 4; 140 pSrcB += 4; 141 142 vecSrcA = vldrwq_z_f32(pSrcA, p); 143 vecSrcB = vldrwq_z_f32(pSrcB, p); 144 vec_acc = vcmulq_m(vuninitializedq_f32(),vecSrcA, vecSrcB, p); 145 vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p); 146 147 vstrwq_p_f32(pDst, vec_acc, p); 148 pDst += 4; 149 150 blkCnt -= 4; 151 } 152 } else { 153 /* small vector */ 154 blkCnt = numSamples * CMPLX_DIM; 155 vec_acc = vdupq_n_f32(0.0f); 156 157 do { 158 mve_pred16_t p = vctp32q(blkCnt); 159 160 vecSrcA = vldrwq_z_f32(pSrcA, p); 161 vecSrcB = vldrwq_z_f32(pSrcB, p); 162 163 vec_acc = vcmulq_m(vuninitializedq_f32(),vecSrcA, vecSrcB, p); 164 vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p); 165 vstrwq_p_f32(pDst, vec_acc, p); 166 pDst += 4; 167 168 /* 169 * Decrement the blkCnt loop counter 170 * Advance vector source and destination pointers 171 */ 172 pSrcA += 4; 173 pSrcB += 4; 174 blkCnt -= 4; 175 } 176 while (blkCnt > 0); 177 } 178 179 } 180 181 #else 182 void arm_cmplx_mult_cmplx_f32( 183 const float32_t * pSrcA, 184 const float32_t * pSrcB, 185 float32_t * pDst, 186 uint32_t numSamples) 187 { 188 uint32_t blkCnt; /* Loop counter */ 189 float32_t a, b, c, d; /* Temporary variables to store real and imaginary values */ 190 191 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) 192 float32x4x2_t va, vb; 193 float32x4x2_t outCplx; 194 195 /* Compute 4 outputs at a time */ 196 blkCnt = numSamples >> 2U; 197 198 while (blkCnt > 0U) 199 { 200 va = vld2q_f32(pSrcA); // load & separate real/imag pSrcA (de-interleave 2) 201 vb = vld2q_f32(pSrcB); // load & separate real/imag pSrcB 202 203 /* Increment pointers */ 204 pSrcA += 8; 205 pSrcB += 8; 206 207 /* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */ 208 outCplx.val[0] = vmulq_f32(va.val[0], vb.val[0]); 209 outCplx.val[0] = vmlsq_f32(outCplx.val[0], va.val[1], vb.val[1]); 210 211 /* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */ 212 outCplx.val[1] = vmulq_f32(va.val[0], vb.val[1]); 213 outCplx.val[1] = vmlaq_f32(outCplx.val[1], va.val[1], vb.val[0]); 214 215 vst2q_f32(pDst, outCplx); 216 217 /* Increment pointer */ 218 pDst += 8; 219 220 /* Decrement the loop counter */ 221 blkCnt--; 222 } 223 224 /* Tail */ 225 blkCnt = numSamples & 3; 226 227 #else 228 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE) 229 230 /* Loop unrolling: Compute 4 outputs at a time */ 231 blkCnt = numSamples >> 2U; 232 233 while (blkCnt > 0U) 234 { 235 /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */ 236 /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */ 237 238 a = *pSrcA++; 239 b = *pSrcA++; 240 c = *pSrcB++; 241 d = *pSrcB++; 242 /* store result in destination buffer. */ 243 *pDst++ = (a * c) - (b * d); 244 *pDst++ = (a * d) + (b * c); 245 246 a = *pSrcA++; 247 b = *pSrcA++; 248 c = *pSrcB++; 249 d = *pSrcB++; 250 *pDst++ = (a * c) - (b * d); 251 *pDst++ = (a * d) + (b * c); 252 253 a = *pSrcA++; 254 b = *pSrcA++; 255 c = *pSrcB++; 256 d = *pSrcB++; 257 *pDst++ = (a * c) - (b * d); 258 *pDst++ = (a * d) + (b * c); 259 260 a = *pSrcA++; 261 b = *pSrcA++; 262 c = *pSrcB++; 263 d = *pSrcB++; 264 *pDst++ = (a * c) - (b * d); 265 *pDst++ = (a * d) + (b * c); 266 267 /* Decrement loop counter */ 268 blkCnt--; 269 } 270 271 /* Loop unrolling: Compute remaining outputs */ 272 blkCnt = numSamples % 0x4U; 273 274 #else 275 276 /* Initialize blkCnt with number of samples */ 277 blkCnt = numSamples; 278 279 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ 280 #endif /* #if defined(ARM_MATH_NEON) */ 281 282 while (blkCnt > 0U) 283 { 284 /* C[2 * i ] = A[2 * i] * B[2 * i ] - A[2 * i + 1] * B[2 * i + 1]. */ 285 /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i ]. */ 286 287 a = *pSrcA++; 288 b = *pSrcA++; 289 c = *pSrcB++; 290 d = *pSrcB++; 291 292 /* store result in destination buffer. */ 293 *pDst++ = (a * c) - (b * d); 294 *pDst++ = (a * d) + (b * c); 295 296 /* Decrement loop counter */ 297 blkCnt--; 298 } 299 300 } 301 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ 302 303 /** 304 @} end of CmplxByCmplxMult group 305 */