arm_cmplx_dot_prod_f32.c
1 /* ---------------------------------------------------------------------- 2 * Project: CMSIS DSP Library 3 * Title: arm_cmplx_dot_prod_f32.c 4 * Description: Floating-point complex dot product 5 * 6 * $Date: 23 April 2021 7 * $Revision: V1.9.0 8 * 9 * Target Processor: Cortex-M and Cortex-A cores 10 * -------------------------------------------------------------------- */ 11 /* 12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. 13 * 14 * SPDX-License-Identifier: Apache-2.0 15 * 16 * Licensed under the Apache License, Version 2.0 (the License); you may 17 * not use this file except in compliance with the License. 18 * You may obtain a copy of the License at 19 * 20 * www.apache.org/licenses/LICENSE-2.0 21 * 22 * Unless required by applicable law or agreed to in writing, software 23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 * See the License for the specific language governing permissions and 26 * limitations under the License. 27 */ 28 29 #include "dsp/complex_math_functions.h" 30 31 /** 32 @ingroup groupCmplxMath 33 */ 34 35 /** 36 @defgroup cmplx_dot_prod Complex Dot Product 37 38 Computes the dot product of two complex vectors. 39 The vectors are multiplied element-by-element and then summed. 40 41 The <code>pSrcA</code> points to the first complex input vector and 42 <code>pSrcB</code> points to the second complex input vector. 43 <code>numSamples</code> specifies the number of complex samples 44 and the data in each array is stored in an interleaved fashion 45 (real, imag, real, imag, ...). 46 Each array has a total of <code>2*numSamples</code> values. 47 48 The underlying algorithm is used: 49 50 <pre> 51 realResult = 0; 52 imagResult = 0; 53 for (n = 0; n < numSamples; n++) { 54 realResult += pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1]; 55 imagResult += pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0]; 56 } 57 </pre> 58 59 There are separate functions for floating-point, Q15, and Q31 data types. 60 */ 61 62 /** 63 @addtogroup cmplx_dot_prod 64 @{ 65 */ 66 67 /** 68 @brief Floating-point complex dot product. 69 @param[in] pSrcA points to the first input vector 70 @param[in] pSrcB points to the second input vector 71 @param[in] numSamples number of samples in each vector 72 @param[out] realResult real part of the result returned here 73 @param[out] imagResult imaginary part of the result returned here 74 @return none 75 */ 76 77 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) 78 79 void arm_cmplx_dot_prod_f32( 80 const float32_t * pSrcA, 81 const float32_t * pSrcB, 82 uint32_t numSamples, 83 float32_t * realResult, 84 float32_t * imagResult) 85 { 86 int32_t blkCnt; 87 float32_t real_sum, imag_sum; 88 f32x4_t vecSrcA, vecSrcB; 89 f32x4_t vec_acc = vdupq_n_f32(0.0f); 90 f32x4_t vecSrcC, vecSrcD; 91 92 blkCnt = numSamples >> 2; 93 blkCnt -= 1; 94 if (blkCnt > 0) { 95 /* should give more freedom to generate stall free code */ 96 vecSrcA = vld1q(pSrcA); 97 vecSrcB = vld1q(pSrcB); 98 pSrcA += 4; 99 pSrcB += 4; 100 101 while (blkCnt > 0) { 102 vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB); 103 vecSrcC = vld1q(pSrcA); 104 pSrcA += 4; 105 106 vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB); 107 vecSrcD = vld1q(pSrcB); 108 pSrcB += 4; 109 110 vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD); 111 vecSrcA = vld1q(pSrcA); 112 pSrcA += 4; 113 114 vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD); 115 vecSrcB = vld1q(pSrcB); 116 pSrcB += 4; 117 /* 118 * Decrement the blockSize loop counter 119 */ 120 blkCnt--; 121 } 122 123 /* process last elements out of the loop avoid the armclang breaking the SW pipeline */ 124 vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB); 125 vecSrcC = vld1q(pSrcA); 126 127 vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB); 128 vecSrcD = vld1q(pSrcB); 129 130 vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD); 131 vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD); 132 133 /* 134 * tail 135 */ 136 blkCnt = CMPLX_DIM * (numSamples & 3); 137 while (blkCnt > 0) { 138 mve_pred16_t p = vctp32q(blkCnt); 139 pSrcA += 4; 140 pSrcB += 4; 141 vecSrcA = vldrwq_z_f32(pSrcA, p); 142 vecSrcB = vldrwq_z_f32(pSrcB, p); 143 vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p); 144 vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p); 145 blkCnt -= 4; 146 } 147 } else { 148 /* small vector */ 149 blkCnt = numSamples * CMPLX_DIM; 150 vec_acc = vdupq_n_f32(0.0f); 151 152 do { 153 mve_pred16_t p = vctp32q(blkCnt); 154 155 vecSrcA = vldrwq_z_f32(pSrcA, p); 156 vecSrcB = vldrwq_z_f32(pSrcB, p); 157 158 vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p); 159 vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p); 160 161 /* 162 * Decrement the blkCnt loop counter 163 * Advance vector source and destination pointers 164 */ 165 pSrcA += 4; 166 pSrcB += 4; 167 blkCnt -= 4; 168 } 169 while (blkCnt > 0); 170 } 171 172 real_sum = vgetq_lane(vec_acc, 0) + vgetq_lane(vec_acc, 2); 173 imag_sum = vgetq_lane(vec_acc, 1) + vgetq_lane(vec_acc, 3); 174 175 /* 176 * Store the real and imaginary results in the destination buffers 177 */ 178 *realResult = real_sum; 179 *imagResult = imag_sum; 180 } 181 182 #else 183 void arm_cmplx_dot_prod_f32( 184 const float32_t * pSrcA, 185 const float32_t * pSrcB, 186 uint32_t numSamples, 187 float32_t * realResult, 188 float32_t * imagResult) 189 { 190 uint32_t blkCnt; /* Loop counter */ 191 float32_t real_sum = 0.0f, imag_sum = 0.0f; /* Temporary result variables */ 192 float32_t a0,b0,c0,d0; 193 194 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) 195 float32x4x2_t vec1,vec2,vec3,vec4; 196 float32x4_t accR,accI; 197 float32x2_t accum = vdup_n_f32(0); 198 199 accR = vdupq_n_f32(0.0f); 200 accI = vdupq_n_f32(0.0f); 201 202 /* Loop unrolling: Compute 8 outputs at a time */ 203 blkCnt = numSamples >> 3U; 204 205 while (blkCnt > 0U) 206 { 207 /* C = (A[0]+jA[1])*(B[0]+jB[1]) + ... */ 208 /* Calculate dot product and then store the result in a temporary buffer. */ 209 210 vec1 = vld2q_f32(pSrcA); 211 vec2 = vld2q_f32(pSrcB); 212 213 /* Increment pointers */ 214 pSrcA += 8; 215 pSrcB += 8; 216 217 /* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */ 218 accR = vmlaq_f32(accR,vec1.val[0],vec2.val[0]); 219 accR = vmlsq_f32(accR,vec1.val[1],vec2.val[1]); 220 221 /* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */ 222 accI = vmlaq_f32(accI,vec1.val[1],vec2.val[0]); 223 accI = vmlaq_f32(accI,vec1.val[0],vec2.val[1]); 224 225 vec3 = vld2q_f32(pSrcA); 226 vec4 = vld2q_f32(pSrcB); 227 228 /* Increment pointers */ 229 pSrcA += 8; 230 pSrcB += 8; 231 232 /* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */ 233 accR = vmlaq_f32(accR,vec3.val[0],vec4.val[0]); 234 accR = vmlsq_f32(accR,vec3.val[1],vec4.val[1]); 235 236 /* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */ 237 accI = vmlaq_f32(accI,vec3.val[1],vec4.val[0]); 238 accI = vmlaq_f32(accI,vec3.val[0],vec4.val[1]); 239 240 /* Decrement the loop counter */ 241 blkCnt--; 242 } 243 244 accum = vpadd_f32(vget_low_f32(accR), vget_high_f32(accR)); 245 real_sum += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1); 246 247 accum = vpadd_f32(vget_low_f32(accI), vget_high_f32(accI)); 248 imag_sum += vget_lane_f32(accum, 0) + vget_lane_f32(accum, 1); 249 250 /* Tail */ 251 blkCnt = numSamples & 0x7; 252 253 #else 254 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE) 255 256 /* Loop unrolling: Compute 4 outputs at a time */ 257 blkCnt = numSamples >> 2U; 258 259 while (blkCnt > 0U) 260 { 261 a0 = *pSrcA++; 262 b0 = *pSrcA++; 263 c0 = *pSrcB++; 264 d0 = *pSrcB++; 265 266 real_sum += a0 * c0; 267 imag_sum += a0 * d0; 268 real_sum -= b0 * d0; 269 imag_sum += b0 * c0; 270 271 a0 = *pSrcA++; 272 b0 = *pSrcA++; 273 c0 = *pSrcB++; 274 d0 = *pSrcB++; 275 276 real_sum += a0 * c0; 277 imag_sum += a0 * d0; 278 real_sum -= b0 * d0; 279 imag_sum += b0 * c0; 280 281 a0 = *pSrcA++; 282 b0 = *pSrcA++; 283 c0 = *pSrcB++; 284 d0 = *pSrcB++; 285 286 real_sum += a0 * c0; 287 imag_sum += a0 * d0; 288 real_sum -= b0 * d0; 289 imag_sum += b0 * c0; 290 291 a0 = *pSrcA++; 292 b0 = *pSrcA++; 293 c0 = *pSrcB++; 294 d0 = *pSrcB++; 295 296 real_sum += a0 * c0; 297 imag_sum += a0 * d0; 298 real_sum -= b0 * d0; 299 imag_sum += b0 * c0; 300 301 /* Decrement loop counter */ 302 blkCnt--; 303 } 304 305 /* Loop unrolling: Compute remaining outputs */ 306 blkCnt = numSamples % 0x4U; 307 308 #else 309 310 /* Initialize blkCnt with number of samples */ 311 blkCnt = numSamples; 312 313 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ 314 #endif /* #if defined(ARM_MATH_NEON) */ 315 316 while (blkCnt > 0U) 317 { 318 a0 = *pSrcA++; 319 b0 = *pSrcA++; 320 c0 = *pSrcB++; 321 d0 = *pSrcB++; 322 323 real_sum += a0 * c0; 324 imag_sum += a0 * d0; 325 real_sum -= b0 * d0; 326 imag_sum += b0 * c0; 327 328 /* Decrement loop counter */ 329 blkCnt--; 330 } 331 332 /* Store real and imaginary result in destination buffer. */ 333 *realResult = real_sum; 334 *imagResult = imag_sum; 335 } 336 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ 337 338 /** 339 @} end of cmplx_dot_prod group 340 */