arm_cmplx_dot_prod_f16.c
1 /* ---------------------------------------------------------------------- 2 * Project: CMSIS DSP Library 3 * Title: arm_cmplx_dot_prod_f16.c 4 * Description: Floating-point complex dot product 5 * 6 * $Date: 23 April 2021 7 * $Revision: V1.9.0 8 * 9 * Target Processor: Cortex-M and Cortex-A cores 10 * -------------------------------------------------------------------- */ 11 /* 12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. 13 * 14 * SPDX-License-Identifier: Apache-2.0 15 * 16 * Licensed under the Apache License, Version 2.0 (the License); you may 17 * not use this file except in compliance with the License. 18 * You may obtain a copy of the License at 19 * 20 * www.apache.org/licenses/LICENSE-2.0 21 * 22 * Unless required by applicable law or agreed to in writing, software 23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 * See the License for the specific language governing permissions and 26 * limitations under the License. 27 */ 28 29 #include "dsp/complex_math_functions_f16.h" 30 31 #if defined(ARM_FLOAT16_SUPPORTED) 32 33 34 /** 35 @ingroup groupCmplxMath 36 */ 37 38 /** 39 @defgroup cmplx_dot_prod Complex Dot Product 40 41 Computes the dot product of two complex vectors. 42 The vectors are multiplied element-by-element and then summed. 43 44 The <code>pSrcA</code> points to the first complex input vector and 45 <code>pSrcB</code> points to the second complex input vector. 46 <code>numSamples</code> specifies the number of complex samples 47 and the data in each array is stored in an interleaved fashion 48 (real, imag, real, imag, ...). 49 Each array has a total of <code>2*numSamples</code> values. 50 51 The underlying algorithm is used: 52 53 <pre> 54 realResult = 0; 55 imagResult = 0; 56 for (n = 0; n < numSamples; n++) { 57 realResult += pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1]; 58 imagResult += pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0]; 59 } 60 </pre> 61 62 There are separate functions for floating-point, Q15, and Q31 data types. 63 */ 64 65 /** 66 @addtogroup cmplx_dot_prod 67 @{ 68 */ 69 70 /** 71 @brief Floating-point complex dot product. 72 @param[in] pSrcA points to the first input vector 73 @param[in] pSrcB points to the second input vector 74 @param[in] numSamples number of samples in each vector 75 @param[out] realResult real part of the result returned here 76 @param[out] imagResult imaginary part of the result returned here 77 @return none 78 */ 79 80 #if defined(ARM_MATH_MVE_FLOAT16) && !defined(ARM_MATH_AUTOVECTORIZE) 81 82 #include "arm_helium_utils.h" 83 84 void arm_cmplx_dot_prod_f16( 85 const float16_t * pSrcA, 86 const float16_t * pSrcB, 87 uint32_t numSamples, 88 float16_t * realResult, 89 float16_t * imagResult) 90 { 91 int32_t blkCnt; 92 float16_t real_sum, imag_sum; 93 f16x8_t vecSrcA, vecSrcB; 94 f16x8_t vec_acc = vdupq_n_f16(0.0f16); 95 f16x8_t vecSrcC, vecSrcD; 96 97 blkCnt = (numSamples >> 3); 98 blkCnt -= 1; 99 if (blkCnt > 0) { 100 /* should give more freedom to generate stall free code */ 101 vecSrcA = vld1q( pSrcA); 102 vecSrcB = vld1q( pSrcB); 103 pSrcA += 8; 104 pSrcB += 8; 105 106 while (blkCnt > 0) { 107 vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB); 108 vecSrcC = vld1q(pSrcA); 109 pSrcA += 8; 110 111 vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB); 112 vecSrcD = vld1q(pSrcB); 113 pSrcB += 8; 114 115 vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD); 116 vecSrcA = vld1q(pSrcA); 117 pSrcA += 8; 118 119 vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD); 120 vecSrcB = vld1q(pSrcB); 121 pSrcB += 8; 122 /* 123 * Decrement the blockSize loop counter 124 */ 125 blkCnt--; 126 } 127 128 /* process last elements out of the loop avoid the armclang breaking the SW pipeline */ 129 vec_acc = vcmlaq(vec_acc, vecSrcA, vecSrcB); 130 vecSrcC = vld1q(pSrcA); 131 132 vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB); 133 vecSrcD = vld1q(pSrcB); 134 135 vec_acc = vcmlaq(vec_acc, vecSrcC, vecSrcD); 136 vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD); 137 138 /* 139 * tail 140 */ 141 blkCnt = CMPLX_DIM * (numSamples & 7); 142 while (blkCnt > 0) { 143 mve_pred16_t p = vctp16q(blkCnt); 144 pSrcA += 8; 145 pSrcB += 8; 146 147 vecSrcA = vldrhq_z_f16(pSrcA, p); 148 vecSrcB = vldrhq_z_f16(pSrcB, p); 149 vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p); 150 vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p); 151 152 blkCnt -= 8; 153 } 154 } else { 155 /* small vector */ 156 blkCnt = numSamples * CMPLX_DIM; 157 vec_acc = vdupq_n_f16(0.0f16); 158 159 do { 160 mve_pred16_t p = vctp16q(blkCnt); 161 162 vecSrcA = vldrhq_z_f16(pSrcA, p); 163 vecSrcB = vldrhq_z_f16(pSrcB, p); 164 165 vec_acc = vcmlaq_m(vec_acc, vecSrcA, vecSrcB, p); 166 vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p); 167 168 /* 169 * Decrement the blkCnt loop counter 170 * Advance vector source and destination pointers 171 */ 172 pSrcA += 8; 173 pSrcB += 8; 174 blkCnt -= 8; 175 } 176 while (blkCnt > 0); 177 } 178 179 /* Sum the partial parts */ 180 mve_cmplx_sum_intra_r_i_f16(vec_acc, real_sum, imag_sum); 181 182 /* 183 * Store the real and imaginary results in the destination buffers 184 */ 185 *realResult = real_sum; 186 *imagResult = imag_sum; 187 } 188 189 #else 190 void arm_cmplx_dot_prod_f16( 191 const float16_t * pSrcA, 192 const float16_t * pSrcB, 193 uint32_t numSamples, 194 float16_t * realResult, 195 float16_t * imagResult) 196 { 197 uint32_t blkCnt; /* Loop counter */ 198 _Float16 real_sum = 0.0f, imag_sum = 0.0f; /* Temporary result variables */ 199 _Float16 a0,b0,c0,d0; 200 201 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE) 202 203 /* Loop unrolling: Compute 4 outputs at a time */ 204 blkCnt = numSamples >> 2U; 205 206 while (blkCnt > 0U) 207 { 208 a0 = *pSrcA++; 209 b0 = *pSrcA++; 210 c0 = *pSrcB++; 211 d0 = *pSrcB++; 212 213 real_sum += a0 * c0; 214 imag_sum += a0 * d0; 215 real_sum -= b0 * d0; 216 imag_sum += b0 * c0; 217 218 a0 = *pSrcA++; 219 b0 = *pSrcA++; 220 c0 = *pSrcB++; 221 d0 = *pSrcB++; 222 223 real_sum += a0 * c0; 224 imag_sum += a0 * d0; 225 real_sum -= b0 * d0; 226 imag_sum += b0 * c0; 227 228 a0 = *pSrcA++; 229 b0 = *pSrcA++; 230 c0 = *pSrcB++; 231 d0 = *pSrcB++; 232 233 real_sum += a0 * c0; 234 imag_sum += a0 * d0; 235 real_sum -= b0 * d0; 236 imag_sum += b0 * c0; 237 238 a0 = *pSrcA++; 239 b0 = *pSrcA++; 240 c0 = *pSrcB++; 241 d0 = *pSrcB++; 242 243 real_sum += a0 * c0; 244 imag_sum += a0 * d0; 245 real_sum -= b0 * d0; 246 imag_sum += b0 * c0; 247 248 /* Decrement loop counter */ 249 blkCnt--; 250 } 251 252 /* Loop unrolling: Compute remaining outputs */ 253 blkCnt = numSamples % 0x4U; 254 255 #else 256 257 /* Initialize blkCnt with number of samples */ 258 blkCnt = numSamples; 259 260 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ 261 262 while (blkCnt > 0U) 263 { 264 a0 = *pSrcA++; 265 b0 = *pSrcA++; 266 c0 = *pSrcB++; 267 d0 = *pSrcB++; 268 269 real_sum += a0 * c0; 270 imag_sum += a0 * d0; 271 real_sum -= b0 * d0; 272 imag_sum += b0 * c0; 273 274 /* Decrement loop counter */ 275 blkCnt--; 276 } 277 278 /* Store real and imaginary result in destination buffer. */ 279 *realResult = real_sum; 280 *imagResult = imag_sum; 281 } 282 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ 283 284 /** 285 @} end of cmplx_dot_prod group 286 */ 287 288 #endif /* #if defined(ARM_FLOAT16_SUPPORTED) */