arm_cmplx_dot_prod_q31.c
1 /* ---------------------------------------------------------------------- 2 * Project: CMSIS DSP Library 3 * Title: arm_cmplx_dot_prod_q31.c 4 * Description: Q31 complex dot product 5 * 6 * $Date: 23 April 2021 7 * $Revision: V1.9.0 8 * 9 * Target Processor: Cortex-M and Cortex-A cores 10 * -------------------------------------------------------------------- */ 11 /* 12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. 13 * 14 * SPDX-License-Identifier: Apache-2.0 15 * 16 * Licensed under the Apache License, Version 2.0 (the License); you may 17 * not use this file except in compliance with the License. 18 * You may obtain a copy of the License at 19 * 20 * www.apache.org/licenses/LICENSE-2.0 21 * 22 * Unless required by applicable law or agreed to in writing, software 23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 * See the License for the specific language governing permissions and 26 * limitations under the License. 27 */ 28 29 #include "dsp/complex_math_functions.h" 30 31 /** 32 @ingroup groupCmplxMath 33 */ 34 35 /** 36 @addtogroup cmplx_dot_prod 37 @{ 38 */ 39 40 /** 41 @brief Q31 complex dot product. 42 @param[in] pSrcA points to the first input vector 43 @param[in] pSrcB points to the second input vector 44 @param[in] numSamples number of samples in each vector 45 @param[out] realResult real part of the result returned here 46 @param[out] imagResult imaginary part of the result returned here 47 @return none 48 49 @par Scaling and Overflow Behavior 50 The function is implemented using an internal 64-bit accumulator. 51 The intermediate 1.31 by 1.31 multiplications are performed with 64-bit precision and then shifted to 16.48 format. 52 The internal real and imaginary accumulators are in 16.48 format and provide 15 guard bits. 53 Additions are nonsaturating and no overflow will occur as long as <code>numSamples</code> is less than 32768. 54 The return results <code>realResult</code> and <code>imagResult</code> are in 16.48 format. 55 Input down scaling is not required. 56 */ 57 58 #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE) 59 60 void arm_cmplx_dot_prod_q31( 61 const q31_t * pSrcA, 62 const q31_t * pSrcB, 63 uint32_t numSamples, 64 q63_t * realResult, 65 q63_t * imagResult) 66 { 67 int32_t blkCnt; 68 q63_t accReal = 0LL; 69 q63_t accImag = 0LL; 70 q31x4_t vecSrcA, vecSrcB; 71 q31x4_t vecSrcC, vecSrcD; 72 73 blkCnt = numSamples >> 2; 74 blkCnt -= 1; 75 if (blkCnt > 0) { 76 /* should give more freedom to generate stall free code */ 77 vecSrcA = vld1q(pSrcA); 78 vecSrcB = vld1q(pSrcB); 79 pSrcA += 4; 80 pSrcB += 4; 81 82 while (blkCnt > 0) { 83 84 accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB); 85 vecSrcC = vld1q(pSrcA); 86 pSrcA += 4; 87 88 accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB); 89 vecSrcD = vld1q(pSrcB); 90 pSrcB += 4; 91 92 accReal = vrmlsldavhaq(accReal, vecSrcC, vecSrcD); 93 vecSrcA = vld1q(pSrcA); 94 pSrcA += 4; 95 96 accImag = vrmlaldavhaxq(accImag, vecSrcC, vecSrcD); 97 vecSrcB = vld1q(pSrcB); 98 pSrcB += 4; 99 /* 100 * Decrement the blockSize loop counter 101 */ 102 blkCnt--; 103 } 104 105 /* process last elements out of the loop avoid the armclang breaking the SW pipeline */ 106 accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB); 107 vecSrcC = vld1q(pSrcA); 108 109 accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB); 110 vecSrcD = vld1q(pSrcB); 111 112 accReal = vrmlsldavhaq(accReal, vecSrcC, vecSrcD); 113 vecSrcA = vld1q(pSrcA); 114 115 accImag = vrmlaldavhaxq(accImag, vecSrcC, vecSrcD); 116 vecSrcB = vld1q(pSrcB); 117 118 /* 119 * tail 120 */ 121 blkCnt = CMPLX_DIM * (numSamples & 3); 122 do { 123 mve_pred16_t p = vctp32q(blkCnt); 124 125 pSrcA += 4; 126 pSrcB += 4; 127 128 vecSrcA = vldrwq_z_s32(pSrcA, p); 129 vecSrcB = vldrwq_z_s32(pSrcB, p); 130 131 accReal = vrmlsldavhaq_p(accReal, vecSrcA, vecSrcB, p); 132 accImag = vrmlaldavhaxq_p(accImag, vecSrcA, vecSrcB, p); 133 134 blkCnt -= 4; 135 } 136 while ((int32_t) blkCnt > 0); 137 } else { 138 blkCnt = numSamples * CMPLX_DIM; 139 while (blkCnt > 0) { 140 mve_pred16_t p = vctp32q(blkCnt); 141 142 vecSrcA = vldrwq_z_s32(pSrcA, p); 143 vecSrcB = vldrwq_z_s32(pSrcB, p); 144 145 accReal = vrmlsldavhaq_p(accReal, vecSrcA, vecSrcB, p); 146 accImag = vrmlaldavhaxq_p(accImag, vecSrcA, vecSrcB, p); 147 148 /* 149 * Decrement the blkCnt loop counter 150 * Advance vector source and destination pointers 151 */ 152 pSrcA += 4; 153 pSrcB += 4; 154 blkCnt -= 4; 155 } 156 } 157 *realResult = asrl(accReal, (14 - 8)); 158 *imagResult = asrl(accImag, (14 - 8)); 159 160 } 161 162 #else 163 void arm_cmplx_dot_prod_q31( 164 const q31_t * pSrcA, 165 const q31_t * pSrcB, 166 uint32_t numSamples, 167 q63_t * realResult, 168 q63_t * imagResult) 169 { 170 uint32_t blkCnt; /* Loop counter */ 171 q63_t real_sum = 0, imag_sum = 0; /* Temporary result variables */ 172 q31_t a0,b0,c0,d0; 173 174 #if defined (ARM_MATH_LOOPUNROLL) 175 176 /* Loop unrolling: Compute 4 outputs at a time */ 177 blkCnt = numSamples >> 2U; 178 179 while (blkCnt > 0U) 180 { 181 a0 = *pSrcA++; 182 b0 = *pSrcA++; 183 c0 = *pSrcB++; 184 d0 = *pSrcB++; 185 186 real_sum += ((q63_t)a0 * c0) >> 14; 187 imag_sum += ((q63_t)a0 * d0) >> 14; 188 real_sum -= ((q63_t)b0 * d0) >> 14; 189 imag_sum += ((q63_t)b0 * c0) >> 14; 190 191 a0 = *pSrcA++; 192 b0 = *pSrcA++; 193 c0 = *pSrcB++; 194 d0 = *pSrcB++; 195 196 real_sum += ((q63_t)a0 * c0) >> 14; 197 imag_sum += ((q63_t)a0 * d0) >> 14; 198 real_sum -= ((q63_t)b0 * d0) >> 14; 199 imag_sum += ((q63_t)b0 * c0) >> 14; 200 201 a0 = *pSrcA++; 202 b0 = *pSrcA++; 203 c0 = *pSrcB++; 204 d0 = *pSrcB++; 205 206 real_sum += ((q63_t)a0 * c0) >> 14; 207 imag_sum += ((q63_t)a0 * d0) >> 14; 208 real_sum -= ((q63_t)b0 * d0) >> 14; 209 imag_sum += ((q63_t)b0 * c0) >> 14; 210 211 a0 = *pSrcA++; 212 b0 = *pSrcA++; 213 c0 = *pSrcB++; 214 d0 = *pSrcB++; 215 216 real_sum += ((q63_t)a0 * c0) >> 14; 217 imag_sum += ((q63_t)a0 * d0) >> 14; 218 real_sum -= ((q63_t)b0 * d0) >> 14; 219 imag_sum += ((q63_t)b0 * c0) >> 14; 220 221 /* Decrement loop counter */ 222 blkCnt--; 223 } 224 225 /* Loop unrolling: Compute remaining outputs */ 226 blkCnt = numSamples % 0x4U; 227 228 #else 229 230 /* Initialize blkCnt with number of samples */ 231 blkCnt = numSamples; 232 233 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ 234 235 while (blkCnt > 0U) 236 { 237 a0 = *pSrcA++; 238 b0 = *pSrcA++; 239 c0 = *pSrcB++; 240 d0 = *pSrcB++; 241 242 real_sum += ((q63_t)a0 * c0) >> 14; 243 imag_sum += ((q63_t)a0 * d0) >> 14; 244 real_sum -= ((q63_t)b0 * d0) >> 14; 245 imag_sum += ((q63_t)b0 * c0) >> 14; 246 247 /* Decrement loop counter */ 248 blkCnt--; 249 } 250 251 /* Store real and imaginary result in 16.48 format */ 252 *realResult = real_sum; 253 *imagResult = imag_sum; 254 } 255 #endif /* defined(ARM_MATH_MVEI) */ 256 257 /** 258 @} end of cmplx_dot_prod group 259 */