/ Drivers / CMSIS / DSP / Source / ComplexMathFunctions / arm_cmplx_dot_prod_q31.c
arm_cmplx_dot_prod_q31.c
  1  /* ----------------------------------------------------------------------
  2   * Project:      CMSIS DSP Library
  3   * Title:        arm_cmplx_dot_prod_q31.c
  4   * Description:  Q31 complex dot product
  5   *
  6   * $Date:        23 April 2021
  7   * $Revision:    V1.9.0
  8   *
  9   * Target Processor: Cortex-M and Cortex-A cores
 10   * -------------------------------------------------------------------- */
 11  /*
 12   * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
 13   *
 14   * SPDX-License-Identifier: Apache-2.0
 15   *
 16   * Licensed under the Apache License, Version 2.0 (the License); you may
 17   * not use this file except in compliance with the License.
 18   * You may obtain a copy of the License at
 19   *
 20   * www.apache.org/licenses/LICENSE-2.0
 21   *
 22   * Unless required by applicable law or agreed to in writing, software
 23   * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 24   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 25   * See the License for the specific language governing permissions and
 26   * limitations under the License.
 27   */
 28  
 29  #include "dsp/complex_math_functions.h"
 30  
 31  /**
 32    @ingroup groupCmplxMath
 33   */
 34  
 35  /**
 36    @addtogroup cmplx_dot_prod
 37    @{
 38   */
 39  
 40  /**
 41    @brief         Q31 complex dot product.
 42    @param[in]     pSrcA       points to the first input vector
 43    @param[in]     pSrcB       points to the second input vector
 44    @param[in]     numSamples  number of samples in each vector
 45    @param[out]    realResult  real part of the result returned here
 46    @param[out]    imagResult  imaginary part of the result returned here
 47    @return        none
 48  
 49    @par           Scaling and Overflow Behavior
 50                     The function is implemented using an internal 64-bit accumulator.
 51                     The intermediate 1.31 by 1.31 multiplications are performed with 64-bit precision and then shifted to 16.48 format.
 52                     The internal real and imaginary accumulators are in 16.48 format and provide 15 guard bits.
 53                     Additions are nonsaturating and no overflow will occur as long as <code>numSamples</code> is less than 32768.
 54                     The return results <code>realResult</code> and <code>imagResult</code> are in 16.48 format.
 55                     Input down scaling is not required.
 56   */
 57  
 58  #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 59  
 60  void arm_cmplx_dot_prod_q31(
 61    const q31_t * pSrcA,
 62    const q31_t * pSrcB,
 63          uint32_t numSamples,
 64          q63_t * realResult,
 65          q63_t * imagResult)
 66  {
 67      int32_t         blkCnt;
 68      q63_t           accReal = 0LL;
 69      q63_t           accImag = 0LL;
 70      q31x4_t         vecSrcA, vecSrcB;
 71      q31x4_t         vecSrcC, vecSrcD;
 72  
 73      blkCnt = numSamples >> 2;
 74      blkCnt -= 1;
 75      if (blkCnt > 0) {
 76          /* should give more freedom to generate stall free code */
 77          vecSrcA = vld1q(pSrcA);
 78          vecSrcB = vld1q(pSrcB);
 79          pSrcA += 4;
 80          pSrcB += 4;
 81  
 82          while (blkCnt > 0) {
 83  
 84              accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB);
 85              vecSrcC = vld1q(pSrcA);
 86              pSrcA += 4;
 87  
 88              accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB);
 89              vecSrcD = vld1q(pSrcB);
 90              pSrcB += 4;
 91  
 92              accReal = vrmlsldavhaq(accReal, vecSrcC, vecSrcD);
 93              vecSrcA = vld1q(pSrcA);
 94              pSrcA += 4;
 95  
 96              accImag = vrmlaldavhaxq(accImag, vecSrcC, vecSrcD);
 97              vecSrcB = vld1q(pSrcB);
 98              pSrcB += 4;
 99              /*
100               * Decrement the blockSize loop counter
101               */
102              blkCnt--;
103          }
104  
105          /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
106          accReal = vrmlsldavhaq(accReal, vecSrcA, vecSrcB);
107          vecSrcC = vld1q(pSrcA);
108  
109          accImag = vrmlaldavhaxq(accImag, vecSrcA, vecSrcB);
110          vecSrcD = vld1q(pSrcB);
111  
112          accReal = vrmlsldavhaq(accReal, vecSrcC, vecSrcD);
113          vecSrcA = vld1q(pSrcA);
114  
115          accImag = vrmlaldavhaxq(accImag, vecSrcC, vecSrcD);
116          vecSrcB = vld1q(pSrcB);
117  
118          /*
119           * tail
120           */
121          blkCnt = CMPLX_DIM * (numSamples & 3);
122          do {
123              mve_pred16_t    p = vctp32q(blkCnt);
124  
125              pSrcA += 4;
126              pSrcB += 4;
127  
128              vecSrcA = vldrwq_z_s32(pSrcA, p);
129              vecSrcB = vldrwq_z_s32(pSrcB, p);
130  
131              accReal = vrmlsldavhaq_p(accReal, vecSrcA, vecSrcB, p);
132              accImag = vrmlaldavhaxq_p(accImag, vecSrcA, vecSrcB, p);
133  
134              blkCnt -= 4;
135          }
136          while ((int32_t) blkCnt > 0);
137      } else {
138          blkCnt = numSamples * CMPLX_DIM;
139          while (blkCnt > 0) {
140              mve_pred16_t    p = vctp32q(blkCnt);
141  
142              vecSrcA = vldrwq_z_s32(pSrcA, p);
143              vecSrcB = vldrwq_z_s32(pSrcB, p);
144  
145              accReal = vrmlsldavhaq_p(accReal, vecSrcA, vecSrcB, p);
146              accImag = vrmlaldavhaxq_p(accImag, vecSrcA, vecSrcB, p);
147  
148              /*
149               * Decrement the blkCnt loop counter
150               * Advance vector source and destination pointers
151               */
152              pSrcA += 4;
153              pSrcB += 4;
154              blkCnt -= 4;
155          }
156      }
157      *realResult = asrl(accReal, (14 - 8));
158      *imagResult = asrl(accImag, (14 - 8));
159  
160  }
161  
162  #else
163  void arm_cmplx_dot_prod_q31(
164    const q31_t * pSrcA,
165    const q31_t * pSrcB,
166          uint32_t numSamples,
167          q63_t * realResult,
168          q63_t * imagResult)
169  {
170          uint32_t blkCnt;                               /* Loop counter */
171          q63_t real_sum = 0, imag_sum = 0;              /* Temporary result variables */
172          q31_t a0,b0,c0,d0;
173  
174  #if defined (ARM_MATH_LOOPUNROLL)
175  
176    /* Loop unrolling: Compute 4 outputs at a time */
177    blkCnt = numSamples >> 2U;
178  
179    while (blkCnt > 0U)
180    {
181      a0 = *pSrcA++;
182      b0 = *pSrcA++;
183      c0 = *pSrcB++;
184      d0 = *pSrcB++;
185  
186      real_sum += ((q63_t)a0 * c0) >> 14;
187      imag_sum += ((q63_t)a0 * d0) >> 14;
188      real_sum -= ((q63_t)b0 * d0) >> 14;
189      imag_sum += ((q63_t)b0 * c0) >> 14;
190  
191      a0 = *pSrcA++;
192      b0 = *pSrcA++;
193      c0 = *pSrcB++;
194      d0 = *pSrcB++;
195  
196      real_sum += ((q63_t)a0 * c0) >> 14;
197      imag_sum += ((q63_t)a0 * d0) >> 14;
198      real_sum -= ((q63_t)b0 * d0) >> 14;
199      imag_sum += ((q63_t)b0 * c0) >> 14;
200  
201      a0 = *pSrcA++;
202      b0 = *pSrcA++;
203      c0 = *pSrcB++;
204      d0 = *pSrcB++;
205  
206      real_sum += ((q63_t)a0 * c0) >> 14;
207      imag_sum += ((q63_t)a0 * d0) >> 14;
208      real_sum -= ((q63_t)b0 * d0) >> 14;
209      imag_sum += ((q63_t)b0 * c0) >> 14;
210  
211      a0 = *pSrcA++;
212      b0 = *pSrcA++;
213      c0 = *pSrcB++;
214      d0 = *pSrcB++;
215  
216      real_sum += ((q63_t)a0 * c0) >> 14;
217      imag_sum += ((q63_t)a0 * d0) >> 14;
218      real_sum -= ((q63_t)b0 * d0) >> 14;
219      imag_sum += ((q63_t)b0 * c0) >> 14;
220  
221      /* Decrement loop counter */
222      blkCnt--;
223    }
224  
225    /* Loop unrolling: Compute remaining outputs */
226    blkCnt = numSamples % 0x4U;
227  
228  #else
229  
230    /* Initialize blkCnt with number of samples */
231    blkCnt = numSamples;
232  
233  #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
234  
235    while (blkCnt > 0U)
236    {
237      a0 = *pSrcA++;
238      b0 = *pSrcA++;
239      c0 = *pSrcB++;
240      d0 = *pSrcB++;
241  
242      real_sum += ((q63_t)a0 * c0) >> 14;
243      imag_sum += ((q63_t)a0 * d0) >> 14;
244      real_sum -= ((q63_t)b0 * d0) >> 14;
245      imag_sum += ((q63_t)b0 * c0) >> 14;
246  
247      /* Decrement loop counter */
248      blkCnt--;
249    }
250  
251    /* Store real and imaginary result in 16.48 format  */
252    *realResult = real_sum;
253    *imagResult = imag_sum;
254  }
255  #endif /* defined(ARM_MATH_MVEI) */
256  
257  /**
258    @} end of cmplx_dot_prod group
259   */