/ Drivers / CMSIS / DSP / Source / BasicMathFunctions / arm_dot_prod_q15.c
arm_dot_prod_q15.c
  1  /* ----------------------------------------------------------------------
  2   * Project:      CMSIS DSP Library
  3   * Title:        arm_dot_prod_q15.c
  4   * Description:  Q15 dot product
  5   *
  6   * $Date:        23 April 2021
  7   * $Revision:    V1.9.0
  8   *
  9   * Target Processor: Cortex-M and Cortex-A cores
 10   * -------------------------------------------------------------------- */
 11  /*
 12   * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
 13   *
 14   * SPDX-License-Identifier: Apache-2.0
 15   *
 16   * Licensed under the Apache License, Version 2.0 (the License); you may
 17   * not use this file except in compliance with the License.
 18   * You may obtain a copy of the License at
 19   *
 20   * www.apache.org/licenses/LICENSE-2.0
 21   *
 22   * Unless required by applicable law or agreed to in writing, software
 23   * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 24   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 25   * See the License for the specific language governing permissions and
 26   * limitations under the License.
 27   */
 28  
 29  #include "dsp/basic_math_functions.h"
 30  
 31  /**
 32    @ingroup groupMath
 33   */
 34  
 35  /**
 36    @addtogroup BasicDotProd
 37    @{
 38   */
 39  
 40  /**
 41    @brief         Dot product of Q15 vectors.
 42    @param[in]     pSrcA      points to the first input vector
 43    @param[in]     pSrcB      points to the second input vector
 44    @param[in]     blockSize  number of samples in each vector
 45    @param[out]    result     output result returned here
 46    @return        none
 47  
 48    @par           Scaling and Overflow Behavior
 49                     The intermediate multiplications are in 1.15 x 1.15 = 2.30 format and these
 50                     results are added to a 64-bit accumulator in 34.30 format.
 51                     Nonsaturating additions are used and given that there are 33 guard bits in the accumulator
 52                     there is no risk of overflow.
 53                     The return result is in 34.30 format.
 54   */
 55  #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 56  
 57  #include "arm_helium_utils.h"
 58  
 59  void arm_dot_prod_q15(
 60      const q15_t * pSrcA,
 61      const q15_t * pSrcB,
 62      uint32_t blockSize,
 63      q63_t * result)
 64  {
 65      uint32_t  blkCnt;           /* loop counters */
 66      q15x8_t vecA;
 67      q15x8_t vecB;
 68      q63_t     sum = 0LL;
 69  
 70      /* Compute 8 outputs at a time */
 71      blkCnt = blockSize >> 3;
 72      while (blkCnt > 0U)
 73      {
 74          /*
 75           * C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1]
 76           * Calculate dot product and then store the result in a temporary buffer.
 77           */
 78          vecA = vld1q(pSrcA);
 79          vecB = vld1q(pSrcB);
 80          sum = vmlaldavaq(sum, vecA, vecB);
 81          /*
 82           * Decrement the blockSize loop counter
 83           */
 84          blkCnt--;
 85          /*
 86           * advance vector source and destination pointers
 87           */
 88          pSrcA += 8;
 89          pSrcB += 8;
 90      }
 91      /*
 92       * tail
 93       */
 94      blkCnt = blockSize & 7;
 95      if (blkCnt > 0U)
 96      {
 97          mve_pred16_t p0 = vctp16q(blkCnt);
 98          vecA = vld1q(pSrcA);
 99          vecB = vld1q(pSrcB);
100          sum = vmlaldavaq_p(sum, vecA, vecB, p0);
101      }
102  
103      *result = sum;
104  }
105  
106  #else
107  void arm_dot_prod_q15(
108    const q15_t * pSrcA,
109    const q15_t * pSrcB,
110          uint32_t blockSize,
111          q63_t * result)
112  {
113          uint32_t blkCnt;                               /* Loop counter */
114          q63_t sum = 0;                                 /* Temporary return variable */
115  
116  #if defined (ARM_MATH_LOOPUNROLL)
117  
118    /* Loop unrolling: Compute 4 outputs at a time */
119    blkCnt = blockSize >> 2U;
120  
121    while (blkCnt > 0U)
122    {
123      /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
124  
125  #if defined (ARM_MATH_DSP)
126      /* Calculate dot product and store result in a temporary buffer. */
127      sum = __SMLALD(read_q15x2_ia (&pSrcA), read_q15x2_ia (&pSrcB), sum);
128      sum = __SMLALD(read_q15x2_ia (&pSrcA), read_q15x2_ia (&pSrcB), sum);
129  #else
130      sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
131      sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
132      sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
133      sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
134  #endif
135  
136      /* Decrement loop counter */
137      blkCnt--;
138    }
139  
140    /* Loop unrolling: Compute remaining outputs */
141    blkCnt = blockSize % 0x4U;
142  
143  #else
144  
145    /* Initialize blkCnt with number of samples */
146    blkCnt = blockSize;
147  
148  #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
149  
150    while (blkCnt > 0U)
151    {
152      /* C = A[0]* B[0] + A[1]* B[1] + A[2]* B[2] + .....+ A[blockSize-1]* B[blockSize-1] */
153  
154      /* Calculate dot product and store result in a temporary buffer. */
155  //#if defined (ARM_MATH_DSP)
156  //    sum  = __SMLALD(*pSrcA++, *pSrcB++, sum);
157  //#else
158      sum += (q63_t)((q31_t) *pSrcA++ * *pSrcB++);
159  //#endif
160  
161      /* Decrement loop counter */
162      blkCnt--;
163    }
164  
165    /* Store result in destination buffer in 34.30 format */
166    *result = sum;
167  }
168  #endif /* defined(ARM_MATH_MVEI) */
169  
170  /**
171    @} end of BasicDotProd group
172   */