/ Drivers / CMSIS / DSP / Source / StatisticsFunctions / arm_power_q31.c
arm_power_q31.c
  1  /* ----------------------------------------------------------------------
  2   * Project:      CMSIS DSP Library
  3   * Title:        arm_power_q31.c
  4   * Description:  Sum of the squares of the elements of a Q31 vector
  5   *
  6   * $Date:        23 April 2021
  7   * $Revision:    V1.9.0
  8   *
  9   * Target Processor: Cortex-M and Cortex-A cores
 10   * -------------------------------------------------------------------- */
 11  /*
 12   * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
 13   *
 14   * SPDX-License-Identifier: Apache-2.0
 15   *
 16   * Licensed under the Apache License, Version 2.0 (the License); you may
 17   * not use this file except in compliance with the License.
 18   * You may obtain a copy of the License at
 19   *
 20   * www.apache.org/licenses/LICENSE-2.0
 21   *
 22   * Unless required by applicable law or agreed to in writing, software
 23   * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 24   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 25   * See the License for the specific language governing permissions and
 26   * limitations under the License.
 27   */
 28  
 29  #include "dsp/statistics_functions.h"
 30  
 31  /**
 32    @ingroup groupStats
 33   */
 34  
 35  /**
 36    @addtogroup power
 37    @{
 38   */
 39  
 40  /**
 41    @brief         Sum of the squares of the elements of a Q31 vector.
 42    @param[in]     pSrc       points to the input vector
 43    @param[in]     blockSize  number of samples in input vector
 44    @param[out]    pResult    sum of the squares value returned here
 45    @return        none
 46  
 47    @par           Scaling and Overflow Behavior
 48                     The function is implemented using a 64-bit internal accumulator.
 49                     The input is represented in 1.31 format.
 50                     Intermediate multiplication yields a 2.62 format, and this
 51                     result is truncated to 2.48 format by discarding the lower 14 bits.
 52                     The 2.48 result is then added without saturation to a 64-bit accumulator in 16.48 format.
 53                     With 15 guard bits in the accumulator, there is no risk of overflow, and the
 54                     full precision of the intermediate multiplication is preserved.
 55                     Finally, the return result is in 16.48 format.
 56   */
 57  #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 58  void arm_power_q31(
 59    const q31_t * pSrc,
 60          uint32_t blockSize,
 61          q63_t * pResult)
 62  {
 63      uint32_t     blkCnt;           /* loop counters */
 64      q31x4_t     vecSrc;
 65      q63_t       sum = 0LL;
 66      q31_t       in;
 67  
 68      /* Compute 4 outputs at a time */
 69      blkCnt = blockSize >> 2U;
 70      while (blkCnt > 0U)
 71      {
 72          vecSrc = vldrwq_s32(pSrc);
 73          /*
 74           * sum lanes
 75           */
 76          sum = vrmlaldavhaq(sum, vecSrc, vecSrc);
 77  
 78          blkCnt --;
 79          pSrc += 4;
 80      }
 81  
 82      /*
 83       * tail
 84       */
 85      blkCnt = blockSize & 0x3;
 86      while (blkCnt > 0U)
 87      {
 88         /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
 89  
 90         /* Compute Power and store result in a temporary variable, sum. */
 91         in = *pSrc++;
 92         sum += ((q63_t) in * in) >> 8;
 93  
 94         /* Decrement loop counter */
 95         blkCnt--;
 96      }
 97  
 98      *pResult = asrl(sum, 6);
 99  }
100  #else
101  void arm_power_q31(
102    const q31_t * pSrc,
103          uint32_t blockSize,
104          q63_t * pResult)
105  {
106          uint32_t blkCnt;                               /* Loop counter */
107          q63_t sum = 0;                                 /* Temporary result storage */
108          q31_t in;                                      /* Temporary variable to store input value */
109  
110  #if defined (ARM_MATH_LOOPUNROLL)
111  
112    /* Loop unrolling: Compute 4 outputs at a time */
113    blkCnt = blockSize >> 2U;
114  
115    while (blkCnt > 0U)
116    {
117      /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
118  
119      /* Compute Power then shift intermediate results by 14 bits to maintain 16.48 format and store result in a temporary variable sum, providing 15 guard bits. */
120      in = *pSrc++;
121      sum += ((q63_t) in * in) >> 14U;
122  
123      in = *pSrc++;
124      sum += ((q63_t) in * in) >> 14U;
125  
126      in = *pSrc++;
127      sum += ((q63_t) in * in) >> 14U;
128  
129      in = *pSrc++;
130      sum += ((q63_t) in * in) >> 14U;
131  
132      /* Decrement loop counter */
133      blkCnt--;
134    }
135  
136    /* Loop unrolling: Compute remaining outputs */
137    blkCnt = blockSize % 0x4U;
138  
139  #else
140  
141    /* Initialize blkCnt with number of samples */
142    blkCnt = blockSize;
143  
144  #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
145  
146    while (blkCnt > 0U)
147    {
148      /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
149  
150      /* Compute Power and store result in a temporary variable, sum. */
151      in = *pSrc++;
152      sum += ((q63_t) in * in) >> 14U;
153  
154      /* Decrement loop counter */
155      blkCnt--;
156    }
157  
158    /* Store results in 16.48 format */
159    *pResult = sum;
160  }
161  #endif /* defined(ARM_MATH_MVEI) */
162  
163  /**
164    @} end of power group
165   */