/ Drivers / CMSIS / DSP / Source / StatisticsFunctions / arm_power_q7.c
arm_power_q7.c
  1  /* ----------------------------------------------------------------------
  2   * Project:      CMSIS DSP Library
  3   * Title:        arm_power_q7.c
  4   * Description:  Sum of the squares of the elements of a Q7 vector
  5   *
  6   * $Date:        23 April 2021
  7   * $Revision:    V1.9.0
  8   *
  9   * Target Processor: Cortex-M and Cortex-A cores
 10   * -------------------------------------------------------------------- */
 11  /*
 12   * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
 13   *
 14   * SPDX-License-Identifier: Apache-2.0
 15   *
 16   * Licensed under the Apache License, Version 2.0 (the License); you may
 17   * not use this file except in compliance with the License.
 18   * You may obtain a copy of the License at
 19   *
 20   * www.apache.org/licenses/LICENSE-2.0
 21   *
 22   * Unless required by applicable law or agreed to in writing, software
 23   * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 24   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 25   * See the License for the specific language governing permissions and
 26   * limitations under the License.
 27   */
 28  
 29  #include "dsp/statistics_functions.h"
 30  
 31  /**
 32    @ingroup groupStats
 33   */
 34  
 35  /**
 36    @addtogroup power
 37    @{
 38   */
 39  
 40  /**
 41    @brief         Sum of the squares of the elements of a Q7 vector.
 42    @param[in]     pSrc       points to the input vector
 43    @param[in]     blockSize  number of samples in input vector
 44    @param[out]    pResult    sum of the squares value returned here
 45    @return        none
 46  
 47    @par           Scaling and Overflow Behavior
 48                     The function is implemented using a 32-bit internal accumulator.
 49                     The input is represented in 1.7 format.
 50                     Intermediate multiplication yields a 2.14 format, and this
 51                     result is added without saturation to an accumulator in 18.14 format.
 52                     With 17 guard bits in the accumulator, there is no risk of overflow, and the
 53                     full precision of the intermediate multiplication is preserved.
 54                     Finally, the return result is in 18.14 format.
 55   */
 56  #if defined(ARM_MATH_MVEI) && !defined(ARM_MATH_AUTOVECTORIZE)
 57  void arm_power_q7(
 58    const q7_t * pSrc,
 59          uint32_t blockSize,
 60          q31_t * pResult)
 61  {
 62      uint32_t  blkCnt;           /* loop counters */
 63      q7x16_t vecSrc;
 64      q31_t   sum = 0LL;
 65      q7_t in;
 66  
 67     /* Compute 16 outputs at a time */
 68      blkCnt = blockSize >> 4U;
 69      while (blkCnt > 0U)
 70      {
 71          vecSrc = vldrbq_s8(pSrc);
 72          /*
 73           * sum lanes
 74           */
 75          sum = vmladavaq(sum, vecSrc, vecSrc);
 76  
 77          blkCnt--;
 78          pSrc += 16;
 79      }
 80  
 81      /*
 82       * tail
 83       */
 84      blkCnt = blockSize & 0xF;
 85      while (blkCnt > 0U)
 86      {
 87         /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
 88  
 89         /* Compute Power and store result in a temporary variable, sum. */
 90         in = *pSrc++;
 91         sum += ((q15_t) in * in);
 92  
 93         /* Decrement loop counter */
 94         blkCnt--;
 95      }
 96  
 97      *pResult = sum;
 98  }
 99  #else
100  void arm_power_q7(
101    const q7_t * pSrc,
102          uint32_t blockSize,
103          q31_t * pResult)
104  {
105          uint32_t blkCnt;                               /* Loop counter */
106          q31_t sum = 0;                                 /* Temporary result storage */
107          q7_t in;                                       /* Temporary variable to store input value */
108  
109  #if defined (ARM_MATH_LOOPUNROLL) && defined (ARM_MATH_DSP)
110          q31_t in32;                                    /* Temporary variable to store packed input value */
111          q31_t in1, in2;                                /* Temporary variables to store input value */
112  #endif
113  
114  #if defined (ARM_MATH_LOOPUNROLL)
115  
116    /* Loop unrolling: Compute 4 outputs at a time */
117    blkCnt = blockSize >> 2U;
118  
119    while (blkCnt > 0U)
120    {
121      /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
122  
123      /* Compute Power and store result in a temporary variable, sum. */
124  #if defined (ARM_MATH_DSP)
125      in32 = read_q7x4_ia (&pSrc);
126  
127      in1 = __SXTB16(__ROR(in32, 8));
128      in2 = __SXTB16(in32);
129  
130      /* calculate power and accumulate to accumulator */
131      sum = __SMLAD(in1, in1, sum);
132      sum = __SMLAD(in2, in2, sum);
133  #else
134      in = *pSrc++;
135      sum += ((q15_t) in * in);
136  
137      in = *pSrc++;
138      sum += ((q15_t) in * in);
139  
140      in = *pSrc++;
141      sum += ((q15_t) in * in);
142  
143      in = *pSrc++;
144      sum += ((q15_t) in * in);
145  #endif /* #if defined (ARM_MATH_DSP) */
146  
147      /* Decrement loop counter */
148      blkCnt--;
149    }
150  
151    /* Loop unrolling: Compute remaining outputs */
152    blkCnt = blockSize % 0x4U;
153  
154  #else
155  
156    /* Initialize blkCnt with number of samples */
157    blkCnt = blockSize;
158  
159  #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
160  
161    while (blkCnt > 0U)
162    {
163      /* C = A[0] * A[0] + A[1] * A[1] + ... + A[blockSize-1] * A[blockSize-1] */
164  
165      /* Compute Power and store result in a temporary variable, sum. */
166      in = *pSrc++;
167      sum += ((q15_t) in * in);
168  
169      /* Decrement loop counter */
170      blkCnt--;
171    }
172  
173    /* Store result in 18.14 format */
174    *pResult = sum;
175  }
176  #endif /* defined(ARM_MATH_MVEI) */
177  
178  /**
179    @} end of power group
180   */