/ Drivers / CMSIS / DSP / Source / BasicMathFunctions / arm_mult_f32.c
arm_mult_f32.c
  1  /* ----------------------------------------------------------------------
  2   * Project:      CMSIS DSP Library
  3   * Title:        arm_mult_f32.c
  4   * Description:  Floating-point vector multiplication
  5   *
  6   * $Date:        23 April 2021
  7   * $Revision:    V1.9.0
  8   *
  9   * Target Processor: Cortex-M and Cortex-A cores
 10   * -------------------------------------------------------------------- */
 11  /*
 12   * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
 13   *
 14   * SPDX-License-Identifier: Apache-2.0
 15   *
 16   * Licensed under the Apache License, Version 2.0 (the License); you may
 17   * not use this file except in compliance with the License.
 18   * You may obtain a copy of the License at
 19   *
 20   * www.apache.org/licenses/LICENSE-2.0
 21   *
 22   * Unless required by applicable law or agreed to in writing, software
 23   * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 24   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 25   * See the License for the specific language governing permissions and
 26   * limitations under the License.
 27   */
 28  
 29  #include "dsp/basic_math_functions.h"
 30  
 31  /**
 32    @ingroup groupMath
 33   */
 34  
 35  /**
 36    @defgroup BasicMult Vector Multiplication
 37  
 38    Element-by-element multiplication of two vectors.
 39  
 40    <pre>
 41        pDst[n] = pSrcA[n] * pSrcB[n],   0 <= n < blockSize.
 42    </pre>
 43  
 44    There are separate functions for floating-point, Q7, Q15, and Q31 data types.
 45   */
 46  
 47  /**
 48    @addtogroup BasicMult
 49    @{
 50   */
 51  
 52  /**
 53    @brief         Floating-point vector multiplication.
 54    @param[in]     pSrcA      points to the first input vector.
 55    @param[in]     pSrcB      points to the second input vector.
 56    @param[out]    pDst       points to the output vector.
 57    @param[in]     blockSize  number of samples in each vector.
 58    @return        none
 59   */
 60  
 61  #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 62  
 63  #include "arm_helium_utils.h"
 64  
 65  void arm_mult_f32(
 66    const float32_t * pSrcA,
 67    const float32_t * pSrcB,
 68          float32_t * pDst,
 69          uint32_t blockSize)
 70  {
 71      uint32_t blkCnt;                               /* Loop counter */
 72  
 73      f32x4_t vec1;
 74      f32x4_t vec2;
 75      f32x4_t res;
 76  
 77      /* Compute 4 outputs at a time */
 78      blkCnt = blockSize >> 2U;
 79      while (blkCnt > 0U)
 80      {
 81          /* C = A + B */
 82  
 83        /* Add and then store the results in the destination buffer. */
 84          vec1 = vld1q(pSrcA);
 85          vec2 = vld1q(pSrcB);
 86          res = vmulq(vec1, vec2);
 87          vst1q(pDst, res);
 88  
 89          /* Increment pointers */
 90          pSrcA += 4;
 91          pSrcB += 4; 
 92          pDst += 4;
 93          
 94          /* Decrement the loop counter */
 95          blkCnt--;
 96      }
 97  
 98      /* Tail */
 99      blkCnt = blockSize & 0x3;
100      if (blkCnt > 0U)
101      {
102        /* C = A + B */
103        mve_pred16_t p0 = vctp32q(blkCnt);
104        vec1 = vld1q(pSrcA);
105        vec2 = vld1q(pSrcB);
106        vstrwq_p(pDst, vmulq(vec1,vec2), p0);
107      }
108  
109  }
110  
111  #else
112  void arm_mult_f32(
113    const float32_t * pSrcA,
114    const float32_t * pSrcB,
115          float32_t * pDst,
116          uint32_t blockSize)
117  {
118      uint32_t blkCnt;                               /* Loop counter */
119  
120  #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
121      f32x4_t vec1;
122      f32x4_t vec2;
123      f32x4_t res;
124  
125      /* Compute 4 outputs at a time */
126      blkCnt = blockSize >> 2U;
127  
128      while (blkCnt > 0U)
129      {
130          /* C = A * B */
131  
132      	/* Multiply the inputs and then store the results in the destination buffer. */
133          vec1 = vld1q_f32(pSrcA);
134          vec2 = vld1q_f32(pSrcB);
135          res = vmulq_f32(vec1, vec2);
136          vst1q_f32(pDst, res);
137  
138          /* Increment pointers */
139          pSrcA += 4;
140          pSrcB += 4; 
141          pDst += 4;
142          
143          /* Decrement the loop counter */
144          blkCnt--;
145      }
146  
147      /* Tail */
148      blkCnt = blockSize & 0x3;
149  
150  #else
151  #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
152  
153    /* Loop unrolling: Compute 4 outputs at a time */
154    blkCnt = blockSize >> 2U;
155  
156    while (blkCnt > 0U)
157    {
158      /* C = A * B */
159  
160      /* Multiply inputs and store result in destination buffer. */
161      *pDst++ = (*pSrcA++) * (*pSrcB++);
162  
163      *pDst++ = (*pSrcA++) * (*pSrcB++);
164  
165      *pDst++ = (*pSrcA++) * (*pSrcB++);
166  
167      *pDst++ = (*pSrcA++) * (*pSrcB++);
168  
169      /* Decrement loop counter */
170      blkCnt--;
171    }
172  
173    /* Loop unrolling: Compute remaining outputs */
174    blkCnt = blockSize % 0x4U;
175  
176  #else
177  
178    /* Initialize blkCnt with number of samples */
179    blkCnt = blockSize;
180  
181  #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
182  #endif /* #if defined(ARM_MATH_NEON) */
183  
184    while (blkCnt > 0U)
185    {
186      /* C = A * B */
187  
188      /* Multiply input and store result in destination buffer. */
189      *pDst++ = (*pSrcA++) * (*pSrcB++);
190  
191      /* Decrement loop counter */
192      blkCnt--;
193    }
194  
195  }
196  #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
197  
198  /**
199    @} end of BasicMult group
200   */