/ Drivers / CMSIS / DSP / Source / MatrixFunctions / arm_mat_add_f32.c
arm_mat_add_f32.c
  1  /* ----------------------------------------------------------------------
  2   * Project:      CMSIS DSP Library
  3   * Title:        arm_mat_add_f32.c
  4   * Description:  Floating-point matrix addition
  5   *
  6   * $Date:        23 April 2021
  7   * $Revision:    V1.9.0
  8   *
  9   * Target Processor: Cortex-M and Cortex-A cores
 10   * -------------------------------------------------------------------- */
 11  /*
 12   * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
 13   *
 14   * SPDX-License-Identifier: Apache-2.0
 15   *
 16   * Licensed under the Apache License, Version 2.0 (the License); you may
 17   * not use this file except in compliance with the License.
 18   * You may obtain a copy of the License at
 19   *
 20   * www.apache.org/licenses/LICENSE-2.0
 21   *
 22   * Unless required by applicable law or agreed to in writing, software
 23   * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 24   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 25   * See the License for the specific language governing permissions and
 26   * limitations under the License.
 27   */
 28  
 29  #include "dsp/matrix_functions.h"
 30  
 31  /**
 32    @ingroup groupMatrix
 33   */
 34  
 35  /**
 36    @defgroup MatrixAdd Matrix Addition
 37  
 38    Adds two matrices.
 39    \image html MatrixAddition.gif "Addition of two 3 x 3 matrices"
 40  
 41    The functions check to make sure that
 42    <code>pSrcA</code>, <code>pSrcB</code>, and <code>pDst</code> have the same
 43    number of rows and columns.
 44   */
 45  
 46  /**
 47    @addtogroup MatrixAdd
 48    @{
 49   */
 50  
 51  
 52  /**
 53    @brief         Floating-point matrix addition.
 54    @param[in]     pSrcA      points to first input matrix structure
 55    @param[in]     pSrcB      points to second input matrix structure
 56    @param[out]    pDst       points to output matrix structure
 57    @return        execution status
 58                     - \ref ARM_MATH_SUCCESS       : Operation successful
 59                     - \ref ARM_MATH_SIZE_MISMATCH : Matrix size check failed
 60   */
 61  
 62  #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 63  arm_status arm_mat_add_f32(
 64    const arm_matrix_instance_f32 * pSrcA,
 65    const arm_matrix_instance_f32 * pSrcB,
 66    arm_matrix_instance_f32 * pDst)
 67  {
 68      arm_status status;  
 69      uint32_t  numSamples;       /* total number of elements in the matrix  */
 70      float32_t *pDataA, *pDataB, *pDataDst;
 71      f32x4_t vecA, vecB, vecDst;
 72      float32_t const *pSrcAVec;
 73      float32_t const *pSrcBVec;
 74      uint32_t  blkCnt;           /* loop counters */
 75  
 76      pDataA = pSrcA->pData;
 77      pDataB = pSrcB->pData;
 78      pDataDst = pDst->pData;
 79      pSrcAVec = (float32_t const *) pDataA;
 80      pSrcBVec = (float32_t const *) pDataB;
 81  
 82  #ifdef ARM_MATH_MATRIX_CHECK
 83    /* Check for matrix mismatch condition */
 84    if ((pSrcA->numRows != pSrcB->numRows) ||
 85       (pSrcA->numCols != pSrcB->numCols) ||
 86       (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
 87    {
 88      /* Set status as ARM_MATH_SIZE_MISMATCH */
 89      status = ARM_MATH_SIZE_MISMATCH;
 90    }
 91    else
 92  #endif
 93   {
 94      /*
 95       * Total number of samples in the input matrix
 96       */
 97      numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
 98      blkCnt = numSamples >> 2;
 99      while (blkCnt > 0U)
100      {
101          /* C(m,n) = A(m,n) + B(m,n) */
102          /* Add and then store the results in the destination buffer. */
103          vecA = vld1q(pSrcAVec); 
104          pSrcAVec += 4;
105          vecB = vld1q(pSrcBVec); 
106          pSrcBVec += 4;
107          vecDst = vaddq(vecA, vecB);
108          vst1q(pDataDst, vecDst);  
109          pDataDst += 4;
110          /*
111           * Decrement the blockSize loop counter
112           */
113          blkCnt--;
114      }
115      /*
116       * tail
117       */
118      blkCnt = numSamples & 3;
119      if (blkCnt > 0U)
120      {
121          mve_pred16_t p0 = vctp32q(blkCnt);
122          vecA = vld1q(pSrcAVec); 
123          vecB = vld1q(pSrcBVec); 
124          vecDst = vaddq_m(vecDst, vecA, vecB, p0);
125          vstrwq_p(pDataDst, vecDst, p0);
126      }
127      /* set status as ARM_MATH_SUCCESS */
128      status = ARM_MATH_SUCCESS;
129    }
130    return (status);
131  }
132  #else
133  #if defined(ARM_MATH_NEON)
134  /*
135  
136  Neon version is assuming the matrix is small enough.
137  So no blocking is used for taking into account cache effects.
138  For big matrix, there exist better libraries for Neon.
139  
140  */
141  arm_status arm_mat_add_f32(
142    const arm_matrix_instance_f32 * pSrcA,
143    const arm_matrix_instance_f32 * pSrcB,
144    arm_matrix_instance_f32 * pDst)
145  {
146    float32_t *pIn1 = pSrcA->pData;                /* input data matrix pointer A  */
147    float32_t *pIn2 = pSrcB->pData;                /* input data matrix pointer B  */
148    float32_t *pOut = pDst->pData;                 /* output data matrix pointer   */
149  
150  
151    uint32_t numSamples;                           /* total number of elements in the matrix  */
152    uint32_t blkCnt;                               /* loop counters */
153    arm_status status;                             /* status of matrix addition */
154  
155  #ifdef ARM_MATH_MATRIX_CHECK
156    /* Check for matrix mismatch condition */
157    if ((pSrcA->numRows != pSrcB->numRows) ||
158       (pSrcA->numCols != pSrcB->numCols) ||
159       (pSrcA->numRows != pDst->numRows) || (pSrcA->numCols != pDst->numCols))
160    {
161      /* Set status as ARM_MATH_SIZE_MISMATCH */
162      status = ARM_MATH_SIZE_MISMATCH;
163    }
164    else
165  #endif
166    {
167      float32x4_t vec1;
168      float32x4_t vec2;
169      float32x4_t res;
170  
171      /* Total number of samples in the input matrix */
172      numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
173  
174      blkCnt = numSamples >> 2U;
175  
176      /* Compute 4 outputs at a time.
177       ** a second loop below computes the remaining 1 to 3 samples. */
178      while (blkCnt > 0U)
179      {
180        /* C(m,n) = A(m,n) + B(m,n) */
181        /* Add and then store the results in the destination buffer. */
182        vec1 = vld1q_f32(pIn1);
183        vec2 = vld1q_f32(pIn2);
184        res = vaddq_f32(vec1, vec2);
185        vst1q_f32(pOut, res);
186  
187        /* update pointers to process next samples */
188        pIn1 += 4U;
189        pIn2 += 4U;
190        pOut += 4U;
191        /* Decrement the loop counter */
192        blkCnt--;
193      }
194  
195      /* If the numSamples is not a multiple of 4, compute any remaining output samples here.
196       ** No loop unrolling is used. */
197      blkCnt = numSamples % 0x4U;
198  
199      while (blkCnt > 0U)
200      {
201        /* C(m,n) = A(m,n) + B(m,n) */
202        /* Add and then store the results in the destination buffer. */
203        *pOut++ = (*pIn1++) + (*pIn2++);
204  
205        /* Decrement the loop counter */
206        blkCnt--;
207      }
208  
209      /* set status as ARM_MATH_SUCCESS */
210      status = ARM_MATH_SUCCESS;
211    }
212  
213    /* Return to application */
214    return (status);
215  }
216  #else
217  arm_status arm_mat_add_f32(
218    const arm_matrix_instance_f32 * pSrcA,
219    const arm_matrix_instance_f32 * pSrcB,
220          arm_matrix_instance_f32 * pDst)
221  {
222    float32_t *pInA = pSrcA->pData;                /* input data matrix pointer A */
223    float32_t *pInB = pSrcB->pData;                /* input data matrix pointer B */
224    float32_t *pOut = pDst->pData;                 /* output data matrix pointer */
225  
226    uint32_t numSamples;                           /* total number of elements in the matrix */
227    uint32_t blkCnt;                               /* loop counters */
228    arm_status status;                             /* status of matrix addition */
229  
230  #ifdef ARM_MATH_MATRIX_CHECK
231  
232    /* Check for matrix mismatch condition */
233    if ((pSrcA->numRows != pSrcB->numRows) ||
234        (pSrcA->numCols != pSrcB->numCols) ||
235        (pSrcA->numRows != pDst->numRows)  ||
236        (pSrcA->numCols != pDst->numCols)    )
237    {
238      /* Set status as ARM_MATH_SIZE_MISMATCH */
239      status = ARM_MATH_SIZE_MISMATCH;
240    }
241    else
242  
243  #endif /* #ifdef ARM_MATH_MATRIX_CHECK */
244  
245    {
246      /* Total number of samples in input matrix */
247      numSamples = (uint32_t) pSrcA->numRows * pSrcA->numCols;
248  
249  #if defined (ARM_MATH_LOOPUNROLL)
250  
251      /* Loop unrolling: Compute 4 outputs at a time */
252      blkCnt = numSamples >> 2U;
253  
254      while (blkCnt > 0U)
255      {
256        /* C(m,n) = A(m,n) + B(m,n) */
257  
258        /* Add and store result in destination buffer. */
259        *pOut++ = *pInA++ + *pInB++;
260  
261        *pOut++ = *pInA++ + *pInB++;
262  
263        *pOut++ = *pInA++ + *pInB++;
264  
265        *pOut++ = *pInA++ + *pInB++;
266  
267        /* Decrement loop counter */
268        blkCnt--;
269      }
270  
271      /* Loop unrolling: Compute remaining outputs */
272      blkCnt = numSamples % 0x4U;
273  
274  #else
275  
276      /* Initialize blkCnt with number of samples */
277      blkCnt = numSamples;
278  
279  #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
280  
281      while (blkCnt > 0U)
282      {
283        /* C(m,n) = A(m,n) + B(m,n) */
284  
285        /* Add and store result in destination buffer. */
286        *pOut++ = *pInA++ + *pInB++;
287  
288        /* Decrement loop counter */
289        blkCnt--;
290      }
291  
292      /* Set status as ARM_MATH_SUCCESS */
293      status = ARM_MATH_SUCCESS;
294    }
295  
296    /* Return to application */
297    return (status);
298  }
299  #endif /* #if defined(ARM_MATH_NEON) */
300  #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
301  
302  /**
303    @} end of MatrixAdd group
304   */