/ Drivers / CMSIS / DSP / Source / ComplexMathFunctions / arm_cmplx_mult_cmplx_f32.c
arm_cmplx_mult_cmplx_f32.c
  1  /* ----------------------------------------------------------------------
  2   * Project:      CMSIS DSP Library
  3   * Title:        arm_cmplx_mult_cmplx_f32.c
  4   * Description:  Floating-point complex-by-complex multiplication
  5   *
  6   * $Date:        23 April 2021
  7   * $Revision:    V1.9.0
  8   *
  9   * Target Processor: Cortex-M and Cortex-A cores
 10   * -------------------------------------------------------------------- */
 11  /*
 12   * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
 13   *
 14   * SPDX-License-Identifier: Apache-2.0
 15   *
 16   * Licensed under the Apache License, Version 2.0 (the License); you may
 17   * not use this file except in compliance with the License.
 18   * You may obtain a copy of the License at
 19   *
 20   * www.apache.org/licenses/LICENSE-2.0
 21   *
 22   * Unless required by applicable law or agreed to in writing, software
 23   * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 24   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 25   * See the License for the specific language governing permissions and
 26   * limitations under the License.
 27   */
 28  
 29  #include "dsp/complex_math_functions.h"
 30  
 31  /**
 32    @ingroup groupCmplxMath
 33   */
 34  
 35  /**
 36    @defgroup CmplxByCmplxMult Complex-by-Complex Multiplication
 37  
 38    Multiplies a complex vector by another complex vector and generates a complex result.
 39    The data in the complex arrays is stored in an interleaved fashion
 40    (real, imag, real, imag, ...).
 41    The parameter <code>numSamples</code> represents the number of complex
 42    samples processed.  The complex arrays have a total of <code>2*numSamples</code>
 43    real values.
 44  
 45    The underlying algorithm is used:
 46  
 47    <pre>
 48    for (n = 0; n < numSamples; n++) {
 49        pDst[(2*n)+0] = pSrcA[(2*n)+0] * pSrcB[(2*n)+0] - pSrcA[(2*n)+1] * pSrcB[(2*n)+1];
 50        pDst[(2*n)+1] = pSrcA[(2*n)+0] * pSrcB[(2*n)+1] + pSrcA[(2*n)+1] * pSrcB[(2*n)+0];
 51    }
 52    </pre>
 53  
 54    There are separate functions for floating-point, Q15, and Q31 data types.
 55   */
 56  
 57  /**
 58    @addtogroup CmplxByCmplxMult
 59    @{
 60   */
 61  
 62  /**
 63    @brief         Floating-point complex-by-complex multiplication.
 64    @param[in]     pSrcA       points to first input vector
 65    @param[in]     pSrcB       points to second input vector
 66    @param[out]    pDst        points to output vector
 67    @param[in]     numSamples  number of samples in each vector
 68    @return        none
 69   */
 70  
 71  #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE)
 72  
 73  void arm_cmplx_mult_cmplx_f32(
 74    const float32_t * pSrcA,
 75    const float32_t * pSrcB,
 76          float32_t * pDst,
 77          uint32_t numSamples)
 78  {
 79       int32_t         blkCnt;
 80      f32x4_t         vecSrcA, vecSrcB;
 81      f32x4_t         vecSrcC, vecSrcD;
 82      f32x4_t         vec_acc;
 83  
 84      blkCnt = numSamples >> 2;
 85      blkCnt -= 1;
 86      if (blkCnt > 0) {
 87          /* should give more freedom to generate stall free code */
 88          vecSrcA = vld1q(pSrcA);
 89          vecSrcB = vld1q(pSrcB);
 90          pSrcA += 4;
 91          pSrcB += 4;
 92  
 93          while (blkCnt > 0) {
 94              vec_acc = vcmulq(vecSrcA, vecSrcB);
 95              vecSrcC = vld1q(pSrcA);
 96              pSrcA += 4;
 97  
 98              vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
 99              vecSrcD = vld1q(pSrcB);
100              pSrcB += 4;
101              vst1q(pDst, vec_acc);
102              pDst += 4;
103  
104              vec_acc = vcmulq(vecSrcC, vecSrcD);
105              vecSrcA = vld1q(pSrcA);
106              pSrcA += 4;
107  
108              vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
109              vecSrcB = vld1q(pSrcB);
110              pSrcB += 4;
111              vst1q(pDst, vec_acc);
112              pDst += 4;
113              /*
114               * Decrement the blockSize loop counter
115               */
116              blkCnt--;
117          }
118  
119          /* process last elements out of the loop avoid the armclang breaking the SW pipeline */
120          vec_acc = vcmulq(vecSrcA, vecSrcB);
121          vecSrcC = vld1q(pSrcA);
122  
123          vec_acc = vcmlaq_rot90(vec_acc, vecSrcA, vecSrcB);
124          vecSrcD = vld1q(pSrcB);
125          vst1q(pDst, vec_acc);
126          pDst += 4;
127  
128          vec_acc = vcmulq(vecSrcC, vecSrcD);
129          vec_acc = vcmlaq_rot90(vec_acc, vecSrcC, vecSrcD);
130          vst1q(pDst, vec_acc);
131          pDst += 4;
132  
133          /*
134           * tail
135           */
136          blkCnt = CMPLX_DIM * (numSamples & 3);
137          while (blkCnt > 0) {
138              mve_pred16_t    p = vctp32q(blkCnt);
139              pSrcA += 4;
140              pSrcB += 4;
141  
142              vecSrcA = vldrwq_z_f32(pSrcA, p);
143              vecSrcB = vldrwq_z_f32(pSrcB, p);
144              vec_acc = vcmulq_m(vuninitializedq_f32(),vecSrcA, vecSrcB, p);
145              vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
146  
147              vstrwq_p_f32(pDst, vec_acc, p);
148              pDst += 4;
149  
150              blkCnt -= 4;
151          }
152      } else {
153          /* small vector */
154          blkCnt = numSamples * CMPLX_DIM;
155          vec_acc = vdupq_n_f32(0.0f);
156  
157          do {
158              mve_pred16_t    p = vctp32q(blkCnt);
159  
160              vecSrcA = vldrwq_z_f32(pSrcA, p);
161              vecSrcB = vldrwq_z_f32(pSrcB, p);
162  
163              vec_acc = vcmulq_m(vuninitializedq_f32(),vecSrcA, vecSrcB, p);
164              vec_acc = vcmlaq_rot90_m(vec_acc, vecSrcA, vecSrcB, p);
165              vstrwq_p_f32(pDst, vec_acc, p);
166              pDst += 4;
167  
168              /*
169               * Decrement the blkCnt loop counter
170               * Advance vector source and destination pointers
171               */
172              pSrcA += 4;
173              pSrcB += 4;
174              blkCnt -= 4;
175          }
176          while (blkCnt > 0);
177      }
178  
179  }
180  
181  #else
182  void arm_cmplx_mult_cmplx_f32(
183    const float32_t * pSrcA,
184    const float32_t * pSrcB,
185          float32_t * pDst,
186          uint32_t numSamples)
187  {
188      uint32_t blkCnt;                               /* Loop counter */
189      float32_t a, b, c, d;  /* Temporary variables to store real and imaginary values */
190  
191  #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE)
192      float32x4x2_t va, vb;
193      float32x4x2_t outCplx;
194  
195      /* Compute 4 outputs at a time */
196      blkCnt = numSamples >> 2U;
197  
198      while (blkCnt > 0U)
199      {
200          va = vld2q_f32(pSrcA);  // load & separate real/imag pSrcA (de-interleave 2)
201          vb = vld2q_f32(pSrcB);  // load & separate real/imag pSrcB
202  
203  	/* Increment pointers */
204          pSrcA += 8;
205          pSrcB += 8;
206  	
207  	/* Re{C} = Re{A}*Re{B} - Im{A}*Im{B} */
208          outCplx.val[0] = vmulq_f32(va.val[0], vb.val[0]);
209          outCplx.val[0] = vmlsq_f32(outCplx.val[0], va.val[1], vb.val[1]);
210  
211  	/* Im{C} = Re{A}*Im{B} + Im{A}*Re{B} */
212          outCplx.val[1] = vmulq_f32(va.val[0], vb.val[1]);
213          outCplx.val[1] = vmlaq_f32(outCplx.val[1], va.val[1], vb.val[0]);
214  
215          vst2q_f32(pDst, outCplx);
216  
217  	/* Increment pointer */
218          pDst += 8;
219  
220  	/* Decrement the loop counter */
221          blkCnt--;
222      }
223  
224      /* Tail */
225      blkCnt = numSamples & 3;
226  
227  #else
228  #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE)
229  
230    /* Loop unrolling: Compute 4 outputs at a time */
231    blkCnt = numSamples >> 2U;
232  
233    while (blkCnt > 0U)
234    {
235      /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
236      /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
237  
238      a = *pSrcA++;
239      b = *pSrcA++;
240      c = *pSrcB++;
241      d = *pSrcB++;
242      /* store result in destination buffer. */
243      *pDst++ = (a * c) - (b * d);
244      *pDst++ = (a * d) + (b * c);
245  
246      a = *pSrcA++;
247      b = *pSrcA++;
248      c = *pSrcB++;
249      d = *pSrcB++;
250      *pDst++ = (a * c) - (b * d);
251      *pDst++ = (a * d) + (b * c);
252  
253      a = *pSrcA++;
254      b = *pSrcA++;
255      c = *pSrcB++;
256      d = *pSrcB++;
257      *pDst++ = (a * c) - (b * d);
258      *pDst++ = (a * d) + (b * c);
259  
260      a = *pSrcA++;
261      b = *pSrcA++;
262      c = *pSrcB++;
263      d = *pSrcB++;
264      *pDst++ = (a * c) - (b * d);
265      *pDst++ = (a * d) + (b * c);
266  
267      /* Decrement loop counter */
268      blkCnt--;
269    }
270  
271    /* Loop unrolling: Compute remaining outputs */
272    blkCnt = numSamples % 0x4U;
273  
274  #else
275  
276    /* Initialize blkCnt with number of samples */
277    blkCnt = numSamples;
278  
279  #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
280  #endif /* #if defined(ARM_MATH_NEON) */
281  
282    while (blkCnt > 0U)
283    {
284      /* C[2 * i    ] = A[2 * i] * B[2 * i    ] - A[2 * i + 1] * B[2 * i + 1]. */
285      /* C[2 * i + 1] = A[2 * i] * B[2 * i + 1] + A[2 * i + 1] * B[2 * i    ]. */
286  
287      a = *pSrcA++;
288      b = *pSrcA++;
289      c = *pSrcB++;
290      d = *pSrcB++;
291  
292      /* store result in destination buffer. */
293      *pDst++ = (a * c) - (b * d);
294      *pDst++ = (a * d) + (b * c);
295  
296      /* Decrement loop counter */
297      blkCnt--;
298    }
299  
300  }
301  #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */
302  
303  /**
304    @} end of CmplxByCmplxMult group
305   */