arm_cmplx_mag_squared_f32.c
1 /* ---------------------------------------------------------------------- 2 * Project: CMSIS DSP Library 3 * Title: arm_cmplx_mag_squared_f32.c 4 * Description: Floating-point complex magnitude squared 5 * 6 * $Date: 23 April 2021 7 * $Revision: V1.9.0 8 * 9 * Target Processor: Cortex-M and Cortex-A cores 10 * -------------------------------------------------------------------- */ 11 /* 12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. 13 * 14 * SPDX-License-Identifier: Apache-2.0 15 * 16 * Licensed under the Apache License, Version 2.0 (the License); you may 17 * not use this file except in compliance with the License. 18 * You may obtain a copy of the License at 19 * 20 * www.apache.org/licenses/LICENSE-2.0 21 * 22 * Unless required by applicable law or agreed to in writing, software 23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 * See the License for the specific language governing permissions and 26 * limitations under the License. 27 */ 28 29 #include "dsp/complex_math_functions.h" 30 31 /** 32 @ingroup groupCmplxMath 33 */ 34 35 /** 36 @defgroup cmplx_mag_squared Complex Magnitude Squared 37 38 Computes the magnitude squared of the elements of a complex data vector. 39 40 The <code>pSrc</code> points to the source data and 41 <code>pDst</code> points to the where the result should be written. 42 <code>numSamples</code> specifies the number of complex samples 43 in the input array and the data is stored in an interleaved fashion 44 (real, imag, real, imag, ...). 45 The input array has a total of <code>2*numSamples</code> values; 46 the output array has a total of <code>numSamples</code> values. 47 48 The underlying algorithm is used: 49 50 <pre> 51 for (n = 0; n < numSamples; n++) { 52 pDst[n] = pSrc[(2*n)+0]^2 + pSrc[(2*n)+1]^2; 53 } 54 </pre> 55 56 There are separate functions for floating-point, Q15, and Q31 data types. 57 */ 58 59 /** 60 @addtogroup cmplx_mag_squared 61 @{ 62 */ 63 64 /** 65 @brief Floating-point complex magnitude squared. 66 @param[in] pSrc points to input vector 67 @param[out] pDst points to output vector 68 @param[in] numSamples number of samples in each vector 69 @return none 70 */ 71 72 #if defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) 73 74 void arm_cmplx_mag_squared_f32( 75 const float32_t * pSrc, 76 float32_t * pDst, 77 uint32_t numSamples) 78 { 79 int32_t blockSize = numSamples; /* loop counters */ 80 uint32_t blkCnt; /* loop counters */ 81 f32x4x2_t vecSrc; 82 f32x4_t sum; 83 float32_t real, imag; /* Temporary input variables */ 84 85 /* Compute 4 complex samples at a time */ 86 blkCnt = blockSize >> 2; 87 while (blkCnt > 0U) 88 { 89 vecSrc = vld2q(pSrc); 90 sum = vmulq(vecSrc.val[0], vecSrc.val[0]); 91 sum = vfmaq(sum, vecSrc.val[1], vecSrc.val[1]); 92 vst1q(pDst, sum); 93 94 pSrc += 8; 95 pDst += 4; 96 97 /* 98 * Decrement the blockSize loop counter 99 */ 100 blkCnt--; 101 } 102 103 /* Tail */ 104 blkCnt = blockSize & 3; 105 while (blkCnt > 0U) 106 { 107 /* C[0] = (A[0] * A[0] + A[1] * A[1]) */ 108 109 real = *pSrc++; 110 imag = *pSrc++; 111 112 /* store result in destination buffer. */ 113 *pDst++ = (real * real) + (imag * imag); 114 115 /* Decrement loop counter */ 116 blkCnt--; 117 } 118 119 } 120 121 #else 122 void arm_cmplx_mag_squared_f32( 123 const float32_t * pSrc, 124 float32_t * pDst, 125 uint32_t numSamples) 126 { 127 uint32_t blkCnt; /* Loop counter */ 128 float32_t real, imag; /* Temporary input variables */ 129 130 #if defined(ARM_MATH_NEON) && !defined(ARM_MATH_AUTOVECTORIZE) 131 float32x4x2_t vecA; 132 float32x4_t vRealA; 133 float32x4_t vImagA; 134 float32x4_t vMagSqA; 135 136 float32x4x2_t vecB; 137 float32x4_t vRealB; 138 float32x4_t vImagB; 139 float32x4_t vMagSqB; 140 141 /* Loop unrolling: Compute 8 outputs at a time */ 142 blkCnt = numSamples >> 3; 143 144 while (blkCnt > 0U) 145 { 146 /* out = sqrt((real * real) + (imag * imag)) */ 147 148 vecA = vld2q_f32(pSrc); 149 pSrc += 8; 150 151 vRealA = vmulq_f32(vecA.val[0], vecA.val[0]); 152 vImagA = vmulq_f32(vecA.val[1], vecA.val[1]); 153 vMagSqA = vaddq_f32(vRealA, vImagA); 154 155 vecB = vld2q_f32(pSrc); 156 pSrc += 8; 157 158 vRealB = vmulq_f32(vecB.val[0], vecB.val[0]); 159 vImagB = vmulq_f32(vecB.val[1], vecB.val[1]); 160 vMagSqB = vaddq_f32(vRealB, vImagB); 161 162 /* Store the result in the destination buffer. */ 163 vst1q_f32(pDst, vMagSqA); 164 pDst += 4; 165 166 vst1q_f32(pDst, vMagSqB); 167 pDst += 4; 168 169 /* Decrement the loop counter */ 170 blkCnt--; 171 } 172 173 blkCnt = numSamples & 7; 174 175 #else 176 #if defined (ARM_MATH_LOOPUNROLL) && !defined(ARM_MATH_AUTOVECTORIZE) 177 178 /* Loop unrolling: Compute 4 outputs at a time */ 179 blkCnt = numSamples >> 2U; 180 181 while (blkCnt > 0U) 182 { 183 /* C[0] = (A[0] * A[0] + A[1] * A[1]) */ 184 185 real = *pSrc++; 186 imag = *pSrc++; 187 *pDst++ = (real * real) + (imag * imag); 188 189 real = *pSrc++; 190 imag = *pSrc++; 191 *pDst++ = (real * real) + (imag * imag); 192 193 real = *pSrc++; 194 imag = *pSrc++; 195 *pDst++ = (real * real) + (imag * imag); 196 197 real = *pSrc++; 198 imag = *pSrc++; 199 *pDst++ = (real * real) + (imag * imag); 200 201 /* Decrement loop counter */ 202 blkCnt--; 203 } 204 205 /* Loop unrolling: Compute remaining outputs */ 206 blkCnt = numSamples % 0x4U; 207 208 #else 209 210 /* Initialize blkCnt with number of samples */ 211 blkCnt = numSamples; 212 213 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ 214 #endif /* #if defined(ARM_MATH_NEON) */ 215 216 while (blkCnt > 0U) 217 { 218 /* C[0] = (A[0] * A[0] + A[1] * A[1]) */ 219 220 real = *pSrc++; 221 imag = *pSrc++; 222 223 /* store result in destination buffer. */ 224 *pDst++ = (real * real) + (imag * imag); 225 226 /* Decrement loop counter */ 227 blkCnt--; 228 } 229 230 } 231 #endif /* defined(ARM_MATH_MVEF) && !defined(ARM_MATH_AUTOVECTORIZE) */ 232 233 /** 234 @} end of cmplx_mag_squared group 235 */