arm_dct4_q31.c
1 /* ---------------------------------------------------------------------- 2 * Project: CMSIS DSP Library 3 * Title: arm_dct4_q31.c 4 * Description: Processing function of DCT4 & IDCT4 Q31 5 * 6 * $Date: 23 April 2021 7 * $Revision: V1.9.0 8 * 9 * Target Processor: Cortex-M and Cortex-A cores 10 * -------------------------------------------------------------------- */ 11 /* 12 * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved. 13 * 14 * SPDX-License-Identifier: Apache-2.0 15 * 16 * Licensed under the Apache License, Version 2.0 (the License); you may 17 * not use this file except in compliance with the License. 18 * You may obtain a copy of the License at 19 * 20 * www.apache.org/licenses/LICENSE-2.0 21 * 22 * Unless required by applicable law or agreed to in writing, software 23 * distributed under the License is distributed on an AS IS BASIS, WITHOUT 24 * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 25 * See the License for the specific language governing permissions and 26 * limitations under the License. 27 */ 28 29 #include "dsp/transform_functions.h" 30 31 /** 32 @addtogroup DCT4_IDCT4 33 @{ 34 */ 35 36 /** 37 @brief Processing function for the Q31 DCT4/IDCT4. 38 @param[in] S points to an instance of the Q31 DCT4 structure. 39 @param[in] pState points to state buffer. 40 @param[in,out] pInlineBuffer points to the in-place input and output buffer. 41 @return none 42 43 @par Input an output formats 44 Input samples need to be downscaled by 1 bit to avoid saturations in the Q31 DCT process, 45 as the conversion from DCT2 to DCT4 involves one subtraction. 46 Internally inputs are downscaled in the RFFT process function to avoid overflows. 47 Number of bits downscaled, depends on the size of the transform. 48 The input and output formats for different DCT sizes and number of bits to upscale are 49 mentioned in the table below: 50 51 \image html dct4FormatsQ31Table.gif 52 */ 53 54 void arm_dct4_q31( 55 const arm_dct4_instance_q31 * S, 56 q31_t * pState, 57 q31_t * pInlineBuffer) 58 { 59 const q31_t *weights = S->pTwiddle; /* Pointer to the Weights table */ 60 const q31_t *cosFact = S->pCosFactor; /* Pointer to the cos factors table */ 61 q31_t *pS1, *pS2, *pbuff; /* Temporary pointers for input buffer and pState buffer */ 62 q31_t in; /* Temporary variable */ 63 uint32_t i; /* Loop counter */ 64 65 66 /* DCT4 computation involves DCT2 (which is calculated using RFFT) 67 * along with some pre-processing and post-processing. 68 * Computational procedure is explained as follows: 69 * (a) Pre-processing involves multiplying input with cos factor, 70 * r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n)) 71 * where, 72 * r(n) -- output of preprocessing 73 * u(n) -- input to preprocessing(actual Source buffer) 74 * (b) Calculation of DCT2 using FFT is divided into three steps: 75 * Step1: Re-ordering of even and odd elements of input. 76 * Step2: Calculating FFT of the re-ordered input. 77 * Step3: Taking the real part of the product of FFT output and weights. 78 * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation: 79 * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0) 80 * where, 81 * Y4 -- DCT4 output, Y2 -- DCT2 output 82 * (d) Multiplying the output with the normalizing factor sqrt(2/N). 83 */ 84 85 /*-------- Pre-processing ------------*/ 86 /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */ 87 arm_mult_q31 (pInlineBuffer, cosFact, pInlineBuffer, S->N); 88 arm_shift_q31 (pInlineBuffer, 1, pInlineBuffer, S->N); 89 90 /* ---------------------------------------------------------------- 91 * Step1: Re-ordering of even and odd elements as 92 * pState[i] = pInlineBuffer[2*i] and 93 * pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2 94 ---------------------------------------------------------------------*/ 95 96 /* pS1 initialized to pState */ 97 pS1 = pState; 98 99 /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */ 100 pS2 = pState + (S->N - 1U); 101 102 /* pbuff initialized to input buffer */ 103 pbuff = pInlineBuffer; 104 105 106 #if defined (ARM_MATH_LOOPUNROLL) 107 108 /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */ 109 i = S->Nby2 >> 2U; 110 111 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 112 ** a second loop below computes the remaining 1 to 3 samples. */ 113 do 114 { 115 /* Re-ordering of even and odd elements */ 116 /* pState[i] = pInlineBuffer[2*i] */ 117 *pS1++ = *pbuff++; 118 /* pState[N-i-1] = pInlineBuffer[2*i+1] */ 119 *pS2-- = *pbuff++; 120 121 *pS1++ = *pbuff++; 122 *pS2-- = *pbuff++; 123 124 *pS1++ = *pbuff++; 125 *pS2-- = *pbuff++; 126 127 *pS1++ = *pbuff++; 128 *pS2-- = *pbuff++; 129 130 /* Decrement loop counter */ 131 i--; 132 } while (i > 0U); 133 134 /* pbuff initialized to input buffer */ 135 pbuff = pInlineBuffer; 136 137 /* pS1 initialized to pState */ 138 pS1 = pState; 139 140 /* Initializing the loop counter to N/4 instead of N for loop unrolling */ 141 i = S->N >> 2U; 142 143 /* Processing with loop unrolling 4 times as N is always multiple of 4. 144 * Compute 4 outputs at a time */ 145 do 146 { 147 /* Writing the re-ordered output back to inplace input buffer */ 148 *pbuff++ = *pS1++; 149 *pbuff++ = *pS1++; 150 *pbuff++ = *pS1++; 151 *pbuff++ = *pS1++; 152 153 /* Decrement the loop counter */ 154 i--; 155 } while (i > 0U); 156 157 158 /* --------------------------------------------------------- 159 * Step2: Calculate RFFT for N-point input 160 * ---------------------------------------------------------- */ 161 /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */ 162 arm_rfft_q31 (S->pRfft, pInlineBuffer, pState); 163 164 /*---------------------------------------------------------------------- 165 * Step3: Multiply the FFT output with the weights. 166 *----------------------------------------------------------------------*/ 167 arm_cmplx_mult_cmplx_q31 (pState, weights, pState, S->N); 168 169 /* The output of complex multiplication is in 3.29 format. 170 * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */ 171 arm_shift_q31 (pState, 2, pState, S->N * 2); 172 173 /* ----------- Post-processing ---------- */ 174 /* DCT-IV can be obtained from DCT-II by the equation, 175 * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0) 176 * Hence, Y4(0) = Y2(0)/2 */ 177 /* Getting only real part from the output and Converting to DCT-IV */ 178 179 /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */ 180 i = (S->N - 1U) >> 2U; 181 182 /* pbuff initialized to input buffer. */ 183 pbuff = pInlineBuffer; 184 185 /* pS1 initialized to pState */ 186 pS1 = pState; 187 188 /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */ 189 in = *pS1++ >> 1U; 190 /* input buffer acts as inplace, so output values are stored in the input itself. */ 191 *pbuff++ = in; 192 193 /* pState pointer is incremented twice as the real values are located alternatively in the array */ 194 pS1++; 195 196 /* First part of the processing with loop unrolling. Compute 4 outputs at a time. 197 ** a second loop below computes the remaining 1 to 3 samples. */ 198 do 199 { 200 /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ 201 /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ 202 in = *pS1++ - in; 203 *pbuff++ = in; 204 /* points to the next real value */ 205 pS1++; 206 207 in = *pS1++ - in; 208 *pbuff++ = in; 209 pS1++; 210 211 in = *pS1++ - in; 212 *pbuff++ = in; 213 pS1++; 214 215 in = *pS1++ - in; 216 *pbuff++ = in; 217 pS1++; 218 219 /* Decrement the loop counter */ 220 i--; 221 } while (i > 0U); 222 223 /* If the blockSize is not a multiple of 4, compute any remaining output samples here. 224 ** No loop unrolling is used. */ 225 i = (S->N - 1U) % 0x4U; 226 227 while (i > 0U) 228 { 229 /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ 230 /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ 231 in = *pS1++ - in; 232 *pbuff++ = in; 233 234 /* points to the next real value */ 235 pS1++; 236 237 /* Decrement loop counter */ 238 i--; 239 } 240 241 242 /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/ 243 244 /* Initializing the loop counter to N/4 instead of N for loop unrolling */ 245 i = S->N >> 2U; 246 247 /* pbuff initialized to the pInlineBuffer(now contains the output values) */ 248 pbuff = pInlineBuffer; 249 250 /* Processing with loop unrolling 4 times as N is always multiple of 4. Compute 4 outputs at a time */ 251 do 252 { 253 /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */ 254 in = *pbuff; 255 *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31)); 256 257 in = *pbuff; 258 *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31)); 259 260 in = *pbuff; 261 *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31)); 262 263 in = *pbuff; 264 *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31)); 265 266 /* Decrement loop counter */ 267 i--; 268 } while (i > 0U); 269 270 271 #else 272 273 /* Initializing the loop counter to N/2 */ 274 i = S->Nby2; 275 276 do 277 { 278 /* Re-ordering of even and odd elements */ 279 /* pState[i] = pInlineBuffer[2*i] */ 280 *pS1++ = *pbuff++; 281 /* pState[N-i-1] = pInlineBuffer[2*i+1] */ 282 *pS2-- = *pbuff++; 283 284 /* Decrement the loop counter */ 285 i--; 286 } while (i > 0U); 287 288 /* pbuff initialized to input buffer */ 289 pbuff = pInlineBuffer; 290 291 /* pS1 initialized to pState */ 292 pS1 = pState; 293 294 /* Initializing the loop counter */ 295 i = S->N; 296 297 do 298 { 299 /* Writing the re-ordered output back to inplace input buffer */ 300 *pbuff++ = *pS1++; 301 302 /* Decrement the loop counter */ 303 i--; 304 } while (i > 0U); 305 306 307 /* --------------------------------------------------------- 308 * Step2: Calculate RFFT for N-point input 309 * ---------------------------------------------------------- */ 310 /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */ 311 arm_rfft_q31 (S->pRfft, pInlineBuffer, pState); 312 313 /*---------------------------------------------------------------------- 314 * Step3: Multiply the FFT output with the weights. 315 *----------------------------------------------------------------------*/ 316 arm_cmplx_mult_cmplx_q31 (pState, weights, pState, S->N); 317 318 /* The output of complex multiplication is in 3.29 format. 319 * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */ 320 arm_shift_q31(pState, 2, pState, S->N * 2); 321 322 /* ----------- Post-processing ---------- */ 323 /* DCT-IV can be obtained from DCT-II by the equation, 324 * Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0) 325 * Hence, Y4(0) = Y2(0)/2 */ 326 /* Getting only real part from the output and Converting to DCT-IV */ 327 328 /* pbuff initialized to input buffer. */ 329 pbuff = pInlineBuffer; 330 331 /* pS1 initialized to pState */ 332 pS1 = pState; 333 334 /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */ 335 in = *pS1++ >> 1U; 336 /* input buffer acts as inplace, so output values are stored in the input itself. */ 337 *pbuff++ = in; 338 339 /* pState pointer is incremented twice as the real values are located alternatively in the array */ 340 pS1++; 341 342 /* Initializing the loop counter */ 343 i = (S->N - 1U); 344 345 while (i > 0U) 346 { 347 /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */ 348 /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */ 349 in = *pS1++ - in; 350 *pbuff++ = in; 351 352 /* points to the next real value */ 353 pS1++; 354 355 /* Decrement loop counter */ 356 i--; 357 } 358 359 /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/ 360 361 /* Initializing loop counter */ 362 i = S->N; 363 364 /* pbuff initialized to the pInlineBuffer (now contains the output values) */ 365 pbuff = pInlineBuffer; 366 367 do 368 { 369 /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */ 370 in = *pbuff; 371 *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31)); 372 373 /* Decrement loop counter */ 374 i--; 375 } while (i > 0U); 376 377 #endif /* #if defined (ARM_MATH_LOOPUNROLL) */ 378 379 } 380 381 /** 382 @} end of DCT4_IDCT4 group 383 */