/ Drivers / CMSIS / DSP / Source / TransformFunctions / arm_dct4_q31.c
arm_dct4_q31.c
  1  /* ----------------------------------------------------------------------
  2   * Project:      CMSIS DSP Library
  3   * Title:        arm_dct4_q31.c
  4   * Description:  Processing function of DCT4 & IDCT4 Q31
  5   *
  6   * $Date:        23 April 2021
  7   * $Revision:    V1.9.0
  8   *
  9   * Target Processor: Cortex-M and Cortex-A cores
 10   * -------------------------------------------------------------------- */
 11  /*
 12   * Copyright (C) 2010-2021 ARM Limited or its affiliates. All rights reserved.
 13   *
 14   * SPDX-License-Identifier: Apache-2.0
 15   *
 16   * Licensed under the Apache License, Version 2.0 (the License); you may
 17   * not use this file except in compliance with the License.
 18   * You may obtain a copy of the License at
 19   *
 20   * www.apache.org/licenses/LICENSE-2.0
 21   *
 22   * Unless required by applicable law or agreed to in writing, software
 23   * distributed under the License is distributed on an AS IS BASIS, WITHOUT
 24   * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 25   * See the License for the specific language governing permissions and
 26   * limitations under the License.
 27   */
 28  
 29  #include "dsp/transform_functions.h"
 30  
 31  /**
 32    @addtogroup DCT4_IDCT4
 33    @{
 34   */
 35  
 36  /**
 37    @brief         Processing function for the Q31 DCT4/IDCT4.
 38    @param[in]     S             points to an instance of the Q31 DCT4 structure.
 39    @param[in]     pState        points to state buffer.
 40    @param[in,out] pInlineBuffer points to the in-place input and output buffer.
 41    @return        none
 42  
 43    @par           Input an output formats
 44                     Input samples need to be downscaled by 1 bit to avoid saturations in the Q31 DCT process,
 45                     as the conversion from DCT2 to DCT4 involves one subtraction.
 46                     Internally inputs are downscaled in the RFFT process function to avoid overflows.
 47                     Number of bits downscaled, depends on the size of the transform.
 48                     The input and output formats for different DCT sizes and number of bits to upscale are
 49                     mentioned in the table below:
 50  
 51                     \image html dct4FormatsQ31Table.gif
 52   */
 53  
 54  void arm_dct4_q31(
 55    const arm_dct4_instance_q31 * S,
 56          q31_t * pState,
 57          q31_t * pInlineBuffer)
 58  {
 59    const q31_t *weights = S->pTwiddle;                  /* Pointer to the Weights table */
 60    const q31_t *cosFact = S->pCosFactor;                /* Pointer to the cos factors table */
 61          q31_t *pS1, *pS2, *pbuff;                      /* Temporary pointers for input buffer and pState buffer */
 62          q31_t in;                                      /* Temporary variable */
 63          uint32_t i;                                    /* Loop counter */
 64  
 65  
 66    /* DCT4 computation involves DCT2 (which is calculated using RFFT)
 67     * along with some pre-processing and post-processing.
 68     * Computational procedure is explained as follows:
 69     * (a) Pre-processing involves multiplying input with cos factor,
 70     *     r(n) = 2 * u(n) * cos(pi*(2*n+1)/(4*n))
 71     *              where,
 72     *                 r(n) -- output of preprocessing
 73     *                 u(n) -- input to preprocessing(actual Source buffer)
 74     * (b) Calculation of DCT2 using FFT is divided into three steps:
 75     *                  Step1: Re-ordering of even and odd elements of input.
 76     *                  Step2: Calculating FFT of the re-ordered input.
 77     *                  Step3: Taking the real part of the product of FFT output and weights.
 78     * (c) Post-processing - DCT4 can be obtained from DCT2 output using the following equation:
 79     *                   Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
 80     *                        where,
 81     *                           Y4 -- DCT4 output,   Y2 -- DCT2 output
 82     * (d) Multiplying the output with the normalizing factor sqrt(2/N).
 83     */
 84  
 85    /*-------- Pre-processing ------------*/
 86    /* Multiplying input with cos factor i.e. r(n) = 2 * x(n) * cos(pi*(2*n+1)/(4*n)) */
 87    arm_mult_q31 (pInlineBuffer, cosFact, pInlineBuffer, S->N);
 88    arm_shift_q31 (pInlineBuffer, 1, pInlineBuffer, S->N);
 89  
 90    /* ----------------------------------------------------------------
 91     * Step1: Re-ordering of even and odd elements as
 92     *             pState[i] =  pInlineBuffer[2*i] and
 93     *             pState[N-i-1] = pInlineBuffer[2*i+1] where i = 0 to N/2
 94     ---------------------------------------------------------------------*/
 95  
 96    /* pS1 initialized to pState */
 97    pS1 = pState;
 98  
 99    /* pS2 initialized to pState+N-1, so that it points to the end of the state buffer */
100    pS2 = pState + (S->N - 1U);
101  
102    /* pbuff initialized to input buffer */
103    pbuff = pInlineBuffer;
104  
105  
106  #if defined (ARM_MATH_LOOPUNROLL)
107  
108    /* Initializing the loop counter to N/2 >> 2 for loop unrolling by 4 */
109    i = S->Nby2 >> 2U;
110  
111    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
112     ** a second loop below computes the remaining 1 to 3 samples. */
113    do
114    {
115      /* Re-ordering of even and odd elements */
116      /* pState[i] =  pInlineBuffer[2*i] */
117      *pS1++ = *pbuff++;
118      /* pState[N-i-1] = pInlineBuffer[2*i+1] */
119      *pS2-- = *pbuff++;
120  
121      *pS1++ = *pbuff++;
122      *pS2-- = *pbuff++;
123  
124      *pS1++ = *pbuff++;
125      *pS2-- = *pbuff++;
126  
127      *pS1++ = *pbuff++;
128      *pS2-- = *pbuff++;
129  
130      /* Decrement loop counter */
131      i--;
132    } while (i > 0U);
133  
134    /* pbuff initialized to input buffer */
135    pbuff = pInlineBuffer;
136  
137    /* pS1 initialized to pState */
138    pS1 = pState;
139  
140    /* Initializing the loop counter to N/4 instead of N for loop unrolling */
141    i = S->N >> 2U;
142  
143    /* Processing with loop unrolling 4 times as N is always multiple of 4.
144     * Compute 4 outputs at a time */
145    do
146    {
147      /* Writing the re-ordered output back to inplace input buffer */
148      *pbuff++ = *pS1++;
149      *pbuff++ = *pS1++;
150      *pbuff++ = *pS1++;
151      *pbuff++ = *pS1++;
152  
153      /* Decrement the loop counter */
154      i--;
155    } while (i > 0U);
156  
157  
158    /* ---------------------------------------------------------
159     *     Step2: Calculate RFFT for N-point input
160     * ---------------------------------------------------------- */
161    /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
162    arm_rfft_q31 (S->pRfft, pInlineBuffer, pState);
163  
164    /*----------------------------------------------------------------------
165     *  Step3: Multiply the FFT output with the weights.
166     *----------------------------------------------------------------------*/
167    arm_cmplx_mult_cmplx_q31 (pState, weights, pState, S->N);
168  
169    /* The output of complex multiplication is in 3.29 format.
170     * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */
171    arm_shift_q31 (pState, 2, pState, S->N * 2);
172  
173    /* ----------- Post-processing ---------- */
174    /* DCT-IV can be obtained from DCT-II by the equation,
175     *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
176     *       Hence, Y4(0) = Y2(0)/2  */
177    /* Getting only real part from the output and Converting to DCT-IV */
178  
179    /* Initializing the loop counter to N >> 2 for loop unrolling by 4 */
180    i = (S->N - 1U) >> 2U;
181  
182    /* pbuff initialized to input buffer. */
183    pbuff = pInlineBuffer;
184  
185    /* pS1 initialized to pState */
186    pS1 = pState;
187  
188    /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
189    in = *pS1++ >> 1U;
190    /* input buffer acts as inplace, so output values are stored in the input itself. */
191    *pbuff++ = in;
192  
193    /* pState pointer is incremented twice as the real values are located alternatively in the array */
194    pS1++;
195  
196    /* First part of the processing with loop unrolling.  Compute 4 outputs at a time.
197     ** a second loop below computes the remaining 1 to 3 samples. */
198    do
199    {
200      /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
201      /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
202      in = *pS1++ - in;
203      *pbuff++ = in;
204      /* points to the next real value */
205      pS1++;
206  
207      in = *pS1++ - in;
208      *pbuff++ = in;
209      pS1++;
210  
211      in = *pS1++ - in;
212      *pbuff++ = in;
213      pS1++;
214  
215      in = *pS1++ - in;
216      *pbuff++ = in;
217      pS1++;
218  
219      /* Decrement the loop counter */
220      i--;
221    } while (i > 0U);
222  
223    /* If the blockSize is not a multiple of 4, compute any remaining output samples here.
224     ** No loop unrolling is used. */
225    i = (S->N - 1U) % 0x4U;
226  
227    while (i > 0U)
228    {
229      /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
230      /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
231      in = *pS1++ - in;
232      *pbuff++ = in;
233  
234      /* points to the next real value */
235      pS1++;
236  
237      /* Decrement loop counter */
238      i--;
239    }
240  
241  
242    /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
243  
244    /* Initializing the loop counter to N/4 instead of N for loop unrolling */
245    i = S->N >> 2U;
246  
247    /* pbuff initialized to the pInlineBuffer(now contains the output values) */
248    pbuff = pInlineBuffer;
249  
250    /* Processing with loop unrolling 4 times as N is always multiple of 4.  Compute 4 outputs at a time */
251    do
252    {
253      /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
254      in = *pbuff;
255      *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
256  
257      in = *pbuff;
258      *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
259  
260      in = *pbuff;
261      *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
262  
263      in = *pbuff;
264      *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
265  
266      /* Decrement loop counter */
267      i--;
268    } while (i > 0U);
269  
270  
271  #else
272  
273    /* Initializing the loop counter to N/2 */
274    i = S->Nby2;
275  
276    do
277    {
278      /* Re-ordering of even and odd elements */
279      /* pState[i] =  pInlineBuffer[2*i] */
280      *pS1++ = *pbuff++;
281      /* pState[N-i-1] = pInlineBuffer[2*i+1] */
282      *pS2-- = *pbuff++;
283  
284      /* Decrement the loop counter */
285      i--;
286    } while (i > 0U);
287  
288    /* pbuff initialized to input buffer */
289    pbuff = pInlineBuffer;
290  
291    /* pS1 initialized to pState */
292    pS1 = pState;
293  
294    /* Initializing the loop counter */
295    i = S->N;
296  
297    do
298    {
299      /* Writing the re-ordered output back to inplace input buffer */
300      *pbuff++ = *pS1++;
301  
302      /* Decrement the loop counter */
303      i--;
304    } while (i > 0U);
305  
306  
307    /* ---------------------------------------------------------
308     *     Step2: Calculate RFFT for N-point input
309     * ---------------------------------------------------------- */
310    /* pInlineBuffer is real input of length N , pState is the complex output of length 2N */
311    arm_rfft_q31 (S->pRfft, pInlineBuffer, pState);
312  
313    /*----------------------------------------------------------------------
314     *  Step3: Multiply the FFT output with the weights.
315     *----------------------------------------------------------------------*/
316    arm_cmplx_mult_cmplx_q31 (pState, weights, pState, S->N);
317  
318    /* The output of complex multiplication is in 3.29 format.
319     * Hence changing the format of N (i.e. 2*N elements) complex numbers to 1.31 format by shifting left by 2 bits. */
320    arm_shift_q31(pState, 2, pState, S->N * 2);
321  
322    /* ----------- Post-processing ---------- */
323    /* DCT-IV can be obtained from DCT-II by the equation,
324     *       Y4(k) = Y2(k) - Y4(k-1) and Y4(-1) = Y4(0)
325     *       Hence, Y4(0) = Y2(0)/2  */
326    /* Getting only real part from the output and Converting to DCT-IV */
327  
328    /* pbuff initialized to input buffer. */
329    pbuff = pInlineBuffer;
330  
331    /* pS1 initialized to pState */
332    pS1 = pState;
333  
334    /* Calculating Y4(0) from Y2(0) using Y4(0) = Y2(0)/2 */
335    in = *pS1++ >> 1U;
336    /* input buffer acts as inplace, so output values are stored in the input itself. */
337    *pbuff++ = in;
338  
339    /* pState pointer is incremented twice as the real values are located alternatively in the array */
340    pS1++;
341  
342    /* Initializing the loop counter */
343    i = (S->N - 1U);
344  
345    while (i > 0U)
346    {
347      /* Calculating Y4(1) to Y4(N-1) from Y2 using equation Y4(k) = Y2(k) - Y4(k-1) */
348      /* pState pointer (pS1) is incremented twice as the real values are located alternatively in the array */
349      in = *pS1++ - in;
350      *pbuff++ = in;
351  
352      /* points to the next real value */
353      pS1++;
354  
355      /* Decrement loop counter */
356      i--;
357    }
358  
359    /*------------ Normalizing the output by multiplying with the normalizing factor ----------*/
360  
361    /* Initializing loop counter */
362    i = S->N;
363  
364    /* pbuff initialized to the pInlineBuffer (now contains the output values) */
365    pbuff = pInlineBuffer;
366  
367    do
368    {
369      /* Multiplying pInlineBuffer with the normalizing factor sqrt(2/N) */
370      in = *pbuff;
371      *pbuff++ = ((q31_t) (((q63_t) in * S->normalize) >> 31));
372  
373      /* Decrement loop counter */
374      i--;
375    } while (i > 0U);
376  
377  #endif /* #if defined (ARM_MATH_LOOPUNROLL) */
378  
379  }
380  
381  /**
382    @} end of DCT4_IDCT4 group
383   */