/ src / imdct.cpp
imdct.cpp
  1  /* ***** BEGIN LICENSE BLOCK ***** 
  2   * Version: RCSL 1.0/RPSL 1.0 
  3   *  
  4   * Portions Copyright (c) 1995-2002 RealNetworks, Inc. All Rights Reserved. 
  5   *      
  6   * The contents of this file, and the files included with this file, are 
  7   * subject to the current version of the RealNetworks Public Source License 
  8   * Version 1.0 (the "RPSL") available at 
  9   * http://www.helixcommunity.org/content/rpsl unless you have licensed 
 10   * the file under the RealNetworks Community Source License Version 1.0 
 11   * (the "RCSL") available at http://www.helixcommunity.org/content/rcsl, 
 12   * in which case the RCSL will apply. You may also obtain the license terms 
 13   * directly from RealNetworks.  You may not use this file except in 
 14   * compliance with the RPSL or, if you have a valid RCSL with RealNetworks 
 15   * applicable to this file, the RCSL.  Please see the applicable RPSL or 
 16   * RCSL for the rights, obligations and limitations governing use of the 
 17   * contents of the file.  
 18   *  
 19   * This file is part of the Helix DNA Technology. RealNetworks is the 
 20   * developer of the Original Code and owns the copyrights in the portions 
 21   * it created. 
 22   *  
 23   * This file, and the files included with this file, is distributed and made 
 24   * available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 
 25   * EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS ALL SUCH WARRANTIES, 
 26   * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, FITNESS 
 27   * FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 
 28   * 
 29   * Technology Compatibility Kit Test Suite(s) Location: 
 30   *    http://www.helixcommunity.org/content/tck 
 31   * 
 32   * Contributor(s): 
 33   *  
 34   * ***** END LICENSE BLOCK ***** */ 
 35  
 36  /**************************************************************************************
 37   * Fixed-point MP3 decoder
 38   * Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com)
 39   * June 2003
 40   *
 41   * imdct.c - antialias, inverse transform (short/long/mixed), windowing, 
 42   *             overlap-add, frequency inversion
 43   **************************************************************************************/
 44  
 45  #include "coder.h"
 46  #include "assembly.h"
 47  
 48  /**************************************************************************************
 49   * Function:    AntiAlias
 50   *
 51   * Description: smooth transition across DCT block boundaries (every 18 coefficients)
 52   *
 53   * Inputs:      vector of dequantized coefficients, length = (nBfly+1) * 18
 54   *              number of "butterflies" to perform (one butterfly means one
 55   *                inter-block smoothing operation)
 56   *
 57   * Outputs:     updated coefficient vector x
 58   *
 59   * Return:      none
 60   *
 61   * Notes:       weighted average of opposite bands (pairwise) from the 8 samples 
 62   *                before and after each block boundary
 63   *              nBlocks = (nonZeroBound + 7) / 18, since nZB is the first ZERO sample 
 64   *                above which all other samples are also zero
 65   *              max gain per sample = 1.372
 66   *                MAX(i) (abs(csa[i][0]) + abs(csa[i][1]))
 67   *              bits gained = 0
 68   *              assume at least 1 guard bit in x[] to avoid overflow
 69   *                (should be guaranteed from dequant, and max gain from stproc * max 
 70   *                 gain from AntiAlias < 2.0)
 71   **************************************************************************************/
 72  // a little bit faster in RAM (< 1 ms per block)
 73  /* __attribute__ ((section (".data"))) */ static void AntiAlias(int *x, int nBfly)
 74  {
 75  	int k, a0, b0, c0, c1;
 76  	const int *c;
 77  
 78  	/* csa = Q31 */
 79  	for (k = nBfly; k > 0; k--) {
 80  		c = csa[0];
 81  		x += 18;
 82  
 83  		a0 = x[-1];			c0 = *c;	c++;	b0 = x[0];		c1 = *c;	c++;
 84  		x[-1] = (MULSHIFT32(c0, a0) - MULSHIFT32(c1, b0)) << 1;	
 85  		x[0] =  (MULSHIFT32(c0, b0) + MULSHIFT32(c1, a0)) << 1;
 86  
 87  		a0 = x[-2];			c0 = *c;	c++;	b0 = x[1];		c1 = *c;	c++;
 88  		x[-2] = (MULSHIFT32(c0, a0) - MULSHIFT32(c1, b0)) << 1;	
 89  		x[1] =  (MULSHIFT32(c0, b0) + MULSHIFT32(c1, a0)) << 1;
 90  		
 91  		a0 = x[-3];			c0 = *c;	c++;	b0 = x[2];		c1 = *c;	c++;
 92  		x[-3] = (MULSHIFT32(c0, a0) - MULSHIFT32(c1, b0)) << 1;	
 93  		x[2] =  (MULSHIFT32(c0, b0) + MULSHIFT32(c1, a0)) << 1;
 94  
 95  		a0 = x[-4];			c0 = *c;	c++;	b0 = x[3];		c1 = *c;	c++;
 96  		x[-4] = (MULSHIFT32(c0, a0) - MULSHIFT32(c1, b0)) << 1;	
 97  		x[3] =  (MULSHIFT32(c0, b0) + MULSHIFT32(c1, a0)) << 1;
 98  
 99  		a0 = x[-5];			c0 = *c;	c++;	b0 = x[4];		c1 = *c;	c++;
100  		x[-5] = (MULSHIFT32(c0, a0) - MULSHIFT32(c1, b0)) << 1;	
101  		x[4] =  (MULSHIFT32(c0, b0) + MULSHIFT32(c1, a0)) << 1;
102  
103  		a0 = x[-6];			c0 = *c;	c++;	b0 = x[5];		c1 = *c;	c++;
104  		x[-6] = (MULSHIFT32(c0, a0) - MULSHIFT32(c1, b0)) << 1;	
105  		x[5] =  (MULSHIFT32(c0, b0) + MULSHIFT32(c1, a0)) << 1;
106  
107  		a0 = x[-7];			c0 = *c;	c++;	b0 = x[6];		c1 = *c;	c++;
108  		x[-7] = (MULSHIFT32(c0, a0) - MULSHIFT32(c1, b0)) << 1;	
109  		x[6] =  (MULSHIFT32(c0, b0) + MULSHIFT32(c1, a0)) << 1;
110  
111  		a0 = x[-8];			c0 = *c;	c++;	b0 = x[7];		c1 = *c;	c++;
112  		x[-8] = (MULSHIFT32(c0, a0) - MULSHIFT32(c1, b0)) << 1;	
113  		x[7] =  (MULSHIFT32(c0, b0) + MULSHIFT32(c1, a0)) << 1;
114  	}
115  }
116  
117  /**************************************************************************************
118   * Function:    WinPrevious
119   *
120   * Description: apply specified window to second half of previous IMDCT (overlap part)
121   *
122   * Inputs:      vector of 9 coefficients (xPrev)
123   *
124   * Outputs:     18 windowed output coefficients (gain 1 integer bit)
125   *              window type (0, 1, 2, 3)
126   *
127   * Return:      none
128   * 
129   * Notes:       produces 9 output samples from 18 input samples via symmetry
130   *              all blocks gain at least 1 guard bit via window (long blocks get extra
131   *                sign bit, short blocks can have one addition but max gain < 1.0)
132   **************************************************************************************/
133  static void WinPrevious(int *xPrev, int *xPrevWin, int btPrev)
134  {
135  	int i, x, *xp, *xpwLo, *xpwHi, wLo, wHi;
136  	const int *wpLo, *wpHi;
137  
138  	xp = xPrev;
139  	/* mapping (see IMDCT12x3): xPrev[0-2] = sum[6-8], xPrev[3-8] = sum[12-17] */
140  	if (btPrev == 2) {
141  		/* this could be reordered for minimum loads/stores */
142  		wpLo = imdctWin[btPrev];
143  		xPrevWin[ 0] = MULSHIFT32(wpLo[ 6], xPrev[2]) + MULSHIFT32(wpLo[0], xPrev[6]);
144  		xPrevWin[ 1] = MULSHIFT32(wpLo[ 7], xPrev[1]) + MULSHIFT32(wpLo[1], xPrev[7]);
145  		xPrevWin[ 2] = MULSHIFT32(wpLo[ 8], xPrev[0]) + MULSHIFT32(wpLo[2], xPrev[8]);
146  		xPrevWin[ 3] = MULSHIFT32(wpLo[ 9], xPrev[0]) + MULSHIFT32(wpLo[3], xPrev[8]);
147  		xPrevWin[ 4] = MULSHIFT32(wpLo[10], xPrev[1]) + MULSHIFT32(wpLo[4], xPrev[7]);
148  		xPrevWin[ 5] = MULSHIFT32(wpLo[11], xPrev[2]) + MULSHIFT32(wpLo[5], xPrev[6]);
149  		xPrevWin[ 6] = MULSHIFT32(wpLo[ 6], xPrev[5]);
150  		xPrevWin[ 7] = MULSHIFT32(wpLo[ 7], xPrev[4]);
151  		xPrevWin[ 8] = MULSHIFT32(wpLo[ 8], xPrev[3]);
152  		xPrevWin[ 9] = MULSHIFT32(wpLo[ 9], xPrev[3]);
153  		xPrevWin[10] = MULSHIFT32(wpLo[10], xPrev[4]);
154  		xPrevWin[11] = MULSHIFT32(wpLo[11], xPrev[5]);
155  		xPrevWin[12] = xPrevWin[13] = xPrevWin[14] = xPrevWin[15] = xPrevWin[16] = xPrevWin[17] = 0;
156  	} else {
157  		/* use ARM-style pointers (*ptr++) so that ADS compiles well */
158  		wpLo = imdctWin[btPrev] + 18;
159  		wpHi = wpLo + 17;
160  		xpwLo = xPrevWin;
161  		xpwHi = xPrevWin + 17;
162  		for (i = 9; i > 0; i--) {
163  			x = *xp++;	wLo = *wpLo++;	wHi = *wpHi--;
164  			*xpwLo++ = MULSHIFT32(wLo, x);
165  			*xpwHi-- = MULSHIFT32(wHi, x);
166  		}
167  	}
168  }
169  
170  /**************************************************************************************
171   * Function:    FreqInvertRescale
172   *
173   * Description: do frequency inversion (odd samples of odd blocks) and rescale 
174   *                if necessary (extra guard bits added before IMDCT)
175   *
176   * Inputs:      output vector y (18 new samples, spaced NBANDS apart)
177   *              previous sample vector xPrev (9 samples)
178   *              index of current block
179   *              number of extra shifts added before IMDCT (usually 0)
180   *
181   * Outputs:     inverted and rescaled (as necessary) outputs
182   *              rescaled (as necessary) previous samples
183   *
184   * Return:      updated mOut (from new outputs y)
185   **************************************************************************************/
186  static int FreqInvertRescale(int *y, int *xPrev, int blockIdx, int es)
187  {
188  	int i, d, mOut;
189  	int y0, y1, y2, y3, y4, y5, y6, y7, y8;
190  
191  	if (es == 0) {
192  		/* fast case - frequency invert only (no rescaling) - can fuse into overlap-add for speed, if desired */
193  		if (blockIdx & 0x01) {
194  			y += NBANDS;
195  			y0 = *y;	y += 2*NBANDS;
196  			y1 = *y;	y += 2*NBANDS;
197  			y2 = *y;	y += 2*NBANDS;
198  			y3 = *y;	y += 2*NBANDS;
199  			y4 = *y;	y += 2*NBANDS;
200  			y5 = *y;	y += 2*NBANDS;
201  			y6 = *y;	y += 2*NBANDS;
202  			y7 = *y;	y += 2*NBANDS;
203  			y8 = *y;	y += 2*NBANDS;
204  
205  			y -= 18*NBANDS;
206  			*y = -y0;	y += 2*NBANDS;
207  			*y = -y1;	y += 2*NBANDS;
208  			*y = -y2;	y += 2*NBANDS;
209  			*y = -y3;	y += 2*NBANDS;
210  			*y = -y4;	y += 2*NBANDS;
211  			*y = -y5;	y += 2*NBANDS;
212  			*y = -y6;	y += 2*NBANDS;
213  			*y = -y7;	y += 2*NBANDS;
214  			*y = -y8;	y += 2*NBANDS;
215  		}
216  		return 0;
217  	} else {
218  		/* undo pre-IMDCT scaling, clipping if necessary */
219  		mOut = 0;
220  		if (blockIdx & 0x01) {
221  			/* frequency invert */
222  			for (i = 0; i < 18; i+=2) {
223  				d = *y;		CLIP_2N(d, 31 - es);	*y = d << es;	mOut |= FASTABS(*y);	y += NBANDS;
224  				d = -*y;	CLIP_2N(d, 31 - es);	*y = d << es;	mOut |= FASTABS(*y);	y += NBANDS;
225  				d = *xPrev;	CLIP_2N(d, 31 - es);	*xPrev++ = d << es;
226  			}
227  		} else {
228  			for (i = 0; i < 18; i+=2) {
229  				d = *y;		CLIP_2N(d, 31 - es);	*y = d << es;	mOut |= FASTABS(*y);	y += NBANDS;
230  				d = *y;		CLIP_2N(d, 31 - es);	*y = d << es;	mOut |= FASTABS(*y);	y += NBANDS;
231  				d = *xPrev;	CLIP_2N(d, 31 - es);	*xPrev++ = d << es;
232  			}
233  		}
234  		return mOut;
235  	}
236  }
237  
238  /* format = Q31
239   * #define M_PI 3.14159265358979323846
240   * double u = 2.0 * M_PI / 9.0;
241   * float c0 = sqrt(3.0) / 2.0; 
242   * float c1 = cos(u);          
243   * float c2 = cos(2*u);        
244   * float c3 = sin(u);          
245   * float c4 = sin(2*u);
246   */
247  static const int c9_0 = 0x6ed9eba1;
248  static const int c9_1 = 0x620dbe8b;
249  static const int c9_2 = 0x163a1a7e;
250  static const int c9_3 = 0x5246dd49;
251  static const int c9_4 = 0x7e0e2e32;
252  
253  /* format = Q31
254   * cos(((0:8) + 0.5) * (pi/18)) 
255   */
256  static const int c18[9] = {
257  	0x7f834ed0, 0x7ba3751d, 0x7401e4c1, 0x68d9f964, 0x5a82799a, 0x496af3e2, 0x36185aee, 0x2120fb83, 0x0b27eb5c, 
258  };
259  
260  /* require at least 3 guard bits in x[] to ensure no overflow */
261  static __inline void idct9(int *x)
262  {
263  	int a1, a2, a3, a4, a5, a6, a7, a8, a9;
264  	int a10, a11, a12, a13, a14, a15, a16, a17, a18;
265  	int a19, a20, a21, a22, a23, a24, a25, a26, a27;
266  	int m1, m3, m5, m6, m7, m8, m9, m10, m11, m12;
267  	int x0, x1, x2, x3, x4, x5, x6, x7, x8;
268  
269  	x0 = x[0]; x1 = x[1]; x2 = x[2]; x3 = x[3]; x4 = x[4];
270  	x5 = x[5]; x6 = x[6]; x7 = x[7]; x8 = x[8];
271  
272  	a1 = x0 - x6;
273  	a2 = x1 - x5;
274  	a3 = x1 + x5;
275  	a4 = x2 - x4;
276  	a5 = x2 + x4;
277  	a6 = x2 + x8;
278  	a7 = x1 + x7;
279  
280  	a8 = a6 - a5;		/* ie x[8] - x[4] */
281  	a9 = a3 - a7;		/* ie x[5] - x[7] */
282  	a10 = a2 - x7;		/* ie x[1] - x[5] - x[7] */
283  	a11 = a4 - x8;		/* ie x[2] - x[4] - x[8] */
284  
285  	/* do the << 1 as constant shifts where mX is actually used (free, no stall or extra inst.) */
286  	m1 =  MULSHIFT32(c9_0, x3);
287  	m3 =  MULSHIFT32(c9_0, a10);
288  	m5 =  MULSHIFT32(c9_1, a5);
289  	m6 =  MULSHIFT32(c9_2, a6);
290  	m7 =  MULSHIFT32(c9_1, a8);
291  	m8 =  MULSHIFT32(c9_2, a5);
292  	m9 =  MULSHIFT32(c9_3, a9);
293  	m10 = MULSHIFT32(c9_4, a7);
294  	m11 = MULSHIFT32(c9_3, a3);
295  	m12 = MULSHIFT32(c9_4, a9);
296  
297  	a12 = x[0] +  (x[6] >> 1);
298  	a13 = a12  +  (  m1 << 1);
299  	a14 = a12  -  (  m1 << 1);
300  	a15 = a1   +  ( a11 >> 1);
301  	a16 = ( m5 << 1) + (m6 << 1);
302  	a17 = ( m7 << 1) - (m8 << 1);
303  	a18 = a16 + a17;
304  	a19 = ( m9 << 1) + (m10 << 1);
305  	a20 = (m11 << 1) - (m12 << 1);
306  
307  	a21 = a20 - a19;
308  	a22 = a13 + a16;
309  	a23 = a14 + a16;
310  	a24 = a14 + a17;
311  	a25 = a13 + a17;
312  	a26 = a14 - a18;
313  	a27 = a13 - a18;
314  
315  	x0 = a22 + a19;			x[0] = x0;
316  	x1 = a15 + (m3 << 1);	x[1] = x1;
317  	x2 = a24 + a20;			x[2] = x2;
318  	x3 = a26 - a21;			x[3] = x3;
319  	x4 = a1 - a11;			x[4] = x4;
320  	x5 = a27 + a21;			x[5] = x5;
321  	x6 = a25 - a20;			x[6] = x6;
322  	x7 = a15 - (m3 << 1);	x[7] = x7;
323  	x8 = a23 - a19;			x[8] = x8;
324  }
325  
326  /* let c(j) = cos(M_PI/36 * ((j)+0.5)), s(j) = sin(M_PI/36 * ((j)+0.5))
327   * then fastWin[2*j+0] = c(j)*(s(j) + c(j)), j = [0, 8]
328   *      fastWin[2*j+1] = c(j)*(s(j) - c(j))
329   * format = Q30
330   */
331  static const int fastWin36[18] = {
332  	(int32_t)0x42aace8b, (int32_t)0xc2e92724, (int32_t)0x47311c28, (int32_t)0xc95f619a, (int32_t)0x4a868feb, (int32_t)0xd0859d8c,
333  	(int32_t)0x4c913b51, (int32_t)0xd8243ea0, (int32_t)0x4d413ccc, (int32_t)0xe0000000, (int32_t)0x4c913b51, (int32_t)0xe7dbc161,
334  	(int32_t)0x4a868feb, (int32_t)0xef7a6275, (int32_t)0x47311c28, (int32_t)0xf6a09e67, (int32_t)0x42aace8b, (int32_t)0xfd16d8dd,
335  };
336  
337  /**************************************************************************************
338   * Function:    IMDCT36
339   *
340   * Description: 36-point modified DCT, with windowing and overlap-add (50% overlap)
341   *
342   * Inputs:      vector of 18 coefficients (N/2 inputs produces N outputs, by symmetry)
343   *              overlap part of last IMDCT (9 samples - see output comments)
344   *              window type (0,1,2,3) of current and previous block
345   *              current block index (for deciding whether to do frequency inversion)
346   *              number of guard bits in input vector
347   *
348   * Outputs:     18 output samples, after windowing and overlap-add with last frame
349   *              second half of (unwindowed) 36-point IMDCT - save for next time
350   *                only save 9 xPrev samples, using symmetry (see WinPrevious())
351   *
352   * Notes:       this is Ken's hyper-fast algorithm, including symmetric sin window
353   *                optimization, if applicable
354   *              total number of multiplies, general case: 
355   *                2*10 (idct9) + 9 (last stage imdct) + 36 (for windowing) = 65
356   *              total number of multiplies, btCurr == 0 && btPrev == 0:
357   *                2*10 (idct9) + 9 (last stage imdct) + 18 (for windowing) = 47
358   *
359   *              blockType == 0 is by far the most common case, so it should be
360   *                possible to use the fast path most of the time
361   *              this is the fastest known algorithm for performing 
362   *                long IMDCT + windowing + overlap-add in MP3
363   *
364   * Return:      mOut (OR of abs(y) for all y calculated here)
365   *
366   * TODO:        optimize for ARM (reorder window coefs, ARM-style pointers in C, 
367   *                inline asm may or may not be helpful)
368   **************************************************************************************/
369  // barely faster in RAM
370  /*__attribute__ ((section (".data")))*/ static int IMDCT36(int *xCurr, int *xPrev, int *y, int btCurr, int btPrev, int blockIdx, int gb)
371  {
372  	int i, es, xBuf[18], xPrevWin[18];
373  	int acc1, acc2, s, d, t, mOut;
374  	int xo, xe, c, *xp, yLo, yHi;
375  	const int *cp, *wp;
376  
377  	acc1 = acc2 = 0;
378  	xCurr += 17;
379  
380  	/* 7 gb is always adequate for antialias + accumulator loop + idct9 */
381  	if (gb < 7) {
382  		/* rarely triggered - 5% to 10% of the time on normal clips (with Q25 input) */
383  		es = 7 - gb;
384  		for (i = 8; i >= 0; i--) {	
385  			acc1 = ((*xCurr--) >> es) - acc1;
386  			acc2 = acc1 - acc2;
387  			acc1 = ((*xCurr--) >> es) - acc1;
388  			xBuf[i+9] = acc2;	/* odd */
389  			xBuf[i+0] = acc1;	/* even */
390  			xPrev[i] >>= es;
391  		}
392  	} else {
393  		es = 0;
394  		/* max gain = 18, assume adequate guard bits */
395  		for (i = 8; i >= 0; i--) {	
396  			acc1 = (*xCurr--) - acc1;
397  			acc2 = acc1 - acc2;
398  			acc1 = (*xCurr--) - acc1;
399  			xBuf[i+9] = acc2;	/* odd */
400  			xBuf[i+0] = acc1;	/* even */
401  		}
402  	}
403  	/* xEven[0] and xOdd[0] scaled by 0.5 */
404  	xBuf[9] >>= 1;
405  	xBuf[0] >>= 1;
406  
407  	/* do 9-point IDCT on even and odd */
408  	idct9(xBuf+0);	/* even */
409  	idct9(xBuf+9);	/* odd */
410  
411  	xp = xBuf + 8;
412  	cp = c18 + 8;
413  	mOut = 0;
414  	if (btPrev == 0 && btCurr == 0) {
415  		/* fast path - use symmetry of sin window to reduce windowing multiplies to 18 (N/2) */
416  		wp = fastWin36;
417  		for (i = 0; i < 9; i++) {
418  			/* do ARM-style pointer arithmetic (i still needed for y[] indexing - compiler spills if 2 y pointers) */
419  			c = *cp--;	xo = *(xp + 9);		xe = *xp--;
420  			/* gain 2 int bits here */
421  			xo = MULSHIFT32(c, xo);			/* 2*c18*xOdd (mul by 2 implicit in scaling)  */
422  			xe >>= 2;
423  
424  			s = -(*xPrev);		/* sum from last block (always at least 2 guard bits) */
425  			d = -(xe - xo);		/* gain 2 int bits, don't shift xo (effective << 1 to eat sign bit, << 1 for mul by 2) */
426  			(*xPrev++) = xe + xo;			/* symmetry - xPrev[i] = xPrev[17-i] for long blocks */
427  			t = s - d;
428  
429  			yLo = (d + (MULSHIFT32(t, *wp++) << 2));
430  			yHi = (s + (MULSHIFT32(t, *wp++) << 2));
431  			y[(i)*NBANDS]    = 	yLo;
432  			y[(17-i)*NBANDS] =  yHi;
433  			mOut |= FASTABS(yLo);
434  			mOut |= FASTABS(yHi);
435  		}
436  	} else {
437  		/* slower method - either prev or curr is using window type != 0 so do full 36-point window 
438  		 * output xPrevWin has at least 3 guard bits (xPrev has 2, gain 1 in WinPrevious)
439  		 */
440  		WinPrevious(xPrev, xPrevWin, btPrev);
441  
442  		wp = imdctWin[btCurr];
443  		for (i = 0; i < 9; i++) {
444  			c = *cp--;	xo = *(xp + 9);		xe = *xp--;
445  			/* gain 2 int bits here */
446  			xo = MULSHIFT32(c, xo);			/* 2*c18*xOdd (mul by 2 implicit in scaling)  */
447  			xe >>= 2;
448  
449  			d = xe - xo;
450  			(*xPrev++) = xe + xo;	/* symmetry - xPrev[i] = xPrev[17-i] for long blocks */
451  			
452  			yLo = (xPrevWin[i]    + MULSHIFT32(d, wp[i])) << 2;
453  			yHi = (xPrevWin[17-i] + MULSHIFT32(d, wp[17-i])) << 2;
454  			y[(i)*NBANDS]    = yLo;
455  			y[(17-i)*NBANDS] = yHi;
456  			mOut |= FASTABS(yLo);
457  			mOut |= FASTABS(yHi);
458  		}
459  	}
460  
461  	xPrev -= 9;
462  	mOut |= FreqInvertRescale(y, xPrev, blockIdx, es);
463  
464  	return mOut;
465  }
466  
467  static const int c3_0 = (int32_t)0x6ed9eba1;	/* format = Q31, cos(pi/6) */
468  static const int c6[3] = { (int32_t)0x7ba3751d, (int32_t)0x5a82799a, (int32_t)0x2120fb83 };	/* format = Q31, cos(((0:2) + 0.5) * (pi/6)) */
469  
470  /* 12-point inverse DCT, used in IMDCT12x3() 
471   * 4 input guard bits will ensure no overflow
472   */
473  static __inline void imdct12 (int *x, int *out)
474  {
475  	int a0, a1, a2;
476  	int x0, x1, x2, x3, x4, x5;
477  
478  	x0 = *x;	x+=3;	x1 = *x;	x+=3;
479  	x2 = *x;	x+=3;	x3 = *x;	x+=3;
480  	x4 = *x;	x+=3;	x5 = *x;	x+=3;
481  
482  	x4 -= x5;
483  	x3 -= x4;
484  	x2 -= x3;
485  	x3 -= x5;
486  	x1 -= x2;
487  	x0 -= x1;
488  	x1 -= x3;
489  
490  	x0 >>= 1;
491  	x1 >>= 1;
492  
493  	a0 = MULSHIFT32(c3_0, x2) << 1;
494  	a1 = x0 + (x4 >> 1);
495  	a2 = x0 - x4;
496  	x0 = a1 + a0;
497  	x2 = a2;
498  	x4 = a1 - a0;
499  
500  	a0 = MULSHIFT32(c3_0, x3) << 1;
501  	a1 = x1 + (x5 >> 1);
502  	a2 = x1 - x5;
503  
504  	/* cos window odd samples, mul by 2, eat sign bit */
505  	x1 = MULSHIFT32(c6[0], a1 + a0) << 2;			
506  	x3 = MULSHIFT32(c6[1], a2) << 2;
507  	x5 = MULSHIFT32(c6[2], a1 - a0) << 2;
508  
509  	*out = x0 + x1;	out++;
510  	*out = x2 + x3;	out++;
511  	*out = x4 + x5;	out++;
512  	*out = x4 - x5;	out++;
513  	*out = x2 - x3;	out++;
514  	*out = x0 - x1;
515  }
516  
517  /**************************************************************************************
518   * Function:    IMDCT12x3
519   *
520   * Description: three 12-point modified DCT's for short blocks, with windowing,
521   *                short block concatenation, and overlap-add
522   *
523   * Inputs:      3 interleaved vectors of 6 samples each 
524   *                (block0[0], block1[0], block2[0], block0[1], block1[1]....)
525   *              overlap part of last IMDCT (9 samples - see output comments)
526   *              window type (0,1,2,3) of previous block
527   *              current block index (for deciding whether to do frequency inversion)
528   *              number of guard bits in input vector
529   *
530   * Outputs:     updated sample vector x, net gain of 1 integer bit
531   *              second half of (unwindowed) IMDCT's - save for next time
532   *                only save 9 xPrev samples, using symmetry (see WinPrevious())
533   *
534   * Return:      mOut (OR of abs(y) for all y calculated here)
535   *
536   * TODO:        optimize for ARM
537   **************************************************************************************/
538   // barely faster in RAM
539  /*__attribute__ ((section (".data")))*/ static int IMDCT12x3(int *xCurr, int *xPrev, int *y, int btPrev, int blockIdx, int gb)
540  {
541  	int i, es, mOut, yLo, xBuf[18], xPrevWin[18];	/* need temp buffer for reordering short blocks */
542  	const int *wp;
543  
544  	es = 0;
545  	/* 7 gb is always adequate for accumulator loop + idct12 + window + overlap */
546  	if (gb < 7) {
547  		es = 7 - gb;
548  		for (i = 0; i < 18; i+=2) {
549  			xCurr[i+0] >>= es;
550  			xCurr[i+1] >>= es;
551  			*xPrev++ >>= es;
552  		}
553  		xPrev -= 9;
554  	}
555  
556  	/* requires 4 input guard bits for each imdct12 */
557  	imdct12(xCurr + 0, xBuf + 0);
558  	imdct12(xCurr + 1, xBuf + 6);
559  	imdct12(xCurr + 2, xBuf + 12);
560  
561  	/* window previous from last time */
562  	WinPrevious(xPrev, xPrevWin, btPrev);
563  
564  	/* could unroll this for speed, minimum loads (short blocks usually rare, so doesn't make much overall difference) 
565  	 * xPrevWin[i] << 2 still has 1 gb always, max gain of windowed xBuf stuff also < 1.0 and gain the sign bit
566  	 * so y calculations won't overflow
567  	 */
568  	wp = imdctWin[2];
569  	mOut = 0;
570  	for (i = 0; i < 3; i++) {
571  		yLo = (xPrevWin[ 0+i] << 2);
572  		mOut |= FASTABS(yLo);	y[( 0+i)*NBANDS] = yLo;
573  		yLo = (xPrevWin[ 3+i] << 2);
574  		mOut |= FASTABS(yLo);	y[( 3+i)*NBANDS] = yLo;
575  		yLo = (xPrevWin[ 6+i] << 2) + (MULSHIFT32(wp[0+i], xBuf[3+i]));	
576  		mOut |= FASTABS(yLo);	y[( 6+i)*NBANDS] = yLo;
577  		yLo = (xPrevWin[ 9+i] << 2) + (MULSHIFT32(wp[3+i], xBuf[5-i]));	
578  		mOut |= FASTABS(yLo);	y[( 9+i)*NBANDS] = yLo;
579  		yLo = (xPrevWin[12+i] << 2) + (MULSHIFT32(wp[6+i], xBuf[2-i]) + MULSHIFT32(wp[0+i], xBuf[(6+3)+i]));	
580  		mOut |= FASTABS(yLo);	y[(12+i)*NBANDS] = yLo;
581  		yLo = (xPrevWin[15+i] << 2) + (MULSHIFT32(wp[9+i], xBuf[0+i]) + MULSHIFT32(wp[3+i], xBuf[(6+5)-i]));	
582  		mOut |= FASTABS(yLo);	y[(15+i)*NBANDS] = yLo;
583  	}
584  
585  	/* save previous (unwindowed) for overlap - only need samples 6-8, 12-17 */
586  	for (i = 6; i < 9; i++)
587  		*xPrev++ = xBuf[i] >> 2;
588  	for (i = 12; i < 18; i++)
589  		*xPrev++ = xBuf[i] >> 2;
590  
591  	xPrev -= 9;
592  	mOut |= FreqInvertRescale(y, xPrev, blockIdx, es);
593  
594  	return mOut;
595  }
596  
597  /**************************************************************************************
598   * Function:    HybridTransform
599   *
600   * Description: IMDCT's, windowing, and overlap-add on long/short/mixed blocks
601   *
602   * Inputs:      vector of input coefficients, length = nBlocksTotal * 18)
603   *              vector of overlap samples from last time, length = nBlocksPrev * 9)
604   *              buffer for output samples, length = MAXNSAMP
605   *              SideInfoSub struct for this granule/channel
606   *              BlockCount struct with necessary info
607   *                number of non-zero input and overlap blocks
608   *                number of long blocks in input vector (rest assumed to be short blocks)
609   *                number of blocks which use long window (type) 0 in case of mixed block
610   *                  (bc->currWinSwitch, 0 for non-mixed blocks)
611   *
612   * Outputs:     transformed, windowed, and overlapped sample buffer
613   *              does frequency inversion on odd blocks
614   *              updated buffer of samples for overlap
615   *
616   * Return:      number of non-zero IMDCT blocks calculated in this call
617   *                (including overlap-add)
618   *
619   * TODO:        examine mixedBlock/winSwitch logic carefully (test he_mode.bit)
620   **************************************************************************************/
621  static int HybridTransform(int *xCurr, int *xPrev, int y[BLOCK_SIZE][NBANDS], SideInfoSub *sis, BlockCount *bc)
622  {
623  	int xPrevWin[18], currWinIdx, prevWinIdx;
624  	int i, j, nBlocksOut, nonZero, mOut;
625  	int fiBit, xp;
626  
627  	ASSERT(bc->nBlocksLong  <= NBANDS);
628  	ASSERT(bc->nBlocksTotal <= NBANDS);
629  	ASSERT(bc->nBlocksPrev  <= NBANDS);
630  
631  	mOut = 0;
632  
633  	/* do long blocks, if any */
634  	for(i = 0; i < bc->nBlocksLong; i++) {
635  		/* currWinIdx picks the right window for long blocks (if mixed, long blocks use window type 0) */
636  		currWinIdx = sis->blockType;
637  		if (sis->mixedBlock && i < bc->currWinSwitch) 
638  			currWinIdx = 0;
639  
640  		prevWinIdx = bc->prevType;
641  		if (i < bc->prevWinSwitch)
642  			 prevWinIdx = 0;
643  
644  		/* do 36-point IMDCT, including windowing and overlap-add */
645  		mOut |= IMDCT36(xCurr, xPrev, &(y[0][i]), currWinIdx, prevWinIdx, i, bc->gbIn);
646  		xCurr += 18;
647  		xPrev += 9;
648  	}
649  
650  	/* do short blocks (if any) */
651  	for (   ; i < bc->nBlocksTotal; i++) {
652  		ASSERT(sis->blockType == 2);
653  
654  		prevWinIdx = bc->prevType;
655  		if (i < bc->prevWinSwitch)
656  			 prevWinIdx = 0;
657  		
658  		mOut |= IMDCT12x3(xCurr, xPrev, &(y[0][i]), prevWinIdx, i, bc->gbIn);
659  		xCurr += 18;
660  		xPrev += 9;
661  	}
662  	nBlocksOut = i;
663  	
664  	/* window and overlap prev if prev longer that current */
665  	for (   ; i < bc->nBlocksPrev; i++) {
666  		prevWinIdx = bc->prevType;
667  		if (i < bc->prevWinSwitch)
668  			 prevWinIdx = 0;
669  		WinPrevious(xPrev, xPrevWin, prevWinIdx);
670  
671  		nonZero = 0;
672  		fiBit = i << 31;
673  		for (j = 0; j < 9; j++) {
674  			xp = xPrevWin[2*j+0] << 2;	/* << 2 temp for scaling */
675  			nonZero |= xp;
676  			y[2*j+0][i] = xp;
677  			mOut |= FASTABS(xp);
678  
679  			/* frequency inversion on odd blocks/odd samples (flip sign if i odd, j odd) */
680  			xp = xPrevWin[2*j+1] << 2;
681  			xp = (xp ^ (fiBit >> 31)) + (i & 0x01);	
682  			nonZero |= xp;
683  			y[2*j+1][i] = xp;
684  			mOut |= FASTABS(xp);
685  
686  			xPrev[j] = 0;
687  		}
688  		xPrev += 9;
689  		if (nonZero)
690  			nBlocksOut = i;
691  	}
692  	
693  	/* clear rest of blocks */
694  	for (   ; i < 32; i++) {
695  		for (j = 0; j < 18; j++) 
696  			y[j][i] = 0;
697  	}
698  
699  	bc->gbOut = CLZ(mOut) - 1;
700  
701  	return nBlocksOut;
702  }
703  
704  /**************************************************************************************
705   * Function:    IMDCT
706   *
707   * Description: do alias reduction, inverse MDCT, overlap-add, and frequency inversion
708   *
709   * Inputs:      MP3DecInfo structure filled by UnpackFrameHeader(), UnpackSideInfo(),
710   *                UnpackScaleFactors(), and DecodeHuffman() (for this granule, channel)
711   *                includes PCM samples in overBuf (from last call to IMDCT) for OLA
712   *              index of current granule and channel
713   *
714   * Outputs:     PCM samples in outBuf, for input to subband transform
715   *              PCM samples in overBuf, for OLA next time
716   *              updated hi->nonZeroBound index for this channel
717   *
718   * Return:      0 on success,  -1 if null input pointers
719   **************************************************************************************/
720   // a bit faster in RAM
721  /*__attribute__ ((section (".data")))*/ int IMDCT(MP3DecInfo *mp3DecInfo, int gr, int ch)
722  {
723  	int nBfly, blockCutoff;
724  	FrameHeader *fh;
725  	SideInfo *si;
726  	HuffmanInfo *hi;
727  	IMDCTInfo *mi;
728  	BlockCount bc;
729  
730  	/* validate pointers */
731  	if (!mp3DecInfo || !mp3DecInfo->FrameHeaderPS || !mp3DecInfo->SideInfoPS || 
732  		!mp3DecInfo->HuffmanInfoPS || !mp3DecInfo->IMDCTInfoPS)
733  		return -1;
734  
735  	/* si is an array of up to 4 structs, stored as gr0ch0, gr0ch1, gr1ch0, gr1ch1 */
736  	fh = (FrameHeader *)(mp3DecInfo->FrameHeaderPS);
737  	si = (SideInfo *)(mp3DecInfo->SideInfoPS);
738  	hi = (HuffmanInfo*)(mp3DecInfo->HuffmanInfoPS);
739  	mi = (IMDCTInfo *)(mp3DecInfo->IMDCTInfoPS);
740  
741  	/* anti-aliasing done on whole long blocks only
742  	 * for mixed blocks, nBfly always 1, except 3 for 8 kHz MPEG 2.5 (see sfBandTab) 
743       *   nLongBlocks = number of blocks with (possibly) non-zero power 
744  	 *   nBfly = number of butterflies to do (nLongBlocks - 1, unless no long blocks)
745  	 */
746  	blockCutoff = fh->sfBand->l[(fh->ver == MPEG1 ? 8 : 6)] / 18;	/* same as 3* num short sfb's in spec */
747  	if (si->sis[gr][ch].blockType != 2) {
748  		/* all long transforms */
749  		bc.nBlocksLong = MIN((hi->nonZeroBound[ch] + 7) / 18 + 1, 32);	
750  		nBfly = bc.nBlocksLong - 1;
751  	} else if (si->sis[gr][ch].blockType == 2 && si->sis[gr][ch].mixedBlock) {
752  		/* mixed block - long transforms until cutoff, then short transforms */
753  		bc.nBlocksLong = blockCutoff;	
754  		nBfly = bc.nBlocksLong - 1;
755  	} else {
756  		/* all short transforms */
757  		bc.nBlocksLong = 0;
758  		nBfly = 0;
759  	}
760   
761  	AntiAlias(hi->huffDecBuf[ch], nBfly);
762  	hi->nonZeroBound[ch] = MAX(hi->nonZeroBound[ch], (nBfly * 18) + 8);
763  
764  	ASSERT(hi->nonZeroBound[ch] <= MAX_NSAMP);
765  
766  	/* for readability, use a struct instead of passing a million parameters to HybridTransform() */
767  	bc.nBlocksTotal = (hi->nonZeroBound[ch] + 17) / 18;
768  	bc.nBlocksPrev = mi->numPrevIMDCT[ch];
769  	bc.prevType = mi->prevType[ch];
770  	bc.prevWinSwitch = mi->prevWinSwitch[ch];
771  	bc.currWinSwitch = (si->sis[gr][ch].mixedBlock ? blockCutoff : 0);	/* where WINDOW switches (not nec. transform) */
772  	bc.gbIn = hi->gb[ch];
773  
774  	mi->numPrevIMDCT[ch] = HybridTransform(hi->huffDecBuf[ch], mi->overBuf[ch], mi->outBuf[ch], &si->sis[gr][ch], &bc);
775  	mi->prevType[ch] = si->sis[gr][ch].blockType;
776  	mi->prevWinSwitch[ch] = bc.currWinSwitch;		/* 0 means not a mixed block (either all short or all long) */
777  	mi->gb[ch] = bc.gbOut;
778  
779  	ASSERT(mi->numPrevIMDCT[ch] <= NBANDS);
780  
781  	/* output has gained 2 int bits */
782  	return 0;
783  }