imdct.cpp
1 /* ***** BEGIN LICENSE BLOCK ***** 2 * Version: RCSL 1.0/RPSL 1.0 3 * 4 * Portions Copyright (c) 1995-2002 RealNetworks, Inc. All Rights Reserved. 5 * 6 * The contents of this file, and the files included with this file, are 7 * subject to the current version of the RealNetworks Public Source License 8 * Version 1.0 (the "RPSL") available at 9 * http://www.helixcommunity.org/content/rpsl unless you have licensed 10 * the file under the RealNetworks Community Source License Version 1.0 11 * (the "RCSL") available at http://www.helixcommunity.org/content/rcsl, 12 * in which case the RCSL will apply. You may also obtain the license terms 13 * directly from RealNetworks. You may not use this file except in 14 * compliance with the RPSL or, if you have a valid RCSL with RealNetworks 15 * applicable to this file, the RCSL. Please see the applicable RPSL or 16 * RCSL for the rights, obligations and limitations governing use of the 17 * contents of the file. 18 * 19 * This file is part of the Helix DNA Technology. RealNetworks is the 20 * developer of the Original Code and owns the copyrights in the portions 21 * it created. 22 * 23 * This file, and the files included with this file, is distributed and made 24 * available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 25 * EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS ALL SUCH WARRANTIES, 26 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, FITNESS 27 * FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 28 * 29 * Technology Compatibility Kit Test Suite(s) Location: 30 * http://www.helixcommunity.org/content/tck 31 * 32 * Contributor(s): 33 * 34 * ***** END LICENSE BLOCK ***** */ 35 36 /************************************************************************************** 37 * Fixed-point MP3 decoder 38 * Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com) 39 * June 2003 40 * 41 * imdct.c - antialias, inverse transform (short/long/mixed), windowing, 42 * overlap-add, frequency inversion 43 **************************************************************************************/ 44 45 #include "coder.h" 46 #include "assembly.h" 47 48 /************************************************************************************** 49 * Function: AntiAlias 50 * 51 * Description: smooth transition across DCT block boundaries (every 18 coefficients) 52 * 53 * Inputs: vector of dequantized coefficients, length = (nBfly+1) * 18 54 * number of "butterflies" to perform (one butterfly means one 55 * inter-block smoothing operation) 56 * 57 * Outputs: updated coefficient vector x 58 * 59 * Return: none 60 * 61 * Notes: weighted average of opposite bands (pairwise) from the 8 samples 62 * before and after each block boundary 63 * nBlocks = (nonZeroBound + 7) / 18, since nZB is the first ZERO sample 64 * above which all other samples are also zero 65 * max gain per sample = 1.372 66 * MAX(i) (abs(csa[i][0]) + abs(csa[i][1])) 67 * bits gained = 0 68 * assume at least 1 guard bit in x[] to avoid overflow 69 * (should be guaranteed from dequant, and max gain from stproc * max 70 * gain from AntiAlias < 2.0) 71 **************************************************************************************/ 72 // a little bit faster in RAM (< 1 ms per block) 73 /* __attribute__ ((section (".data"))) */ static void AntiAlias(int *x, int nBfly) 74 { 75 int k, a0, b0, c0, c1; 76 const int *c; 77 78 /* csa = Q31 */ 79 for (k = nBfly; k > 0; k--) { 80 c = csa[0]; 81 x += 18; 82 83 a0 = x[-1]; c0 = *c; c++; b0 = x[0]; c1 = *c; c++; 84 x[-1] = (MULSHIFT32(c0, a0) - MULSHIFT32(c1, b0)) << 1; 85 x[0] = (MULSHIFT32(c0, b0) + MULSHIFT32(c1, a0)) << 1; 86 87 a0 = x[-2]; c0 = *c; c++; b0 = x[1]; c1 = *c; c++; 88 x[-2] = (MULSHIFT32(c0, a0) - MULSHIFT32(c1, b0)) << 1; 89 x[1] = (MULSHIFT32(c0, b0) + MULSHIFT32(c1, a0)) << 1; 90 91 a0 = x[-3]; c0 = *c; c++; b0 = x[2]; c1 = *c; c++; 92 x[-3] = (MULSHIFT32(c0, a0) - MULSHIFT32(c1, b0)) << 1; 93 x[2] = (MULSHIFT32(c0, b0) + MULSHIFT32(c1, a0)) << 1; 94 95 a0 = x[-4]; c0 = *c; c++; b0 = x[3]; c1 = *c; c++; 96 x[-4] = (MULSHIFT32(c0, a0) - MULSHIFT32(c1, b0)) << 1; 97 x[3] = (MULSHIFT32(c0, b0) + MULSHIFT32(c1, a0)) << 1; 98 99 a0 = x[-5]; c0 = *c; c++; b0 = x[4]; c1 = *c; c++; 100 x[-5] = (MULSHIFT32(c0, a0) - MULSHIFT32(c1, b0)) << 1; 101 x[4] = (MULSHIFT32(c0, b0) + MULSHIFT32(c1, a0)) << 1; 102 103 a0 = x[-6]; c0 = *c; c++; b0 = x[5]; c1 = *c; c++; 104 x[-6] = (MULSHIFT32(c0, a0) - MULSHIFT32(c1, b0)) << 1; 105 x[5] = (MULSHIFT32(c0, b0) + MULSHIFT32(c1, a0)) << 1; 106 107 a0 = x[-7]; c0 = *c; c++; b0 = x[6]; c1 = *c; c++; 108 x[-7] = (MULSHIFT32(c0, a0) - MULSHIFT32(c1, b0)) << 1; 109 x[6] = (MULSHIFT32(c0, b0) + MULSHIFT32(c1, a0)) << 1; 110 111 a0 = x[-8]; c0 = *c; c++; b0 = x[7]; c1 = *c; c++; 112 x[-8] = (MULSHIFT32(c0, a0) - MULSHIFT32(c1, b0)) << 1; 113 x[7] = (MULSHIFT32(c0, b0) + MULSHIFT32(c1, a0)) << 1; 114 } 115 } 116 117 /************************************************************************************** 118 * Function: WinPrevious 119 * 120 * Description: apply specified window to second half of previous IMDCT (overlap part) 121 * 122 * Inputs: vector of 9 coefficients (xPrev) 123 * 124 * Outputs: 18 windowed output coefficients (gain 1 integer bit) 125 * window type (0, 1, 2, 3) 126 * 127 * Return: none 128 * 129 * Notes: produces 9 output samples from 18 input samples via symmetry 130 * all blocks gain at least 1 guard bit via window (long blocks get extra 131 * sign bit, short blocks can have one addition but max gain < 1.0) 132 **************************************************************************************/ 133 static void WinPrevious(int *xPrev, int *xPrevWin, int btPrev) 134 { 135 int i, x, *xp, *xpwLo, *xpwHi, wLo, wHi; 136 const int *wpLo, *wpHi; 137 138 xp = xPrev; 139 /* mapping (see IMDCT12x3): xPrev[0-2] = sum[6-8], xPrev[3-8] = sum[12-17] */ 140 if (btPrev == 2) { 141 /* this could be reordered for minimum loads/stores */ 142 wpLo = imdctWin[btPrev]; 143 xPrevWin[ 0] = MULSHIFT32(wpLo[ 6], xPrev[2]) + MULSHIFT32(wpLo[0], xPrev[6]); 144 xPrevWin[ 1] = MULSHIFT32(wpLo[ 7], xPrev[1]) + MULSHIFT32(wpLo[1], xPrev[7]); 145 xPrevWin[ 2] = MULSHIFT32(wpLo[ 8], xPrev[0]) + MULSHIFT32(wpLo[2], xPrev[8]); 146 xPrevWin[ 3] = MULSHIFT32(wpLo[ 9], xPrev[0]) + MULSHIFT32(wpLo[3], xPrev[8]); 147 xPrevWin[ 4] = MULSHIFT32(wpLo[10], xPrev[1]) + MULSHIFT32(wpLo[4], xPrev[7]); 148 xPrevWin[ 5] = MULSHIFT32(wpLo[11], xPrev[2]) + MULSHIFT32(wpLo[5], xPrev[6]); 149 xPrevWin[ 6] = MULSHIFT32(wpLo[ 6], xPrev[5]); 150 xPrevWin[ 7] = MULSHIFT32(wpLo[ 7], xPrev[4]); 151 xPrevWin[ 8] = MULSHIFT32(wpLo[ 8], xPrev[3]); 152 xPrevWin[ 9] = MULSHIFT32(wpLo[ 9], xPrev[3]); 153 xPrevWin[10] = MULSHIFT32(wpLo[10], xPrev[4]); 154 xPrevWin[11] = MULSHIFT32(wpLo[11], xPrev[5]); 155 xPrevWin[12] = xPrevWin[13] = xPrevWin[14] = xPrevWin[15] = xPrevWin[16] = xPrevWin[17] = 0; 156 } else { 157 /* use ARM-style pointers (*ptr++) so that ADS compiles well */ 158 wpLo = imdctWin[btPrev] + 18; 159 wpHi = wpLo + 17; 160 xpwLo = xPrevWin; 161 xpwHi = xPrevWin + 17; 162 for (i = 9; i > 0; i--) { 163 x = *xp++; wLo = *wpLo++; wHi = *wpHi--; 164 *xpwLo++ = MULSHIFT32(wLo, x); 165 *xpwHi-- = MULSHIFT32(wHi, x); 166 } 167 } 168 } 169 170 /************************************************************************************** 171 * Function: FreqInvertRescale 172 * 173 * Description: do frequency inversion (odd samples of odd blocks) and rescale 174 * if necessary (extra guard bits added before IMDCT) 175 * 176 * Inputs: output vector y (18 new samples, spaced NBANDS apart) 177 * previous sample vector xPrev (9 samples) 178 * index of current block 179 * number of extra shifts added before IMDCT (usually 0) 180 * 181 * Outputs: inverted and rescaled (as necessary) outputs 182 * rescaled (as necessary) previous samples 183 * 184 * Return: updated mOut (from new outputs y) 185 **************************************************************************************/ 186 static int FreqInvertRescale(int *y, int *xPrev, int blockIdx, int es) 187 { 188 int i, d, mOut; 189 int y0, y1, y2, y3, y4, y5, y6, y7, y8; 190 191 if (es == 0) { 192 /* fast case - frequency invert only (no rescaling) - can fuse into overlap-add for speed, if desired */ 193 if (blockIdx & 0x01) { 194 y += NBANDS; 195 y0 = *y; y += 2*NBANDS; 196 y1 = *y; y += 2*NBANDS; 197 y2 = *y; y += 2*NBANDS; 198 y3 = *y; y += 2*NBANDS; 199 y4 = *y; y += 2*NBANDS; 200 y5 = *y; y += 2*NBANDS; 201 y6 = *y; y += 2*NBANDS; 202 y7 = *y; y += 2*NBANDS; 203 y8 = *y; y += 2*NBANDS; 204 205 y -= 18*NBANDS; 206 *y = -y0; y += 2*NBANDS; 207 *y = -y1; y += 2*NBANDS; 208 *y = -y2; y += 2*NBANDS; 209 *y = -y3; y += 2*NBANDS; 210 *y = -y4; y += 2*NBANDS; 211 *y = -y5; y += 2*NBANDS; 212 *y = -y6; y += 2*NBANDS; 213 *y = -y7; y += 2*NBANDS; 214 *y = -y8; y += 2*NBANDS; 215 } 216 return 0; 217 } else { 218 /* undo pre-IMDCT scaling, clipping if necessary */ 219 mOut = 0; 220 if (blockIdx & 0x01) { 221 /* frequency invert */ 222 for (i = 0; i < 18; i+=2) { 223 d = *y; CLIP_2N(d, 31 - es); *y = d << es; mOut |= FASTABS(*y); y += NBANDS; 224 d = -*y; CLIP_2N(d, 31 - es); *y = d << es; mOut |= FASTABS(*y); y += NBANDS; 225 d = *xPrev; CLIP_2N(d, 31 - es); *xPrev++ = d << es; 226 } 227 } else { 228 for (i = 0; i < 18; i+=2) { 229 d = *y; CLIP_2N(d, 31 - es); *y = d << es; mOut |= FASTABS(*y); y += NBANDS; 230 d = *y; CLIP_2N(d, 31 - es); *y = d << es; mOut |= FASTABS(*y); y += NBANDS; 231 d = *xPrev; CLIP_2N(d, 31 - es); *xPrev++ = d << es; 232 } 233 } 234 return mOut; 235 } 236 } 237 238 /* format = Q31 239 * #define M_PI 3.14159265358979323846 240 * double u = 2.0 * M_PI / 9.0; 241 * float c0 = sqrt(3.0) / 2.0; 242 * float c1 = cos(u); 243 * float c2 = cos(2*u); 244 * float c3 = sin(u); 245 * float c4 = sin(2*u); 246 */ 247 static const int c9_0 = 0x6ed9eba1; 248 static const int c9_1 = 0x620dbe8b; 249 static const int c9_2 = 0x163a1a7e; 250 static const int c9_3 = 0x5246dd49; 251 static const int c9_4 = 0x7e0e2e32; 252 253 /* format = Q31 254 * cos(((0:8) + 0.5) * (pi/18)) 255 */ 256 static const int c18[9] = { 257 0x7f834ed0, 0x7ba3751d, 0x7401e4c1, 0x68d9f964, 0x5a82799a, 0x496af3e2, 0x36185aee, 0x2120fb83, 0x0b27eb5c, 258 }; 259 260 /* require at least 3 guard bits in x[] to ensure no overflow */ 261 static __inline void idct9(int *x) 262 { 263 int a1, a2, a3, a4, a5, a6, a7, a8, a9; 264 int a10, a11, a12, a13, a14, a15, a16, a17, a18; 265 int a19, a20, a21, a22, a23, a24, a25, a26, a27; 266 int m1, m3, m5, m6, m7, m8, m9, m10, m11, m12; 267 int x0, x1, x2, x3, x4, x5, x6, x7, x8; 268 269 x0 = x[0]; x1 = x[1]; x2 = x[2]; x3 = x[3]; x4 = x[4]; 270 x5 = x[5]; x6 = x[6]; x7 = x[7]; x8 = x[8]; 271 272 a1 = x0 - x6; 273 a2 = x1 - x5; 274 a3 = x1 + x5; 275 a4 = x2 - x4; 276 a5 = x2 + x4; 277 a6 = x2 + x8; 278 a7 = x1 + x7; 279 280 a8 = a6 - a5; /* ie x[8] - x[4] */ 281 a9 = a3 - a7; /* ie x[5] - x[7] */ 282 a10 = a2 - x7; /* ie x[1] - x[5] - x[7] */ 283 a11 = a4 - x8; /* ie x[2] - x[4] - x[8] */ 284 285 /* do the << 1 as constant shifts where mX is actually used (free, no stall or extra inst.) */ 286 m1 = MULSHIFT32(c9_0, x3); 287 m3 = MULSHIFT32(c9_0, a10); 288 m5 = MULSHIFT32(c9_1, a5); 289 m6 = MULSHIFT32(c9_2, a6); 290 m7 = MULSHIFT32(c9_1, a8); 291 m8 = MULSHIFT32(c9_2, a5); 292 m9 = MULSHIFT32(c9_3, a9); 293 m10 = MULSHIFT32(c9_4, a7); 294 m11 = MULSHIFT32(c9_3, a3); 295 m12 = MULSHIFT32(c9_4, a9); 296 297 a12 = x[0] + (x[6] >> 1); 298 a13 = a12 + ( m1 << 1); 299 a14 = a12 - ( m1 << 1); 300 a15 = a1 + ( a11 >> 1); 301 a16 = ( m5 << 1) + (m6 << 1); 302 a17 = ( m7 << 1) - (m8 << 1); 303 a18 = a16 + a17; 304 a19 = ( m9 << 1) + (m10 << 1); 305 a20 = (m11 << 1) - (m12 << 1); 306 307 a21 = a20 - a19; 308 a22 = a13 + a16; 309 a23 = a14 + a16; 310 a24 = a14 + a17; 311 a25 = a13 + a17; 312 a26 = a14 - a18; 313 a27 = a13 - a18; 314 315 x0 = a22 + a19; x[0] = x0; 316 x1 = a15 + (m3 << 1); x[1] = x1; 317 x2 = a24 + a20; x[2] = x2; 318 x3 = a26 - a21; x[3] = x3; 319 x4 = a1 - a11; x[4] = x4; 320 x5 = a27 + a21; x[5] = x5; 321 x6 = a25 - a20; x[6] = x6; 322 x7 = a15 - (m3 << 1); x[7] = x7; 323 x8 = a23 - a19; x[8] = x8; 324 } 325 326 /* let c(j) = cos(M_PI/36 * ((j)+0.5)), s(j) = sin(M_PI/36 * ((j)+0.5)) 327 * then fastWin[2*j+0] = c(j)*(s(j) + c(j)), j = [0, 8] 328 * fastWin[2*j+1] = c(j)*(s(j) - c(j)) 329 * format = Q30 330 */ 331 static const int fastWin36[18] = { 332 (int32_t)0x42aace8b, (int32_t)0xc2e92724, (int32_t)0x47311c28, (int32_t)0xc95f619a, (int32_t)0x4a868feb, (int32_t)0xd0859d8c, 333 (int32_t)0x4c913b51, (int32_t)0xd8243ea0, (int32_t)0x4d413ccc, (int32_t)0xe0000000, (int32_t)0x4c913b51, (int32_t)0xe7dbc161, 334 (int32_t)0x4a868feb, (int32_t)0xef7a6275, (int32_t)0x47311c28, (int32_t)0xf6a09e67, (int32_t)0x42aace8b, (int32_t)0xfd16d8dd, 335 }; 336 337 /************************************************************************************** 338 * Function: IMDCT36 339 * 340 * Description: 36-point modified DCT, with windowing and overlap-add (50% overlap) 341 * 342 * Inputs: vector of 18 coefficients (N/2 inputs produces N outputs, by symmetry) 343 * overlap part of last IMDCT (9 samples - see output comments) 344 * window type (0,1,2,3) of current and previous block 345 * current block index (for deciding whether to do frequency inversion) 346 * number of guard bits in input vector 347 * 348 * Outputs: 18 output samples, after windowing and overlap-add with last frame 349 * second half of (unwindowed) 36-point IMDCT - save for next time 350 * only save 9 xPrev samples, using symmetry (see WinPrevious()) 351 * 352 * Notes: this is Ken's hyper-fast algorithm, including symmetric sin window 353 * optimization, if applicable 354 * total number of multiplies, general case: 355 * 2*10 (idct9) + 9 (last stage imdct) + 36 (for windowing) = 65 356 * total number of multiplies, btCurr == 0 && btPrev == 0: 357 * 2*10 (idct9) + 9 (last stage imdct) + 18 (for windowing) = 47 358 * 359 * blockType == 0 is by far the most common case, so it should be 360 * possible to use the fast path most of the time 361 * this is the fastest known algorithm for performing 362 * long IMDCT + windowing + overlap-add in MP3 363 * 364 * Return: mOut (OR of abs(y) for all y calculated here) 365 * 366 * TODO: optimize for ARM (reorder window coefs, ARM-style pointers in C, 367 * inline asm may or may not be helpful) 368 **************************************************************************************/ 369 // barely faster in RAM 370 /*__attribute__ ((section (".data")))*/ static int IMDCT36(int *xCurr, int *xPrev, int *y, int btCurr, int btPrev, int blockIdx, int gb) 371 { 372 int i, es, xBuf[18], xPrevWin[18]; 373 int acc1, acc2, s, d, t, mOut; 374 int xo, xe, c, *xp, yLo, yHi; 375 const int *cp, *wp; 376 377 acc1 = acc2 = 0; 378 xCurr += 17; 379 380 /* 7 gb is always adequate for antialias + accumulator loop + idct9 */ 381 if (gb < 7) { 382 /* rarely triggered - 5% to 10% of the time on normal clips (with Q25 input) */ 383 es = 7 - gb; 384 for (i = 8; i >= 0; i--) { 385 acc1 = ((*xCurr--) >> es) - acc1; 386 acc2 = acc1 - acc2; 387 acc1 = ((*xCurr--) >> es) - acc1; 388 xBuf[i+9] = acc2; /* odd */ 389 xBuf[i+0] = acc1; /* even */ 390 xPrev[i] >>= es; 391 } 392 } else { 393 es = 0; 394 /* max gain = 18, assume adequate guard bits */ 395 for (i = 8; i >= 0; i--) { 396 acc1 = (*xCurr--) - acc1; 397 acc2 = acc1 - acc2; 398 acc1 = (*xCurr--) - acc1; 399 xBuf[i+9] = acc2; /* odd */ 400 xBuf[i+0] = acc1; /* even */ 401 } 402 } 403 /* xEven[0] and xOdd[0] scaled by 0.5 */ 404 xBuf[9] >>= 1; 405 xBuf[0] >>= 1; 406 407 /* do 9-point IDCT on even and odd */ 408 idct9(xBuf+0); /* even */ 409 idct9(xBuf+9); /* odd */ 410 411 xp = xBuf + 8; 412 cp = c18 + 8; 413 mOut = 0; 414 if (btPrev == 0 && btCurr == 0) { 415 /* fast path - use symmetry of sin window to reduce windowing multiplies to 18 (N/2) */ 416 wp = fastWin36; 417 for (i = 0; i < 9; i++) { 418 /* do ARM-style pointer arithmetic (i still needed for y[] indexing - compiler spills if 2 y pointers) */ 419 c = *cp--; xo = *(xp + 9); xe = *xp--; 420 /* gain 2 int bits here */ 421 xo = MULSHIFT32(c, xo); /* 2*c18*xOdd (mul by 2 implicit in scaling) */ 422 xe >>= 2; 423 424 s = -(*xPrev); /* sum from last block (always at least 2 guard bits) */ 425 d = -(xe - xo); /* gain 2 int bits, don't shift xo (effective << 1 to eat sign bit, << 1 for mul by 2) */ 426 (*xPrev++) = xe + xo; /* symmetry - xPrev[i] = xPrev[17-i] for long blocks */ 427 t = s - d; 428 429 yLo = (d + (MULSHIFT32(t, *wp++) << 2)); 430 yHi = (s + (MULSHIFT32(t, *wp++) << 2)); 431 y[(i)*NBANDS] = yLo; 432 y[(17-i)*NBANDS] = yHi; 433 mOut |= FASTABS(yLo); 434 mOut |= FASTABS(yHi); 435 } 436 } else { 437 /* slower method - either prev or curr is using window type != 0 so do full 36-point window 438 * output xPrevWin has at least 3 guard bits (xPrev has 2, gain 1 in WinPrevious) 439 */ 440 WinPrevious(xPrev, xPrevWin, btPrev); 441 442 wp = imdctWin[btCurr]; 443 for (i = 0; i < 9; i++) { 444 c = *cp--; xo = *(xp + 9); xe = *xp--; 445 /* gain 2 int bits here */ 446 xo = MULSHIFT32(c, xo); /* 2*c18*xOdd (mul by 2 implicit in scaling) */ 447 xe >>= 2; 448 449 d = xe - xo; 450 (*xPrev++) = xe + xo; /* symmetry - xPrev[i] = xPrev[17-i] for long blocks */ 451 452 yLo = (xPrevWin[i] + MULSHIFT32(d, wp[i])) << 2; 453 yHi = (xPrevWin[17-i] + MULSHIFT32(d, wp[17-i])) << 2; 454 y[(i)*NBANDS] = yLo; 455 y[(17-i)*NBANDS] = yHi; 456 mOut |= FASTABS(yLo); 457 mOut |= FASTABS(yHi); 458 } 459 } 460 461 xPrev -= 9; 462 mOut |= FreqInvertRescale(y, xPrev, blockIdx, es); 463 464 return mOut; 465 } 466 467 static const int c3_0 = (int32_t)0x6ed9eba1; /* format = Q31, cos(pi/6) */ 468 static const int c6[3] = { (int32_t)0x7ba3751d, (int32_t)0x5a82799a, (int32_t)0x2120fb83 }; /* format = Q31, cos(((0:2) + 0.5) * (pi/6)) */ 469 470 /* 12-point inverse DCT, used in IMDCT12x3() 471 * 4 input guard bits will ensure no overflow 472 */ 473 static __inline void imdct12 (int *x, int *out) 474 { 475 int a0, a1, a2; 476 int x0, x1, x2, x3, x4, x5; 477 478 x0 = *x; x+=3; x1 = *x; x+=3; 479 x2 = *x; x+=3; x3 = *x; x+=3; 480 x4 = *x; x+=3; x5 = *x; x+=3; 481 482 x4 -= x5; 483 x3 -= x4; 484 x2 -= x3; 485 x3 -= x5; 486 x1 -= x2; 487 x0 -= x1; 488 x1 -= x3; 489 490 x0 >>= 1; 491 x1 >>= 1; 492 493 a0 = MULSHIFT32(c3_0, x2) << 1; 494 a1 = x0 + (x4 >> 1); 495 a2 = x0 - x4; 496 x0 = a1 + a0; 497 x2 = a2; 498 x4 = a1 - a0; 499 500 a0 = MULSHIFT32(c3_0, x3) << 1; 501 a1 = x1 + (x5 >> 1); 502 a2 = x1 - x5; 503 504 /* cos window odd samples, mul by 2, eat sign bit */ 505 x1 = MULSHIFT32(c6[0], a1 + a0) << 2; 506 x3 = MULSHIFT32(c6[1], a2) << 2; 507 x5 = MULSHIFT32(c6[2], a1 - a0) << 2; 508 509 *out = x0 + x1; out++; 510 *out = x2 + x3; out++; 511 *out = x4 + x5; out++; 512 *out = x4 - x5; out++; 513 *out = x2 - x3; out++; 514 *out = x0 - x1; 515 } 516 517 /************************************************************************************** 518 * Function: IMDCT12x3 519 * 520 * Description: three 12-point modified DCT's for short blocks, with windowing, 521 * short block concatenation, and overlap-add 522 * 523 * Inputs: 3 interleaved vectors of 6 samples each 524 * (block0[0], block1[0], block2[0], block0[1], block1[1]....) 525 * overlap part of last IMDCT (9 samples - see output comments) 526 * window type (0,1,2,3) of previous block 527 * current block index (for deciding whether to do frequency inversion) 528 * number of guard bits in input vector 529 * 530 * Outputs: updated sample vector x, net gain of 1 integer bit 531 * second half of (unwindowed) IMDCT's - save for next time 532 * only save 9 xPrev samples, using symmetry (see WinPrevious()) 533 * 534 * Return: mOut (OR of abs(y) for all y calculated here) 535 * 536 * TODO: optimize for ARM 537 **************************************************************************************/ 538 // barely faster in RAM 539 /*__attribute__ ((section (".data")))*/ static int IMDCT12x3(int *xCurr, int *xPrev, int *y, int btPrev, int blockIdx, int gb) 540 { 541 int i, es, mOut, yLo, xBuf[18], xPrevWin[18]; /* need temp buffer for reordering short blocks */ 542 const int *wp; 543 544 es = 0; 545 /* 7 gb is always adequate for accumulator loop + idct12 + window + overlap */ 546 if (gb < 7) { 547 es = 7 - gb; 548 for (i = 0; i < 18; i+=2) { 549 xCurr[i+0] >>= es; 550 xCurr[i+1] >>= es; 551 *xPrev++ >>= es; 552 } 553 xPrev -= 9; 554 } 555 556 /* requires 4 input guard bits for each imdct12 */ 557 imdct12(xCurr + 0, xBuf + 0); 558 imdct12(xCurr + 1, xBuf + 6); 559 imdct12(xCurr + 2, xBuf + 12); 560 561 /* window previous from last time */ 562 WinPrevious(xPrev, xPrevWin, btPrev); 563 564 /* could unroll this for speed, minimum loads (short blocks usually rare, so doesn't make much overall difference) 565 * xPrevWin[i] << 2 still has 1 gb always, max gain of windowed xBuf stuff also < 1.0 and gain the sign bit 566 * so y calculations won't overflow 567 */ 568 wp = imdctWin[2]; 569 mOut = 0; 570 for (i = 0; i < 3; i++) { 571 yLo = (xPrevWin[ 0+i] << 2); 572 mOut |= FASTABS(yLo); y[( 0+i)*NBANDS] = yLo; 573 yLo = (xPrevWin[ 3+i] << 2); 574 mOut |= FASTABS(yLo); y[( 3+i)*NBANDS] = yLo; 575 yLo = (xPrevWin[ 6+i] << 2) + (MULSHIFT32(wp[0+i], xBuf[3+i])); 576 mOut |= FASTABS(yLo); y[( 6+i)*NBANDS] = yLo; 577 yLo = (xPrevWin[ 9+i] << 2) + (MULSHIFT32(wp[3+i], xBuf[5-i])); 578 mOut |= FASTABS(yLo); y[( 9+i)*NBANDS] = yLo; 579 yLo = (xPrevWin[12+i] << 2) + (MULSHIFT32(wp[6+i], xBuf[2-i]) + MULSHIFT32(wp[0+i], xBuf[(6+3)+i])); 580 mOut |= FASTABS(yLo); y[(12+i)*NBANDS] = yLo; 581 yLo = (xPrevWin[15+i] << 2) + (MULSHIFT32(wp[9+i], xBuf[0+i]) + MULSHIFT32(wp[3+i], xBuf[(6+5)-i])); 582 mOut |= FASTABS(yLo); y[(15+i)*NBANDS] = yLo; 583 } 584 585 /* save previous (unwindowed) for overlap - only need samples 6-8, 12-17 */ 586 for (i = 6; i < 9; i++) 587 *xPrev++ = xBuf[i] >> 2; 588 for (i = 12; i < 18; i++) 589 *xPrev++ = xBuf[i] >> 2; 590 591 xPrev -= 9; 592 mOut |= FreqInvertRescale(y, xPrev, blockIdx, es); 593 594 return mOut; 595 } 596 597 /************************************************************************************** 598 * Function: HybridTransform 599 * 600 * Description: IMDCT's, windowing, and overlap-add on long/short/mixed blocks 601 * 602 * Inputs: vector of input coefficients, length = nBlocksTotal * 18) 603 * vector of overlap samples from last time, length = nBlocksPrev * 9) 604 * buffer for output samples, length = MAXNSAMP 605 * SideInfoSub struct for this granule/channel 606 * BlockCount struct with necessary info 607 * number of non-zero input and overlap blocks 608 * number of long blocks in input vector (rest assumed to be short blocks) 609 * number of blocks which use long window (type) 0 in case of mixed block 610 * (bc->currWinSwitch, 0 for non-mixed blocks) 611 * 612 * Outputs: transformed, windowed, and overlapped sample buffer 613 * does frequency inversion on odd blocks 614 * updated buffer of samples for overlap 615 * 616 * Return: number of non-zero IMDCT blocks calculated in this call 617 * (including overlap-add) 618 * 619 * TODO: examine mixedBlock/winSwitch logic carefully (test he_mode.bit) 620 **************************************************************************************/ 621 static int HybridTransform(int *xCurr, int *xPrev, int y[BLOCK_SIZE][NBANDS], SideInfoSub *sis, BlockCount *bc) 622 { 623 int xPrevWin[18], currWinIdx, prevWinIdx; 624 int i, j, nBlocksOut, nonZero, mOut; 625 int fiBit, xp; 626 627 ASSERT(bc->nBlocksLong <= NBANDS); 628 ASSERT(bc->nBlocksTotal <= NBANDS); 629 ASSERT(bc->nBlocksPrev <= NBANDS); 630 631 mOut = 0; 632 633 /* do long blocks, if any */ 634 for(i = 0; i < bc->nBlocksLong; i++) { 635 /* currWinIdx picks the right window for long blocks (if mixed, long blocks use window type 0) */ 636 currWinIdx = sis->blockType; 637 if (sis->mixedBlock && i < bc->currWinSwitch) 638 currWinIdx = 0; 639 640 prevWinIdx = bc->prevType; 641 if (i < bc->prevWinSwitch) 642 prevWinIdx = 0; 643 644 /* do 36-point IMDCT, including windowing and overlap-add */ 645 mOut |= IMDCT36(xCurr, xPrev, &(y[0][i]), currWinIdx, prevWinIdx, i, bc->gbIn); 646 xCurr += 18; 647 xPrev += 9; 648 } 649 650 /* do short blocks (if any) */ 651 for ( ; i < bc->nBlocksTotal; i++) { 652 ASSERT(sis->blockType == 2); 653 654 prevWinIdx = bc->prevType; 655 if (i < bc->prevWinSwitch) 656 prevWinIdx = 0; 657 658 mOut |= IMDCT12x3(xCurr, xPrev, &(y[0][i]), prevWinIdx, i, bc->gbIn); 659 xCurr += 18; 660 xPrev += 9; 661 } 662 nBlocksOut = i; 663 664 /* window and overlap prev if prev longer that current */ 665 for ( ; i < bc->nBlocksPrev; i++) { 666 prevWinIdx = bc->prevType; 667 if (i < bc->prevWinSwitch) 668 prevWinIdx = 0; 669 WinPrevious(xPrev, xPrevWin, prevWinIdx); 670 671 nonZero = 0; 672 fiBit = i << 31; 673 for (j = 0; j < 9; j++) { 674 xp = xPrevWin[2*j+0] << 2; /* << 2 temp for scaling */ 675 nonZero |= xp; 676 y[2*j+0][i] = xp; 677 mOut |= FASTABS(xp); 678 679 /* frequency inversion on odd blocks/odd samples (flip sign if i odd, j odd) */ 680 xp = xPrevWin[2*j+1] << 2; 681 xp = (xp ^ (fiBit >> 31)) + (i & 0x01); 682 nonZero |= xp; 683 y[2*j+1][i] = xp; 684 mOut |= FASTABS(xp); 685 686 xPrev[j] = 0; 687 } 688 xPrev += 9; 689 if (nonZero) 690 nBlocksOut = i; 691 } 692 693 /* clear rest of blocks */ 694 for ( ; i < 32; i++) { 695 for (j = 0; j < 18; j++) 696 y[j][i] = 0; 697 } 698 699 bc->gbOut = CLZ(mOut) - 1; 700 701 return nBlocksOut; 702 } 703 704 /************************************************************************************** 705 * Function: IMDCT 706 * 707 * Description: do alias reduction, inverse MDCT, overlap-add, and frequency inversion 708 * 709 * Inputs: MP3DecInfo structure filled by UnpackFrameHeader(), UnpackSideInfo(), 710 * UnpackScaleFactors(), and DecodeHuffman() (for this granule, channel) 711 * includes PCM samples in overBuf (from last call to IMDCT) for OLA 712 * index of current granule and channel 713 * 714 * Outputs: PCM samples in outBuf, for input to subband transform 715 * PCM samples in overBuf, for OLA next time 716 * updated hi->nonZeroBound index for this channel 717 * 718 * Return: 0 on success, -1 if null input pointers 719 **************************************************************************************/ 720 // a bit faster in RAM 721 /*__attribute__ ((section (".data")))*/ int IMDCT(MP3DecInfo *mp3DecInfo, int gr, int ch) 722 { 723 int nBfly, blockCutoff; 724 FrameHeader *fh; 725 SideInfo *si; 726 HuffmanInfo *hi; 727 IMDCTInfo *mi; 728 BlockCount bc; 729 730 /* validate pointers */ 731 if (!mp3DecInfo || !mp3DecInfo->FrameHeaderPS || !mp3DecInfo->SideInfoPS || 732 !mp3DecInfo->HuffmanInfoPS || !mp3DecInfo->IMDCTInfoPS) 733 return -1; 734 735 /* si is an array of up to 4 structs, stored as gr0ch0, gr0ch1, gr1ch0, gr1ch1 */ 736 fh = (FrameHeader *)(mp3DecInfo->FrameHeaderPS); 737 si = (SideInfo *)(mp3DecInfo->SideInfoPS); 738 hi = (HuffmanInfo*)(mp3DecInfo->HuffmanInfoPS); 739 mi = (IMDCTInfo *)(mp3DecInfo->IMDCTInfoPS); 740 741 /* anti-aliasing done on whole long blocks only 742 * for mixed blocks, nBfly always 1, except 3 for 8 kHz MPEG 2.5 (see sfBandTab) 743 * nLongBlocks = number of blocks with (possibly) non-zero power 744 * nBfly = number of butterflies to do (nLongBlocks - 1, unless no long blocks) 745 */ 746 blockCutoff = fh->sfBand->l[(fh->ver == MPEG1 ? 8 : 6)] / 18; /* same as 3* num short sfb's in spec */ 747 if (si->sis[gr][ch].blockType != 2) { 748 /* all long transforms */ 749 bc.nBlocksLong = MIN((hi->nonZeroBound[ch] + 7) / 18 + 1, 32); 750 nBfly = bc.nBlocksLong - 1; 751 } else if (si->sis[gr][ch].blockType == 2 && si->sis[gr][ch].mixedBlock) { 752 /* mixed block - long transforms until cutoff, then short transforms */ 753 bc.nBlocksLong = blockCutoff; 754 nBfly = bc.nBlocksLong - 1; 755 } else { 756 /* all short transforms */ 757 bc.nBlocksLong = 0; 758 nBfly = 0; 759 } 760 761 AntiAlias(hi->huffDecBuf[ch], nBfly); 762 hi->nonZeroBound[ch] = MAX(hi->nonZeroBound[ch], (nBfly * 18) + 8); 763 764 ASSERT(hi->nonZeroBound[ch] <= MAX_NSAMP); 765 766 /* for readability, use a struct instead of passing a million parameters to HybridTransform() */ 767 bc.nBlocksTotal = (hi->nonZeroBound[ch] + 17) / 18; 768 bc.nBlocksPrev = mi->numPrevIMDCT[ch]; 769 bc.prevType = mi->prevType[ch]; 770 bc.prevWinSwitch = mi->prevWinSwitch[ch]; 771 bc.currWinSwitch = (si->sis[gr][ch].mixedBlock ? blockCutoff : 0); /* where WINDOW switches (not nec. transform) */ 772 bc.gbIn = hi->gb[ch]; 773 774 mi->numPrevIMDCT[ch] = HybridTransform(hi->huffDecBuf[ch], mi->overBuf[ch], mi->outBuf[ch], &si->sis[gr][ch], &bc); 775 mi->prevType[ch] = si->sis[gr][ch].blockType; 776 mi->prevWinSwitch[ch] = bc.currWinSwitch; /* 0 means not a mixed block (either all short or all long) */ 777 mi->gb[ch] = bc.gbOut; 778 779 ASSERT(mi->numPrevIMDCT[ch] <= NBANDS); 780 781 /* output has gained 2 int bits */ 782 return 0; 783 }