dct32.cpp
1 /* ***** BEGIN LICENSE BLOCK ***** 2 * Version: RCSL 1.0/RPSL 1.0 3 * 4 * Portions Copyright (c) 1995-2002 RealNetworks, Inc. All Rights Reserved. 5 * 6 * The contents of this file, and the files included with this file, are 7 * subject to the current version of the RealNetworks Public Source License 8 * Version 1.0 (the "RPSL") available at 9 * http://www.helixcommunity.org/content/rpsl unless you have licensed 10 * the file under the RealNetworks Community Source License Version 1.0 11 * (the "RCSL") available at http://www.helixcommunity.org/content/rcsl, 12 * in which case the RCSL will apply. You may also obtain the license terms 13 * directly from RealNetworks. You may not use this file except in 14 * compliance with the RPSL or, if you have a valid RCSL with RealNetworks 15 * applicable to this file, the RCSL. Please see the applicable RPSL or 16 * RCSL for the rights, obligations and limitations governing use of the 17 * contents of the file. 18 * 19 * This file is part of the Helix DNA Technology. RealNetworks is the 20 * developer of the Original Code and owns the copyrights in the portions 21 * it created. 22 * 23 * This file, and the files included with this file, is distributed and made 24 * available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 25 * EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS ALL SUCH WARRANTIES, 26 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, FITNESS 27 * FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 28 * 29 * Technology Compatibility Kit Test Suite(s) Location: 30 * http://www.helixcommunity.org/content/tck 31 * 32 * Contributor(s): 33 * 34 * ***** END LICENSE BLOCK ***** */ 35 36 /************************************************************************************** 37 * Fixed-point MP3 decoder 38 * Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com) 39 * June 2003 40 * 41 * dct32.c - optimized implementations of 32-point DCT for matrixing stage of 42 * polyphase filter 43 **************************************************************************************/ 44 45 #include "coder.h" 46 #include "assembly.h" 47 48 #define COS0_0 0x4013c251 /* Q31 */ 49 #define COS0_1 0x40b345bd /* Q31 */ 50 #define COS0_2 0x41fa2d6d /* Q31 */ 51 #define COS0_3 0x43f93421 /* Q31 */ 52 #define COS0_4 0x46cc1bc4 /* Q31 */ 53 #define COS0_5 0x4a9d9cf0 /* Q31 */ 54 #define COS0_6 0x4fae3711 /* Q31 */ 55 #define COS0_7 0x56601ea7 /* Q31 */ 56 #define COS0_8 0x5f4cf6eb /* Q31 */ 57 #define COS0_9 0x6b6fcf26 /* Q31 */ 58 #define COS0_10 0x7c7d1db3 /* Q31 */ 59 #define COS0_11 0x4ad81a97 /* Q30 */ 60 #define COS0_12 0x5efc8d96 /* Q30 */ 61 #define COS0_13 0x41d95790 /* Q29 */ 62 #define COS0_14 0x6d0b20cf /* Q29 */ 63 #define COS0_15 0x518522fb /* Q27 */ 64 65 #define COS1_0 0x404f4672 /* Q31 */ 66 #define COS1_1 0x42e13c10 /* Q31 */ 67 #define COS1_2 0x48919f44 /* Q31 */ 68 #define COS1_3 0x52cb0e63 /* Q31 */ 69 #define COS1_4 0x64e2402e /* Q31 */ 70 #define COS1_5 0x43e224a9 /* Q30 */ 71 #define COS1_6 0x6e3c92c1 /* Q30 */ 72 #define COS1_7 0x519e4e04 /* Q28 */ 73 74 #define COS2_0 0x4140fb46 /* Q31 */ 75 #define COS2_1 0x4cf8de88 /* Q31 */ 76 #define COS2_2 0x73326bbf /* Q31 */ 77 #define COS2_3 0x52036742 /* Q29 */ 78 79 #define COS3_0 0x4545e9ef /* Q31 */ 80 #define COS3_1 0x539eba45 /* Q30 */ 81 82 #define COS4_0 0x5a82799a /* Q31 */ 83 84 // faster in ROM 85 static const int dcttab[48] = { 86 /* first pass */ 87 COS0_0, COS0_15, COS1_0, /* 31, 27, 31 */ 88 COS0_1, COS0_14, COS1_1, /* 31, 29, 31 */ 89 COS0_2, COS0_13, COS1_2, /* 31, 29, 31 */ 90 COS0_3, COS0_12, COS1_3, /* 31, 30, 31 */ 91 COS0_4, COS0_11, COS1_4, /* 31, 30, 31 */ 92 COS0_5, COS0_10, COS1_5, /* 31, 31, 30 */ 93 COS0_6, COS0_9, COS1_6, /* 31, 31, 30 */ 94 COS0_7, COS0_8, COS1_7, /* 31, 31, 28 */ 95 /* second pass */ 96 COS2_0, COS2_3, COS3_0, /* 31, 29, 31 */ 97 COS2_1, COS2_2, COS3_1, /* 31, 31, 30 */ 98 -COS2_0, -COS2_3, COS3_0, /* 31, 29, 31 */ 99 -COS2_1, -COS2_2, COS3_1, /* 31, 31, 30 */ 100 COS2_0, COS2_3, COS3_0, /* 31, 29, 31 */ 101 COS2_1, COS2_2, COS3_1, /* 31, 31, 30 */ 102 -COS2_0, -COS2_3, COS3_0, /* 31, 29, 31 */ 103 -COS2_1, -COS2_2, COS3_1, /* 31, 31, 30 */ 104 }; 105 106 #define D32FP(i, s0, s1, s2) { \ 107 a0 = buf[i]; a3 = buf[31-i]; \ 108 a1 = buf[15-i]; a2 = buf[16+i]; \ 109 b0 = a0 + a3; b3 = MULSHIFT32(*cptr++, a0 - a3) << (s0); \ 110 b1 = a1 + a2; b2 = MULSHIFT32(*cptr++, a1 - a2) << (s1); \ 111 buf[i] = b0 + b1; buf[15-i] = MULSHIFT32(*cptr, b0 - b1) << (s2); \ 112 buf[16+i] = b2 + b3; buf[31-i] = MULSHIFT32(*cptr++, b3 - b2) << (s2); \ 113 } 114 115 /************************************************************************************** 116 * Function: FDCT32 117 * 118 * Description: Ken's highly-optimized 32-point DCT (radix-4 + radix-8) 119 * 120 * Inputs: input buffer, length = 32 samples 121 * require at least 6 guard bits in input vector x to avoid possibility 122 * of overflow in internal calculations (see bbtest_imdct test app) 123 * buffer offset and oddblock flag for polyphase filter input buffer 124 * number of guard bits in input 125 * 126 * Outputs: output buffer, data copied and interleaved for polyphase filter 127 * no guarantees about number of guard bits in output 128 * 129 * Return: none 130 * 131 * Notes: number of muls = 4*8 + 12*4 = 80 132 * final stage of DCT is hardcoded to shuffle data into the proper order 133 * for the polyphase filterbank 134 * fully unrolled stage 1, for max precision (scale the 1/cos() factors 135 * differently, depending on magnitude) 136 * guard bit analysis verified by exhaustive testing of all 2^32 137 * combinations of max pos/max neg values in x[] 138 * 139 * TODO: code organization and optimization for ARM 140 * possibly interleave stereo (cut # of coef loads in half - may not have 141 * enough registers) 142 **************************************************************************************/ 143 // about 1ms faster in RAM 144 /*__attribute__ ((section (".data")))*/ void FDCT32(int *buf, int *dest, int offset, int oddBlock, int gb) 145 { 146 int i, s, tmp, es; 147 const int *cptr = dcttab; 148 int a0, a1, a2, a3, a4, a5, a6, a7; 149 int b0, b1, b2, b3, b4, b5, b6, b7; 150 int *d; 151 152 /* scaling - ensure at least 6 guard bits for DCT 153 * (in practice this is already true 99% of time, so this code is 154 * almost never triggered) 155 */ 156 es = 0; 157 if (gb < 6) { 158 es = 6 - gb; 159 for (i = 0; i < 32; i++) 160 buf[i] >>= es; 161 } 162 163 /* first pass */ 164 D32FP(0, 1, 5, 1); 165 D32FP(1, 1, 3, 1); 166 D32FP(2, 1, 3, 1); 167 D32FP(3, 1, 2, 1); 168 D32FP(4, 1, 2, 1); 169 D32FP(5, 1, 1, 2); 170 D32FP(6, 1, 1, 2); 171 D32FP(7, 1, 1, 4); 172 173 /* second pass */ 174 for (i = 4; i > 0; i--) { 175 a0 = buf[0]; a7 = buf[7]; a3 = buf[3]; a4 = buf[4]; 176 b0 = a0 + a7; b7 = MULSHIFT32(*cptr++, a0 - a7) << 1; 177 b3 = a3 + a4; b4 = MULSHIFT32(*cptr++, a3 - a4) << 3; 178 a0 = b0 + b3; a3 = MULSHIFT32(*cptr, b0 - b3) << 1; 179 a4 = b4 + b7; a7 = MULSHIFT32(*cptr++, b7 - b4) << 1; 180 181 a1 = buf[1]; a6 = buf[6]; a2 = buf[2]; a5 = buf[5]; 182 b1 = a1 + a6; b6 = MULSHIFT32(*cptr++, a1 - a6) << 1; 183 b2 = a2 + a5; b5 = MULSHIFT32(*cptr++, a2 - a5) << 1; 184 a1 = b1 + b2; a2 = MULSHIFT32(*cptr, b1 - b2) << 2; 185 a5 = b5 + b6; a6 = MULSHIFT32(*cptr++, b6 - b5) << 2; 186 187 b0 = a0 + a1; b1 = MULSHIFT32(COS4_0, a0 - a1) << 1; 188 b2 = a2 + a3; b3 = MULSHIFT32(COS4_0, a3 - a2) << 1; 189 buf[0] = b0; buf[1] = b1; 190 buf[2] = b2 + b3; buf[3] = b3; 191 192 b4 = a4 + a5; b5 = MULSHIFT32(COS4_0, a4 - a5) << 1; 193 b6 = a6 + a7; b7 = MULSHIFT32(COS4_0, a7 - a6) << 1; 194 b6 += b7; 195 buf[4] = b4 + b6; buf[5] = b5 + b7; 196 buf[6] = b5 + b6; buf[7] = b7; 197 198 buf += 8; 199 } 200 buf -= 32; /* reset */ 201 202 /* sample 0 - always delayed one block */ 203 d = dest + 64*16 + ((offset - oddBlock) & 7) + (oddBlock ? 0 : VBUF_LENGTH); 204 s = buf[ 0]; d[0] = d[8] = s; 205 206 /* samples 16 to 31 */ 207 d = dest + offset + (oddBlock ? VBUF_LENGTH : 0); 208 209 s = buf[ 1]; d[0] = d[8] = s; d += 64; 210 211 tmp = buf[25] + buf[29]; 212 s = buf[17] + tmp; d[0] = d[8] = s; d += 64; 213 s = buf[ 9] + buf[13]; d[0] = d[8] = s; d += 64; 214 s = buf[21] + tmp; d[0] = d[8] = s; d += 64; 215 216 tmp = buf[29] + buf[27]; 217 s = buf[ 5]; d[0] = d[8] = s; d += 64; 218 s = buf[21] + tmp; d[0] = d[8] = s; d += 64; 219 s = buf[13] + buf[11]; d[0] = d[8] = s; d += 64; 220 s = buf[19] + tmp; d[0] = d[8] = s; d += 64; 221 222 tmp = buf[27] + buf[31]; 223 s = buf[ 3]; d[0] = d[8] = s; d += 64; 224 s = buf[19] + tmp; d[0] = d[8] = s; d += 64; 225 s = buf[11] + buf[15]; d[0] = d[8] = s; d += 64; 226 s = buf[23] + tmp; d[0] = d[8] = s; d += 64; 227 228 tmp = buf[31]; 229 s = buf[ 7]; d[0] = d[8] = s; d += 64; 230 s = buf[23] + tmp; d[0] = d[8] = s; d += 64; 231 s = buf[15]; d[0] = d[8] = s; d += 64; 232 s = tmp; d[0] = d[8] = s; 233 234 /* samples 16 to 1 (sample 16 used again) */ 235 d = dest + 16 + ((offset - oddBlock) & 7) + (oddBlock ? 0 : VBUF_LENGTH); 236 237 s = buf[ 1]; d[0] = d[8] = s; d += 64; 238 239 tmp = buf[30] + buf[25]; 240 s = buf[17] + tmp; d[0] = d[8] = s; d += 64; 241 s = buf[14] + buf[ 9]; d[0] = d[8] = s; d += 64; 242 s = buf[22] + tmp; d[0] = d[8] = s; d += 64; 243 s = buf[ 6]; d[0] = d[8] = s; d += 64; 244 245 tmp = buf[26] + buf[30]; 246 s = buf[22] + tmp; d[0] = d[8] = s; d += 64; 247 s = buf[10] + buf[14]; d[0] = d[8] = s; d += 64; 248 s = buf[18] + tmp; d[0] = d[8] = s; d += 64; 249 s = buf[ 2]; d[0] = d[8] = s; d += 64; 250 251 tmp = buf[28] + buf[26]; 252 s = buf[18] + tmp; d[0] = d[8] = s; d += 64; 253 s = buf[12] + buf[10]; d[0] = d[8] = s; d += 64; 254 s = buf[20] + tmp; d[0] = d[8] = s; d += 64; 255 s = buf[ 4]; d[0] = d[8] = s; d += 64; 256 257 tmp = buf[24] + buf[28]; 258 s = buf[20] + tmp; d[0] = d[8] = s; d += 64; 259 s = buf[ 8] + buf[12]; d[0] = d[8] = s; d += 64; 260 s = buf[16] + tmp; d[0] = d[8] = s; 261 262 /* this is so rarely invoked that it's not worth making two versions of the output 263 * shuffle code (one for no shift, one for clip + variable shift) like in IMDCT 264 * here we just load, clip, shift, and store on the rare instances that es != 0 265 */ 266 if (es) { 267 d = dest + 64*16 + ((offset - oddBlock) & 7) + (oddBlock ? 0 : VBUF_LENGTH); 268 s = d[0]; CLIP_2N(s, 31 - es); d[0] = d[8] = (s << es); 269 270 d = dest + offset + (oddBlock ? VBUF_LENGTH : 0); 271 for (i = 16; i <= 31; i++) { 272 s = d[0]; CLIP_2N(s, 31 - es); d[0] = d[8] = (s << es); d += 64; 273 } 274 275 d = dest + 16 + ((offset - oddBlock) & 7) + (oddBlock ? 0 : VBUF_LENGTH); 276 for (i = 15; i >= 0; i--) { 277 s = d[0]; CLIP_2N(s, 31 - es); d[0] = d[8] = (s << es); d += 64; 278 } 279 } 280 }