/ src / dct32.cpp
dct32.cpp
  1  /* ***** BEGIN LICENSE BLOCK ***** 
  2   * Version: RCSL 1.0/RPSL 1.0 
  3   *  
  4   * Portions Copyright (c) 1995-2002 RealNetworks, Inc. All Rights Reserved. 
  5   *      
  6   * The contents of this file, and the files included with this file, are 
  7   * subject to the current version of the RealNetworks Public Source License 
  8   * Version 1.0 (the "RPSL") available at 
  9   * http://www.helixcommunity.org/content/rpsl unless you have licensed 
 10   * the file under the RealNetworks Community Source License Version 1.0 
 11   * (the "RCSL") available at http://www.helixcommunity.org/content/rcsl, 
 12   * in which case the RCSL will apply. You may also obtain the license terms 
 13   * directly from RealNetworks.  You may not use this file except in 
 14   * compliance with the RPSL or, if you have a valid RCSL with RealNetworks 
 15   * applicable to this file, the RCSL.  Please see the applicable RPSL or 
 16   * RCSL for the rights, obligations and limitations governing use of the 
 17   * contents of the file.  
 18   *  
 19   * This file is part of the Helix DNA Technology. RealNetworks is the 
 20   * developer of the Original Code and owns the copyrights in the portions 
 21   * it created. 
 22   *  
 23   * This file, and the files included with this file, is distributed and made 
 24   * available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 
 25   * EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS ALL SUCH WARRANTIES, 
 26   * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, FITNESS 
 27   * FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 
 28   * 
 29   * Technology Compatibility Kit Test Suite(s) Location: 
 30   *    http://www.helixcommunity.org/content/tck 
 31   * 
 32   * Contributor(s): 
 33   *  
 34   * ***** END LICENSE BLOCK ***** */ 
 35  
 36  /**************************************************************************************
 37   * Fixed-point MP3 decoder
 38   * Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com)
 39   * June 2003
 40   *
 41   * dct32.c - optimized implementations of 32-point DCT for matrixing stage of 
 42   *             polyphase filter
 43   **************************************************************************************/
 44  
 45  #include "coder.h"
 46  #include "assembly.h"
 47  
 48  #define COS0_0  0x4013c251	/* Q31 */
 49  #define COS0_1  0x40b345bd	/* Q31 */
 50  #define COS0_2  0x41fa2d6d	/* Q31 */
 51  #define COS0_3  0x43f93421	/* Q31 */
 52  #define COS0_4  0x46cc1bc4	/* Q31 */
 53  #define COS0_5  0x4a9d9cf0	/* Q31 */
 54  #define COS0_6  0x4fae3711	/* Q31 */
 55  #define COS0_7  0x56601ea7	/* Q31 */
 56  #define COS0_8  0x5f4cf6eb	/* Q31 */
 57  #define COS0_9  0x6b6fcf26	/* Q31 */
 58  #define COS0_10 0x7c7d1db3	/* Q31 */
 59  #define COS0_11 0x4ad81a97	/* Q30 */
 60  #define COS0_12 0x5efc8d96	/* Q30 */
 61  #define COS0_13 0x41d95790	/* Q29 */
 62  #define COS0_14 0x6d0b20cf	/* Q29 */
 63  #define COS0_15 0x518522fb	/* Q27 */
 64  
 65  #define COS1_0  0x404f4672	/* Q31 */
 66  #define COS1_1  0x42e13c10	/* Q31 */
 67  #define COS1_2  0x48919f44	/* Q31 */
 68  #define COS1_3  0x52cb0e63	/* Q31 */
 69  #define COS1_4  0x64e2402e	/* Q31 */
 70  #define COS1_5  0x43e224a9	/* Q30 */
 71  #define COS1_6  0x6e3c92c1	/* Q30 */
 72  #define COS1_7  0x519e4e04	/* Q28 */
 73  
 74  #define COS2_0  0x4140fb46	/* Q31 */
 75  #define COS2_1  0x4cf8de88	/* Q31 */
 76  #define COS2_2  0x73326bbf	/* Q31 */
 77  #define COS2_3  0x52036742	/* Q29 */
 78  
 79  #define COS3_0  0x4545e9ef	/* Q31 */
 80  #define COS3_1  0x539eba45	/* Q30 */
 81  
 82  #define COS4_0  0x5a82799a	/* Q31 */
 83  
 84  // faster in ROM
 85  static const int dcttab[48] = {
 86  	/* first pass */
 87  	COS0_0, COS0_15, COS1_0,	/* 31, 27, 31 */
 88  	COS0_1, COS0_14, COS1_1,	/* 31, 29, 31 */
 89  	COS0_2, COS0_13, COS1_2,	/* 31, 29, 31 */
 90  	COS0_3, COS0_12, COS1_3,	/* 31, 30, 31 */
 91  	COS0_4, COS0_11, COS1_4,	/* 31, 30, 31 */
 92  	COS0_5, COS0_10, COS1_5,	/* 31, 31, 30 */
 93  	COS0_6, COS0_9,  COS1_6,	/* 31, 31, 30 */
 94  	COS0_7, COS0_8,  COS1_7,	/* 31, 31, 28 */
 95  	/* second pass */
 96  	 COS2_0,  COS2_3, COS3_0,	/* 31, 29, 31 */
 97  	 COS2_1,  COS2_2, COS3_1,	/* 31, 31, 30 */
 98  	-COS2_0, -COS2_3, COS3_0, 	/* 31, 29, 31 */
 99  	-COS2_1, -COS2_2, COS3_1, 	/* 31, 31, 30 */
100  	 COS2_0,  COS2_3, COS3_0, 	/* 31, 29, 31 */
101  	 COS2_1,  COS2_2, COS3_1, 	/* 31, 31, 30 */
102  	-COS2_0, -COS2_3, COS3_0, 	/* 31, 29, 31 */
103  	-COS2_1, -COS2_2, COS3_1, 	/* 31, 31, 30 */
104  };
105  
106  #define D32FP(i, s0, s1, s2) { \
107      a0 = buf[i];			a3 = buf[31-i]; \
108  	a1 = buf[15-i];			a2 = buf[16+i]; \
109      b0 = a0 + a3;			b3 = MULSHIFT32(*cptr++, a0 - a3) << (s0);	\
110  	b1 = a1 + a2;			b2 = MULSHIFT32(*cptr++, a1 - a2) << (s1);	\
111  	buf[i] = b0 + b1;		buf[15-i] = MULSHIFT32(*cptr,   b0 - b1) << (s2); \
112  	buf[16+i] = b2 + b3;    buf[31-i] = MULSHIFT32(*cptr++, b3 - b2) << (s2); \
113  }
114  
115  /**************************************************************************************
116   * Function:    FDCT32
117   *
118   * Description: Ken's highly-optimized 32-point DCT (radix-4 + radix-8) 
119   *
120   * Inputs:      input buffer, length = 32 samples
121   *              require at least 6 guard bits in input vector x to avoid possibility
122   *                of overflow in internal calculations (see bbtest_imdct test app)
123   *              buffer offset and oddblock flag for polyphase filter input buffer
124   *              number of guard bits in input
125   *
126   * Outputs:     output buffer, data copied and interleaved for polyphase filter
127   *              no guarantees about number of guard bits in output
128   *
129   * Return:      none
130   *
131   * Notes:       number of muls = 4*8 + 12*4 = 80
132   *              final stage of DCT is hardcoded to shuffle data into the proper order
133   *                for the polyphase filterbank
134   *              fully unrolled stage 1, for max precision (scale the 1/cos() factors
135   *                differently, depending on magnitude)
136   *              guard bit analysis verified by exhaustive testing of all 2^32 
137   *                combinations of max pos/max neg values in x[]
138   *
139   * TODO:        code organization and optimization for ARM
140   *              possibly interleave stereo (cut # of coef loads in half - may not have
141   *                enough registers)
142   **************************************************************************************/
143  // about 1ms faster in RAM
144  /*__attribute__ ((section (".data")))*/ void FDCT32(int *buf, int *dest, int offset, int oddBlock, int gb)
145  {
146      int i, s, tmp, es;
147      const int *cptr = dcttab;
148      int a0, a1, a2, a3, a4, a5, a6, a7;
149      int b0, b1, b2, b3, b4, b5, b6, b7;
150  	int *d;
151  
152  	/* scaling - ensure at least 6 guard bits for DCT 
153  	 * (in practice this is already true 99% of time, so this code is
154  	 *  almost never triggered)
155  	 */
156  	es = 0;
157  	if (gb < 6) {
158  		es = 6 - gb;
159  		for (i = 0; i < 32; i++)
160  			buf[i] >>= es;
161  	}
162  
163  	/* first pass */    
164  	D32FP(0, 1, 5, 1);
165  	D32FP(1, 1, 3, 1);
166  	D32FP(2, 1, 3, 1);
167  	D32FP(3, 1, 2, 1);
168  	D32FP(4, 1, 2, 1);
169  	D32FP(5, 1, 1, 2);
170  	D32FP(6, 1, 1, 2);
171  	D32FP(7, 1, 1, 4);
172  
173  	/* second pass */
174  	for (i = 4; i > 0; i--) {
175  		a0 = buf[0]; 	    a7 = buf[7];		a3 = buf[3];	    a4 = buf[4];
176  		b0 = a0 + a7;	    b7 = MULSHIFT32(*cptr++, a0 - a7) << 1;
177  		b3 = a3 + a4;	    b4 = MULSHIFT32(*cptr++, a3 - a4) << 3;
178  		a0 = b0 + b3;	    a3 = MULSHIFT32(*cptr,   b0 - b3) << 1;
179  		a4 = b4 + b7;		a7 = MULSHIFT32(*cptr++, b7 - b4) << 1;
180  
181  		a1 = buf[1];	    a6 = buf[6];	    a2 = buf[2];	    a5 = buf[5];
182  		b1 = a1 + a6;	    b6 = MULSHIFT32(*cptr++, a1 - a6) << 1;
183  		b2 = a2 + a5;	    b5 = MULSHIFT32(*cptr++, a2 - a5) << 1;
184  		a1 = b1 + b2;		a2 = MULSHIFT32(*cptr,   b1 - b2) << 2;
185  		a5 = b5 + b6;	    a6 = MULSHIFT32(*cptr++, b6 - b5) << 2;
186  
187  		b0 = a0 + a1;	    b1 = MULSHIFT32(COS4_0, a0 - a1) << 1;
188  		b2 = a2 + a3;	    b3 = MULSHIFT32(COS4_0, a3 - a2) << 1;
189  		buf[0] = b0;	    buf[1] = b1;
190  		buf[2] = b2 + b3;	buf[3] = b3;
191  
192  		b4 = a4 + a5;	    b5 = MULSHIFT32(COS4_0, a4 - a5) << 1;
193  		b6 = a6 + a7;	    b7 = MULSHIFT32(COS4_0, a7 - a6) << 1;
194  		b6 += b7;
195  		buf[4] = b4 + b6;	buf[5] = b5 + b7;
196  		buf[6] = b5 + b6;	buf[7] = b7;
197  
198  		buf += 8;
199  	}
200  	buf -= 32;	/* reset */
201  
202  	/* sample 0 - always delayed one block */
203  	d = dest + 64*16 + ((offset - oddBlock) & 7) + (oddBlock ? 0 : VBUF_LENGTH);
204  	s = buf[ 0];				d[0] = d[8] = s;
205      
206  	/* samples 16 to 31 */
207  	d = dest + offset + (oddBlock ? VBUF_LENGTH  : 0);
208  
209  	s = buf[ 1];				d[0] = d[8] = s;	d += 64;
210  
211  	tmp = buf[25] + buf[29];
212  	s = buf[17] + tmp;			d[0] = d[8] = s;	d += 64;
213  	s = buf[ 9] + buf[13];		d[0] = d[8] = s;	d += 64;
214  	s = buf[21] + tmp;			d[0] = d[8] = s;	d += 64;
215  
216  	tmp = buf[29] + buf[27];
217  	s = buf[ 5];				d[0] = d[8] = s;	d += 64;
218  	s = buf[21] + tmp;			d[0] = d[8] = s;	d += 64;
219  	s = buf[13] + buf[11];		d[0] = d[8] = s;	d += 64;
220  	s = buf[19] + tmp;			d[0] = d[8] = s;	d += 64;
221  
222  	tmp = buf[27] + buf[31];
223  	s = buf[ 3];				d[0] = d[8] = s;	d += 64;
224  	s = buf[19] + tmp;			d[0] = d[8] = s;	d += 64;
225  	s = buf[11] + buf[15];		d[0] = d[8] = s;	d += 64;
226  	s = buf[23] + tmp;			d[0] = d[8] = s;	d += 64;
227  
228  	tmp = buf[31];
229  	s = buf[ 7];				d[0] = d[8] = s;	d += 64;
230  	s = buf[23] + tmp;			d[0] = d[8] = s;	d += 64;
231  	s = buf[15];				d[0] = d[8] = s;	d += 64;
232  	s = tmp;					d[0] = d[8] = s;
233  
234  	/* samples 16 to 1 (sample 16 used again) */
235  	d = dest + 16 + ((offset - oddBlock) & 7) + (oddBlock ? 0 : VBUF_LENGTH);
236  
237  	s = buf[ 1];				d[0] = d[8] = s;	d += 64;
238  
239  	tmp = buf[30] + buf[25];
240  	s = buf[17] + tmp;			d[0] = d[8] = s;	d += 64;
241  	s = buf[14] + buf[ 9];		d[0] = d[8] = s;	d += 64;
242  	s = buf[22] + tmp;			d[0] = d[8] = s;	d += 64;
243  	s = buf[ 6];				d[0] = d[8] = s;	d += 64;
244  
245  	tmp = buf[26] + buf[30];
246  	s = buf[22] + tmp;			d[0] = d[8] = s;	d += 64;
247  	s = buf[10] + buf[14];		d[0] = d[8] = s;	d += 64;
248  	s = buf[18] + tmp;			d[0] = d[8] = s;	d += 64;
249  	s = buf[ 2];				d[0] = d[8] = s;	d += 64;
250  
251  	tmp = buf[28] + buf[26];
252  	s = buf[18] + tmp;			d[0] = d[8] = s;	d += 64;
253  	s = buf[12] + buf[10];		d[0] = d[8] = s;	d += 64;
254  	s = buf[20] + tmp;			d[0] = d[8] = s;	d += 64;
255  	s = buf[ 4];				d[0] = d[8] = s;	d += 64;
256  
257  	tmp = buf[24] + buf[28];
258  	s = buf[20] + tmp;			d[0] = d[8] = s;	d += 64;
259  	s = buf[ 8] + buf[12];		d[0] = d[8] = s;	d += 64;
260  	s = buf[16] + tmp;			d[0] = d[8] = s;
261  
262  	/* this is so rarely invoked that it's not worth making two versions of the output
263  	 *   shuffle code (one for no shift, one for clip + variable shift) like in IMDCT
264  	 * here we just load, clip, shift, and store on the rare instances that es != 0
265  	 */
266  	if (es) {
267  		d = dest + 64*16 + ((offset - oddBlock) & 7) + (oddBlock ? 0 : VBUF_LENGTH);
268  		s = d[0];	CLIP_2N(s, 31 - es);	d[0] = d[8] = (s << es);
269  	
270  		d = dest + offset + (oddBlock ? VBUF_LENGTH  : 0);
271  		for (i = 16; i <= 31; i++) {
272  			s = d[0];	CLIP_2N(s, 31 - es);	d[0] = d[8] = (s << es);	d += 64;
273  		}
274  
275  		d = dest + 16 + ((offset - oddBlock) & 7) + (oddBlock ? 0 : VBUF_LENGTH);
276  		for (i = 15; i >= 0; i--) {
277  			s = d[0];	CLIP_2N(s, 31 - es);	d[0] = d[8] = (s << es);	d += 64;
278  		}
279  	}
280  }