assembly.h
1 /* ***** BEGIN LICENSE BLOCK ***** 2 * Version: RCSL 1.0/RPSL 1.0 3 * 4 * Portions Copyright (c) 1995-2002 RealNetworks, Inc. All Rights Reserved. 5 * 6 * The contents of this file, and the files included with this file, are 7 * subject to the current version of the RealNetworks Public Source License 8 * Version 1.0 (the "RPSL") available at 9 * http://www.helixcommunity.org/content/rpsl unless you have licensed 10 * the file under the RealNetworks Community Source License Version 1.0 11 * (the "RCSL") available at http://www.helixcommunity.org/content/rcsl, 12 * in which case the RCSL will apply. You may also obtain the license terms 13 * directly from RealNetworks. You may not use this file except in 14 * compliance with the RPSL or, if you have a valid RCSL with RealNetworks 15 * applicable to this file, the RCSL. Please see the applicable RPSL or 16 * RCSL for the rights, obligations and limitations governing use of the 17 * contents of the file. 18 * 19 * This file is part of the Helix DNA Technology. RealNetworks is the 20 * developer of the Original Code and owns the copyrights in the portions 21 * it created. 22 * 23 * This file, and the files included with this file, is distributed and made 24 * available on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 25 * EXPRESS OR IMPLIED, AND REALNETWORKS HEREBY DISCLAIMS ALL SUCH WARRANTIES, 26 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, FITNESS 27 * FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 28 * 29 * Technology Compatibility Kit Test Suite(s) Location: 30 * http://www.helixcommunity.org/content/tck 31 * 32 * Contributor(s): 33 * 34 * ***** END LICENSE BLOCK ***** */ 35 36 /************************************************************************************** 37 * Fixed-point MP3 decoder 38 * Jon Recker (jrecker@real.com), Ken Cooke (kenc@real.com) 39 * June 2003 40 * 41 * assembly.h - assembly language functions and prototypes for supported platforms 42 * 43 * - inline rountines with access to 64-bit multiply results 44 * - x86 (_WIN32) and ARM (ARM_ADS, _WIN32_WCE) versions included 45 * - some inline functions are mix of asm and C for speed 46 * - some functions are in native asm files, so only the prototype is given here 47 * 48 * MULSHIFT32(x, y) signed multiply of two 32-bit integers (x and y), returns top 32 bits of 64-bit result 49 * FASTABS(x) branchless absolute value of signed integer x 50 * CLZ(x) count leading zeros in x 51 * MADD64(sum, x, y) (Windows only) sum [64-bit] += x [32-bit] * y [32-bit] 52 * SHL64(sum, x, y) (Windows only) 64-bit left shift using __int64 53 * SAR64(sum, x, y) (Windows only) 64-bit right shift using __int64 54 */ 55 56 #ifndef _ASSEMBLY_H 57 #define _ASSEMBLY_H 58 59 #if (defined _WIN32 && !defined _WIN32_WCE) || (defined __WINS__ && defined _SYMBIAN) || defined(_OPENWAVE_SIMULATOR) || defined(WINCE_EMULATOR) /* Symbian emulator for Ix86 */ 60 61 #pragma warning( disable : 4035 ) /* complains about inline asm not returning a value */ 62 63 static __inline int MULSHIFT32(int x, int y) 64 { 65 __asm { 66 mov eax, x 67 imul y 68 mov eax, edx 69 } 70 } 71 72 static __inline int FASTABS(int x) 73 { 74 int sign; 75 76 sign = x >> (sizeof(int) * 8 - 1); 77 x ^= sign; 78 x -= sign; 79 80 return x; 81 } 82 83 static __inline int CLZ(int x) 84 { 85 int numZeros; 86 87 if (!x) 88 return (sizeof(int) * 8); 89 90 numZeros = 0; 91 while (!(x & 0x80000000)) { 92 numZeros++; 93 x <<= 1; 94 } 95 96 return numZeros; 97 } 98 99 /* MADD64, SHL64, SAR64: 100 * write in assembly to avoid dependency on run-time lib for 64-bit shifts, muls 101 * (sometimes compiler thunks to function calls instead of code generating) 102 * required for Symbian emulator 103 */ 104 #ifdef __CW32__ 105 typedef long long Word64; 106 #else 107 typedef __int64 Word64; 108 #endif 109 110 static __inline Word64 MADD64(Word64 sum, int x, int y) 111 { 112 unsigned int sumLo = ((unsigned int *)&sum)[0]; 113 int sumHi = ((int *)&sum)[1]; 114 115 __asm { 116 mov eax, x 117 imul y 118 add eax, sumLo 119 adc edx, sumHi 120 } 121 122 /* equivalent to return (sum + ((__int64)x * y)); */ 123 } 124 125 static __inline Word64 SHL64(Word64 x, int n) 126 { 127 unsigned int xLo = ((unsigned int *)&x)[0]; 128 int xHi = ((int *)&x)[1]; 129 unsigned char nb = (unsigned char)n; 130 131 if (n < 32) { 132 __asm { 133 mov edx, xHi 134 mov eax, xLo 135 mov cl, nb 136 shld edx, eax, cl 137 shl eax, cl 138 } 139 } else if (n < 64) { 140 /* shl masks cl to 0x1f */ 141 __asm { 142 mov edx, xLo 143 mov cl, nb 144 xor eax, eax 145 shl edx, cl 146 } 147 } else { 148 __asm { 149 xor edx, edx 150 xor eax, eax 151 } 152 } 153 } 154 155 static __inline Word64 SAR64(Word64 x, int n) 156 { 157 unsigned int xLo = ((unsigned int *)&x)[0]; 158 int xHi = ((int *)&x)[1]; 159 unsigned char nb = (unsigned char)n; 160 161 if (n < 32) { 162 __asm { 163 mov edx, xHi 164 mov eax, xLo 165 mov cl, nb 166 shrd eax, edx, cl 167 sar edx, cl 168 } 169 } else if (n < 64) { 170 /* sar masks cl to 0x1f */ 171 __asm { 172 mov edx, xHi 173 mov eax, xHi 174 mov cl, nb 175 sar edx, 31 176 sar eax, cl 177 } 178 } else { 179 __asm { 180 sar xHi, 31 181 mov eax, xHi 182 mov edx, xHi 183 } 184 } 185 } 186 187 #elif (defined _WIN32) && (defined _WIN32_WCE) 188 189 /* use asm function for now (EVC++ 3.0 does horrible job compiling __int64 version) */ 190 #define MULSHIFT32 xmp3_MULSHIFT32 191 int MULSHIFT32(int x, int y); 192 193 static __inline int FASTABS(int x) 194 { 195 int sign; 196 197 sign = x >> (sizeof(int) * 8 - 1); 198 x ^= sign; 199 x -= sign; 200 201 return x; 202 } 203 204 static __inline int CLZ(int x) 205 { 206 int numZeros; 207 208 if (!x) 209 return (sizeof(int) * 8); 210 211 numZeros = 0; 212 while (!(x & 0x80000000)) { 213 numZeros++; 214 x <<= 1; 215 } 216 217 return numZeros; 218 } 219 220 #elif defined ARM_ADS 221 222 static __inline int MULSHIFT32(int x, int y) 223 { 224 /* important rules for smull RdLo, RdHi, Rm, Rs: 225 * RdHi and Rm can't be the same register 226 * RdLo and Rm can't be the same register 227 * RdHi and RdLo can't be the same register 228 * Note: Rs determines early termination (leading sign bits) so if you want to specify 229 * which operand is Rs, put it in the SECOND argument (y) 230 * For inline assembly, x and y are not assumed to be R0, R1 so it shouldn't matter 231 * which one is returned. (If this were a function call, returning y (R1) would 232 * require an extra "mov r0, r1") 233 */ 234 int zlow; 235 __asm { 236 smull zlow,y,x,y 237 } 238 239 return y; 240 } 241 242 static __inline int FASTABS(int x) 243 { 244 int t=0; /*Really is not necessary to initialiaze only to avoid warning*/ 245 246 __asm { 247 eor t, x, x, asr #31 248 sub t, t, x, asr #31 249 } 250 251 return t; 252 } 253 254 static __inline int CLZ(int x) 255 { 256 int numZeros; 257 258 if (!x) 259 return (sizeof(int) * 8); 260 261 numZeros = 0; 262 while (!(x & 0x80000000)) { 263 numZeros++; 264 x <<= 1; 265 } 266 267 return numZeros; 268 } 269 270 #elif defined(__GNUC__) && defined(ARM) 271 272 static __inline int MULSHIFT32(int x, int y) 273 { 274 /* important rules for smull RdLo, RdHi, Rm, Rs: 275 * RdHi and Rm can't be the same register 276 * RdLo and Rm can't be the same register 277 * RdHi and RdLo can't be the same register 278 * Note: Rs determines early termination (leading sign bits) so if you want to specify 279 * which operand is Rs, put it in the SECOND argument (y) 280 * For inline assembly, x and y are not assumed to be R0, R1 so it shouldn't matter 281 * which one is returned. (If this were a function call, returning y (R1) would 282 * require an extra "mov r0, r1") 283 */ 284 int zlow; 285 __asm__ volatile ("smull %0,%1,%2,%3" : "=&r" (zlow), "=r" (y) : "r" (x), "1" (y)) ; 286 287 return y; 288 } 289 290 static __inline int FASTABS(int x) 291 { 292 int t=0; /*Really is not necessary to initialiaze only to avoid warning*/ 293 294 __asm__ volatile ( 295 "eor %0,%2,%2, asr #31;" 296 "sub %0,%1,%2, asr #31;" 297 : "=&r" (t) 298 : "0" (t), "r" (x) 299 ); 300 301 return t; 302 } 303 304 static __inline int CLZ(int x) 305 { 306 int numZeros; 307 308 if (!x) 309 return (sizeof(int) * 8); 310 311 numZeros = 0; 312 while (!(x & 0x80000000)) { 313 numZeros++; 314 x <<= 1; 315 } 316 317 return numZeros; 318 } 319 320 #elif defined(__GNUC__) && defined(__AVR32_UC__) 321 322 typedef signed long long int Word64; // 64-bit signed integer. 323 324 325 __attribute__((__always_inline__)) static __inline int MULSHIFT32(int x, int y) 326 { 327 signed long long int s64Tmp; 328 __asm__ __volatile__( "muls.d %0, %1, %2" 329 : "=r" (s64Tmp) 330 : "r" (x), "r" (y) ); 331 return( s64Tmp >> 32 ); 332 } 333 334 __attribute__((__always_inline__)) static __inline int FASTABS(int x) 335 { 336 int tmp; 337 __asm__ __volatile__( "abs %0" 338 : "=r" (tmp) 339 : "r" (x) ); 340 return tmp; 341 342 } 343 344 345 __attribute__((__always_inline__)) static __inline int CLZ(int x) 346 { 347 int tmp; 348 __asm__ __volatile__( "clz %0,%1" 349 : "=r" (tmp) 350 : "r" (x) ); 351 return tmp; 352 } 353 354 355 /* MADD64, SAR64: 356 * write in assembly to avoid dependency on run-time lib for 64-bit shifts, muls 357 * (sometimes compiler do function calls instead of code generating) 358 */ 359 __attribute__((__always_inline__)) static __inline Word64 MADD64(Word64 sum, int x, int y) 360 { 361 __asm__ __volatile__( "macs.d %0, %1, %2" 362 : "+r" (sum) 363 : "r" (x), "r" (y) ); 364 return( sum ); 365 } 366 367 368 __attribute__((__always_inline__)) static __inline Word64 SAR64(Word64 x, int n) 369 { 370 unsigned int xLo = (unsigned int) x; 371 int xHi = (int) (x >> 32); 372 int nComp = 32-n; 373 int tmp; 374 // Shortcut: n is always < 32. 375 __asm__ __volatile__( "lsl %2, %0, %3\n\t" // tmp <- xHi<<(32-n) 376 "asr %0, %0, %4\n\t" // xHi <- xHi>>n 377 "lsr %1, %1, %4\n\t" // xLo <- xLo>>n 378 "or %1, %2\n\t" // xLo <= xLo || tmp 379 : "+&r" (xHi), "+r" (xLo), "=&r" (tmp) 380 : "r" (nComp), "r" (n) ); 381 x = xLo | ((Word64)xHi << 32); 382 return( x ); 383 } 384 385 #elif (defined(__CORTEX_M) && __CORTEX_M == 0x04U) || defined(__MK66FX1M0__) || defined(__MK64FX512__) || defined(__MK20DX256__) /* teensy 3.6, 3.5, or 3.1/2 */ 386 387 /* ARM cortex m4 */ 388 389 typedef signed long long int Word64; // 64-bit signed integer. 390 391 392 static __inline int MULSHIFT32(int x, int y) 393 { 394 /* important rules for smull RdLo, RdHi, Rm, Rs: 395 * RdHi and Rm can't be the same register 396 * RdLo and Rm can't be the same register 397 * RdHi and RdLo can't be the same register 398 * Note: Rs determines early termination (leading sign bits) so if you want to specify 399 * which operand is Rs, put it in the SECOND argument (y) 400 * For inline assembly, x and y are not assumed to be R0, R1 so it shouldn't matter 401 * which one is returned. (If this were a function call, returning y (R1) would 402 * require an extra "mov r0, r1") 403 */ 404 int zlow; 405 __asm__ volatile ("smull %0,%1,%2,%3" : "=&r" (zlow), "=r" (y) : "r" (x), "1" (y)) ; 406 407 return y; 408 } 409 410 static __inline int FASTABS(int x) 411 { 412 int sign; 413 414 sign = x >> (sizeof(int) * 8 - 1); 415 x ^= sign; 416 x -= sign; 417 418 return x; 419 } 420 421 static __inline int CLZ(int x) 422 { 423 #if defined(__MK66FX1M0__) || defined(__MK64FX512__) || defined(__MK20DX256__) /* teensy 3.6, 3.5, or 3.1/2 */ 424 return __builtin_clz(x); 425 #else 426 return __CLZ(x); 427 #endif 428 } 429 430 typedef union _U64 { 431 Word64 w64; 432 struct { 433 /* ARM ADS = little endian */ 434 unsigned int lo32; 435 signed int hi32; 436 } r; 437 } U64; 438 439 static __inline Word64 MADD64(Word64 sum64, int x, int y) 440 { 441 U64 u; 442 u.w64 = sum64; 443 444 __asm__ volatile ("smlal %0,%1,%2,%3" : "+&r" (u.r.lo32), "+&r" (u.r.hi32) : "r" (x), "r" (y) : "cc"); 445 446 return u.w64; 447 } 448 449 450 __attribute__((__always_inline__)) static __inline Word64 SAR64(Word64 x, int n) 451 { 452 unsigned int xLo = (unsigned int) x; 453 int xHi = (int) (x >> 32); 454 int nComp = 32-n; 455 int tmp; 456 // Shortcut: n is always < 32. 457 __asm__ __volatile__( "lsl %2, %0, %3\n\t" // tmp <- xHi<<(32-n) 458 "asr %0, %0, %4\n\t" // xHi <- xHi>>n 459 "lsr %1, %1, %4\n\t" // xLo <- xLo>>n 460 "orr %1, %2\n\t" // xLo <= xLo || tmp 461 : "+&r" (xHi), "+r" (xLo), "=&r" (tmp) 462 : "r" (nComp), "r" (n) ); 463 x = xLo | ((Word64)xHi << 32); 464 return( x ); 465 } 466 467 //END cortex m4 468 469 470 #else 471 472 #error Unsupported platform in assembly.h 473 474 #endif /* platforms */ 475 476 #endif /* _ASSEMBLY_H */