InvTxfm.cs
1 using Ryujinx.Graphics.Nvdec.Vp9.Common; 2 using System; 3 using System.Diagnostics; 4 using System.Runtime.CompilerServices; 5 using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.TxfmCommon; 6 7 namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp 8 { 9 internal static class InvTxfm 10 { 11 // 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse 12 // transform amplify bits + 1 bit for contingency in rounding and quantizing 13 private const int HighbdValidTxfmMagnitudeRange = (1 << 25); 14 15 [MethodImpl(MethodImplOptions.AggressiveInlining)] 16 private static int DetectInvalidHighbdInput(ReadOnlySpan<int> input, int size) 17 { 18 int i; 19 for (i = 0; i < size; ++i) 20 { 21 if (Math.Abs(input[i]) >= HighbdValidTxfmMagnitudeRange) 22 { 23 return 1; 24 } 25 } 26 27 return 0; 28 } 29 30 [MethodImpl(MethodImplOptions.AggressiveInlining)] 31 private static long CheckRange(long input) 32 { 33 // For valid VP9 input streams, intermediate stage coefficients should always 34 // stay within the range of a signed 16 bit integer. Coefficients can go out 35 // of this range for invalid/corrupt VP9 streams. 36 Debug.Assert(short.MinValue <= input); 37 Debug.Assert(input <= short.MaxValue); 38 39 return input; 40 } 41 42 [MethodImpl(MethodImplOptions.AggressiveInlining)] 43 public static long HighbdCheckRange(long input, int bd) 44 { 45 // For valid highbitdepth VP9 streams, intermediate stage coefficients will 46 // stay within the ranges: 47 // - 8 bit: signed 16 bit integer 48 // - 10 bit: signed 18 bit integer 49 // - 12 bit: signed 20 bit integer 50 int intMax = (1 << (7 + bd)) - 1; 51 int intMin = -intMax - 1; 52 Debug.Assert(intMin <= input); 53 Debug.Assert(input <= intMax); 54 55 return input; 56 } 57 58 [MethodImpl(MethodImplOptions.AggressiveInlining)] 59 private static int WrapLow(long x) 60 { 61 return (short)CheckRange(x); 62 } 63 64 [MethodImpl(MethodImplOptions.AggressiveInlining)] 65 private static int HighbdWrapLow(long x, int bd) 66 { 67 return ((int)HighbdCheckRange(x, bd) << (24 - bd)) >> (24 - bd); 68 } 69 70 [MethodImpl(MethodImplOptions.AggressiveInlining)] 71 public static byte ClipPixelAdd(byte dest, long trans) 72 { 73 trans = WrapLow(trans); 74 75 return BitUtils.ClipPixel(dest + (int)trans); 76 } 77 78 [MethodImpl(MethodImplOptions.AggressiveInlining)] 79 public static ushort HighbdClipPixelAdd(ushort dest, long trans, int bd) 80 { 81 trans = HighbdWrapLow(trans, bd); 82 83 return BitUtils.ClipPixelHighbd(dest + (int)trans, bd); 84 } 85 86 [MethodImpl(MethodImplOptions.AggressiveInlining)] 87 private static long DctConstRoundShift(long input) 88 { 89 long rv = BitUtils.RoundPowerOfTwo(input, DctConstBits); 90 91 return rv; 92 } 93 94 [SkipLocalsInit] 95 public static void Iwht4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) 96 { 97 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 98 0.5 shifts per pixel. */ 99 int i; 100 Span<int> output = stackalloc int[16]; 101 long a1, b1, c1, d1, e1; 102 ReadOnlySpan<int> ip = input; 103 Span<int> op = output; 104 105 for (i = 0; i < 4; i++) 106 { 107 a1 = ip[0] >> UnitQuantShift; 108 c1 = ip[1] >> UnitQuantShift; 109 d1 = ip[2] >> UnitQuantShift; 110 b1 = ip[3] >> UnitQuantShift; 111 a1 += c1; 112 d1 -= b1; 113 e1 = (a1 - d1) >> 1; 114 b1 = e1 - b1; 115 c1 = e1 - c1; 116 a1 -= b1; 117 d1 += c1; 118 op[0] = WrapLow(a1); 119 op[1] = WrapLow(b1); 120 op[2] = WrapLow(c1); 121 op[3] = WrapLow(d1); 122 ip = ip[4..]; 123 op = op[4..]; 124 } 125 126 Span<int> ip2 = output; 127 for (i = 0; i < 4; i++) 128 { 129 a1 = ip2[4 * 0]; 130 c1 = ip2[4 * 1]; 131 d1 = ip2[4 * 2]; 132 b1 = ip2[4 * 3]; 133 a1 += c1; 134 d1 -= b1; 135 e1 = (a1 - d1) >> 1; 136 b1 = e1 - b1; 137 c1 = e1 - c1; 138 a1 -= b1; 139 d1 += c1; 140 dest[stride * 0] = ClipPixelAdd(dest[stride * 0], WrapLow(a1)); 141 dest[stride * 1] = ClipPixelAdd(dest[stride * 1], WrapLow(b1)); 142 dest[stride * 2] = ClipPixelAdd(dest[stride * 2], WrapLow(c1)); 143 dest[stride * 3] = ClipPixelAdd(dest[stride * 3], WrapLow(d1)); 144 145 ip2 = ip2[1..]; 146 dest = dest[1..]; 147 } 148 } 149 150 [SkipLocalsInit] 151 public static void Iwht4x41Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) 152 { 153 int i; 154 long a1, e1; 155 Span<int> tmp = stackalloc int[4]; 156 ReadOnlySpan<int> ip = input; 157 Span<int> op = tmp; 158 159 a1 = ip[0] >> UnitQuantShift; 160 e1 = a1 >> 1; 161 a1 -= e1; 162 op[0] = WrapLow(a1); 163 op[1] = op[2] = op[3] = WrapLow(e1); 164 165 Span<int> ip2 = tmp; 166 for (i = 0; i < 4; i++) 167 { 168 e1 = ip2[0] >> 1; 169 a1 = ip2[0] - e1; 170 dest[stride * 0] = ClipPixelAdd(dest[stride * 0], a1); 171 dest[stride * 1] = ClipPixelAdd(dest[stride * 1], e1); 172 dest[stride * 2] = ClipPixelAdd(dest[stride * 2], e1); 173 dest[stride * 3] = ClipPixelAdd(dest[stride * 3], e1); 174 ip2 = ip2[1..]; 175 dest = dest[1..]; 176 } 177 } 178 179 public static void Iadst4(ReadOnlySpan<int> input, Span<int> output) 180 { 181 long s0, s1, s2, s3, s4, s5, s6, s7; 182 int x0 = input[0]; 183 int x1 = input[1]; 184 int x2 = input[2]; 185 int x3 = input[3]; 186 187 if ((x0 | x1 | x2 | x3) == 0) 188 { 189 output[..4].Clear(); 190 191 return; 192 } 193 194 // 32-bit result is enough for the following multiplications. 195 s0 = SinPi1_9 * x0; 196 s1 = SinPi2_9 * x0; 197 s2 = SinPi3_9 * x1; 198 s3 = SinPi4_9 * x2; 199 s4 = SinPi1_9 * x2; 200 s5 = SinPi2_9 * x3; 201 s6 = SinPi4_9 * x3; 202 s7 = WrapLow(x0 - x2 + x3); 203 204 s0 = s0 + s3 + s5; 205 s1 = s1 - s4 - s6; 206 s3 = s2; 207 s2 = SinPi3_9 * s7; 208 209 // 1-D transform scaling factor is sqrt(2). 210 // The overall dynamic range is 14b (input) + 14b (multiplication scaling) 211 // + 1b (addition) = 29b. 212 // Hence the output bit depth is 15b. 213 output[0] = WrapLow(DctConstRoundShift(s0 + s3)); 214 output[1] = WrapLow(DctConstRoundShift(s1 + s3)); 215 output[2] = WrapLow(DctConstRoundShift(s2)); 216 output[3] = WrapLow(DctConstRoundShift(s0 + s1 - s3)); 217 } 218 219 [SkipLocalsInit] 220 public static void Idct4(ReadOnlySpan<int> input, Span<int> output) 221 { 222 Span<short> step = stackalloc short[4]; 223 long temp1, temp2; 224 225 // stage 1 226 temp1 = ((short)input[0] + (short)input[2]) * CosPi16_64; 227 temp2 = ((short)input[0] - (short)input[2]) * CosPi16_64; 228 step[0] = (short)WrapLow(DctConstRoundShift(temp1)); 229 step[1] = (short)WrapLow(DctConstRoundShift(temp2)); 230 temp1 = (short)input[1] * CosPi24_64 - (short)input[3] * CosPi8_64; 231 temp2 = (short)input[1] * CosPi8_64 + (short)input[3] * CosPi24_64; 232 step[2] = (short)WrapLow(DctConstRoundShift(temp1)); 233 step[3] = (short)WrapLow(DctConstRoundShift(temp2)); 234 235 // stage 2 236 output[0] = WrapLow(step[0] + step[3]); 237 output[1] = WrapLow(step[1] + step[2]); 238 output[2] = WrapLow(step[1] - step[2]); 239 output[3] = WrapLow(step[0] - step[3]); 240 } 241 242 [SkipLocalsInit] 243 public static void Idct4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) 244 { 245 int i, j; 246 Span<int> output = stackalloc int[4 * 4]; 247 Span<int> outptr = output; 248 Span<int> tempIn = stackalloc int[4]; 249 Span<int> tempOut = stackalloc int[4]; 250 251 // Rows 252 for (i = 0; i < 4; ++i) 253 { 254 Idct4(input, outptr); 255 input = input[4..]; 256 outptr = outptr[4..]; 257 } 258 259 // Columns 260 for (i = 0; i < 4; ++i) 261 { 262 for (j = 0; j < 4; ++j) 263 { 264 tempIn[j] = output[j * 4 + i]; 265 } 266 267 Idct4(tempIn, tempOut); 268 for (j = 0; j < 4; ++j) 269 { 270 dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4)); 271 } 272 } 273 } 274 275 public static void Idct4x41Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) 276 { 277 int i; 278 long a1; 279 int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64)); 280 281 output = WrapLow(DctConstRoundShift(output * CosPi16_64)); 282 a1 = BitUtils.RoundPowerOfTwo(output, 4); 283 284 for (i = 0; i < 4; i++) 285 { 286 dest[0] = ClipPixelAdd(dest[0], a1); 287 dest[1] = ClipPixelAdd(dest[1], a1); 288 dest[2] = ClipPixelAdd(dest[2], a1); 289 dest[3] = ClipPixelAdd(dest[3], a1); 290 dest = dest[stride..]; 291 } 292 } 293 294 public static void Iadst8(ReadOnlySpan<int> input, Span<int> output) 295 { 296 int s0, s1, s2, s3, s4, s5, s6, s7; 297 long x0 = input[7]; 298 long x1 = input[0]; 299 long x2 = input[5]; 300 long x3 = input[2]; 301 long x4 = input[3]; 302 long x5 = input[4]; 303 long x6 = input[1]; 304 long x7 = input[6]; 305 306 if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7) == 0) 307 { 308 output[..8].Clear(); 309 310 return; 311 } 312 313 // stage 1 314 s0 = (int)(CosPi2_64 * x0 + CosPi30_64 * x1); 315 s1 = (int)(CosPi30_64 * x0 - CosPi2_64 * x1); 316 s2 = (int)(CosPi10_64 * x2 + CosPi22_64 * x3); 317 s3 = (int)(CosPi22_64 * x2 - CosPi10_64 * x3); 318 s4 = (int)(CosPi18_64 * x4 + CosPi14_64 * x5); 319 s5 = (int)(CosPi14_64 * x4 - CosPi18_64 * x5); 320 s6 = (int)(CosPi26_64 * x6 + CosPi6_64 * x7); 321 s7 = (int)(CosPi6_64 * x6 - CosPi26_64 * x7); 322 323 x0 = WrapLow(DctConstRoundShift(s0 + s4)); 324 x1 = WrapLow(DctConstRoundShift(s1 + s5)); 325 x2 = WrapLow(DctConstRoundShift(s2 + s6)); 326 x3 = WrapLow(DctConstRoundShift(s3 + s7)); 327 x4 = WrapLow(DctConstRoundShift(s0 - s4)); 328 x5 = WrapLow(DctConstRoundShift(s1 - s5)); 329 x6 = WrapLow(DctConstRoundShift(s2 - s6)); 330 x7 = WrapLow(DctConstRoundShift(s3 - s7)); 331 332 // stage 2 333 s0 = (int)x0; 334 s1 = (int)x1; 335 s2 = (int)x2; 336 s3 = (int)x3; 337 s4 = (int)(CosPi8_64 * x4 + CosPi24_64 * x5); 338 s5 = (int)(CosPi24_64 * x4 - CosPi8_64 * x5); 339 s6 = (int)(-CosPi24_64 * x6 + CosPi8_64 * x7); 340 s7 = (int)(CosPi8_64 * x6 + CosPi24_64 * x7); 341 342 x0 = WrapLow(s0 + s2); 343 x1 = WrapLow(s1 + s3); 344 x2 = WrapLow(s0 - s2); 345 x3 = WrapLow(s1 - s3); 346 x4 = WrapLow(DctConstRoundShift(s4 + s6)); 347 x5 = WrapLow(DctConstRoundShift(s5 + s7)); 348 x6 = WrapLow(DctConstRoundShift(s4 - s6)); 349 x7 = WrapLow(DctConstRoundShift(s5 - s7)); 350 351 // stage 3 352 s2 = (int)(CosPi16_64 * (x2 + x3)); 353 s3 = (int)(CosPi16_64 * (x2 - x3)); 354 s6 = (int)(CosPi16_64 * (x6 + x7)); 355 s7 = (int)(CosPi16_64 * (x6 - x7)); 356 357 x2 = WrapLow(DctConstRoundShift(s2)); 358 x3 = WrapLow(DctConstRoundShift(s3)); 359 x6 = WrapLow(DctConstRoundShift(s6)); 360 x7 = WrapLow(DctConstRoundShift(s7)); 361 362 output[0] = WrapLow(x0); 363 output[1] = WrapLow(-x4); 364 output[2] = WrapLow(x6); 365 output[3] = WrapLow(-x2); 366 output[4] = WrapLow(x3); 367 output[5] = WrapLow(-x7); 368 output[6] = WrapLow(x5); 369 output[7] = WrapLow(-x1); 370 } 371 372 [SkipLocalsInit] 373 public static void Idct8(ReadOnlySpan<int> input, Span<int> output) 374 { 375 Span<short> step1 = stackalloc short[8]; 376 Span<short> step2 = stackalloc short[8]; 377 long temp1, temp2; 378 379 // stage 1 380 step1[0] = (short)input[0]; 381 step1[2] = (short)input[4]; 382 step1[1] = (short)input[2]; 383 step1[3] = (short)input[6]; 384 temp1 = (short)input[1] * CosPi28_64 - (short)input[7] * CosPi4_64; 385 temp2 = (short)input[1] * CosPi4_64 + (short)input[7] * CosPi28_64; 386 step1[4] = (short)WrapLow(DctConstRoundShift(temp1)); 387 step1[7] = (short)WrapLow(DctConstRoundShift(temp2)); 388 temp1 = (short)input[5] * CosPi12_64 - (short)input[3] * CosPi20_64; 389 temp2 = (short)input[5] * CosPi20_64 + (short)input[3] * CosPi12_64; 390 step1[5] = (short)WrapLow(DctConstRoundShift(temp1)); 391 step1[6] = (short)WrapLow(DctConstRoundShift(temp2)); 392 393 // stage 2 394 temp1 = (step1[0] + step1[2]) * CosPi16_64; 395 temp2 = (step1[0] - step1[2]) * CosPi16_64; 396 step2[0] = (short)WrapLow(DctConstRoundShift(temp1)); 397 step2[1] = (short)WrapLow(DctConstRoundShift(temp2)); 398 temp1 = step1[1] * CosPi24_64 - step1[3] * CosPi8_64; 399 temp2 = step1[1] * CosPi8_64 + step1[3] * CosPi24_64; 400 step2[2] = (short)WrapLow(DctConstRoundShift(temp1)); 401 step2[3] = (short)WrapLow(DctConstRoundShift(temp2)); 402 step2[4] = (short)WrapLow(step1[4] + step1[5]); 403 step2[5] = (short)WrapLow(step1[4] - step1[5]); 404 step2[6] = (short)WrapLow(-step1[6] + step1[7]); 405 step2[7] = (short)WrapLow(step1[6] + step1[7]); 406 407 // stage 3 408 step1[0] = (short)WrapLow(step2[0] + step2[3]); 409 step1[1] = (short)WrapLow(step2[1] + step2[2]); 410 step1[2] = (short)WrapLow(step2[1] - step2[2]); 411 step1[3] = (short)WrapLow(step2[0] - step2[3]); 412 step1[4] = step2[4]; 413 temp1 = (step2[6] - step2[5]) * CosPi16_64; 414 temp2 = (step2[5] + step2[6]) * CosPi16_64; 415 step1[5] = (short)WrapLow(DctConstRoundShift(temp1)); 416 step1[6] = (short)WrapLow(DctConstRoundShift(temp2)); 417 step1[7] = step2[7]; 418 419 // stage 4 420 output[0] = WrapLow(step1[0] + step1[7]); 421 output[1] = WrapLow(step1[1] + step1[6]); 422 output[2] = WrapLow(step1[2] + step1[5]); 423 output[3] = WrapLow(step1[3] + step1[4]); 424 output[4] = WrapLow(step1[3] - step1[4]); 425 output[5] = WrapLow(step1[2] - step1[5]); 426 output[6] = WrapLow(step1[1] - step1[6]); 427 output[7] = WrapLow(step1[0] - step1[7]); 428 } 429 430 [SkipLocalsInit] 431 public static void Idct8x864Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) 432 { 433 int i, j; 434 Span<int> output = stackalloc int[8 * 8]; 435 Span<int> outptr = output; 436 Span<int> tempIn = stackalloc int[8]; 437 Span<int> tempOut = stackalloc int[8]; 438 439 // First transform rows 440 for (i = 0; i < 8; ++i) 441 { 442 Idct8(input, outptr); 443 input = input[8..]; 444 outptr = outptr[8..]; 445 } 446 447 // Then transform columns 448 for (i = 0; i < 8; ++i) 449 { 450 for (j = 0; j < 8; ++j) 451 { 452 tempIn[j] = output[j * 8 + i]; 453 } 454 455 Idct8(tempIn, tempOut); 456 for (j = 0; j < 8; ++j) 457 { 458 dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], 459 BitUtils.RoundPowerOfTwo(tempOut[j], 5)); 460 } 461 } 462 } 463 464 [SkipLocalsInit] 465 public static void Idct8x812Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) 466 { 467 int i, j; 468 Span<int> output = stackalloc int[8 * 8]; 469 Span<int> outptr = output; 470 Span<int> tempIn = stackalloc int[8]; 471 Span<int> tempOut = stackalloc int[8]; 472 473 output.Clear(); 474 475 // First transform rows 476 // Only first 4 row has non-zero coefs 477 for (i = 0; i < 4; ++i) 478 { 479 Idct8(input, outptr); 480 input = input[8..]; 481 outptr = outptr[8..]; 482 } 483 484 // Then transform columns 485 for (i = 0; i < 8; ++i) 486 { 487 for (j = 0; j < 8; ++j) 488 { 489 tempIn[j] = output[j * 8 + i]; 490 } 491 492 Idct8(tempIn, tempOut); 493 for (j = 0; j < 8; ++j) 494 { 495 dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5)); 496 } 497 } 498 } 499 500 public static void Idct8x81Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) 501 { 502 int i, j; 503 long a1; 504 int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64)); 505 506 output = WrapLow(DctConstRoundShift(output * CosPi16_64)); 507 a1 = BitUtils.RoundPowerOfTwo(output, 5); 508 for (j = 0; j < 8; ++j) 509 { 510 for (i = 0; i < 8; ++i) 511 { 512 dest[i] = ClipPixelAdd(dest[i], a1); 513 } 514 515 dest = dest[stride..]; 516 } 517 } 518 519 public static void Iadst16(ReadOnlySpan<int> input, Span<int> output) 520 { 521 long s0, s1, s2, s3, s4, s5, s6, s7, s8; 522 long s9, s10, s11, s12, s13, s14, s15; 523 long x0 = input[15]; 524 long x1 = input[0]; 525 long x2 = input[13]; 526 long x3 = input[2]; 527 long x4 = input[11]; 528 long x5 = input[4]; 529 long x6 = input[9]; 530 long x7 = input[6]; 531 long x8 = input[7]; 532 long x9 = input[8]; 533 long x10 = input[5]; 534 long x11 = input[10]; 535 long x12 = input[3]; 536 long x13 = input[12]; 537 long x14 = input[1]; 538 long x15 = input[14]; 539 540 if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | x13 | x14 | x15) == 0) 541 { 542 output[..16].Clear(); 543 544 return; 545 } 546 547 // stage 1 548 s0 = x0 * CosPi1_64 + x1 * CosPi31_64; 549 s1 = x0 * CosPi31_64 - x1 * CosPi1_64; 550 s2 = x2 * CosPi5_64 + x3 * CosPi27_64; 551 s3 = x2 * CosPi27_64 - x3 * CosPi5_64; 552 s4 = x4 * CosPi9_64 + x5 * CosPi23_64; 553 s5 = x4 * CosPi23_64 - x5 * CosPi9_64; 554 s6 = x6 * CosPi13_64 + x7 * CosPi19_64; 555 s7 = x6 * CosPi19_64 - x7 * CosPi13_64; 556 s8 = x8 * CosPi17_64 + x9 * CosPi15_64; 557 s9 = x8 * CosPi15_64 - x9 * CosPi17_64; 558 s10 = x10 * CosPi21_64 + x11 * CosPi11_64; 559 s11 = x10 * CosPi11_64 - x11 * CosPi21_64; 560 s12 = x12 * CosPi25_64 + x13 * CosPi7_64; 561 s13 = x12 * CosPi7_64 - x13 * CosPi25_64; 562 s14 = x14 * CosPi29_64 + x15 * CosPi3_64; 563 s15 = x14 * CosPi3_64 - x15 * CosPi29_64; 564 565 x0 = WrapLow(DctConstRoundShift(s0 + s8)); 566 x1 = WrapLow(DctConstRoundShift(s1 + s9)); 567 x2 = WrapLow(DctConstRoundShift(s2 + s10)); 568 x3 = WrapLow(DctConstRoundShift(s3 + s11)); 569 x4 = WrapLow(DctConstRoundShift(s4 + s12)); 570 x5 = WrapLow(DctConstRoundShift(s5 + s13)); 571 x6 = WrapLow(DctConstRoundShift(s6 + s14)); 572 x7 = WrapLow(DctConstRoundShift(s7 + s15)); 573 x8 = WrapLow(DctConstRoundShift(s0 - s8)); 574 x9 = WrapLow(DctConstRoundShift(s1 - s9)); 575 x10 = WrapLow(DctConstRoundShift(s2 - s10)); 576 x11 = WrapLow(DctConstRoundShift(s3 - s11)); 577 x12 = WrapLow(DctConstRoundShift(s4 - s12)); 578 x13 = WrapLow(DctConstRoundShift(s5 - s13)); 579 x14 = WrapLow(DctConstRoundShift(s6 - s14)); 580 x15 = WrapLow(DctConstRoundShift(s7 - s15)); 581 582 // stage 2 583 s0 = x0; 584 s1 = x1; 585 s2 = x2; 586 s3 = x3; 587 s4 = x4; 588 s5 = x5; 589 s6 = x6; 590 s7 = x7; 591 s8 = x8 * CosPi4_64 + x9 * CosPi28_64; 592 s9 = x8 * CosPi28_64 - x9 * CosPi4_64; 593 s10 = x10 * CosPi20_64 + x11 * CosPi12_64; 594 s11 = x10 * CosPi12_64 - x11 * CosPi20_64; 595 s12 = -x12 * CosPi28_64 + x13 * CosPi4_64; 596 s13 = x12 * CosPi4_64 + x13 * CosPi28_64; 597 s14 = -x14 * CosPi12_64 + x15 * CosPi20_64; 598 s15 = x14 * CosPi20_64 + x15 * CosPi12_64; 599 600 x0 = WrapLow(s0 + s4); 601 x1 = WrapLow(s1 + s5); 602 x2 = WrapLow(s2 + s6); 603 x3 = WrapLow(s3 + s7); 604 x4 = WrapLow(s0 - s4); 605 x5 = WrapLow(s1 - s5); 606 x6 = WrapLow(s2 - s6); 607 x7 = WrapLow(s3 - s7); 608 x8 = WrapLow(DctConstRoundShift(s8 + s12)); 609 x9 = WrapLow(DctConstRoundShift(s9 + s13)); 610 x10 = WrapLow(DctConstRoundShift(s10 + s14)); 611 x11 = WrapLow(DctConstRoundShift(s11 + s15)); 612 x12 = WrapLow(DctConstRoundShift(s8 - s12)); 613 x13 = WrapLow(DctConstRoundShift(s9 - s13)); 614 x14 = WrapLow(DctConstRoundShift(s10 - s14)); 615 x15 = WrapLow(DctConstRoundShift(s11 - s15)); 616 617 // stage 3 618 s0 = x0; 619 s1 = x1; 620 s2 = x2; 621 s3 = x3; 622 s4 = x4 * CosPi8_64 + x5 * CosPi24_64; 623 s5 = x4 * CosPi24_64 - x5 * CosPi8_64; 624 s6 = -x6 * CosPi24_64 + x7 * CosPi8_64; 625 s7 = x6 * CosPi8_64 + x7 * CosPi24_64; 626 s8 = x8; 627 s9 = x9; 628 s10 = x10; 629 s11 = x11; 630 s12 = x12 * CosPi8_64 + x13 * CosPi24_64; 631 s13 = x12 * CosPi24_64 - x13 * CosPi8_64; 632 s14 = -x14 * CosPi24_64 + x15 * CosPi8_64; 633 s15 = x14 * CosPi8_64 + x15 * CosPi24_64; 634 635 x0 = WrapLow(s0 + s2); 636 x1 = WrapLow(s1 + s3); 637 x2 = WrapLow(s0 - s2); 638 x3 = WrapLow(s1 - s3); 639 x4 = WrapLow(DctConstRoundShift(s4 + s6)); 640 x5 = WrapLow(DctConstRoundShift(s5 + s7)); 641 x6 = WrapLow(DctConstRoundShift(s4 - s6)); 642 x7 = WrapLow(DctConstRoundShift(s5 - s7)); 643 x8 = WrapLow(s8 + s10); 644 x9 = WrapLow(s9 + s11); 645 x10 = WrapLow(s8 - s10); 646 x11 = WrapLow(s9 - s11); 647 x12 = WrapLow(DctConstRoundShift(s12 + s14)); 648 x13 = WrapLow(DctConstRoundShift(s13 + s15)); 649 x14 = WrapLow(DctConstRoundShift(s12 - s14)); 650 x15 = WrapLow(DctConstRoundShift(s13 - s15)); 651 652 // stage 4 653 s2 = (-CosPi16_64) * (x2 + x3); 654 s3 = CosPi16_64 * (x2 - x3); 655 s6 = CosPi16_64 * (x6 + x7); 656 s7 = CosPi16_64 * (-x6 + x7); 657 s10 = CosPi16_64 * (x10 + x11); 658 s11 = CosPi16_64 * (-x10 + x11); 659 s14 = (-CosPi16_64) * (x14 + x15); 660 s15 = CosPi16_64 * (x14 - x15); 661 662 x2 = WrapLow(DctConstRoundShift(s2)); 663 x3 = WrapLow(DctConstRoundShift(s3)); 664 x6 = WrapLow(DctConstRoundShift(s6)); 665 x7 = WrapLow(DctConstRoundShift(s7)); 666 x10 = WrapLow(DctConstRoundShift(s10)); 667 x11 = WrapLow(DctConstRoundShift(s11)); 668 x14 = WrapLow(DctConstRoundShift(s14)); 669 x15 = WrapLow(DctConstRoundShift(s15)); 670 671 output[0] = WrapLow(x0); 672 output[1] = WrapLow(-x8); 673 output[2] = WrapLow(x12); 674 output[3] = WrapLow(-x4); 675 output[4] = WrapLow(x6); 676 output[5] = WrapLow(x14); 677 output[6] = WrapLow(x10); 678 output[7] = WrapLow(x2); 679 output[8] = WrapLow(x3); 680 output[9] = WrapLow(x11); 681 output[10] = WrapLow(x15); 682 output[11] = WrapLow(x7); 683 output[12] = WrapLow(x5); 684 output[13] = WrapLow(-x13); 685 output[14] = WrapLow(x9); 686 output[15] = WrapLow(-x1); 687 } 688 689 [SkipLocalsInit] 690 public static void Idct16(ReadOnlySpan<int> input, Span<int> output) 691 { 692 Span<short> step1 = stackalloc short[16]; 693 Span<short> step2 = stackalloc short[16]; 694 long temp1, temp2; 695 696 // stage 1 697 step1[0] = (short)input[0 / 2]; 698 step1[1] = (short)input[16 / 2]; 699 step1[2] = (short)input[8 / 2]; 700 step1[3] = (short)input[24 / 2]; 701 step1[4] = (short)input[4 / 2]; 702 step1[5] = (short)input[20 / 2]; 703 step1[6] = (short)input[12 / 2]; 704 step1[7] = (short)input[28 / 2]; 705 step1[8] = (short)input[2 / 2]; 706 step1[9] = (short)input[18 / 2]; 707 step1[10] = (short)input[10 / 2]; 708 step1[11] = (short)input[26 / 2]; 709 step1[12] = (short)input[6 / 2]; 710 step1[13] = (short)input[22 / 2]; 711 step1[14] = (short)input[14 / 2]; 712 step1[15] = (short)input[30 / 2]; 713 714 // stage 2 715 step2[0] = step1[0]; 716 step2[1] = step1[1]; 717 step2[2] = step1[2]; 718 step2[3] = step1[3]; 719 step2[4] = step1[4]; 720 step2[5] = step1[5]; 721 step2[6] = step1[6]; 722 step2[7] = step1[7]; 723 724 temp1 = step1[8] * CosPi30_64 - step1[15] * CosPi2_64; 725 temp2 = step1[8] * CosPi2_64 + step1[15] * CosPi30_64; 726 step2[8] = (short)WrapLow(DctConstRoundShift(temp1)); 727 step2[15] = (short)WrapLow(DctConstRoundShift(temp2)); 728 729 temp1 = step1[9] * CosPi14_64 - step1[14] * CosPi18_64; 730 temp2 = step1[9] * CosPi18_64 + step1[14] * CosPi14_64; 731 step2[9] = (short)WrapLow(DctConstRoundShift(temp1)); 732 step2[14] = (short)WrapLow(DctConstRoundShift(temp2)); 733 734 temp1 = step1[10] * CosPi22_64 - step1[13] * CosPi10_64; 735 temp2 = step1[10] * CosPi10_64 + step1[13] * CosPi22_64; 736 step2[10] = (short)WrapLow(DctConstRoundShift(temp1)); 737 step2[13] = (short)WrapLow(DctConstRoundShift(temp2)); 738 739 temp1 = step1[11] * CosPi6_64 - step1[12] * CosPi26_64; 740 temp2 = step1[11] * CosPi26_64 + step1[12] * CosPi6_64; 741 step2[11] = (short)WrapLow(DctConstRoundShift(temp1)); 742 step2[12] = (short)WrapLow(DctConstRoundShift(temp2)); 743 744 // stage 3 745 step1[0] = step2[0]; 746 step1[1] = step2[1]; 747 step1[2] = step2[2]; 748 step1[3] = step2[3]; 749 750 temp1 = step2[4] * CosPi28_64 - step2[7] * CosPi4_64; 751 temp2 = step2[4] * CosPi4_64 + step2[7] * CosPi28_64; 752 step1[4] = (short)WrapLow(DctConstRoundShift(temp1)); 753 step1[7] = (short)WrapLow(DctConstRoundShift(temp2)); 754 temp1 = step2[5] * CosPi12_64 - step2[6] * CosPi20_64; 755 temp2 = step2[5] * CosPi20_64 + step2[6] * CosPi12_64; 756 step1[5] = (short)WrapLow(DctConstRoundShift(temp1)); 757 step1[6] = (short)WrapLow(DctConstRoundShift(temp2)); 758 759 step1[8] = (short)WrapLow(step2[8] + step2[9]); 760 step1[9] = (short)WrapLow(step2[8] - step2[9]); 761 step1[10] = (short)WrapLow(-step2[10] + step2[11]); 762 step1[11] = (short)WrapLow(step2[10] + step2[11]); 763 step1[12] = (short)WrapLow(step2[12] + step2[13]); 764 step1[13] = (short)WrapLow(step2[12] - step2[13]); 765 step1[14] = (short)WrapLow(-step2[14] + step2[15]); 766 step1[15] = (short)WrapLow(step2[14] + step2[15]); 767 768 // stage 4 769 temp1 = (step1[0] + step1[1]) * CosPi16_64; 770 temp2 = (step1[0] - step1[1]) * CosPi16_64; 771 step2[0] = (short)WrapLow(DctConstRoundShift(temp1)); 772 step2[1] = (short)WrapLow(DctConstRoundShift(temp2)); 773 temp1 = step1[2] * CosPi24_64 - step1[3] * CosPi8_64; 774 temp2 = step1[2] * CosPi8_64 + step1[3] * CosPi24_64; 775 step2[2] = (short)WrapLow(DctConstRoundShift(temp1)); 776 step2[3] = (short)WrapLow(DctConstRoundShift(temp2)); 777 step2[4] = (short)WrapLow(step1[4] + step1[5]); 778 step2[5] = (short)WrapLow(step1[4] - step1[5]); 779 step2[6] = (short)WrapLow(-step1[6] + step1[7]); 780 step2[7] = (short)WrapLow(step1[6] + step1[7]); 781 782 step2[8] = step1[8]; 783 step2[15] = step1[15]; 784 temp1 = -step1[9] * CosPi8_64 + step1[14] * CosPi24_64; 785 temp2 = step1[9] * CosPi24_64 + step1[14] * CosPi8_64; 786 step2[9] = (short)WrapLow(DctConstRoundShift(temp1)); 787 step2[14] = (short)WrapLow(DctConstRoundShift(temp2)); 788 temp1 = -step1[10] * CosPi24_64 - step1[13] * CosPi8_64; 789 temp2 = -step1[10] * CosPi8_64 + step1[13] * CosPi24_64; 790 step2[10] = (short)WrapLow(DctConstRoundShift(temp1)); 791 step2[13] = (short)WrapLow(DctConstRoundShift(temp2)); 792 step2[11] = step1[11]; 793 step2[12] = step1[12]; 794 795 // stage 5 796 step1[0] = (short)WrapLow(step2[0] + step2[3]); 797 step1[1] = (short)WrapLow(step2[1] + step2[2]); 798 step1[2] = (short)WrapLow(step2[1] - step2[2]); 799 step1[3] = (short)WrapLow(step2[0] - step2[3]); 800 step1[4] = step2[4]; 801 temp1 = (step2[6] - step2[5]) * CosPi16_64; 802 temp2 = (step2[5] + step2[6]) * CosPi16_64; 803 step1[5] = (short)WrapLow(DctConstRoundShift(temp1)); 804 step1[6] = (short)WrapLow(DctConstRoundShift(temp2)); 805 step1[7] = step2[7]; 806 807 step1[8] = (short)WrapLow(step2[8] + step2[11]); 808 step1[9] = (short)WrapLow(step2[9] + step2[10]); 809 step1[10] = (short)WrapLow(step2[9] - step2[10]); 810 step1[11] = (short)WrapLow(step2[8] - step2[11]); 811 step1[12] = (short)WrapLow(-step2[12] + step2[15]); 812 step1[13] = (short)WrapLow(-step2[13] + step2[14]); 813 step1[14] = (short)WrapLow(step2[13] + step2[14]); 814 step1[15] = (short)WrapLow(step2[12] + step2[15]); 815 816 // stage 6 817 step2[0] = (short)WrapLow(step1[0] + step1[7]); 818 step2[1] = (short)WrapLow(step1[1] + step1[6]); 819 step2[2] = (short)WrapLow(step1[2] + step1[5]); 820 step2[3] = (short)WrapLow(step1[3] + step1[4]); 821 step2[4] = (short)WrapLow(step1[3] - step1[4]); 822 step2[5] = (short)WrapLow(step1[2] - step1[5]); 823 step2[6] = (short)WrapLow(step1[1] - step1[6]); 824 step2[7] = (short)WrapLow(step1[0] - step1[7]); 825 step2[8] = step1[8]; 826 step2[9] = step1[9]; 827 temp1 = (-step1[10] + step1[13]) * CosPi16_64; 828 temp2 = (step1[10] + step1[13]) * CosPi16_64; 829 step2[10] = (short)WrapLow(DctConstRoundShift(temp1)); 830 step2[13] = (short)WrapLow(DctConstRoundShift(temp2)); 831 temp1 = (-step1[11] + step1[12]) * CosPi16_64; 832 temp2 = (step1[11] + step1[12]) * CosPi16_64; 833 step2[11] = (short)WrapLow(DctConstRoundShift(temp1)); 834 step2[12] = (short)WrapLow(DctConstRoundShift(temp2)); 835 step2[14] = step1[14]; 836 step2[15] = step1[15]; 837 838 // stage 7 839 output[0] = WrapLow(step2[0] + step2[15]); 840 output[1] = WrapLow(step2[1] + step2[14]); 841 output[2] = WrapLow(step2[2] + step2[13]); 842 output[3] = WrapLow(step2[3] + step2[12]); 843 output[4] = WrapLow(step2[4] + step2[11]); 844 output[5] = WrapLow(step2[5] + step2[10]); 845 output[6] = WrapLow(step2[6] + step2[9]); 846 output[7] = WrapLow(step2[7] + step2[8]); 847 output[8] = WrapLow(step2[7] - step2[8]); 848 output[9] = WrapLow(step2[6] - step2[9]); 849 output[10] = WrapLow(step2[5] - step2[10]); 850 output[11] = WrapLow(step2[4] - step2[11]); 851 output[12] = WrapLow(step2[3] - step2[12]); 852 output[13] = WrapLow(step2[2] - step2[13]); 853 output[14] = WrapLow(step2[1] - step2[14]); 854 output[15] = WrapLow(step2[0] - step2[15]); 855 } 856 857 [SkipLocalsInit] 858 public static void Idct16x16256Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) 859 { 860 int i, j; 861 Span<int> output = stackalloc int[16 * 16]; 862 Span<int> outptr = output; 863 Span<int> tempIn = stackalloc int[16]; 864 Span<int> tempOut = stackalloc int[16]; 865 866 // First transform rows 867 for (i = 0; i < 16; ++i) 868 { 869 Idct16(input, outptr); 870 input = input[16..]; 871 outptr = outptr[16..]; 872 } 873 874 // Then transform columns 875 for (i = 0; i < 16; ++i) 876 { 877 for (j = 0; j < 16; ++j) 878 { 879 tempIn[j] = output[j * 16 + i]; 880 } 881 882 Idct16(tempIn, tempOut); 883 for (j = 0; j < 16; ++j) 884 { 885 dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); 886 } 887 } 888 } 889 890 [SkipLocalsInit] 891 public static void Idct16x1638Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) 892 { 893 int i, j; 894 Span<int> output = stackalloc int[16 * 16]; 895 Span<int> outptr = output; 896 Span<int> tempIn = stackalloc int[16]; 897 Span<int> tempOut = stackalloc int[16]; 898 899 output.Clear(); 900 901 // First transform rows. Since all non-zero dct coefficients are in 902 // upper-left 8x8 area, we only need to calculate first 8 rows here. 903 for (i = 0; i < 8; ++i) 904 { 905 Idct16(input, outptr); 906 input = input[16..]; 907 outptr = outptr[16..]; 908 } 909 910 // Then transform columns 911 for (i = 0; i < 16; ++i) 912 { 913 for (j = 0; j < 16; ++j) 914 { 915 tempIn[j] = output[j * 16 + i]; 916 } 917 918 Idct16(tempIn, tempOut); 919 for (j = 0; j < 16; ++j) 920 { 921 dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); 922 } 923 } 924 } 925 926 [SkipLocalsInit] 927 public static void Idct16x1610Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) 928 { 929 int i, j; 930 Span<int> output = stackalloc int[16 * 16]; 931 Span<int> outptr = output; 932 Span<int> tempIn = stackalloc int[16]; 933 Span<int> tempOut = stackalloc int[16]; 934 935 output.Clear(); 936 937 // First transform rows. Since all non-zero dct coefficients are in 938 // upper-left 4x4 area, we only need to calculate first 4 rows here. 939 for (i = 0; i < 4; ++i) 940 { 941 Idct16(input, outptr); 942 input = input[16..]; 943 outptr = outptr[16..]; 944 } 945 946 // Then transform columns 947 for (i = 0; i < 16; ++i) 948 { 949 for (j = 0; j < 16; ++j) 950 { 951 tempIn[j] = output[j * 16 + i]; 952 } 953 954 Idct16(tempIn, tempOut); 955 for (j = 0; j < 16; ++j) 956 { 957 dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); 958 } 959 } 960 } 961 962 public static void Idct16x161Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) 963 { 964 int i, j; 965 long a1; 966 int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64)); 967 968 output = WrapLow(DctConstRoundShift(output * CosPi16_64)); 969 a1 = BitUtils.RoundPowerOfTwo(output, 6); 970 for (j = 0; j < 16; ++j) 971 { 972 for (i = 0; i < 16; ++i) 973 { 974 dest[i] = ClipPixelAdd(dest[i], a1); 975 } 976 977 dest = dest[stride..]; 978 } 979 } 980 981 [SkipLocalsInit] 982 public static void Idct32(ReadOnlySpan<int> input, Span<int> output) 983 { 984 Span<short> step1 = stackalloc short[32]; 985 Span<short> step2 = stackalloc short[32]; 986 long temp1, temp2; 987 988 // stage 1 989 step1[0] = (short)input[0]; 990 step1[1] = (short)input[16]; 991 step1[2] = (short)input[8]; 992 step1[3] = (short)input[24]; 993 step1[4] = (short)input[4]; 994 step1[5] = (short)input[20]; 995 step1[6] = (short)input[12]; 996 step1[7] = (short)input[28]; 997 step1[8] = (short)input[2]; 998 step1[9] = (short)input[18]; 999 step1[10] = (short)input[10]; 1000 step1[11] = (short)input[26]; 1001 step1[12] = (short)input[6]; 1002 step1[13] = (short)input[22]; 1003 step1[14] = (short)input[14]; 1004 step1[15] = (short)input[30]; 1005 1006 temp1 = (short)input[1] * CosPi31_64 - (short)input[31] * CosPi1_64; 1007 temp2 = (short)input[1] * CosPi1_64 + (short)input[31] * CosPi31_64; 1008 step1[16] = (short)WrapLow(DctConstRoundShift(temp1)); 1009 step1[31] = (short)WrapLow(DctConstRoundShift(temp2)); 1010 1011 temp1 = (short)input[17] * CosPi15_64 - (short)input[15] * CosPi17_64; 1012 temp2 = (short)input[17] * CosPi17_64 + (short)input[15] * CosPi15_64; 1013 step1[17] = (short)WrapLow(DctConstRoundShift(temp1)); 1014 step1[30] = (short)WrapLow(DctConstRoundShift(temp2)); 1015 1016 temp1 = (short)input[9] * CosPi23_64 - (short)input[23] * CosPi9_64; 1017 temp2 = (short)input[9] * CosPi9_64 + (short)input[23] * CosPi23_64; 1018 step1[18] = (short)WrapLow(DctConstRoundShift(temp1)); 1019 step1[29] = (short)WrapLow(DctConstRoundShift(temp2)); 1020 1021 temp1 = (short)input[25] * CosPi7_64 - (short)input[7] * CosPi25_64; 1022 temp2 = (short)input[25] * CosPi25_64 + (short)input[7] * CosPi7_64; 1023 step1[19] = (short)WrapLow(DctConstRoundShift(temp1)); 1024 step1[28] = (short)WrapLow(DctConstRoundShift(temp2)); 1025 1026 temp1 = (short)input[5] * CosPi27_64 - (short)input[27] * CosPi5_64; 1027 temp2 = (short)input[5] * CosPi5_64 + (short)input[27] * CosPi27_64; 1028 step1[20] = (short)WrapLow(DctConstRoundShift(temp1)); 1029 step1[27] = (short)WrapLow(DctConstRoundShift(temp2)); 1030 1031 temp1 = (short)input[21] * CosPi11_64 - (short)input[11] * CosPi21_64; 1032 temp2 = (short)input[21] * CosPi21_64 + (short)input[11] * CosPi11_64; 1033 step1[21] = (short)WrapLow(DctConstRoundShift(temp1)); 1034 step1[26] = (short)WrapLow(DctConstRoundShift(temp2)); 1035 1036 temp1 = (short)input[13] * CosPi19_64 - (short)input[19] * CosPi13_64; 1037 temp2 = (short)input[13] * CosPi13_64 + (short)input[19] * CosPi19_64; 1038 step1[22] = (short)WrapLow(DctConstRoundShift(temp1)); 1039 step1[25] = (short)WrapLow(DctConstRoundShift(temp2)); 1040 1041 temp1 = (short)input[29] * CosPi3_64 - (short)input[3] * CosPi29_64; 1042 temp2 = (short)input[29] * CosPi29_64 + (short)input[3] * CosPi3_64; 1043 step1[23] = (short)WrapLow(DctConstRoundShift(temp1)); 1044 step1[24] = (short)WrapLow(DctConstRoundShift(temp2)); 1045 1046 // stage 2 1047 step2[0] = step1[0]; 1048 step2[1] = step1[1]; 1049 step2[2] = step1[2]; 1050 step2[3] = step1[3]; 1051 step2[4] = step1[4]; 1052 step2[5] = step1[5]; 1053 step2[6] = step1[6]; 1054 step2[7] = step1[7]; 1055 1056 temp1 = step1[8] * CosPi30_64 - step1[15] * CosPi2_64; 1057 temp2 = step1[8] * CosPi2_64 + step1[15] * CosPi30_64; 1058 step2[8] = (short)WrapLow(DctConstRoundShift(temp1)); 1059 step2[15] = (short)WrapLow(DctConstRoundShift(temp2)); 1060 1061 temp1 = step1[9] * CosPi14_64 - step1[14] * CosPi18_64; 1062 temp2 = step1[9] * CosPi18_64 + step1[14] * CosPi14_64; 1063 step2[9] = (short)WrapLow(DctConstRoundShift(temp1)); 1064 step2[14] = (short)WrapLow(DctConstRoundShift(temp2)); 1065 1066 temp1 = step1[10] * CosPi22_64 - step1[13] * CosPi10_64; 1067 temp2 = step1[10] * CosPi10_64 + step1[13] * CosPi22_64; 1068 step2[10] = (short)WrapLow(DctConstRoundShift(temp1)); 1069 step2[13] = (short)WrapLow(DctConstRoundShift(temp2)); 1070 1071 temp1 = step1[11] * CosPi6_64 - step1[12] * CosPi26_64; 1072 temp2 = step1[11] * CosPi26_64 + step1[12] * CosPi6_64; 1073 step2[11] = (short)WrapLow(DctConstRoundShift(temp1)); 1074 step2[12] = (short)WrapLow(DctConstRoundShift(temp2)); 1075 1076 step2[16] = (short)WrapLow(step1[16] + step1[17]); 1077 step2[17] = (short)WrapLow(step1[16] - step1[17]); 1078 step2[18] = (short)WrapLow(-step1[18] + step1[19]); 1079 step2[19] = (short)WrapLow(step1[18] + step1[19]); 1080 step2[20] = (short)WrapLow(step1[20] + step1[21]); 1081 step2[21] = (short)WrapLow(step1[20] - step1[21]); 1082 step2[22] = (short)WrapLow(-step1[22] + step1[23]); 1083 step2[23] = (short)WrapLow(step1[22] + step1[23]); 1084 step2[24] = (short)WrapLow(step1[24] + step1[25]); 1085 step2[25] = (short)WrapLow(step1[24] - step1[25]); 1086 step2[26] = (short)WrapLow(-step1[26] + step1[27]); 1087 step2[27] = (short)WrapLow(step1[26] + step1[27]); 1088 step2[28] = (short)WrapLow(step1[28] + step1[29]); 1089 step2[29] = (short)WrapLow(step1[28] - step1[29]); 1090 step2[30] = (short)WrapLow(-step1[30] + step1[31]); 1091 step2[31] = (short)WrapLow(step1[30] + step1[31]); 1092 1093 // stage 3 1094 step1[0] = step2[0]; 1095 step1[1] = step2[1]; 1096 step1[2] = step2[2]; 1097 step1[3] = step2[3]; 1098 1099 temp1 = step2[4] * CosPi28_64 - step2[7] * CosPi4_64; 1100 temp2 = step2[4] * CosPi4_64 + step2[7] * CosPi28_64; 1101 step1[4] = (short)WrapLow(DctConstRoundShift(temp1)); 1102 step1[7] = (short)WrapLow(DctConstRoundShift(temp2)); 1103 temp1 = step2[5] * CosPi12_64 - step2[6] * CosPi20_64; 1104 temp2 = step2[5] * CosPi20_64 + step2[6] * CosPi12_64; 1105 step1[5] = (short)WrapLow(DctConstRoundShift(temp1)); 1106 step1[6] = (short)WrapLow(DctConstRoundShift(temp2)); 1107 1108 step1[8] = (short)WrapLow(step2[8] + step2[9]); 1109 step1[9] = (short)WrapLow(step2[8] - step2[9]); 1110 step1[10] = (short)WrapLow(-step2[10] + step2[11]); 1111 step1[11] = (short)WrapLow(step2[10] + step2[11]); 1112 step1[12] = (short)WrapLow(step2[12] + step2[13]); 1113 step1[13] = (short)WrapLow(step2[12] - step2[13]); 1114 step1[14] = (short)WrapLow(-step2[14] + step2[15]); 1115 step1[15] = (short)WrapLow(step2[14] + step2[15]); 1116 1117 step1[16] = step2[16]; 1118 step1[31] = step2[31]; 1119 temp1 = -step2[17] * CosPi4_64 + step2[30] * CosPi28_64; 1120 temp2 = step2[17] * CosPi28_64 + step2[30] * CosPi4_64; 1121 step1[17] = (short)WrapLow(DctConstRoundShift(temp1)); 1122 step1[30] = (short)WrapLow(DctConstRoundShift(temp2)); 1123 temp1 = -step2[18] * CosPi28_64 - step2[29] * CosPi4_64; 1124 temp2 = -step2[18] * CosPi4_64 + step2[29] * CosPi28_64; 1125 step1[18] = (short)WrapLow(DctConstRoundShift(temp1)); 1126 step1[29] = (short)WrapLow(DctConstRoundShift(temp2)); 1127 step1[19] = step2[19]; 1128 step1[20] = step2[20]; 1129 temp1 = -step2[21] * CosPi20_64 + step2[26] * CosPi12_64; 1130 temp2 = step2[21] * CosPi12_64 + step2[26] * CosPi20_64; 1131 step1[21] = (short)WrapLow(DctConstRoundShift(temp1)); 1132 step1[26] = (short)WrapLow(DctConstRoundShift(temp2)); 1133 temp1 = -step2[22] * CosPi12_64 - step2[25] * CosPi20_64; 1134 temp2 = -step2[22] * CosPi20_64 + step2[25] * CosPi12_64; 1135 step1[22] = (short)WrapLow(DctConstRoundShift(temp1)); 1136 step1[25] = (short)WrapLow(DctConstRoundShift(temp2)); 1137 step1[23] = step2[23]; 1138 step1[24] = step2[24]; 1139 step1[27] = step2[27]; 1140 step1[28] = step2[28]; 1141 1142 // stage 4 1143 temp1 = (step1[0] + step1[1]) * CosPi16_64; 1144 temp2 = (step1[0] - step1[1]) * CosPi16_64; 1145 step2[0] = (short)WrapLow(DctConstRoundShift(temp1)); 1146 step2[1] = (short)WrapLow(DctConstRoundShift(temp2)); 1147 temp1 = step1[2] * CosPi24_64 - step1[3] * CosPi8_64; 1148 temp2 = step1[2] * CosPi8_64 + step1[3] * CosPi24_64; 1149 step2[2] = (short)WrapLow(DctConstRoundShift(temp1)); 1150 step2[3] = (short)WrapLow(DctConstRoundShift(temp2)); 1151 step2[4] = (short)WrapLow(step1[4] + step1[5]); 1152 step2[5] = (short)WrapLow(step1[4] - step1[5]); 1153 step2[6] = (short)WrapLow(-step1[6] + step1[7]); 1154 step2[7] = (short)WrapLow(step1[6] + step1[7]); 1155 1156 step2[8] = step1[8]; 1157 step2[15] = step1[15]; 1158 temp1 = -step1[9] * CosPi8_64 + step1[14] * CosPi24_64; 1159 temp2 = step1[9] * CosPi24_64 + step1[14] * CosPi8_64; 1160 step2[9] = (short)WrapLow(DctConstRoundShift(temp1)); 1161 step2[14] = (short)WrapLow(DctConstRoundShift(temp2)); 1162 temp1 = -step1[10] * CosPi24_64 - step1[13] * CosPi8_64; 1163 temp2 = -step1[10] * CosPi8_64 + step1[13] * CosPi24_64; 1164 step2[10] = (short)WrapLow(DctConstRoundShift(temp1)); 1165 step2[13] = (short)WrapLow(DctConstRoundShift(temp2)); 1166 step2[11] = step1[11]; 1167 step2[12] = step1[12]; 1168 1169 step2[16] = (short)WrapLow(step1[16] + step1[19]); 1170 step2[17] = (short)WrapLow(step1[17] + step1[18]); 1171 step2[18] = (short)WrapLow(step1[17] - step1[18]); 1172 step2[19] = (short)WrapLow(step1[16] - step1[19]); 1173 step2[20] = (short)WrapLow(-step1[20] + step1[23]); 1174 step2[21] = (short)WrapLow(-step1[21] + step1[22]); 1175 step2[22] = (short)WrapLow(step1[21] + step1[22]); 1176 step2[23] = (short)WrapLow(step1[20] + step1[23]); 1177 1178 step2[24] = (short)WrapLow(step1[24] + step1[27]); 1179 step2[25] = (short)WrapLow(step1[25] + step1[26]); 1180 step2[26] = (short)WrapLow(step1[25] - step1[26]); 1181 step2[27] = (short)WrapLow(step1[24] - step1[27]); 1182 step2[28] = (short)WrapLow(-step1[28] + step1[31]); 1183 step2[29] = (short)WrapLow(-step1[29] + step1[30]); 1184 step2[30] = (short)WrapLow(step1[29] + step1[30]); 1185 step2[31] = (short)WrapLow(step1[28] + step1[31]); 1186 1187 // stage 5 1188 step1[0] = (short)WrapLow(step2[0] + step2[3]); 1189 step1[1] = (short)WrapLow(step2[1] + step2[2]); 1190 step1[2] = (short)WrapLow(step2[1] - step2[2]); 1191 step1[3] = (short)WrapLow(step2[0] - step2[3]); 1192 step1[4] = step2[4]; 1193 temp1 = (step2[6] - step2[5]) * CosPi16_64; 1194 temp2 = (step2[5] + step2[6]) * CosPi16_64; 1195 step1[5] = (short)WrapLow(DctConstRoundShift(temp1)); 1196 step1[6] = (short)WrapLow(DctConstRoundShift(temp2)); 1197 step1[7] = step2[7]; 1198 1199 step1[8] = (short)WrapLow(step2[8] + step2[11]); 1200 step1[9] = (short)WrapLow(step2[9] + step2[10]); 1201 step1[10] = (short)WrapLow(step2[9] - step2[10]); 1202 step1[11] = (short)WrapLow(step2[8] - step2[11]); 1203 step1[12] = (short)WrapLow(-step2[12] + step2[15]); 1204 step1[13] = (short)WrapLow(-step2[13] + step2[14]); 1205 step1[14] = (short)WrapLow(step2[13] + step2[14]); 1206 step1[15] = (short)WrapLow(step2[12] + step2[15]); 1207 1208 step1[16] = step2[16]; 1209 step1[17] = step2[17]; 1210 temp1 = -step2[18] * CosPi8_64 + step2[29] * CosPi24_64; 1211 temp2 = step2[18] * CosPi24_64 + step2[29] * CosPi8_64; 1212 step1[18] = (short)WrapLow(DctConstRoundShift(temp1)); 1213 step1[29] = (short)WrapLow(DctConstRoundShift(temp2)); 1214 temp1 = -step2[19] * CosPi8_64 + step2[28] * CosPi24_64; 1215 temp2 = step2[19] * CosPi24_64 + step2[28] * CosPi8_64; 1216 step1[19] = (short)WrapLow(DctConstRoundShift(temp1)); 1217 step1[28] = (short)WrapLow(DctConstRoundShift(temp2)); 1218 temp1 = -step2[20] * CosPi24_64 - step2[27] * CosPi8_64; 1219 temp2 = -step2[20] * CosPi8_64 + step2[27] * CosPi24_64; 1220 step1[20] = (short)WrapLow(DctConstRoundShift(temp1)); 1221 step1[27] = (short)WrapLow(DctConstRoundShift(temp2)); 1222 temp1 = -step2[21] * CosPi24_64 - step2[26] * CosPi8_64; 1223 temp2 = -step2[21] * CosPi8_64 + step2[26] * CosPi24_64; 1224 step1[21] = (short)WrapLow(DctConstRoundShift(temp1)); 1225 step1[26] = (short)WrapLow(DctConstRoundShift(temp2)); 1226 step1[22] = step2[22]; 1227 step1[23] = step2[23]; 1228 step1[24] = step2[24]; 1229 step1[25] = step2[25]; 1230 step1[30] = step2[30]; 1231 step1[31] = step2[31]; 1232 1233 // stage 6 1234 step2[0] = (short)WrapLow(step1[0] + step1[7]); 1235 step2[1] = (short)WrapLow(step1[1] + step1[6]); 1236 step2[2] = (short)WrapLow(step1[2] + step1[5]); 1237 step2[3] = (short)WrapLow(step1[3] + step1[4]); 1238 step2[4] = (short)WrapLow(step1[3] - step1[4]); 1239 step2[5] = (short)WrapLow(step1[2] - step1[5]); 1240 step2[6] = (short)WrapLow(step1[1] - step1[6]); 1241 step2[7] = (short)WrapLow(step1[0] - step1[7]); 1242 step2[8] = step1[8]; 1243 step2[9] = step1[9]; 1244 temp1 = (-step1[10] + step1[13]) * CosPi16_64; 1245 temp2 = (step1[10] + step1[13]) * CosPi16_64; 1246 step2[10] = (short)WrapLow(DctConstRoundShift(temp1)); 1247 step2[13] = (short)WrapLow(DctConstRoundShift(temp2)); 1248 temp1 = (-step1[11] + step1[12]) * CosPi16_64; 1249 temp2 = (step1[11] + step1[12]) * CosPi16_64; 1250 step2[11] = (short)WrapLow(DctConstRoundShift(temp1)); 1251 step2[12] = (short)WrapLow(DctConstRoundShift(temp2)); 1252 step2[14] = step1[14]; 1253 step2[15] = step1[15]; 1254 1255 step2[16] = (short)WrapLow(step1[16] + step1[23]); 1256 step2[17] = (short)WrapLow(step1[17] + step1[22]); 1257 step2[18] = (short)WrapLow(step1[18] + step1[21]); 1258 step2[19] = (short)WrapLow(step1[19] + step1[20]); 1259 step2[20] = (short)WrapLow(step1[19] - step1[20]); 1260 step2[21] = (short)WrapLow(step1[18] - step1[21]); 1261 step2[22] = (short)WrapLow(step1[17] - step1[22]); 1262 step2[23] = (short)WrapLow(step1[16] - step1[23]); 1263 1264 step2[24] = (short)WrapLow(-step1[24] + step1[31]); 1265 step2[25] = (short)WrapLow(-step1[25] + step1[30]); 1266 step2[26] = (short)WrapLow(-step1[26] + step1[29]); 1267 step2[27] = (short)WrapLow(-step1[27] + step1[28]); 1268 step2[28] = (short)WrapLow(step1[27] + step1[28]); 1269 step2[29] = (short)WrapLow(step1[26] + step1[29]); 1270 step2[30] = (short)WrapLow(step1[25] + step1[30]); 1271 step2[31] = (short)WrapLow(step1[24] + step1[31]); 1272 1273 // stage 7 1274 step1[0] = (short)WrapLow(step2[0] + step2[15]); 1275 step1[1] = (short)WrapLow(step2[1] + step2[14]); 1276 step1[2] = (short)WrapLow(step2[2] + step2[13]); 1277 step1[3] = (short)WrapLow(step2[3] + step2[12]); 1278 step1[4] = (short)WrapLow(step2[4] + step2[11]); 1279 step1[5] = (short)WrapLow(step2[5] + step2[10]); 1280 step1[6] = (short)WrapLow(step2[6] + step2[9]); 1281 step1[7] = (short)WrapLow(step2[7] + step2[8]); 1282 step1[8] = (short)WrapLow(step2[7] - step2[8]); 1283 step1[9] = (short)WrapLow(step2[6] - step2[9]); 1284 step1[10] = (short)WrapLow(step2[5] - step2[10]); 1285 step1[11] = (short)WrapLow(step2[4] - step2[11]); 1286 step1[12] = (short)WrapLow(step2[3] - step2[12]); 1287 step1[13] = (short)WrapLow(step2[2] - step2[13]); 1288 step1[14] = (short)WrapLow(step2[1] - step2[14]); 1289 step1[15] = (short)WrapLow(step2[0] - step2[15]); 1290 1291 step1[16] = step2[16]; 1292 step1[17] = step2[17]; 1293 step1[18] = step2[18]; 1294 step1[19] = step2[19]; 1295 temp1 = (-step2[20] + step2[27]) * CosPi16_64; 1296 temp2 = (step2[20] + step2[27]) * CosPi16_64; 1297 step1[20] = (short)WrapLow(DctConstRoundShift(temp1)); 1298 step1[27] = (short)WrapLow(DctConstRoundShift(temp2)); 1299 temp1 = (-step2[21] + step2[26]) * CosPi16_64; 1300 temp2 = (step2[21] + step2[26]) * CosPi16_64; 1301 step1[21] = (short)WrapLow(DctConstRoundShift(temp1)); 1302 step1[26] = (short)WrapLow(DctConstRoundShift(temp2)); 1303 temp1 = (-step2[22] + step2[25]) * CosPi16_64; 1304 temp2 = (step2[22] + step2[25]) * CosPi16_64; 1305 step1[22] = (short)WrapLow(DctConstRoundShift(temp1)); 1306 step1[25] = (short)WrapLow(DctConstRoundShift(temp2)); 1307 temp1 = (-step2[23] + step2[24]) * CosPi16_64; 1308 temp2 = (step2[23] + step2[24]) * CosPi16_64; 1309 step1[23] = (short)WrapLow(DctConstRoundShift(temp1)); 1310 step1[24] = (short)WrapLow(DctConstRoundShift(temp2)); 1311 step1[28] = step2[28]; 1312 step1[29] = step2[29]; 1313 step1[30] = step2[30]; 1314 step1[31] = step2[31]; 1315 1316 // final stage 1317 output[0] = WrapLow(step1[0] + step1[31]); 1318 output[1] = WrapLow(step1[1] + step1[30]); 1319 output[2] = WrapLow(step1[2] + step1[29]); 1320 output[3] = WrapLow(step1[3] + step1[28]); 1321 output[4] = WrapLow(step1[4] + step1[27]); 1322 output[5] = WrapLow(step1[5] + step1[26]); 1323 output[6] = WrapLow(step1[6] + step1[25]); 1324 output[7] = WrapLow(step1[7] + step1[24]); 1325 output[8] = WrapLow(step1[8] + step1[23]); 1326 output[9] = WrapLow(step1[9] + step1[22]); 1327 output[10] = WrapLow(step1[10] + step1[21]); 1328 output[11] = WrapLow(step1[11] + step1[20]); 1329 output[12] = WrapLow(step1[12] + step1[19]); 1330 output[13] = WrapLow(step1[13] + step1[18]); 1331 output[14] = WrapLow(step1[14] + step1[17]); 1332 output[15] = WrapLow(step1[15] + step1[16]); 1333 output[16] = WrapLow(step1[15] - step1[16]); 1334 output[17] = WrapLow(step1[14] - step1[17]); 1335 output[18] = WrapLow(step1[13] - step1[18]); 1336 output[19] = WrapLow(step1[12] - step1[19]); 1337 output[20] = WrapLow(step1[11] - step1[20]); 1338 output[21] = WrapLow(step1[10] - step1[21]); 1339 output[22] = WrapLow(step1[9] - step1[22]); 1340 output[23] = WrapLow(step1[8] - step1[23]); 1341 output[24] = WrapLow(step1[7] - step1[24]); 1342 output[25] = WrapLow(step1[6] - step1[25]); 1343 output[26] = WrapLow(step1[5] - step1[26]); 1344 output[27] = WrapLow(step1[4] - step1[27]); 1345 output[28] = WrapLow(step1[3] - step1[28]); 1346 output[29] = WrapLow(step1[2] - step1[29]); 1347 output[30] = WrapLow(step1[1] - step1[30]); 1348 output[31] = WrapLow(step1[0] - step1[31]); 1349 } 1350 1351 [SkipLocalsInit] 1352 public static void Idct32x321024Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) 1353 { 1354 int i, j; 1355 Span<int> output = stackalloc int[32 * 32]; 1356 Span<int> outptr = output; 1357 Span<int> tempIn = stackalloc int[32]; 1358 Span<int> tempOut = stackalloc int[32]; 1359 1360 // Rows 1361 for (i = 0; i < 32; ++i) 1362 { 1363 short zeroCoeff = 0; 1364 for (j = 0; j < 32; ++j) 1365 { 1366 zeroCoeff |= (short)input[j]; 1367 } 1368 1369 if (zeroCoeff != 0) 1370 { 1371 Idct32(input, outptr); 1372 } 1373 else 1374 { 1375 outptr[..32].Clear(); 1376 } 1377 1378 input = input[32..]; 1379 outptr = outptr[32..]; 1380 } 1381 1382 // Columns 1383 for (i = 0; i < 32; ++i) 1384 { 1385 for (j = 0; j < 32; ++j) 1386 { 1387 tempIn[j] = output[j * 32 + i]; 1388 } 1389 1390 Idct32(tempIn, tempOut); 1391 for (j = 0; j < 32; ++j) 1392 { 1393 dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); 1394 } 1395 } 1396 } 1397 1398 [SkipLocalsInit] 1399 public static void Idct32x32135Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) 1400 { 1401 int i, j; 1402 Span<int> output = stackalloc int[32 * 32]; 1403 Span<int> outptr = output; 1404 Span<int> tempIn = stackalloc int[32]; 1405 Span<int> tempOut = stackalloc int[32]; 1406 1407 output.Clear(); 1408 1409 // Rows 1410 // Only upper-left 16x16 has non-zero coeff 1411 for (i = 0; i < 16; ++i) 1412 { 1413 Idct32(input, outptr); 1414 input = input[32..]; 1415 outptr = outptr[32..]; 1416 } 1417 1418 // Columns 1419 for (i = 0; i < 32; ++i) 1420 { 1421 for (j = 0; j < 32; ++j) 1422 { 1423 tempIn[j] = output[j * 32 + i]; 1424 } 1425 1426 Idct32(tempIn, tempOut); 1427 for (j = 0; j < 32; ++j) 1428 { 1429 dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); 1430 } 1431 } 1432 } 1433 1434 [SkipLocalsInit] 1435 public static void Idct32x3234Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) 1436 { 1437 int i, j; 1438 Span<int> output = stackalloc int[32 * 32]; 1439 Span<int> outptr = output; 1440 Span<int> tempIn = stackalloc int[32]; 1441 Span<int> tempOut = stackalloc int[32]; 1442 1443 output.Clear(); 1444 1445 // Rows 1446 // Only upper-left 8x8 has non-zero coeff 1447 for (i = 0; i < 8; ++i) 1448 { 1449 Idct32(input, outptr); 1450 input = input[32..]; 1451 outptr = outptr[32..]; 1452 } 1453 1454 // Columns 1455 for (i = 0; i < 32; ++i) 1456 { 1457 for (j = 0; j < 32; ++j) 1458 { 1459 tempIn[j] = output[j * 32 + i]; 1460 } 1461 1462 Idct32(tempIn, tempOut); 1463 for (j = 0; j < 32; ++j) 1464 { 1465 dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6)); 1466 } 1467 } 1468 } 1469 1470 public static void Idct32x321Add(ReadOnlySpan<int> input, Span<byte> dest, int stride) 1471 { 1472 int i, j; 1473 long a1; 1474 int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64)); 1475 1476 output = WrapLow(DctConstRoundShift(output * CosPi16_64)); 1477 a1 = BitUtils.RoundPowerOfTwo(output, 6); 1478 1479 for (j = 0; j < 32; ++j) 1480 { 1481 for (i = 0; i < 32; ++i) 1482 { 1483 dest[i] = ClipPixelAdd(dest[i], a1); 1484 } 1485 1486 dest = dest[stride..]; 1487 } 1488 } 1489 1490 [SkipLocalsInit] 1491 public static void HighbdIwht4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) 1492 { 1493 /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds, 1494 0.5 shifts per pixel. */ 1495 int i; 1496 Span<int> output = stackalloc int[16]; 1497 long a1, b1, c1, d1, e1; 1498 ReadOnlySpan<int> ip = input; 1499 Span<int> op = output; 1500 1501 for (i = 0; i < 4; i++) 1502 { 1503 a1 = ip[0] >> UnitQuantShift; 1504 c1 = ip[1] >> UnitQuantShift; 1505 d1 = ip[2] >> UnitQuantShift; 1506 b1 = ip[3] >> UnitQuantShift; 1507 a1 += c1; 1508 d1 -= b1; 1509 e1 = (a1 - d1) >> 1; 1510 b1 = e1 - b1; 1511 c1 = e1 - c1; 1512 a1 -= b1; 1513 d1 += c1; 1514 op[0] = HighbdWrapLow(a1, bd); 1515 op[1] = HighbdWrapLow(b1, bd); 1516 op[2] = HighbdWrapLow(c1, bd); 1517 op[3] = HighbdWrapLow(d1, bd); 1518 ip = ip[4..]; 1519 op = op[4..]; 1520 } 1521 1522 ReadOnlySpan<int> ip2 = output; 1523 for (i = 0; i < 4; i++) 1524 { 1525 a1 = ip2[4 * 0]; 1526 c1 = ip2[4 * 1]; 1527 d1 = ip2[4 * 2]; 1528 b1 = ip2[4 * 3]; 1529 a1 += c1; 1530 d1 -= b1; 1531 e1 = (a1 - d1) >> 1; 1532 b1 = e1 - b1; 1533 c1 = e1 - c1; 1534 a1 -= b1; 1535 d1 += c1; 1536 dest[stride * 0] = HighbdClipPixelAdd(dest[stride * 0], HighbdWrapLow(a1, bd), bd); 1537 dest[stride * 1] = HighbdClipPixelAdd(dest[stride * 1], HighbdWrapLow(b1, bd), bd); 1538 dest[stride * 2] = HighbdClipPixelAdd(dest[stride * 2], HighbdWrapLow(c1, bd), bd); 1539 dest[stride * 3] = HighbdClipPixelAdd(dest[stride * 3], HighbdWrapLow(d1, bd), bd); 1540 1541 ip2 = ip2[1..]; 1542 dest = dest[1..]; 1543 } 1544 } 1545 1546 [SkipLocalsInit] 1547 public static void HighbdIwht4x41Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) 1548 { 1549 int i; 1550 long a1, e1; 1551 Span<int> tmp = stackalloc int[4]; 1552 ReadOnlySpan<int> ip = input; 1553 Span<int> op = tmp; 1554 1555 a1 = ip[0] >> UnitQuantShift; 1556 e1 = a1 >> 1; 1557 a1 -= e1; 1558 op[0] = HighbdWrapLow(a1, bd); 1559 op[1] = op[2] = op[3] = HighbdWrapLow(e1, bd); 1560 1561 ReadOnlySpan<int> ip2 = tmp; 1562 for (i = 0; i < 4; i++) 1563 { 1564 e1 = ip2[0] >> 1; 1565 a1 = ip2[0] - e1; 1566 dest[stride * 0] = HighbdClipPixelAdd(dest[stride * 0], a1, bd); 1567 dest[stride * 1] = HighbdClipPixelAdd(dest[stride * 1], e1, bd); 1568 dest[stride * 2] = HighbdClipPixelAdd(dest[stride * 2], e1, bd); 1569 dest[stride * 3] = HighbdClipPixelAdd(dest[stride * 3], e1, bd); 1570 ip2 = ip2[1..]; 1571 dest = dest[1..]; 1572 } 1573 } 1574 1575 public static void HighbdIadst4(ReadOnlySpan<int> input, Span<int> output, int bd) 1576 { 1577 long s0, s1, s2, s3, s4, s5, s6, s7; 1578 int x0 = input[0]; 1579 int x1 = input[1]; 1580 int x2 = input[2]; 1581 int x3 = input[3]; 1582 1583 if (DetectInvalidHighbdInput(input, 4) != 0) 1584 { 1585 Debug.Assert(false, "invalid highbd txfm input"); 1586 output[..4].Clear(); 1587 1588 return; 1589 } 1590 1591 if ((x0 | x1 | x2 | x3) == 0) 1592 { 1593 output[..4].Clear(); 1594 1595 return; 1596 } 1597 1598 s0 = (long)SinPi1_9 * x0; 1599 s1 = (long)SinPi2_9 * x0; 1600 s2 = (long)SinPi3_9 * x1; 1601 s3 = (long)SinPi4_9 * x2; 1602 s4 = (long)SinPi1_9 * x2; 1603 s5 = (long)SinPi2_9 * x3; 1604 s6 = (long)SinPi4_9 * x3; 1605 s7 = HighbdWrapLow(x0 - x2 + x3, bd); 1606 1607 s0 = s0 + s3 + s5; 1608 s1 = s1 - s4 - s6; 1609 s3 = s2; 1610 s2 = SinPi3_9 * s7; 1611 1612 // 1-D transform scaling factor is sqrt(2). 1613 // The overall dynamic range is 14b (input) + 14b (multiplication scaling) 1614 // + 1b (addition) = 29b. 1615 // Hence the output bit depth is 15b. 1616 output[0] = HighbdWrapLow(DctConstRoundShift(s0 + s3), bd); 1617 output[1] = HighbdWrapLow(DctConstRoundShift(s1 + s3), bd); 1618 output[2] = HighbdWrapLow(DctConstRoundShift(s2), bd); 1619 output[3] = HighbdWrapLow(DctConstRoundShift(s0 + s1 - s3), bd); 1620 } 1621 1622 [SkipLocalsInit] 1623 public static void HighbdIdct4(ReadOnlySpan<int> input, Span<int> output, int bd) 1624 { 1625 Span<int> step = stackalloc int[4]; 1626 long temp1, temp2; 1627 1628 if (DetectInvalidHighbdInput(input, 4) != 0) 1629 { 1630 Debug.Assert(false, "invalid highbd txfm input"); 1631 output[..4].Clear(); 1632 1633 return; 1634 } 1635 1636 // stage 1 1637 temp1 = (input[0] + input[2]) * (long)CosPi16_64; 1638 temp2 = (input[0] - input[2]) * (long)CosPi16_64; 1639 step[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 1640 step[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 1641 temp1 = input[1] * (long)CosPi24_64 - input[3] * (long)CosPi8_64; 1642 temp2 = input[1] * (long)CosPi8_64 + input[3] * (long)CosPi24_64; 1643 step[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 1644 step[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 1645 1646 // stage 2 1647 output[0] = HighbdWrapLow(step[0] + step[3], bd); 1648 output[1] = HighbdWrapLow(step[1] + step[2], bd); 1649 output[2] = HighbdWrapLow(step[1] - step[2], bd); 1650 output[3] = HighbdWrapLow(step[0] - step[3], bd); 1651 } 1652 1653 [SkipLocalsInit] 1654 public static void HighbdIdct4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) 1655 { 1656 int i, j; 1657 Span<int> output = stackalloc int[4 * 4]; 1658 Span<int> outptr = output; 1659 Span<int> tempIn = stackalloc int[4]; 1660 Span<int> tempOut = stackalloc int[4]; 1661 1662 // Rows 1663 for (i = 0; i < 4; ++i) 1664 { 1665 HighbdIdct4(input, outptr, bd); 1666 input = input[4..]; 1667 outptr = outptr[4..]; 1668 } 1669 1670 // Columns 1671 for (i = 0; i < 4; ++i) 1672 { 1673 for (j = 0; j < 4; ++j) 1674 { 1675 tempIn[j] = output[j * 4 + i]; 1676 } 1677 1678 HighbdIdct4(tempIn, tempOut, bd); 1679 for (j = 0; j < 4; ++j) 1680 { 1681 dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4), bd); 1682 } 1683 } 1684 } 1685 1686 public static void HighbdIdct4x41Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) 1687 { 1688 int i; 1689 long a1; 1690 int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd); 1691 1692 output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd); 1693 a1 = BitUtils.RoundPowerOfTwo(output, 4); 1694 1695 for (i = 0; i < 4; i++) 1696 { 1697 dest[0] = HighbdClipPixelAdd(dest[0], a1, bd); 1698 dest[1] = HighbdClipPixelAdd(dest[1], a1, bd); 1699 dest[2] = HighbdClipPixelAdd(dest[2], a1, bd); 1700 dest[3] = HighbdClipPixelAdd(dest[3], a1, bd); 1701 dest = dest[stride..]; 1702 } 1703 } 1704 1705 public static void HighbdIadst8(ReadOnlySpan<int> input, Span<int> output, int bd) 1706 { 1707 long s0, s1, s2, s3, s4, s5, s6, s7; 1708 int x0 = input[7]; 1709 int x1 = input[0]; 1710 int x2 = input[5]; 1711 int x3 = input[2]; 1712 int x4 = input[3]; 1713 int x5 = input[4]; 1714 int x6 = input[1]; 1715 int x7 = input[6]; 1716 1717 if (DetectInvalidHighbdInput(input, 8) != 0) 1718 { 1719 Debug.Assert(false, "invalid highbd txfm input"); 1720 output[..8].Clear(); 1721 1722 return; 1723 } 1724 1725 if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7) == 0) 1726 { 1727 output[..8].Clear(); 1728 1729 return; 1730 } 1731 1732 // stage 1 1733 s0 = (long)CosPi2_64 * x0 + (long)CosPi30_64 * x1; 1734 s1 = (long)CosPi30_64 * x0 - (long)CosPi2_64 * x1; 1735 s2 = (long)CosPi10_64 * x2 + (long)CosPi22_64 * x3; 1736 s3 = (long)CosPi22_64 * x2 - (long)CosPi10_64 * x3; 1737 s4 = (long)CosPi18_64 * x4 + (long)CosPi14_64 * x5; 1738 s5 = (long)CosPi14_64 * x4 - (long)CosPi18_64 * x5; 1739 s6 = (long)CosPi26_64 * x6 + (long)CosPi6_64 * x7; 1740 s7 = (long)CosPi6_64 * x6 - (long)CosPi26_64 * x7; 1741 1742 x0 = HighbdWrapLow(DctConstRoundShift(s0 + s4), bd); 1743 x1 = HighbdWrapLow(DctConstRoundShift(s1 + s5), bd); 1744 x2 = HighbdWrapLow(DctConstRoundShift(s2 + s6), bd); 1745 x3 = HighbdWrapLow(DctConstRoundShift(s3 + s7), bd); 1746 x4 = HighbdWrapLow(DctConstRoundShift(s0 - s4), bd); 1747 x5 = HighbdWrapLow(DctConstRoundShift(s1 - s5), bd); 1748 x6 = HighbdWrapLow(DctConstRoundShift(s2 - s6), bd); 1749 x7 = HighbdWrapLow(DctConstRoundShift(s3 - s7), bd); 1750 1751 // stage 2 1752 s0 = x0; 1753 s1 = x1; 1754 s2 = x2; 1755 s3 = x3; 1756 s4 = (long)CosPi8_64 * x4 + (long)CosPi24_64 * x5; 1757 s5 = (long)CosPi24_64 * x4 - (long)CosPi8_64 * x5; 1758 s6 = (long)(-CosPi24_64) * x6 + (long)CosPi8_64 * x7; 1759 s7 = (long)CosPi8_64 * x6 + (long)CosPi24_64 * x7; 1760 1761 x0 = HighbdWrapLow(s0 + s2, bd); 1762 x1 = HighbdWrapLow(s1 + s3, bd); 1763 x2 = HighbdWrapLow(s0 - s2, bd); 1764 x3 = HighbdWrapLow(s1 - s3, bd); 1765 x4 = HighbdWrapLow(DctConstRoundShift(s4 + s6), bd); 1766 x5 = HighbdWrapLow(DctConstRoundShift(s5 + s7), bd); 1767 x6 = HighbdWrapLow(DctConstRoundShift(s4 - s6), bd); 1768 x7 = HighbdWrapLow(DctConstRoundShift(s5 - s7), bd); 1769 1770 // stage 3 1771 s2 = (long)CosPi16_64 * (x2 + x3); 1772 s3 = (long)CosPi16_64 * (x2 - x3); 1773 s6 = (long)CosPi16_64 * (x6 + x7); 1774 s7 = (long)CosPi16_64 * (x6 - x7); 1775 1776 x2 = HighbdWrapLow(DctConstRoundShift(s2), bd); 1777 x3 = HighbdWrapLow(DctConstRoundShift(s3), bd); 1778 x6 = HighbdWrapLow(DctConstRoundShift(s6), bd); 1779 x7 = HighbdWrapLow(DctConstRoundShift(s7), bd); 1780 1781 output[0] = HighbdWrapLow(x0, bd); 1782 output[1] = HighbdWrapLow(-x4, bd); 1783 output[2] = HighbdWrapLow(x6, bd); 1784 output[3] = HighbdWrapLow(-x2, bd); 1785 output[4] = HighbdWrapLow(x3, bd); 1786 output[5] = HighbdWrapLow(-x7, bd); 1787 output[6] = HighbdWrapLow(x5, bd); 1788 output[7] = HighbdWrapLow(-x1, bd); 1789 } 1790 1791 [SkipLocalsInit] 1792 public static void HighbdIdct8(ReadOnlySpan<int> input, Span<int> output, int bd) 1793 { 1794 Span<int> step1 = stackalloc int[8]; 1795 Span<int> step2 = stackalloc int[8]; 1796 long temp1, temp2; 1797 1798 if (DetectInvalidHighbdInput(input, 8) != 0) 1799 { 1800 Debug.Assert(false, "invalid highbd txfm input"); 1801 output[..8].Clear(); 1802 1803 return; 1804 } 1805 1806 // stage 1 1807 step1[0] = input[0]; 1808 step1[2] = input[4]; 1809 step1[1] = input[2]; 1810 step1[3] = input[6]; 1811 temp1 = input[1] * (long)CosPi28_64 - input[7] * (long)CosPi4_64; 1812 temp2 = input[1] * (long)CosPi4_64 + input[7] * (long)CosPi28_64; 1813 step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 1814 step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 1815 temp1 = input[5] * (long)CosPi12_64 - input[3] * (long)CosPi20_64; 1816 temp2 = input[5] * (long)CosPi20_64 + input[3] * (long)CosPi12_64; 1817 step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 1818 step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 1819 1820 // stage 2 & stage 3 - even half 1821 HighbdIdct4(step1, step1, bd); 1822 1823 // stage 2 - odd half 1824 step2[4] = HighbdWrapLow(step1[4] + step1[5], bd); 1825 step2[5] = HighbdWrapLow(step1[4] - step1[5], bd); 1826 step2[6] = HighbdWrapLow(-step1[6] + step1[7], bd); 1827 step2[7] = HighbdWrapLow(step1[6] + step1[7], bd); 1828 1829 // stage 3 - odd half 1830 step1[4] = step2[4]; 1831 temp1 = (step2[6] - step2[5]) * (long)CosPi16_64; 1832 temp2 = (step2[5] + step2[6]) * (long)CosPi16_64; 1833 step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 1834 step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 1835 step1[7] = step2[7]; 1836 1837 // stage 4 1838 output[0] = HighbdWrapLow(step1[0] + step1[7], bd); 1839 output[1] = HighbdWrapLow(step1[1] + step1[6], bd); 1840 output[2] = HighbdWrapLow(step1[2] + step1[5], bd); 1841 output[3] = HighbdWrapLow(step1[3] + step1[4], bd); 1842 output[4] = HighbdWrapLow(step1[3] - step1[4], bd); 1843 output[5] = HighbdWrapLow(step1[2] - step1[5], bd); 1844 output[6] = HighbdWrapLow(step1[1] - step1[6], bd); 1845 output[7] = HighbdWrapLow(step1[0] - step1[7], bd); 1846 } 1847 1848 [SkipLocalsInit] 1849 public static void HighbdIdct8x864Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) 1850 { 1851 int i, j; 1852 Span<int> output = stackalloc int[8 * 8]; 1853 Span<int> outptr = output; 1854 Span<int> tempIn = stackalloc int[8]; 1855 Span<int> tempOut = stackalloc int[8]; 1856 1857 // First transform rows 1858 for (i = 0; i < 8; ++i) 1859 { 1860 HighbdIdct8(input, outptr, bd); 1861 input = input[8..]; 1862 outptr = outptr[8..]; 1863 } 1864 1865 // Then transform columns 1866 for (i = 0; i < 8; ++i) 1867 { 1868 for (j = 0; j < 8; ++j) 1869 { 1870 tempIn[j] = output[j * 8 + i]; 1871 } 1872 1873 HighbdIdct8(tempIn, tempOut, bd); 1874 for (j = 0; j < 8; ++j) 1875 { 1876 dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd); 1877 } 1878 } 1879 } 1880 1881 [SkipLocalsInit] 1882 public static void HighbdIdct8x812Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) 1883 { 1884 int i, j; 1885 Span<int> output = stackalloc int[8 * 8]; 1886 Span<int> outptr = output; 1887 Span<int> tempIn = stackalloc int[8]; 1888 Span<int> tempOut = stackalloc int[8]; 1889 1890 output.Clear(); 1891 1892 // First transform rows 1893 // Only first 4 row has non-zero coefs 1894 for (i = 0; i < 4; ++i) 1895 { 1896 HighbdIdct8(input, outptr, bd); 1897 input = input[8..]; 1898 outptr = outptr[8..]; 1899 } 1900 1901 // Then transform columns 1902 for (i = 0; i < 8; ++i) 1903 { 1904 for (j = 0; j < 8; ++j) 1905 { 1906 tempIn[j] = output[j * 8 + i]; 1907 } 1908 1909 HighbdIdct8(tempIn, tempOut, bd); 1910 for (j = 0; j < 8; ++j) 1911 { 1912 dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd); 1913 } 1914 } 1915 } 1916 1917 public static void Vpx_Highbdidct8x8_1_add_c(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) 1918 { 1919 int i, j; 1920 long a1; 1921 int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd); 1922 1923 output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd); 1924 a1 = BitUtils.RoundPowerOfTwo(output, 5); 1925 for (j = 0; j < 8; ++j) 1926 { 1927 for (i = 0; i < 8; ++i) 1928 { 1929 dest[i] = HighbdClipPixelAdd(dest[i], a1, bd); 1930 } 1931 1932 dest = dest[stride..]; 1933 } 1934 } 1935 1936 public static void HighbdIadst16(ReadOnlySpan<int> input, Span<int> output, int bd) 1937 { 1938 long s0, s1, s2, s3, s4, s5, s6, s7, s8; 1939 long s9, s10, s11, s12, s13, s14, s15; 1940 int x0 = input[15]; 1941 int x1 = input[0]; 1942 int x2 = input[13]; 1943 int x3 = input[2]; 1944 int x4 = input[11]; 1945 int x5 = input[4]; 1946 int x6 = input[9]; 1947 int x7 = input[6]; 1948 int x8 = input[7]; 1949 int x9 = input[8]; 1950 int x10 = input[5]; 1951 int x11 = input[10]; 1952 int x12 = input[3]; 1953 int x13 = input[12]; 1954 int x14 = input[1]; 1955 int x15 = input[14]; 1956 1957 if (DetectInvalidHighbdInput(input, 16) != 0) 1958 { 1959 Debug.Assert(false, "invalid highbd txfm input"); 1960 output[..16].Clear(); 1961 1962 return; 1963 } 1964 1965 if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | x13 | x14 | x15) == 0) 1966 { 1967 output[..16].Clear(); 1968 1969 return; 1970 } 1971 1972 // stage 1 1973 s0 = x0 * (long)CosPi1_64 + x1 * (long)CosPi31_64; 1974 s1 = x0 * (long)CosPi31_64 - x1 * (long)CosPi1_64; 1975 s2 = x2 * (long)CosPi5_64 + x3 * (long)CosPi27_64; 1976 s3 = x2 * (long)CosPi27_64 - x3 * (long)CosPi5_64; 1977 s4 = x4 * (long)CosPi9_64 + x5 * (long)CosPi23_64; 1978 s5 = x4 * (long)CosPi23_64 - x5 * (long)CosPi9_64; 1979 s6 = x6 * (long)CosPi13_64 + x7 * (long)CosPi19_64; 1980 s7 = x6 * (long)CosPi19_64 - x7 * (long)CosPi13_64; 1981 s8 = x8 * (long)CosPi17_64 + x9 * (long)CosPi15_64; 1982 s9 = x8 * (long)CosPi15_64 - x9 * (long)CosPi17_64; 1983 s10 = x10 * (long)CosPi21_64 + x11 * (long)CosPi11_64; 1984 s11 = x10 * (long)CosPi11_64 - x11 * (long)CosPi21_64; 1985 s12 = x12 * (long)CosPi25_64 + x13 * (long)CosPi7_64; 1986 s13 = x12 * (long)CosPi7_64 - x13 * (long)CosPi25_64; 1987 s14 = x14 * (long)CosPi29_64 + x15 * (long)CosPi3_64; 1988 s15 = x14 * (long)CosPi3_64 - x15 * (long)CosPi29_64; 1989 1990 x0 = HighbdWrapLow(DctConstRoundShift(s0 + s8), bd); 1991 x1 = HighbdWrapLow(DctConstRoundShift(s1 + s9), bd); 1992 x2 = HighbdWrapLow(DctConstRoundShift(s2 + s10), bd); 1993 x3 = HighbdWrapLow(DctConstRoundShift(s3 + s11), bd); 1994 x4 = HighbdWrapLow(DctConstRoundShift(s4 + s12), bd); 1995 x5 = HighbdWrapLow(DctConstRoundShift(s5 + s13), bd); 1996 x6 = HighbdWrapLow(DctConstRoundShift(s6 + s14), bd); 1997 x7 = HighbdWrapLow(DctConstRoundShift(s7 + s15), bd); 1998 x8 = HighbdWrapLow(DctConstRoundShift(s0 - s8), bd); 1999 x9 = HighbdWrapLow(DctConstRoundShift(s1 - s9), bd); 2000 x10 = HighbdWrapLow(DctConstRoundShift(s2 - s10), bd); 2001 x11 = HighbdWrapLow(DctConstRoundShift(s3 - s11), bd); 2002 x12 = HighbdWrapLow(DctConstRoundShift(s4 - s12), bd); 2003 x13 = HighbdWrapLow(DctConstRoundShift(s5 - s13), bd); 2004 x14 = HighbdWrapLow(DctConstRoundShift(s6 - s14), bd); 2005 x15 = HighbdWrapLow(DctConstRoundShift(s7 - s15), bd); 2006 2007 // stage 2 2008 s0 = x0; 2009 s1 = x1; 2010 s2 = x2; 2011 s3 = x3; 2012 s4 = x4; 2013 s5 = x5; 2014 s6 = x6; 2015 s7 = x7; 2016 s8 = x8 * (long)CosPi4_64 + x9 * (long)CosPi28_64; 2017 s9 = x8 * (long)CosPi28_64 - x9 * (long)CosPi4_64; 2018 s10 = x10 * (long)CosPi20_64 + x11 * (long)CosPi12_64; 2019 s11 = x10 * (long)CosPi12_64 - x11 * (long)CosPi20_64; 2020 s12 = -x12 * (long)CosPi28_64 + x13 * (long)CosPi4_64; 2021 s13 = x12 * (long)CosPi4_64 + x13 * (long)CosPi28_64; 2022 s14 = -x14 * (long)CosPi12_64 + x15 * (long)CosPi20_64; 2023 s15 = x14 * (long)CosPi20_64 + x15 * (long)CosPi12_64; 2024 2025 x0 = HighbdWrapLow(s0 + s4, bd); 2026 x1 = HighbdWrapLow(s1 + s5, bd); 2027 x2 = HighbdWrapLow(s2 + s6, bd); 2028 x3 = HighbdWrapLow(s3 + s7, bd); 2029 x4 = HighbdWrapLow(s0 - s4, bd); 2030 x5 = HighbdWrapLow(s1 - s5, bd); 2031 x6 = HighbdWrapLow(s2 - s6, bd); 2032 x7 = HighbdWrapLow(s3 - s7, bd); 2033 x8 = HighbdWrapLow(DctConstRoundShift(s8 + s12), bd); 2034 x9 = HighbdWrapLow(DctConstRoundShift(s9 + s13), bd); 2035 x10 = HighbdWrapLow(DctConstRoundShift(s10 + s14), bd); 2036 x11 = HighbdWrapLow(DctConstRoundShift(s11 + s15), bd); 2037 x12 = HighbdWrapLow(DctConstRoundShift(s8 - s12), bd); 2038 x13 = HighbdWrapLow(DctConstRoundShift(s9 - s13), bd); 2039 x14 = HighbdWrapLow(DctConstRoundShift(s10 - s14), bd); 2040 x15 = HighbdWrapLow(DctConstRoundShift(s11 - s15), bd); 2041 2042 // stage 3 2043 s0 = x0; 2044 s1 = x1; 2045 s2 = x2; 2046 s3 = x3; 2047 s4 = x4 * (long)CosPi8_64 + x5 * (long)CosPi24_64; 2048 s5 = x4 * (long)CosPi24_64 - x5 * (long)CosPi8_64; 2049 s6 = -x6 * (long)CosPi24_64 + x7 * (long)CosPi8_64; 2050 s7 = x6 * (long)CosPi8_64 + x7 * (long)CosPi24_64; 2051 s8 = x8; 2052 s9 = x9; 2053 s10 = x10; 2054 s11 = x11; 2055 s12 = x12 * (long)CosPi8_64 + x13 * (long)CosPi24_64; 2056 s13 = x12 * (long)CosPi24_64 - x13 * (long)CosPi8_64; 2057 s14 = -x14 * (long)CosPi24_64 + x15 * (long)CosPi8_64; 2058 s15 = x14 * (long)CosPi8_64 + x15 * (long)CosPi24_64; 2059 2060 x0 = HighbdWrapLow(s0 + s2, bd); 2061 x1 = HighbdWrapLow(s1 + s3, bd); 2062 x2 = HighbdWrapLow(s0 - s2, bd); 2063 x3 = HighbdWrapLow(s1 - s3, bd); 2064 x4 = HighbdWrapLow(DctConstRoundShift(s4 + s6), bd); 2065 x5 = HighbdWrapLow(DctConstRoundShift(s5 + s7), bd); 2066 x6 = HighbdWrapLow(DctConstRoundShift(s4 - s6), bd); 2067 x7 = HighbdWrapLow(DctConstRoundShift(s5 - s7), bd); 2068 x8 = HighbdWrapLow(s8 + s10, bd); 2069 x9 = HighbdWrapLow(s9 + s11, bd); 2070 x10 = HighbdWrapLow(s8 - s10, bd); 2071 x11 = HighbdWrapLow(s9 - s11, bd); 2072 x12 = HighbdWrapLow(DctConstRoundShift(s12 + s14), bd); 2073 x13 = HighbdWrapLow(DctConstRoundShift(s13 + s15), bd); 2074 x14 = HighbdWrapLow(DctConstRoundShift(s12 - s14), bd); 2075 x15 = HighbdWrapLow(DctConstRoundShift(s13 - s15), bd); 2076 2077 // stage 4 2078 s2 = (long)(-CosPi16_64) * (x2 + x3); 2079 s3 = (long)CosPi16_64 * (x2 - x3); 2080 s6 = (long)CosPi16_64 * (x6 + x7); 2081 s7 = (long)CosPi16_64 * (-x6 + x7); 2082 s10 = (long)CosPi16_64 * (x10 + x11); 2083 s11 = (long)CosPi16_64 * (-x10 + x11); 2084 s14 = (long)(-CosPi16_64) * (x14 + x15); 2085 s15 = (long)CosPi16_64 * (x14 - x15); 2086 2087 x2 = HighbdWrapLow(DctConstRoundShift(s2), bd); 2088 x3 = HighbdWrapLow(DctConstRoundShift(s3), bd); 2089 x6 = HighbdWrapLow(DctConstRoundShift(s6), bd); 2090 x7 = HighbdWrapLow(DctConstRoundShift(s7), bd); 2091 x10 = HighbdWrapLow(DctConstRoundShift(s10), bd); 2092 x11 = HighbdWrapLow(DctConstRoundShift(s11), bd); 2093 x14 = HighbdWrapLow(DctConstRoundShift(s14), bd); 2094 x15 = HighbdWrapLow(DctConstRoundShift(s15), bd); 2095 2096 output[0] = HighbdWrapLow(x0, bd); 2097 output[1] = HighbdWrapLow(-x8, bd); 2098 output[2] = HighbdWrapLow(x12, bd); 2099 output[3] = HighbdWrapLow(-x4, bd); 2100 output[4] = HighbdWrapLow(x6, bd); 2101 output[5] = HighbdWrapLow(x14, bd); 2102 output[6] = HighbdWrapLow(x10, bd); 2103 output[7] = HighbdWrapLow(x2, bd); 2104 output[8] = HighbdWrapLow(x3, bd); 2105 output[9] = HighbdWrapLow(x11, bd); 2106 output[10] = HighbdWrapLow(x15, bd); 2107 output[11] = HighbdWrapLow(x7, bd); 2108 output[12] = HighbdWrapLow(x5, bd); 2109 output[13] = HighbdWrapLow(-x13, bd); 2110 output[14] = HighbdWrapLow(x9, bd); 2111 output[15] = HighbdWrapLow(-x1, bd); 2112 } 2113 2114 [SkipLocalsInit] 2115 public static void HighbdIdct16(ReadOnlySpan<int> input, Span<int> output, int bd) 2116 { 2117 Span<int> step1 = stackalloc int[16]; 2118 Span<int> step2 = stackalloc int[16]; 2119 long temp1, temp2; 2120 2121 if (DetectInvalidHighbdInput(input, 16) != 0) 2122 { 2123 Debug.Assert(false, "invalid highbd txfm input"); 2124 output[..16].Clear(); 2125 2126 return; 2127 } 2128 2129 // stage 1 2130 step1[0] = input[0 / 2]; 2131 step1[1] = input[16 / 2]; 2132 step1[2] = input[8 / 2]; 2133 step1[3] = input[24 / 2]; 2134 step1[4] = input[4 / 2]; 2135 step1[5] = input[20 / 2]; 2136 step1[6] = input[12 / 2]; 2137 step1[7] = input[28 / 2]; 2138 step1[8] = input[2 / 2]; 2139 step1[9] = input[18 / 2]; 2140 step1[10] = input[10 / 2]; 2141 step1[11] = input[26 / 2]; 2142 step1[12] = input[6 / 2]; 2143 step1[13] = input[22 / 2]; 2144 step1[14] = input[14 / 2]; 2145 step1[15] = input[30 / 2]; 2146 2147 // stage 2 2148 step2[0] = step1[0]; 2149 step2[1] = step1[1]; 2150 step2[2] = step1[2]; 2151 step2[3] = step1[3]; 2152 step2[4] = step1[4]; 2153 step2[5] = step1[5]; 2154 step2[6] = step1[6]; 2155 step2[7] = step1[7]; 2156 2157 temp1 = step1[8] * (long)CosPi30_64 - step1[15] * (long)CosPi2_64; 2158 temp2 = step1[8] * (long)CosPi2_64 + step1[15] * (long)CosPi30_64; 2159 step2[8] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2160 step2[15] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2161 2162 temp1 = step1[9] * (long)CosPi14_64 - step1[14] * (long)CosPi18_64; 2163 temp2 = step1[9] * (long)CosPi18_64 + step1[14] * (long)CosPi14_64; 2164 step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2165 step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2166 2167 temp1 = step1[10] * (long)CosPi22_64 - step1[13] * (long)CosPi10_64; 2168 temp2 = step1[10] * (long)CosPi10_64 + step1[13] * (long)CosPi22_64; 2169 step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2170 step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2171 2172 temp1 = step1[11] * (long)CosPi6_64 - step1[12] * (long)CosPi26_64; 2173 temp2 = step1[11] * (long)CosPi26_64 + step1[12] * (long)CosPi6_64; 2174 step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2175 step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2176 2177 // stage 3 2178 step1[0] = step2[0]; 2179 step1[1] = step2[1]; 2180 step1[2] = step2[2]; 2181 step1[3] = step2[3]; 2182 2183 temp1 = step2[4] * (long)CosPi28_64 - step2[7] * (long)CosPi4_64; 2184 temp2 = step2[4] * (long)CosPi4_64 + step2[7] * (long)CosPi28_64; 2185 step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2186 step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2187 temp1 = step2[5] * (long)CosPi12_64 - step2[6] * (long)CosPi20_64; 2188 temp2 = step2[5] * (long)CosPi20_64 + step2[6] * (long)CosPi12_64; 2189 step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2190 step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2191 2192 step1[8] = HighbdWrapLow(step2[8] + step2[9], bd); 2193 step1[9] = HighbdWrapLow(step2[8] - step2[9], bd); 2194 step1[10] = HighbdWrapLow(-step2[10] + step2[11], bd); 2195 step1[11] = HighbdWrapLow(step2[10] + step2[11], bd); 2196 step1[12] = HighbdWrapLow(step2[12] + step2[13], bd); 2197 step1[13] = HighbdWrapLow(step2[12] - step2[13], bd); 2198 step1[14] = HighbdWrapLow(-step2[14] + step2[15], bd); 2199 step1[15] = HighbdWrapLow(step2[14] + step2[15], bd); 2200 2201 // stage 4 2202 temp1 = (step1[0] + step1[1]) * (long)CosPi16_64; 2203 temp2 = (step1[0] - step1[1]) * (long)CosPi16_64; 2204 step2[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2205 step2[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2206 temp1 = step1[2] * (long)CosPi24_64 - step1[3] * (long)CosPi8_64; 2207 temp2 = step1[2] * (long)CosPi8_64 + step1[3] * (long)CosPi24_64; 2208 step2[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2209 step2[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2210 step2[4] = HighbdWrapLow(step1[4] + step1[5], bd); 2211 step2[5] = HighbdWrapLow(step1[4] - step1[5], bd); 2212 step2[6] = HighbdWrapLow(-step1[6] + step1[7], bd); 2213 step2[7] = HighbdWrapLow(step1[6] + step1[7], bd); 2214 2215 step2[8] = step1[8]; 2216 step2[15] = step1[15]; 2217 temp1 = -step1[9] * (long)CosPi8_64 + step1[14] * (long)CosPi24_64; 2218 temp2 = step1[9] * (long)CosPi24_64 + step1[14] * (long)CosPi8_64; 2219 step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2220 step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2221 temp1 = -step1[10] * (long)CosPi24_64 - step1[13] * (long)CosPi8_64; 2222 temp2 = -step1[10] * (long)CosPi8_64 + step1[13] * (long)CosPi24_64; 2223 step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2224 step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2225 step2[11] = step1[11]; 2226 step2[12] = step1[12]; 2227 2228 // stage 5 2229 step1[0] = HighbdWrapLow(step2[0] + step2[3], bd); 2230 step1[1] = HighbdWrapLow(step2[1] + step2[2], bd); 2231 step1[2] = HighbdWrapLow(step2[1] - step2[2], bd); 2232 step1[3] = HighbdWrapLow(step2[0] - step2[3], bd); 2233 step1[4] = step2[4]; 2234 temp1 = (step2[6] - step2[5]) * (long)CosPi16_64; 2235 temp2 = (step2[5] + step2[6]) * (long)CosPi16_64; 2236 step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2237 step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2238 step1[7] = step2[7]; 2239 2240 step1[8] = HighbdWrapLow(step2[8] + step2[11], bd); 2241 step1[9] = HighbdWrapLow(step2[9] + step2[10], bd); 2242 step1[10] = HighbdWrapLow(step2[9] - step2[10], bd); 2243 step1[11] = HighbdWrapLow(step2[8] - step2[11], bd); 2244 step1[12] = HighbdWrapLow(-step2[12] + step2[15], bd); 2245 step1[13] = HighbdWrapLow(-step2[13] + step2[14], bd); 2246 step1[14] = HighbdWrapLow(step2[13] + step2[14], bd); 2247 step1[15] = HighbdWrapLow(step2[12] + step2[15], bd); 2248 2249 // stage 6 2250 step2[0] = HighbdWrapLow(step1[0] + step1[7], bd); 2251 step2[1] = HighbdWrapLow(step1[1] + step1[6], bd); 2252 step2[2] = HighbdWrapLow(step1[2] + step1[5], bd); 2253 step2[3] = HighbdWrapLow(step1[3] + step1[4], bd); 2254 step2[4] = HighbdWrapLow(step1[3] - step1[4], bd); 2255 step2[5] = HighbdWrapLow(step1[2] - step1[5], bd); 2256 step2[6] = HighbdWrapLow(step1[1] - step1[6], bd); 2257 step2[7] = HighbdWrapLow(step1[0] - step1[7], bd); 2258 step2[8] = step1[8]; 2259 step2[9] = step1[9]; 2260 temp1 = (-step1[10] + step1[13]) * (long)CosPi16_64; 2261 temp2 = (step1[10] + step1[13]) * (long)CosPi16_64; 2262 step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2263 step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2264 temp1 = (-step1[11] + step1[12]) * (long)CosPi16_64; 2265 temp2 = (step1[11] + step1[12]) * (long)CosPi16_64; 2266 step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2267 step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2268 step2[14] = step1[14]; 2269 step2[15] = step1[15]; 2270 2271 // stage 7 2272 output[0] = HighbdWrapLow(step2[0] + step2[15], bd); 2273 output[1] = HighbdWrapLow(step2[1] + step2[14], bd); 2274 output[2] = HighbdWrapLow(step2[2] + step2[13], bd); 2275 output[3] = HighbdWrapLow(step2[3] + step2[12], bd); 2276 output[4] = HighbdWrapLow(step2[4] + step2[11], bd); 2277 output[5] = HighbdWrapLow(step2[5] + step2[10], bd); 2278 output[6] = HighbdWrapLow(step2[6] + step2[9], bd); 2279 output[7] = HighbdWrapLow(step2[7] + step2[8], bd); 2280 output[8] = HighbdWrapLow(step2[7] - step2[8], bd); 2281 output[9] = HighbdWrapLow(step2[6] - step2[9], bd); 2282 output[10] = HighbdWrapLow(step2[5] - step2[10], bd); 2283 output[11] = HighbdWrapLow(step2[4] - step2[11], bd); 2284 output[12] = HighbdWrapLow(step2[3] - step2[12], bd); 2285 output[13] = HighbdWrapLow(step2[2] - step2[13], bd); 2286 output[14] = HighbdWrapLow(step2[1] - step2[14], bd); 2287 output[15] = HighbdWrapLow(step2[0] - step2[15], bd); 2288 } 2289 2290 [SkipLocalsInit] 2291 public static void HighbdIdct16x16256Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) 2292 { 2293 int i, j; 2294 Span<int> output = stackalloc int[16 * 16]; 2295 Span<int> outptr = output; 2296 Span<int> tempIn = stackalloc int[16]; 2297 Span<int> tempOut = stackalloc int[16]; 2298 2299 // First transform rows 2300 for (i = 0; i < 16; ++i) 2301 { 2302 HighbdIdct16(input, outptr, bd); 2303 input = input[16..]; 2304 outptr = outptr[16..]; 2305 } 2306 2307 // Then transform columns 2308 for (i = 0; i < 16; ++i) 2309 { 2310 for (j = 0; j < 16; ++j) 2311 { 2312 tempIn[j] = output[j * 16 + i]; 2313 } 2314 2315 HighbdIdct16(tempIn, tempOut, bd); 2316 for (j = 0; j < 16; ++j) 2317 { 2318 dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd); 2319 } 2320 } 2321 } 2322 2323 [SkipLocalsInit] 2324 public static void HighbdIdct16x1638Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) 2325 { 2326 int i, j; 2327 Span<int> output = stackalloc int[16 * 16]; 2328 Span<int> outptr = output; 2329 Span<int> tempIn = stackalloc int[16]; 2330 Span<int> tempOut = stackalloc int[16]; 2331 2332 output.Clear(); 2333 2334 // First transform rows. Since all non-zero dct coefficients are in 2335 // upper-left 8x8 area, we only need to calculate first 8 rows here. 2336 for (i = 0; i < 8; ++i) 2337 { 2338 HighbdIdct16(input, outptr, bd); 2339 input = input[16..]; 2340 outptr = outptr[16..]; 2341 } 2342 2343 // Then transform columns 2344 for (i = 0; i < 16; ++i) 2345 { 2346 Span<ushort> destT = dest; 2347 for (j = 0; j < 16; ++j) 2348 { 2349 tempIn[j] = output[j * 16 + i]; 2350 } 2351 2352 HighbdIdct16(tempIn, tempOut, bd); 2353 for (j = 0; j < 16; ++j) 2354 { 2355 destT[i] = HighbdClipPixelAdd(destT[i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd); 2356 destT = destT[stride..]; 2357 } 2358 } 2359 } 2360 2361 [SkipLocalsInit] 2362 public static void HighbdIdct16x1610Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) 2363 { 2364 int i, j; 2365 Span<int> output = stackalloc int[16 * 16]; 2366 Span<int> outptr = output; 2367 Span<int> tempIn = stackalloc int[16]; 2368 Span<int> tempOut = stackalloc int[16]; 2369 2370 output.Clear(); 2371 2372 // First transform rows. Since all non-zero dct coefficients are in 2373 // upper-left 4x4 area, we only need to calculate first 4 rows here. 2374 for (i = 0; i < 4; ++i) 2375 { 2376 HighbdIdct16(input, outptr, bd); 2377 input = input[16..]; 2378 outptr = outptr[16..]; 2379 } 2380 2381 // Then transform columns 2382 for (i = 0; i < 16; ++i) 2383 { 2384 for (j = 0; j < 16; ++j) 2385 { 2386 tempIn[j] = output[j * 16 + i]; 2387 } 2388 2389 HighbdIdct16(tempIn, tempOut, bd); 2390 for (j = 0; j < 16; ++j) 2391 { 2392 dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd); 2393 } 2394 } 2395 } 2396 2397 public static void HighbdIdct16x161Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) 2398 { 2399 int i, j; 2400 long a1; 2401 int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd); 2402 2403 output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd); 2404 a1 = BitUtils.RoundPowerOfTwo(output, 6); 2405 for (j = 0; j < 16; ++j) 2406 { 2407 for (i = 0; i < 16; ++i) 2408 { 2409 dest[i] = HighbdClipPixelAdd(dest[i], a1, bd); 2410 } 2411 2412 dest = dest[stride..]; 2413 } 2414 } 2415 2416 [SkipLocalsInit] 2417 public static void HighbdIdct32(ReadOnlySpan<int> input, Span<int> output, int bd) 2418 { 2419 Span<int> step1 = stackalloc int[32]; 2420 Span<int> step2 = stackalloc int[32]; 2421 long temp1, temp2; 2422 2423 if (DetectInvalidHighbdInput(input, 32) != 0) 2424 { 2425 Debug.Assert(false, "invalid highbd txfm input"); 2426 output[..32].Clear(); 2427 2428 return; 2429 } 2430 2431 // stage 1 2432 step1[0] = input[0]; 2433 step1[1] = input[16]; 2434 step1[2] = input[8]; 2435 step1[3] = input[24]; 2436 step1[4] = input[4]; 2437 step1[5] = input[20]; 2438 step1[6] = input[12]; 2439 step1[7] = input[28]; 2440 step1[8] = input[2]; 2441 step1[9] = input[18]; 2442 step1[10] = input[10]; 2443 step1[11] = input[26]; 2444 step1[12] = input[6]; 2445 step1[13] = input[22]; 2446 step1[14] = input[14]; 2447 step1[15] = input[30]; 2448 2449 temp1 = input[1] * (long)CosPi31_64 - input[31] * (long)CosPi1_64; 2450 temp2 = input[1] * (long)CosPi1_64 + input[31] * (long)CosPi31_64; 2451 step1[16] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2452 step1[31] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2453 2454 temp1 = input[17] * (long)CosPi15_64 - input[15] * (long)CosPi17_64; 2455 temp2 = input[17] * (long)CosPi17_64 + input[15] * (long)CosPi15_64; 2456 step1[17] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2457 step1[30] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2458 2459 temp1 = input[9] * (long)CosPi23_64 - input[23] * (long)CosPi9_64; 2460 temp2 = input[9] * (long)CosPi9_64 + input[23] * (long)CosPi23_64; 2461 step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2462 step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2463 2464 temp1 = input[25] * (long)CosPi7_64 - input[7] * (long)CosPi25_64; 2465 temp2 = input[25] * (long)CosPi25_64 + input[7] * (long)CosPi7_64; 2466 step1[19] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2467 step1[28] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2468 2469 temp1 = input[5] * (long)CosPi27_64 - input[27] * (long)CosPi5_64; 2470 temp2 = input[5] * (long)CosPi5_64 + input[27] * (long)CosPi27_64; 2471 step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2472 step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2473 2474 temp1 = input[21] * (long)CosPi11_64 - input[11] * (long)CosPi21_64; 2475 temp2 = input[21] * (long)CosPi21_64 + input[11] * (long)CosPi11_64; 2476 step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2477 step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2478 2479 temp1 = input[13] * (long)CosPi19_64 - input[19] * (long)CosPi13_64; 2480 temp2 = input[13] * (long)CosPi13_64 + input[19] * (long)CosPi19_64; 2481 step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2482 step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2483 2484 temp1 = input[29] * (long)CosPi3_64 - input[3] * (long)CosPi29_64; 2485 temp2 = input[29] * (long)CosPi29_64 + input[3] * (long)CosPi3_64; 2486 step1[23] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2487 step1[24] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2488 2489 // stage 2 2490 step2[0] = step1[0]; 2491 step2[1] = step1[1]; 2492 step2[2] = step1[2]; 2493 step2[3] = step1[3]; 2494 step2[4] = step1[4]; 2495 step2[5] = step1[5]; 2496 step2[6] = step1[6]; 2497 step2[7] = step1[7]; 2498 2499 temp1 = step1[8] * (long)CosPi30_64 - step1[15] * (long)CosPi2_64; 2500 temp2 = step1[8] * (long)CosPi2_64 + step1[15] * (long)CosPi30_64; 2501 step2[8] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2502 step2[15] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2503 2504 temp1 = step1[9] * (long)CosPi14_64 - step1[14] * (long)CosPi18_64; 2505 temp2 = step1[9] * (long)CosPi18_64 + step1[14] * (long)CosPi14_64; 2506 step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2507 step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2508 2509 temp1 = step1[10] * (long)CosPi22_64 - step1[13] * (long)CosPi10_64; 2510 temp2 = step1[10] * (long)CosPi10_64 + step1[13] * (long)CosPi22_64; 2511 step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2512 step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2513 2514 temp1 = step1[11] * (long)CosPi6_64 - step1[12] * (long)CosPi26_64; 2515 temp2 = step1[11] * (long)CosPi26_64 + step1[12] * (long)CosPi6_64; 2516 step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2517 step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2518 2519 step2[16] = HighbdWrapLow(step1[16] + step1[17], bd); 2520 step2[17] = HighbdWrapLow(step1[16] - step1[17], bd); 2521 step2[18] = HighbdWrapLow(-step1[18] + step1[19], bd); 2522 step2[19] = HighbdWrapLow(step1[18] + step1[19], bd); 2523 step2[20] = HighbdWrapLow(step1[20] + step1[21], bd); 2524 step2[21] = HighbdWrapLow(step1[20] - step1[21], bd); 2525 step2[22] = HighbdWrapLow(-step1[22] + step1[23], bd); 2526 step2[23] = HighbdWrapLow(step1[22] + step1[23], bd); 2527 step2[24] = HighbdWrapLow(step1[24] + step1[25], bd); 2528 step2[25] = HighbdWrapLow(step1[24] - step1[25], bd); 2529 step2[26] = HighbdWrapLow(-step1[26] + step1[27], bd); 2530 step2[27] = HighbdWrapLow(step1[26] + step1[27], bd); 2531 step2[28] = HighbdWrapLow(step1[28] + step1[29], bd); 2532 step2[29] = HighbdWrapLow(step1[28] - step1[29], bd); 2533 step2[30] = HighbdWrapLow(-step1[30] + step1[31], bd); 2534 step2[31] = HighbdWrapLow(step1[30] + step1[31], bd); 2535 2536 // stage 3 2537 step1[0] = step2[0]; 2538 step1[1] = step2[1]; 2539 step1[2] = step2[2]; 2540 step1[3] = step2[3]; 2541 2542 temp1 = step2[4] * (long)CosPi28_64 - step2[7] * (long)CosPi4_64; 2543 temp2 = step2[4] * (long)CosPi4_64 + step2[7] * (long)CosPi28_64; 2544 step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2545 step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2546 temp1 = step2[5] * (long)CosPi12_64 - step2[6] * (long)CosPi20_64; 2547 temp2 = step2[5] * (long)CosPi20_64 + step2[6] * (long)CosPi12_64; 2548 step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2549 step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2550 2551 step1[8] = HighbdWrapLow(step2[8] + step2[9], bd); 2552 step1[9] = HighbdWrapLow(step2[8] - step2[9], bd); 2553 step1[10] = HighbdWrapLow(-step2[10] + step2[11], bd); 2554 step1[11] = HighbdWrapLow(step2[10] + step2[11], bd); 2555 step1[12] = HighbdWrapLow(step2[12] + step2[13], bd); 2556 step1[13] = HighbdWrapLow(step2[12] - step2[13], bd); 2557 step1[14] = HighbdWrapLow(-step2[14] + step2[15], bd); 2558 step1[15] = HighbdWrapLow(step2[14] + step2[15], bd); 2559 2560 step1[16] = step2[16]; 2561 step1[31] = step2[31]; 2562 temp1 = -step2[17] * (long)CosPi4_64 + step2[30] * (long)CosPi28_64; 2563 temp2 = step2[17] * (long)CosPi28_64 + step2[30] * (long)CosPi4_64; 2564 step1[17] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2565 step1[30] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2566 temp1 = -step2[18] * (long)CosPi28_64 - step2[29] * (long)CosPi4_64; 2567 temp2 = -step2[18] * (long)CosPi4_64 + step2[29] * (long)CosPi28_64; 2568 step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2569 step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2570 step1[19] = step2[19]; 2571 step1[20] = step2[20]; 2572 temp1 = -step2[21] * (long)CosPi20_64 + step2[26] * (long)CosPi12_64; 2573 temp2 = step2[21] * (long)CosPi12_64 + step2[26] * (long)CosPi20_64; 2574 step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2575 step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2576 temp1 = -step2[22] * (long)CosPi12_64 - step2[25] * (long)CosPi20_64; 2577 temp2 = -step2[22] * (long)CosPi20_64 + step2[25] * (long)CosPi12_64; 2578 step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2579 step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2580 step1[23] = step2[23]; 2581 step1[24] = step2[24]; 2582 step1[27] = step2[27]; 2583 step1[28] = step2[28]; 2584 2585 // stage 4 2586 temp1 = (step1[0] + step1[1]) * (long)CosPi16_64; 2587 temp2 = (step1[0] - step1[1]) * (long)CosPi16_64; 2588 step2[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2589 step2[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2590 temp1 = step1[2] * (long)CosPi24_64 - step1[3] * (long)CosPi8_64; 2591 temp2 = step1[2] * (long)CosPi8_64 + step1[3] * (long)CosPi24_64; 2592 step2[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2593 step2[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2594 step2[4] = HighbdWrapLow(step1[4] + step1[5], bd); 2595 step2[5] = HighbdWrapLow(step1[4] - step1[5], bd); 2596 step2[6] = HighbdWrapLow(-step1[6] + step1[7], bd); 2597 step2[7] = HighbdWrapLow(step1[6] + step1[7], bd); 2598 2599 step2[8] = step1[8]; 2600 step2[15] = step1[15]; 2601 temp1 = -step1[9] * (long)CosPi8_64 + step1[14] * (long)CosPi24_64; 2602 temp2 = step1[9] * (long)CosPi24_64 + step1[14] * (long)CosPi8_64; 2603 step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2604 step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2605 temp1 = -step1[10] * (long)CosPi24_64 - step1[13] * (long)CosPi8_64; 2606 temp2 = -step1[10] * (long)CosPi8_64 + step1[13] * (long)CosPi24_64; 2607 step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2608 step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2609 step2[11] = step1[11]; 2610 step2[12] = step1[12]; 2611 2612 step2[16] = HighbdWrapLow(step1[16] + step1[19], bd); 2613 step2[17] = HighbdWrapLow(step1[17] + step1[18], bd); 2614 step2[18] = HighbdWrapLow(step1[17] - step1[18], bd); 2615 step2[19] = HighbdWrapLow(step1[16] - step1[19], bd); 2616 step2[20] = HighbdWrapLow(-step1[20] + step1[23], bd); 2617 step2[21] = HighbdWrapLow(-step1[21] + step1[22], bd); 2618 step2[22] = HighbdWrapLow(step1[21] + step1[22], bd); 2619 step2[23] = HighbdWrapLow(step1[20] + step1[23], bd); 2620 2621 step2[24] = HighbdWrapLow(step1[24] + step1[27], bd); 2622 step2[25] = HighbdWrapLow(step1[25] + step1[26], bd); 2623 step2[26] = HighbdWrapLow(step1[25] - step1[26], bd); 2624 step2[27] = HighbdWrapLow(step1[24] - step1[27], bd); 2625 step2[28] = HighbdWrapLow(-step1[28] + step1[31], bd); 2626 step2[29] = HighbdWrapLow(-step1[29] + step1[30], bd); 2627 step2[30] = HighbdWrapLow(step1[29] + step1[30], bd); 2628 step2[31] = HighbdWrapLow(step1[28] + step1[31], bd); 2629 2630 // stage 5 2631 step1[0] = HighbdWrapLow(step2[0] + step2[3], bd); 2632 step1[1] = HighbdWrapLow(step2[1] + step2[2], bd); 2633 step1[2] = HighbdWrapLow(step2[1] - step2[2], bd); 2634 step1[3] = HighbdWrapLow(step2[0] - step2[3], bd); 2635 step1[4] = step2[4]; 2636 temp1 = (step2[6] - step2[5]) * (long)CosPi16_64; 2637 temp2 = (step2[5] + step2[6]) * (long)CosPi16_64; 2638 step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2639 step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2640 step1[7] = step2[7]; 2641 2642 step1[8] = HighbdWrapLow(step2[8] + step2[11], bd); 2643 step1[9] = HighbdWrapLow(step2[9] + step2[10], bd); 2644 step1[10] = HighbdWrapLow(step2[9] - step2[10], bd); 2645 step1[11] = HighbdWrapLow(step2[8] - step2[11], bd); 2646 step1[12] = HighbdWrapLow(-step2[12] + step2[15], bd); 2647 step1[13] = HighbdWrapLow(-step2[13] + step2[14], bd); 2648 step1[14] = HighbdWrapLow(step2[13] + step2[14], bd); 2649 step1[15] = HighbdWrapLow(step2[12] + step2[15], bd); 2650 2651 step1[16] = step2[16]; 2652 step1[17] = step2[17]; 2653 temp1 = -step2[18] * (long)CosPi8_64 + step2[29] * (long)CosPi24_64; 2654 temp2 = step2[18] * (long)CosPi24_64 + step2[29] * (long)CosPi8_64; 2655 step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2656 step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2657 temp1 = -step2[19] * (long)CosPi8_64 + step2[28] * (long)CosPi24_64; 2658 temp2 = step2[19] * (long)CosPi24_64 + step2[28] * (long)CosPi8_64; 2659 step1[19] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2660 step1[28] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2661 temp1 = -step2[20] * (long)CosPi24_64 - step2[27] * (long)CosPi8_64; 2662 temp2 = -step2[20] * (long)CosPi8_64 + step2[27] * (long)CosPi24_64; 2663 step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2664 step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2665 temp1 = -step2[21] * (long)CosPi24_64 - step2[26] * (long)CosPi8_64; 2666 temp2 = -step2[21] * (long)CosPi8_64 + step2[26] * (long)CosPi24_64; 2667 step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2668 step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2669 step1[22] = step2[22]; 2670 step1[23] = step2[23]; 2671 step1[24] = step2[24]; 2672 step1[25] = step2[25]; 2673 step1[30] = step2[30]; 2674 step1[31] = step2[31]; 2675 2676 // stage 6 2677 step2[0] = HighbdWrapLow(step1[0] + step1[7], bd); 2678 step2[1] = HighbdWrapLow(step1[1] + step1[6], bd); 2679 step2[2] = HighbdWrapLow(step1[2] + step1[5], bd); 2680 step2[3] = HighbdWrapLow(step1[3] + step1[4], bd); 2681 step2[4] = HighbdWrapLow(step1[3] - step1[4], bd); 2682 step2[5] = HighbdWrapLow(step1[2] - step1[5], bd); 2683 step2[6] = HighbdWrapLow(step1[1] - step1[6], bd); 2684 step2[7] = HighbdWrapLow(step1[0] - step1[7], bd); 2685 step2[8] = step1[8]; 2686 step2[9] = step1[9]; 2687 temp1 = (-step1[10] + step1[13]) * (long)CosPi16_64; 2688 temp2 = (step1[10] + step1[13]) * (long)CosPi16_64; 2689 step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2690 step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2691 temp1 = (-step1[11] + step1[12]) * (long)CosPi16_64; 2692 temp2 = (step1[11] + step1[12]) * (long)CosPi16_64; 2693 step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2694 step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2695 step2[14] = step1[14]; 2696 step2[15] = step1[15]; 2697 2698 step2[16] = HighbdWrapLow(step1[16] + step1[23], bd); 2699 step2[17] = HighbdWrapLow(step1[17] + step1[22], bd); 2700 step2[18] = HighbdWrapLow(step1[18] + step1[21], bd); 2701 step2[19] = HighbdWrapLow(step1[19] + step1[20], bd); 2702 step2[20] = HighbdWrapLow(step1[19] - step1[20], bd); 2703 step2[21] = HighbdWrapLow(step1[18] - step1[21], bd); 2704 step2[22] = HighbdWrapLow(step1[17] - step1[22], bd); 2705 step2[23] = HighbdWrapLow(step1[16] - step1[23], bd); 2706 2707 step2[24] = HighbdWrapLow(-step1[24] + step1[31], bd); 2708 step2[25] = HighbdWrapLow(-step1[25] + step1[30], bd); 2709 step2[26] = HighbdWrapLow(-step1[26] + step1[29], bd); 2710 step2[27] = HighbdWrapLow(-step1[27] + step1[28], bd); 2711 step2[28] = HighbdWrapLow(step1[27] + step1[28], bd); 2712 step2[29] = HighbdWrapLow(step1[26] + step1[29], bd); 2713 step2[30] = HighbdWrapLow(step1[25] + step1[30], bd); 2714 step2[31] = HighbdWrapLow(step1[24] + step1[31], bd); 2715 2716 // stage 7 2717 step1[0] = HighbdWrapLow(step2[0] + step2[15], bd); 2718 step1[1] = HighbdWrapLow(step2[1] + step2[14], bd); 2719 step1[2] = HighbdWrapLow(step2[2] + step2[13], bd); 2720 step1[3] = HighbdWrapLow(step2[3] + step2[12], bd); 2721 step1[4] = HighbdWrapLow(step2[4] + step2[11], bd); 2722 step1[5] = HighbdWrapLow(step2[5] + step2[10], bd); 2723 step1[6] = HighbdWrapLow(step2[6] + step2[9], bd); 2724 step1[7] = HighbdWrapLow(step2[7] + step2[8], bd); 2725 step1[8] = HighbdWrapLow(step2[7] - step2[8], bd); 2726 step1[9] = HighbdWrapLow(step2[6] - step2[9], bd); 2727 step1[10] = HighbdWrapLow(step2[5] - step2[10], bd); 2728 step1[11] = HighbdWrapLow(step2[4] - step2[11], bd); 2729 step1[12] = HighbdWrapLow(step2[3] - step2[12], bd); 2730 step1[13] = HighbdWrapLow(step2[2] - step2[13], bd); 2731 step1[14] = HighbdWrapLow(step2[1] - step2[14], bd); 2732 step1[15] = HighbdWrapLow(step2[0] - step2[15], bd); 2733 2734 step1[16] = step2[16]; 2735 step1[17] = step2[17]; 2736 step1[18] = step2[18]; 2737 step1[19] = step2[19]; 2738 temp1 = (-step2[20] + step2[27]) * (long)CosPi16_64; 2739 temp2 = (step2[20] + step2[27]) * (long)CosPi16_64; 2740 step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2741 step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2742 temp1 = (-step2[21] + step2[26]) * (long)CosPi16_64; 2743 temp2 = (step2[21] + step2[26]) * (long)CosPi16_64; 2744 step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2745 step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2746 temp1 = (-step2[22] + step2[25]) * (long)CosPi16_64; 2747 temp2 = (step2[22] + step2[25]) * (long)CosPi16_64; 2748 step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2749 step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2750 temp1 = (-step2[23] + step2[24]) * (long)CosPi16_64; 2751 temp2 = (step2[23] + step2[24]) * (long)CosPi16_64; 2752 step1[23] = HighbdWrapLow(DctConstRoundShift(temp1), bd); 2753 step1[24] = HighbdWrapLow(DctConstRoundShift(temp2), bd); 2754 step1[28] = step2[28]; 2755 step1[29] = step2[29]; 2756 step1[30] = step2[30]; 2757 step1[31] = step2[31]; 2758 2759 // final stage 2760 output[0] = HighbdWrapLow(step1[0] + step1[31], bd); 2761 output[1] = HighbdWrapLow(step1[1] + step1[30], bd); 2762 output[2] = HighbdWrapLow(step1[2] + step1[29], bd); 2763 output[3] = HighbdWrapLow(step1[3] + step1[28], bd); 2764 output[4] = HighbdWrapLow(step1[4] + step1[27], bd); 2765 output[5] = HighbdWrapLow(step1[5] + step1[26], bd); 2766 output[6] = HighbdWrapLow(step1[6] + step1[25], bd); 2767 output[7] = HighbdWrapLow(step1[7] + step1[24], bd); 2768 output[8] = HighbdWrapLow(step1[8] + step1[23], bd); 2769 output[9] = HighbdWrapLow(step1[9] + step1[22], bd); 2770 output[10] = HighbdWrapLow(step1[10] + step1[21], bd); 2771 output[11] = HighbdWrapLow(step1[11] + step1[20], bd); 2772 output[12] = HighbdWrapLow(step1[12] + step1[19], bd); 2773 output[13] = HighbdWrapLow(step1[13] + step1[18], bd); 2774 output[14] = HighbdWrapLow(step1[14] + step1[17], bd); 2775 output[15] = HighbdWrapLow(step1[15] + step1[16], bd); 2776 output[16] = HighbdWrapLow(step1[15] - step1[16], bd); 2777 output[17] = HighbdWrapLow(step1[14] - step1[17], bd); 2778 output[18] = HighbdWrapLow(step1[13] - step1[18], bd); 2779 output[19] = HighbdWrapLow(step1[12] - step1[19], bd); 2780 output[20] = HighbdWrapLow(step1[11] - step1[20], bd); 2781 output[21] = HighbdWrapLow(step1[10] - step1[21], bd); 2782 output[22] = HighbdWrapLow(step1[9] - step1[22], bd); 2783 output[23] = HighbdWrapLow(step1[8] - step1[23], bd); 2784 output[24] = HighbdWrapLow(step1[7] - step1[24], bd); 2785 output[25] = HighbdWrapLow(step1[6] - step1[25], bd); 2786 output[26] = HighbdWrapLow(step1[5] - step1[26], bd); 2787 output[27] = HighbdWrapLow(step1[4] - step1[27], bd); 2788 output[28] = HighbdWrapLow(step1[3] - step1[28], bd); 2789 output[29] = HighbdWrapLow(step1[2] - step1[29], bd); 2790 output[30] = HighbdWrapLow(step1[1] - step1[30], bd); 2791 output[31] = HighbdWrapLow(step1[0] - step1[31], bd); 2792 } 2793 2794 [SkipLocalsInit] 2795 public static void HighbdIdct32x321024Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) 2796 { 2797 int i, j; 2798 Span<int> output = stackalloc int[32 * 32]; 2799 Span<int> outptr = output; 2800 Span<int> tempIn = stackalloc int[32]; 2801 Span<int> tempOut = stackalloc int[32]; 2802 2803 // Rows 2804 for (i = 0; i < 32; ++i) 2805 { 2806 int zeroCoeff = 0; 2807 for (j = 0; j < 32; ++j) 2808 { 2809 zeroCoeff |= input[j]; 2810 } 2811 2812 if (zeroCoeff != 0) 2813 { 2814 HighbdIdct32(input, outptr, bd); 2815 } 2816 else 2817 { 2818 outptr[..32].Clear(); 2819 } 2820 2821 input = input[32..]; 2822 outptr = outptr[32..]; 2823 } 2824 2825 // Columns 2826 for (i = 0; i < 32; ++i) 2827 { 2828 for (j = 0; j < 32; ++j) 2829 { 2830 tempIn[j] = output[j * 32 + i]; 2831 } 2832 2833 HighbdIdct32(tempIn, tempOut, bd); 2834 for (j = 0; j < 32; ++j) 2835 { 2836 dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd); 2837 } 2838 } 2839 } 2840 2841 [SkipLocalsInit] 2842 public static void HighbdIdct32x32135Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) 2843 { 2844 int i, j; 2845 Span<int> output = stackalloc int[32 * 32]; 2846 Span<int> outptr = output; 2847 Span<int> tempIn = stackalloc int[32]; 2848 Span<int> tempOut = stackalloc int[32]; 2849 2850 output.Clear(); 2851 2852 // Rows 2853 // Only upper-left 16x16 has non-zero coeff 2854 for (i = 0; i < 16; ++i) 2855 { 2856 HighbdIdct32(input, outptr, bd); 2857 input = input[32..]; 2858 outptr = outptr[32..]; 2859 } 2860 2861 // Columns 2862 for (i = 0; i < 32; ++i) 2863 { 2864 Span<ushort> destT = dest; 2865 for (j = 0; j < 32; ++j) 2866 { 2867 tempIn[j] = output[j * 32 + i]; 2868 } 2869 2870 HighbdIdct32(tempIn, tempOut, bd); 2871 for (j = 0; j < 32; ++j) 2872 { 2873 destT[i] = HighbdClipPixelAdd(destT[i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd); 2874 destT = destT[stride..]; 2875 } 2876 } 2877 } 2878 2879 [SkipLocalsInit] 2880 public static void HighbdIdct32x3234Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) 2881 { 2882 int i, j; 2883 Span<int> output = stackalloc int[32 * 32]; 2884 Span<int> outptr = output; 2885 Span<int> tempIn = stackalloc int[32]; 2886 Span<int> tempOut = stackalloc int[32]; 2887 2888 output.Clear(); 2889 2890 // Rows 2891 // Only upper-left 8x8 has non-zero coeff 2892 for (i = 0; i < 8; ++i) 2893 { 2894 HighbdIdct32(input, outptr, bd); 2895 input = input[32..]; 2896 outptr = outptr[32..]; 2897 } 2898 2899 // Columns 2900 for (i = 0; i < 32; ++i) 2901 { 2902 for (j = 0; j < 32; ++j) 2903 { 2904 tempIn[j] = output[j * 32 + i]; 2905 } 2906 2907 HighbdIdct32(tempIn, tempOut, bd); 2908 for (j = 0; j < 32; ++j) 2909 { 2910 dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd); 2911 } 2912 } 2913 } 2914 2915 public static void HighbdIdct32x321Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd) 2916 { 2917 int i, j; 2918 int a1; 2919 int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd); 2920 2921 output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd); 2922 a1 = BitUtils.RoundPowerOfTwo(output, 6); 2923 2924 for (j = 0; j < 32; ++j) 2925 { 2926 for (i = 0; i < 32; ++i) 2927 { 2928 dest[i] = HighbdClipPixelAdd(dest[i], a1, bd); 2929 } 2930 2931 dest = dest[stride..]; 2932 } 2933 } 2934 } 2935 }