InvTxfm.cs
   1  using Ryujinx.Graphics.Nvdec.Vp9.Common;
   2  using System;
   3  using System.Diagnostics;
   4  using System.Runtime.CompilerServices;
   5  using static Ryujinx.Graphics.Nvdec.Vp9.Dsp.TxfmCommon;
   6  
   7  namespace Ryujinx.Graphics.Nvdec.Vp9.Dsp
   8  {
   9      internal static class InvTxfm
  10      {
  11          // 12 signal input bits + 7 2D forward transform amplify bits + 5 1D inverse
  12          // transform amplify bits + 1 bit for contingency in rounding and quantizing
  13          private const int HighbdValidTxfmMagnitudeRange = (1 << 25);
  14  
  15          [MethodImpl(MethodImplOptions.AggressiveInlining)]
  16          private static int DetectInvalidHighbdInput(ReadOnlySpan<int> input, int size)
  17          {
  18              int i;
  19              for (i = 0; i < size; ++i)
  20              {
  21                  if (Math.Abs(input[i]) >= HighbdValidTxfmMagnitudeRange)
  22                  {
  23                      return 1;
  24                  }
  25              }
  26  
  27              return 0;
  28          }
  29  
  30          [MethodImpl(MethodImplOptions.AggressiveInlining)]
  31          private static long CheckRange(long input)
  32          {
  33              // For valid VP9 input streams, intermediate stage coefficients should always
  34              // stay within the range of a signed 16 bit integer. Coefficients can go out
  35              // of this range for invalid/corrupt VP9 streams.
  36              Debug.Assert(short.MinValue <= input);
  37              Debug.Assert(input <= short.MaxValue);
  38  
  39              return input;
  40          }
  41  
  42          [MethodImpl(MethodImplOptions.AggressiveInlining)]
  43          public static long HighbdCheckRange(long input, int bd)
  44          {
  45              // For valid highbitdepth VP9 streams, intermediate stage coefficients will
  46              // stay within the ranges:
  47              // - 8 bit: signed 16 bit integer
  48              // - 10 bit: signed 18 bit integer
  49              // - 12 bit: signed 20 bit integer
  50              int intMax = (1 << (7 + bd)) - 1;
  51              int intMin = -intMax - 1;
  52              Debug.Assert(intMin <= input);
  53              Debug.Assert(input <= intMax);
  54  
  55              return input;
  56          }
  57  
  58          [MethodImpl(MethodImplOptions.AggressiveInlining)]
  59          private static int WrapLow(long x)
  60          {
  61              return (short)CheckRange(x);
  62          }
  63  
  64          [MethodImpl(MethodImplOptions.AggressiveInlining)]
  65          private static int HighbdWrapLow(long x, int bd)
  66          {
  67              return ((int)HighbdCheckRange(x, bd) << (24 - bd)) >> (24 - bd);
  68          }
  69  
  70          [MethodImpl(MethodImplOptions.AggressiveInlining)]
  71          public static byte ClipPixelAdd(byte dest, long trans)
  72          {
  73              trans = WrapLow(trans);
  74  
  75              return BitUtils.ClipPixel(dest + (int)trans);
  76          }
  77  
  78          [MethodImpl(MethodImplOptions.AggressiveInlining)]
  79          public static ushort HighbdClipPixelAdd(ushort dest, long trans, int bd)
  80          {
  81              trans = HighbdWrapLow(trans, bd);
  82  
  83              return BitUtils.ClipPixelHighbd(dest + (int)trans, bd);
  84          }
  85  
  86          [MethodImpl(MethodImplOptions.AggressiveInlining)]
  87          private static long DctConstRoundShift(long input)
  88          {
  89              long rv = BitUtils.RoundPowerOfTwo(input, DctConstBits);
  90  
  91              return rv;
  92          }
  93  
  94          [SkipLocalsInit]
  95          public static void Iwht4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
  96          {
  97              /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
  98                 0.5 shifts per pixel. */
  99              int i;
 100              Span<int> output = stackalloc int[16];
 101              long a1, b1, c1, d1, e1;
 102              ReadOnlySpan<int> ip = input;
 103              Span<int> op = output;
 104  
 105              for (i = 0; i < 4; i++)
 106              {
 107                  a1 = ip[0] >> UnitQuantShift;
 108                  c1 = ip[1] >> UnitQuantShift;
 109                  d1 = ip[2] >> UnitQuantShift;
 110                  b1 = ip[3] >> UnitQuantShift;
 111                  a1 += c1;
 112                  d1 -= b1;
 113                  e1 = (a1 - d1) >> 1;
 114                  b1 = e1 - b1;
 115                  c1 = e1 - c1;
 116                  a1 -= b1;
 117                  d1 += c1;
 118                  op[0] = WrapLow(a1);
 119                  op[1] = WrapLow(b1);
 120                  op[2] = WrapLow(c1);
 121                  op[3] = WrapLow(d1);
 122                  ip = ip[4..];
 123                  op = op[4..];
 124              }
 125  
 126              Span<int> ip2 = output;
 127              for (i = 0; i < 4; i++)
 128              {
 129                  a1 = ip2[4 * 0];
 130                  c1 = ip2[4 * 1];
 131                  d1 = ip2[4 * 2];
 132                  b1 = ip2[4 * 3];
 133                  a1 += c1;
 134                  d1 -= b1;
 135                  e1 = (a1 - d1) >> 1;
 136                  b1 = e1 - b1;
 137                  c1 = e1 - c1;
 138                  a1 -= b1;
 139                  d1 += c1;
 140                  dest[stride * 0] = ClipPixelAdd(dest[stride * 0], WrapLow(a1));
 141                  dest[stride * 1] = ClipPixelAdd(dest[stride * 1], WrapLow(b1));
 142                  dest[stride * 2] = ClipPixelAdd(dest[stride * 2], WrapLow(c1));
 143                  dest[stride * 3] = ClipPixelAdd(dest[stride * 3], WrapLow(d1));
 144  
 145                  ip2 = ip2[1..];
 146                  dest = dest[1..];
 147              }
 148          }
 149  
 150          [SkipLocalsInit]
 151          public static void Iwht4x41Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
 152          {
 153              int i;
 154              long a1, e1;
 155              Span<int> tmp = stackalloc int[4];
 156              ReadOnlySpan<int> ip = input;
 157              Span<int> op = tmp;
 158  
 159              a1 = ip[0] >> UnitQuantShift;
 160              e1 = a1 >> 1;
 161              a1 -= e1;
 162              op[0] = WrapLow(a1);
 163              op[1] = op[2] = op[3] = WrapLow(e1);
 164  
 165              Span<int> ip2 = tmp;
 166              for (i = 0; i < 4; i++)
 167              {
 168                  e1 = ip2[0] >> 1;
 169                  a1 = ip2[0] - e1;
 170                  dest[stride * 0] = ClipPixelAdd(dest[stride * 0], a1);
 171                  dest[stride * 1] = ClipPixelAdd(dest[stride * 1], e1);
 172                  dest[stride * 2] = ClipPixelAdd(dest[stride * 2], e1);
 173                  dest[stride * 3] = ClipPixelAdd(dest[stride * 3], e1);
 174                  ip2 = ip2[1..];
 175                  dest = dest[1..];
 176              }
 177          }
 178  
 179          public static void Iadst4(ReadOnlySpan<int> input, Span<int> output)
 180          {
 181              long s0, s1, s2, s3, s4, s5, s6, s7;
 182              int x0 = input[0];
 183              int x1 = input[1];
 184              int x2 = input[2];
 185              int x3 = input[3];
 186  
 187              if ((x0 | x1 | x2 | x3) == 0)
 188              {
 189                  output[..4].Clear();
 190  
 191                  return;
 192              }
 193  
 194              // 32-bit result is enough for the following multiplications.
 195              s0 = SinPi1_9 * x0;
 196              s1 = SinPi2_9 * x0;
 197              s2 = SinPi3_9 * x1;
 198              s3 = SinPi4_9 * x2;
 199              s4 = SinPi1_9 * x2;
 200              s5 = SinPi2_9 * x3;
 201              s6 = SinPi4_9 * x3;
 202              s7 = WrapLow(x0 - x2 + x3);
 203  
 204              s0 = s0 + s3 + s5;
 205              s1 = s1 - s4 - s6;
 206              s3 = s2;
 207              s2 = SinPi3_9 * s7;
 208  
 209              // 1-D transform scaling factor is sqrt(2).
 210              // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
 211              // + 1b (addition) = 29b.
 212              // Hence the output bit depth is 15b.
 213              output[0] = WrapLow(DctConstRoundShift(s0 + s3));
 214              output[1] = WrapLow(DctConstRoundShift(s1 + s3));
 215              output[2] = WrapLow(DctConstRoundShift(s2));
 216              output[3] = WrapLow(DctConstRoundShift(s0 + s1 - s3));
 217          }
 218  
 219          [SkipLocalsInit]
 220          public static void Idct4(ReadOnlySpan<int> input, Span<int> output)
 221          {
 222              Span<short> step = stackalloc short[4];
 223              long temp1, temp2;
 224  
 225              // stage 1
 226              temp1 = ((short)input[0] + (short)input[2]) * CosPi16_64;
 227              temp2 = ((short)input[0] - (short)input[2]) * CosPi16_64;
 228              step[0] = (short)WrapLow(DctConstRoundShift(temp1));
 229              step[1] = (short)WrapLow(DctConstRoundShift(temp2));
 230              temp1 = (short)input[1] * CosPi24_64 - (short)input[3] * CosPi8_64;
 231              temp2 = (short)input[1] * CosPi8_64 + (short)input[3] * CosPi24_64;
 232              step[2] = (short)WrapLow(DctConstRoundShift(temp1));
 233              step[3] = (short)WrapLow(DctConstRoundShift(temp2));
 234  
 235              // stage 2
 236              output[0] = WrapLow(step[0] + step[3]);
 237              output[1] = WrapLow(step[1] + step[2]);
 238              output[2] = WrapLow(step[1] - step[2]);
 239              output[3] = WrapLow(step[0] - step[3]);
 240          }
 241  
 242          [SkipLocalsInit]
 243          public static void Idct4x416Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
 244          {
 245              int i, j;
 246              Span<int> output = stackalloc int[4 * 4];
 247              Span<int> outptr = output;
 248              Span<int> tempIn = stackalloc int[4];
 249              Span<int> tempOut = stackalloc int[4];
 250  
 251              // Rows
 252              for (i = 0; i < 4; ++i)
 253              {
 254                  Idct4(input, outptr);
 255                  input = input[4..];
 256                  outptr = outptr[4..];
 257              }
 258  
 259              // Columns
 260              for (i = 0; i < 4; ++i)
 261              {
 262                  for (j = 0; j < 4; ++j)
 263                  {
 264                      tempIn[j] = output[j * 4 + i];
 265                  }
 266  
 267                  Idct4(tempIn, tempOut);
 268                  for (j = 0; j < 4; ++j)
 269                  {
 270                      dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4));
 271                  }
 272              }
 273          }
 274  
 275          public static void Idct4x41Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
 276          {
 277              int i;
 278              long a1;
 279              int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64));
 280  
 281              output = WrapLow(DctConstRoundShift(output * CosPi16_64));
 282              a1 = BitUtils.RoundPowerOfTwo(output, 4);
 283  
 284              for (i = 0; i < 4; i++)
 285              {
 286                  dest[0] = ClipPixelAdd(dest[0], a1);
 287                  dest[1] = ClipPixelAdd(dest[1], a1);
 288                  dest[2] = ClipPixelAdd(dest[2], a1);
 289                  dest[3] = ClipPixelAdd(dest[3], a1);
 290                  dest = dest[stride..];
 291              }
 292          }
 293  
 294          public static void Iadst8(ReadOnlySpan<int> input, Span<int> output)
 295          {
 296              int s0, s1, s2, s3, s4, s5, s6, s7;
 297              long x0 = input[7];
 298              long x1 = input[0];
 299              long x2 = input[5];
 300              long x3 = input[2];
 301              long x4 = input[3];
 302              long x5 = input[4];
 303              long x6 = input[1];
 304              long x7 = input[6];
 305  
 306              if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7) == 0)
 307              {
 308                  output[..8].Clear();
 309  
 310                  return;
 311              }
 312  
 313              // stage 1
 314              s0 = (int)(CosPi2_64 * x0 + CosPi30_64 * x1);
 315              s1 = (int)(CosPi30_64 * x0 - CosPi2_64 * x1);
 316              s2 = (int)(CosPi10_64 * x2 + CosPi22_64 * x3);
 317              s3 = (int)(CosPi22_64 * x2 - CosPi10_64 * x3);
 318              s4 = (int)(CosPi18_64 * x4 + CosPi14_64 * x5);
 319              s5 = (int)(CosPi14_64 * x4 - CosPi18_64 * x5);
 320              s6 = (int)(CosPi26_64 * x6 + CosPi6_64 * x7);
 321              s7 = (int)(CosPi6_64 * x6 - CosPi26_64 * x7);
 322  
 323              x0 = WrapLow(DctConstRoundShift(s0 + s4));
 324              x1 = WrapLow(DctConstRoundShift(s1 + s5));
 325              x2 = WrapLow(DctConstRoundShift(s2 + s6));
 326              x3 = WrapLow(DctConstRoundShift(s3 + s7));
 327              x4 = WrapLow(DctConstRoundShift(s0 - s4));
 328              x5 = WrapLow(DctConstRoundShift(s1 - s5));
 329              x6 = WrapLow(DctConstRoundShift(s2 - s6));
 330              x7 = WrapLow(DctConstRoundShift(s3 - s7));
 331  
 332              // stage 2
 333              s0 = (int)x0;
 334              s1 = (int)x1;
 335              s2 = (int)x2;
 336              s3 = (int)x3;
 337              s4 = (int)(CosPi8_64 * x4 + CosPi24_64 * x5);
 338              s5 = (int)(CosPi24_64 * x4 - CosPi8_64 * x5);
 339              s6 = (int)(-CosPi24_64 * x6 + CosPi8_64 * x7);
 340              s7 = (int)(CosPi8_64 * x6 + CosPi24_64 * x7);
 341  
 342              x0 = WrapLow(s0 + s2);
 343              x1 = WrapLow(s1 + s3);
 344              x2 = WrapLow(s0 - s2);
 345              x3 = WrapLow(s1 - s3);
 346              x4 = WrapLow(DctConstRoundShift(s4 + s6));
 347              x5 = WrapLow(DctConstRoundShift(s5 + s7));
 348              x6 = WrapLow(DctConstRoundShift(s4 - s6));
 349              x7 = WrapLow(DctConstRoundShift(s5 - s7));
 350  
 351              // stage 3
 352              s2 = (int)(CosPi16_64 * (x2 + x3));
 353              s3 = (int)(CosPi16_64 * (x2 - x3));
 354              s6 = (int)(CosPi16_64 * (x6 + x7));
 355              s7 = (int)(CosPi16_64 * (x6 - x7));
 356  
 357              x2 = WrapLow(DctConstRoundShift(s2));
 358              x3 = WrapLow(DctConstRoundShift(s3));
 359              x6 = WrapLow(DctConstRoundShift(s6));
 360              x7 = WrapLow(DctConstRoundShift(s7));
 361  
 362              output[0] = WrapLow(x0);
 363              output[1] = WrapLow(-x4);
 364              output[2] = WrapLow(x6);
 365              output[3] = WrapLow(-x2);
 366              output[4] = WrapLow(x3);
 367              output[5] = WrapLow(-x7);
 368              output[6] = WrapLow(x5);
 369              output[7] = WrapLow(-x1);
 370          }
 371  
 372          [SkipLocalsInit]
 373          public static void Idct8(ReadOnlySpan<int> input, Span<int> output)
 374          {
 375              Span<short> step1 = stackalloc short[8];
 376              Span<short> step2 = stackalloc short[8];
 377              long temp1, temp2;
 378  
 379              // stage 1
 380              step1[0] = (short)input[0];
 381              step1[2] = (short)input[4];
 382              step1[1] = (short)input[2];
 383              step1[3] = (short)input[6];
 384              temp1 = (short)input[1] * CosPi28_64 - (short)input[7] * CosPi4_64;
 385              temp2 = (short)input[1] * CosPi4_64 + (short)input[7] * CosPi28_64;
 386              step1[4] = (short)WrapLow(DctConstRoundShift(temp1));
 387              step1[7] = (short)WrapLow(DctConstRoundShift(temp2));
 388              temp1 = (short)input[5] * CosPi12_64 - (short)input[3] * CosPi20_64;
 389              temp2 = (short)input[5] * CosPi20_64 + (short)input[3] * CosPi12_64;
 390              step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
 391              step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
 392  
 393              // stage 2
 394              temp1 = (step1[0] + step1[2]) * CosPi16_64;
 395              temp2 = (step1[0] - step1[2]) * CosPi16_64;
 396              step2[0] = (short)WrapLow(DctConstRoundShift(temp1));
 397              step2[1] = (short)WrapLow(DctConstRoundShift(temp2));
 398              temp1 = step1[1] * CosPi24_64 - step1[3] * CosPi8_64;
 399              temp2 = step1[1] * CosPi8_64 + step1[3] * CosPi24_64;
 400              step2[2] = (short)WrapLow(DctConstRoundShift(temp1));
 401              step2[3] = (short)WrapLow(DctConstRoundShift(temp2));
 402              step2[4] = (short)WrapLow(step1[4] + step1[5]);
 403              step2[5] = (short)WrapLow(step1[4] - step1[5]);
 404              step2[6] = (short)WrapLow(-step1[6] + step1[7]);
 405              step2[7] = (short)WrapLow(step1[6] + step1[7]);
 406  
 407              // stage 3
 408              step1[0] = (short)WrapLow(step2[0] + step2[3]);
 409              step1[1] = (short)WrapLow(step2[1] + step2[2]);
 410              step1[2] = (short)WrapLow(step2[1] - step2[2]);
 411              step1[3] = (short)WrapLow(step2[0] - step2[3]);
 412              step1[4] = step2[4];
 413              temp1 = (step2[6] - step2[5]) * CosPi16_64;
 414              temp2 = (step2[5] + step2[6]) * CosPi16_64;
 415              step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
 416              step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
 417              step1[7] = step2[7];
 418  
 419              // stage 4
 420              output[0] = WrapLow(step1[0] + step1[7]);
 421              output[1] = WrapLow(step1[1] + step1[6]);
 422              output[2] = WrapLow(step1[2] + step1[5]);
 423              output[3] = WrapLow(step1[3] + step1[4]);
 424              output[4] = WrapLow(step1[3] - step1[4]);
 425              output[5] = WrapLow(step1[2] - step1[5]);
 426              output[6] = WrapLow(step1[1] - step1[6]);
 427              output[7] = WrapLow(step1[0] - step1[7]);
 428          }
 429  
 430          [SkipLocalsInit]
 431          public static void Idct8x864Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
 432          {
 433              int i, j;
 434              Span<int> output = stackalloc int[8 * 8];
 435              Span<int> outptr = output;
 436              Span<int> tempIn = stackalloc int[8];
 437              Span<int> tempOut = stackalloc int[8];
 438  
 439              // First transform rows
 440              for (i = 0; i < 8; ++i)
 441              {
 442                  Idct8(input, outptr);
 443                  input = input[8..];
 444                  outptr = outptr[8..];
 445              }
 446  
 447              // Then transform columns
 448              for (i = 0; i < 8; ++i)
 449              {
 450                  for (j = 0; j < 8; ++j)
 451                  {
 452                      tempIn[j] = output[j * 8 + i];
 453                  }
 454  
 455                  Idct8(tempIn, tempOut);
 456                  for (j = 0; j < 8; ++j)
 457                  {
 458                      dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i],
 459                                                            BitUtils.RoundPowerOfTwo(tempOut[j], 5));
 460                  }
 461              }
 462          }
 463  
 464          [SkipLocalsInit]
 465          public static void Idct8x812Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
 466          {
 467              int i, j;
 468              Span<int> output = stackalloc int[8 * 8];
 469              Span<int> outptr = output;
 470              Span<int> tempIn = stackalloc int[8];
 471              Span<int> tempOut = stackalloc int[8];
 472  
 473              output.Clear();
 474  
 475              // First transform rows
 476              // Only first 4 row has non-zero coefs
 477              for (i = 0; i < 4; ++i)
 478              {
 479                  Idct8(input, outptr);
 480                  input = input[8..];
 481                  outptr = outptr[8..];
 482              }
 483  
 484              // Then transform columns
 485              for (i = 0; i < 8; ++i)
 486              {
 487                  for (j = 0; j < 8; ++j)
 488                  {
 489                      tempIn[j] = output[j * 8 + i];
 490                  }
 491  
 492                  Idct8(tempIn, tempOut);
 493                  for (j = 0; j < 8; ++j)
 494                  {
 495                      dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5));
 496                  }
 497              }
 498          }
 499  
 500          public static void Idct8x81Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
 501          {
 502              int i, j;
 503              long a1;
 504              int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64));
 505  
 506              output = WrapLow(DctConstRoundShift(output * CosPi16_64));
 507              a1 = BitUtils.RoundPowerOfTwo(output, 5);
 508              for (j = 0; j < 8; ++j)
 509              {
 510                  for (i = 0; i < 8; ++i)
 511                  {
 512                      dest[i] = ClipPixelAdd(dest[i], a1);
 513                  }
 514  
 515                  dest = dest[stride..];
 516              }
 517          }
 518  
 519          public static void Iadst16(ReadOnlySpan<int> input, Span<int> output)
 520          {
 521              long s0, s1, s2, s3, s4, s5, s6, s7, s8;
 522              long s9, s10, s11, s12, s13, s14, s15;
 523              long x0 = input[15];
 524              long x1 = input[0];
 525              long x2 = input[13];
 526              long x3 = input[2];
 527              long x4 = input[11];
 528              long x5 = input[4];
 529              long x6 = input[9];
 530              long x7 = input[6];
 531              long x8 = input[7];
 532              long x9 = input[8];
 533              long x10 = input[5];
 534              long x11 = input[10];
 535              long x12 = input[3];
 536              long x13 = input[12];
 537              long x14 = input[1];
 538              long x15 = input[14];
 539  
 540              if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | x13 | x14 | x15) == 0)
 541              {
 542                  output[..16].Clear();
 543  
 544                  return;
 545              }
 546  
 547              // stage 1
 548              s0 = x0 * CosPi1_64 + x1 * CosPi31_64;
 549              s1 = x0 * CosPi31_64 - x1 * CosPi1_64;
 550              s2 = x2 * CosPi5_64 + x3 * CosPi27_64;
 551              s3 = x2 * CosPi27_64 - x3 * CosPi5_64;
 552              s4 = x4 * CosPi9_64 + x5 * CosPi23_64;
 553              s5 = x4 * CosPi23_64 - x5 * CosPi9_64;
 554              s6 = x6 * CosPi13_64 + x7 * CosPi19_64;
 555              s7 = x6 * CosPi19_64 - x7 * CosPi13_64;
 556              s8 = x8 * CosPi17_64 + x9 * CosPi15_64;
 557              s9 = x8 * CosPi15_64 - x9 * CosPi17_64;
 558              s10 = x10 * CosPi21_64 + x11 * CosPi11_64;
 559              s11 = x10 * CosPi11_64 - x11 * CosPi21_64;
 560              s12 = x12 * CosPi25_64 + x13 * CosPi7_64;
 561              s13 = x12 * CosPi7_64 - x13 * CosPi25_64;
 562              s14 = x14 * CosPi29_64 + x15 * CosPi3_64;
 563              s15 = x14 * CosPi3_64 - x15 * CosPi29_64;
 564  
 565              x0 = WrapLow(DctConstRoundShift(s0 + s8));
 566              x1 = WrapLow(DctConstRoundShift(s1 + s9));
 567              x2 = WrapLow(DctConstRoundShift(s2 + s10));
 568              x3 = WrapLow(DctConstRoundShift(s3 + s11));
 569              x4 = WrapLow(DctConstRoundShift(s4 + s12));
 570              x5 = WrapLow(DctConstRoundShift(s5 + s13));
 571              x6 = WrapLow(DctConstRoundShift(s6 + s14));
 572              x7 = WrapLow(DctConstRoundShift(s7 + s15));
 573              x8 = WrapLow(DctConstRoundShift(s0 - s8));
 574              x9 = WrapLow(DctConstRoundShift(s1 - s9));
 575              x10 = WrapLow(DctConstRoundShift(s2 - s10));
 576              x11 = WrapLow(DctConstRoundShift(s3 - s11));
 577              x12 = WrapLow(DctConstRoundShift(s4 - s12));
 578              x13 = WrapLow(DctConstRoundShift(s5 - s13));
 579              x14 = WrapLow(DctConstRoundShift(s6 - s14));
 580              x15 = WrapLow(DctConstRoundShift(s7 - s15));
 581  
 582              // stage 2
 583              s0 = x0;
 584              s1 = x1;
 585              s2 = x2;
 586              s3 = x3;
 587              s4 = x4;
 588              s5 = x5;
 589              s6 = x6;
 590              s7 = x7;
 591              s8 = x8 * CosPi4_64 + x9 * CosPi28_64;
 592              s9 = x8 * CosPi28_64 - x9 * CosPi4_64;
 593              s10 = x10 * CosPi20_64 + x11 * CosPi12_64;
 594              s11 = x10 * CosPi12_64 - x11 * CosPi20_64;
 595              s12 = -x12 * CosPi28_64 + x13 * CosPi4_64;
 596              s13 = x12 * CosPi4_64 + x13 * CosPi28_64;
 597              s14 = -x14 * CosPi12_64 + x15 * CosPi20_64;
 598              s15 = x14 * CosPi20_64 + x15 * CosPi12_64;
 599  
 600              x0 = WrapLow(s0 + s4);
 601              x1 = WrapLow(s1 + s5);
 602              x2 = WrapLow(s2 + s6);
 603              x3 = WrapLow(s3 + s7);
 604              x4 = WrapLow(s0 - s4);
 605              x5 = WrapLow(s1 - s5);
 606              x6 = WrapLow(s2 - s6);
 607              x7 = WrapLow(s3 - s7);
 608              x8 = WrapLow(DctConstRoundShift(s8 + s12));
 609              x9 = WrapLow(DctConstRoundShift(s9 + s13));
 610              x10 = WrapLow(DctConstRoundShift(s10 + s14));
 611              x11 = WrapLow(DctConstRoundShift(s11 + s15));
 612              x12 = WrapLow(DctConstRoundShift(s8 - s12));
 613              x13 = WrapLow(DctConstRoundShift(s9 - s13));
 614              x14 = WrapLow(DctConstRoundShift(s10 - s14));
 615              x15 = WrapLow(DctConstRoundShift(s11 - s15));
 616  
 617              // stage 3
 618              s0 = x0;
 619              s1 = x1;
 620              s2 = x2;
 621              s3 = x3;
 622              s4 = x4 * CosPi8_64 + x5 * CosPi24_64;
 623              s5 = x4 * CosPi24_64 - x5 * CosPi8_64;
 624              s6 = -x6 * CosPi24_64 + x7 * CosPi8_64;
 625              s7 = x6 * CosPi8_64 + x7 * CosPi24_64;
 626              s8 = x8;
 627              s9 = x9;
 628              s10 = x10;
 629              s11 = x11;
 630              s12 = x12 * CosPi8_64 + x13 * CosPi24_64;
 631              s13 = x12 * CosPi24_64 - x13 * CosPi8_64;
 632              s14 = -x14 * CosPi24_64 + x15 * CosPi8_64;
 633              s15 = x14 * CosPi8_64 + x15 * CosPi24_64;
 634  
 635              x0 = WrapLow(s0 + s2);
 636              x1 = WrapLow(s1 + s3);
 637              x2 = WrapLow(s0 - s2);
 638              x3 = WrapLow(s1 - s3);
 639              x4 = WrapLow(DctConstRoundShift(s4 + s6));
 640              x5 = WrapLow(DctConstRoundShift(s5 + s7));
 641              x6 = WrapLow(DctConstRoundShift(s4 - s6));
 642              x7 = WrapLow(DctConstRoundShift(s5 - s7));
 643              x8 = WrapLow(s8 + s10);
 644              x9 = WrapLow(s9 + s11);
 645              x10 = WrapLow(s8 - s10);
 646              x11 = WrapLow(s9 - s11);
 647              x12 = WrapLow(DctConstRoundShift(s12 + s14));
 648              x13 = WrapLow(DctConstRoundShift(s13 + s15));
 649              x14 = WrapLow(DctConstRoundShift(s12 - s14));
 650              x15 = WrapLow(DctConstRoundShift(s13 - s15));
 651  
 652              // stage 4
 653              s2 = (-CosPi16_64) * (x2 + x3);
 654              s3 = CosPi16_64 * (x2 - x3);
 655              s6 = CosPi16_64 * (x6 + x7);
 656              s7 = CosPi16_64 * (-x6 + x7);
 657              s10 = CosPi16_64 * (x10 + x11);
 658              s11 = CosPi16_64 * (-x10 + x11);
 659              s14 = (-CosPi16_64) * (x14 + x15);
 660              s15 = CosPi16_64 * (x14 - x15);
 661  
 662              x2 = WrapLow(DctConstRoundShift(s2));
 663              x3 = WrapLow(DctConstRoundShift(s3));
 664              x6 = WrapLow(DctConstRoundShift(s6));
 665              x7 = WrapLow(DctConstRoundShift(s7));
 666              x10 = WrapLow(DctConstRoundShift(s10));
 667              x11 = WrapLow(DctConstRoundShift(s11));
 668              x14 = WrapLow(DctConstRoundShift(s14));
 669              x15 = WrapLow(DctConstRoundShift(s15));
 670  
 671              output[0] = WrapLow(x0);
 672              output[1] = WrapLow(-x8);
 673              output[2] = WrapLow(x12);
 674              output[3] = WrapLow(-x4);
 675              output[4] = WrapLow(x6);
 676              output[5] = WrapLow(x14);
 677              output[6] = WrapLow(x10);
 678              output[7] = WrapLow(x2);
 679              output[8] = WrapLow(x3);
 680              output[9] = WrapLow(x11);
 681              output[10] = WrapLow(x15);
 682              output[11] = WrapLow(x7);
 683              output[12] = WrapLow(x5);
 684              output[13] = WrapLow(-x13);
 685              output[14] = WrapLow(x9);
 686              output[15] = WrapLow(-x1);
 687          }
 688  
 689          [SkipLocalsInit]
 690          public static void Idct16(ReadOnlySpan<int> input, Span<int> output)
 691          {
 692              Span<short> step1 = stackalloc short[16];
 693              Span<short> step2 = stackalloc short[16];
 694              long temp1, temp2;
 695  
 696              // stage 1
 697              step1[0] = (short)input[0 / 2];
 698              step1[1] = (short)input[16 / 2];
 699              step1[2] = (short)input[8 / 2];
 700              step1[3] = (short)input[24 / 2];
 701              step1[4] = (short)input[4 / 2];
 702              step1[5] = (short)input[20 / 2];
 703              step1[6] = (short)input[12 / 2];
 704              step1[7] = (short)input[28 / 2];
 705              step1[8] = (short)input[2 / 2];
 706              step1[9] = (short)input[18 / 2];
 707              step1[10] = (short)input[10 / 2];
 708              step1[11] = (short)input[26 / 2];
 709              step1[12] = (short)input[6 / 2];
 710              step1[13] = (short)input[22 / 2];
 711              step1[14] = (short)input[14 / 2];
 712              step1[15] = (short)input[30 / 2];
 713  
 714              // stage 2
 715              step2[0] = step1[0];
 716              step2[1] = step1[1];
 717              step2[2] = step1[2];
 718              step2[3] = step1[3];
 719              step2[4] = step1[4];
 720              step2[5] = step1[5];
 721              step2[6] = step1[6];
 722              step2[7] = step1[7];
 723  
 724              temp1 = step1[8] * CosPi30_64 - step1[15] * CosPi2_64;
 725              temp2 = step1[8] * CosPi2_64 + step1[15] * CosPi30_64;
 726              step2[8] = (short)WrapLow(DctConstRoundShift(temp1));
 727              step2[15] = (short)WrapLow(DctConstRoundShift(temp2));
 728  
 729              temp1 = step1[9] * CosPi14_64 - step1[14] * CosPi18_64;
 730              temp2 = step1[9] * CosPi18_64 + step1[14] * CosPi14_64;
 731              step2[9] = (short)WrapLow(DctConstRoundShift(temp1));
 732              step2[14] = (short)WrapLow(DctConstRoundShift(temp2));
 733  
 734              temp1 = step1[10] * CosPi22_64 - step1[13] * CosPi10_64;
 735              temp2 = step1[10] * CosPi10_64 + step1[13] * CosPi22_64;
 736              step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
 737              step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
 738  
 739              temp1 = step1[11] * CosPi6_64 - step1[12] * CosPi26_64;
 740              temp2 = step1[11] * CosPi26_64 + step1[12] * CosPi6_64;
 741              step2[11] = (short)WrapLow(DctConstRoundShift(temp1));
 742              step2[12] = (short)WrapLow(DctConstRoundShift(temp2));
 743  
 744              // stage 3
 745              step1[0] = step2[0];
 746              step1[1] = step2[1];
 747              step1[2] = step2[2];
 748              step1[3] = step2[3];
 749  
 750              temp1 = step2[4] * CosPi28_64 - step2[7] * CosPi4_64;
 751              temp2 = step2[4] * CosPi4_64 + step2[7] * CosPi28_64;
 752              step1[4] = (short)WrapLow(DctConstRoundShift(temp1));
 753              step1[7] = (short)WrapLow(DctConstRoundShift(temp2));
 754              temp1 = step2[5] * CosPi12_64 - step2[6] * CosPi20_64;
 755              temp2 = step2[5] * CosPi20_64 + step2[6] * CosPi12_64;
 756              step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
 757              step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
 758  
 759              step1[8] = (short)WrapLow(step2[8] + step2[9]);
 760              step1[9] = (short)WrapLow(step2[8] - step2[9]);
 761              step1[10] = (short)WrapLow(-step2[10] + step2[11]);
 762              step1[11] = (short)WrapLow(step2[10] + step2[11]);
 763              step1[12] = (short)WrapLow(step2[12] + step2[13]);
 764              step1[13] = (short)WrapLow(step2[12] - step2[13]);
 765              step1[14] = (short)WrapLow(-step2[14] + step2[15]);
 766              step1[15] = (short)WrapLow(step2[14] + step2[15]);
 767  
 768              // stage 4
 769              temp1 = (step1[0] + step1[1]) * CosPi16_64;
 770              temp2 = (step1[0] - step1[1]) * CosPi16_64;
 771              step2[0] = (short)WrapLow(DctConstRoundShift(temp1));
 772              step2[1] = (short)WrapLow(DctConstRoundShift(temp2));
 773              temp1 = step1[2] * CosPi24_64 - step1[3] * CosPi8_64;
 774              temp2 = step1[2] * CosPi8_64 + step1[3] * CosPi24_64;
 775              step2[2] = (short)WrapLow(DctConstRoundShift(temp1));
 776              step2[3] = (short)WrapLow(DctConstRoundShift(temp2));
 777              step2[4] = (short)WrapLow(step1[4] + step1[5]);
 778              step2[5] = (short)WrapLow(step1[4] - step1[5]);
 779              step2[6] = (short)WrapLow(-step1[6] + step1[7]);
 780              step2[7] = (short)WrapLow(step1[6] + step1[7]);
 781  
 782              step2[8] = step1[8];
 783              step2[15] = step1[15];
 784              temp1 = -step1[9] * CosPi8_64 + step1[14] * CosPi24_64;
 785              temp2 = step1[9] * CosPi24_64 + step1[14] * CosPi8_64;
 786              step2[9] = (short)WrapLow(DctConstRoundShift(temp1));
 787              step2[14] = (short)WrapLow(DctConstRoundShift(temp2));
 788              temp1 = -step1[10] * CosPi24_64 - step1[13] * CosPi8_64;
 789              temp2 = -step1[10] * CosPi8_64 + step1[13] * CosPi24_64;
 790              step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
 791              step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
 792              step2[11] = step1[11];
 793              step2[12] = step1[12];
 794  
 795              // stage 5
 796              step1[0] = (short)WrapLow(step2[0] + step2[3]);
 797              step1[1] = (short)WrapLow(step2[1] + step2[2]);
 798              step1[2] = (short)WrapLow(step2[1] - step2[2]);
 799              step1[3] = (short)WrapLow(step2[0] - step2[3]);
 800              step1[4] = step2[4];
 801              temp1 = (step2[6] - step2[5]) * CosPi16_64;
 802              temp2 = (step2[5] + step2[6]) * CosPi16_64;
 803              step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
 804              step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
 805              step1[7] = step2[7];
 806  
 807              step1[8] = (short)WrapLow(step2[8] + step2[11]);
 808              step1[9] = (short)WrapLow(step2[9] + step2[10]);
 809              step1[10] = (short)WrapLow(step2[9] - step2[10]);
 810              step1[11] = (short)WrapLow(step2[8] - step2[11]);
 811              step1[12] = (short)WrapLow(-step2[12] + step2[15]);
 812              step1[13] = (short)WrapLow(-step2[13] + step2[14]);
 813              step1[14] = (short)WrapLow(step2[13] + step2[14]);
 814              step1[15] = (short)WrapLow(step2[12] + step2[15]);
 815  
 816              // stage 6
 817              step2[0] = (short)WrapLow(step1[0] + step1[7]);
 818              step2[1] = (short)WrapLow(step1[1] + step1[6]);
 819              step2[2] = (short)WrapLow(step1[2] + step1[5]);
 820              step2[3] = (short)WrapLow(step1[3] + step1[4]);
 821              step2[4] = (short)WrapLow(step1[3] - step1[4]);
 822              step2[5] = (short)WrapLow(step1[2] - step1[5]);
 823              step2[6] = (short)WrapLow(step1[1] - step1[6]);
 824              step2[7] = (short)WrapLow(step1[0] - step1[7]);
 825              step2[8] = step1[8];
 826              step2[9] = step1[9];
 827              temp1 = (-step1[10] + step1[13]) * CosPi16_64;
 828              temp2 = (step1[10] + step1[13]) * CosPi16_64;
 829              step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
 830              step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
 831              temp1 = (-step1[11] + step1[12]) * CosPi16_64;
 832              temp2 = (step1[11] + step1[12]) * CosPi16_64;
 833              step2[11] = (short)WrapLow(DctConstRoundShift(temp1));
 834              step2[12] = (short)WrapLow(DctConstRoundShift(temp2));
 835              step2[14] = step1[14];
 836              step2[15] = step1[15];
 837  
 838              // stage 7
 839              output[0] = WrapLow(step2[0] + step2[15]);
 840              output[1] = WrapLow(step2[1] + step2[14]);
 841              output[2] = WrapLow(step2[2] + step2[13]);
 842              output[3] = WrapLow(step2[3] + step2[12]);
 843              output[4] = WrapLow(step2[4] + step2[11]);
 844              output[5] = WrapLow(step2[5] + step2[10]);
 845              output[6] = WrapLow(step2[6] + step2[9]);
 846              output[7] = WrapLow(step2[7] + step2[8]);
 847              output[8] = WrapLow(step2[7] - step2[8]);
 848              output[9] = WrapLow(step2[6] - step2[9]);
 849              output[10] = WrapLow(step2[5] - step2[10]);
 850              output[11] = WrapLow(step2[4] - step2[11]);
 851              output[12] = WrapLow(step2[3] - step2[12]);
 852              output[13] = WrapLow(step2[2] - step2[13]);
 853              output[14] = WrapLow(step2[1] - step2[14]);
 854              output[15] = WrapLow(step2[0] - step2[15]);
 855          }
 856  
 857          [SkipLocalsInit]
 858          public static void Idct16x16256Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
 859          {
 860              int i, j;
 861              Span<int> output = stackalloc int[16 * 16];
 862              Span<int> outptr = output;
 863              Span<int> tempIn = stackalloc int[16];
 864              Span<int> tempOut = stackalloc int[16];
 865  
 866              // First transform rows
 867              for (i = 0; i < 16; ++i)
 868              {
 869                  Idct16(input, outptr);
 870                  input = input[16..];
 871                  outptr = outptr[16..];
 872              }
 873  
 874              // Then transform columns
 875              for (i = 0; i < 16; ++i)
 876              {
 877                  for (j = 0; j < 16; ++j)
 878                  {
 879                      tempIn[j] = output[j * 16 + i];
 880                  }
 881  
 882                  Idct16(tempIn, tempOut);
 883                  for (j = 0; j < 16; ++j)
 884                  {
 885                      dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
 886                  }
 887              }
 888          }
 889  
 890          [SkipLocalsInit]
 891          public static void Idct16x1638Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
 892          {
 893              int i, j;
 894              Span<int> output = stackalloc int[16 * 16];
 895              Span<int> outptr = output;
 896              Span<int> tempIn = stackalloc int[16];
 897              Span<int> tempOut = stackalloc int[16];
 898  
 899              output.Clear();
 900  
 901              // First transform rows. Since all non-zero dct coefficients are in
 902              // upper-left 8x8 area, we only need to calculate first 8 rows here.
 903              for (i = 0; i < 8; ++i)
 904              {
 905                  Idct16(input, outptr);
 906                  input = input[16..];
 907                  outptr = outptr[16..];
 908              }
 909  
 910              // Then transform columns
 911              for (i = 0; i < 16; ++i)
 912              {
 913                  for (j = 0; j < 16; ++j)
 914                  {
 915                      tempIn[j] = output[j * 16 + i];
 916                  }
 917  
 918                  Idct16(tempIn, tempOut);
 919                  for (j = 0; j < 16; ++j)
 920                  {
 921                      dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
 922                  }
 923              }
 924          }
 925  
 926          [SkipLocalsInit]
 927          public static void Idct16x1610Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
 928          {
 929              int i, j;
 930              Span<int> output = stackalloc int[16 * 16];
 931              Span<int> outptr = output;
 932              Span<int> tempIn = stackalloc int[16];
 933              Span<int> tempOut = stackalloc int[16];
 934  
 935              output.Clear();
 936  
 937              // First transform rows. Since all non-zero dct coefficients are in
 938              // upper-left 4x4 area, we only need to calculate first 4 rows here.
 939              for (i = 0; i < 4; ++i)
 940              {
 941                  Idct16(input, outptr);
 942                  input = input[16..];
 943                  outptr = outptr[16..];
 944              }
 945  
 946              // Then transform columns
 947              for (i = 0; i < 16; ++i)
 948              {
 949                  for (j = 0; j < 16; ++j)
 950                  {
 951                      tempIn[j] = output[j * 16 + i];
 952                  }
 953  
 954                  Idct16(tempIn, tempOut);
 955                  for (j = 0; j < 16; ++j)
 956                  {
 957                      dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
 958                  }
 959              }
 960          }
 961  
 962          public static void Idct16x161Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
 963          {
 964              int i, j;
 965              long a1;
 966              int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64));
 967  
 968              output = WrapLow(DctConstRoundShift(output * CosPi16_64));
 969              a1 = BitUtils.RoundPowerOfTwo(output, 6);
 970              for (j = 0; j < 16; ++j)
 971              {
 972                  for (i = 0; i < 16; ++i)
 973                  {
 974                      dest[i] = ClipPixelAdd(dest[i], a1);
 975                  }
 976  
 977                  dest = dest[stride..];
 978              }
 979          }
 980  
 981          [SkipLocalsInit]
 982          public static void Idct32(ReadOnlySpan<int> input, Span<int> output)
 983          {
 984              Span<short> step1 = stackalloc short[32];
 985              Span<short> step2 = stackalloc short[32];
 986              long temp1, temp2;
 987  
 988              // stage 1
 989              step1[0] = (short)input[0];
 990              step1[1] = (short)input[16];
 991              step1[2] = (short)input[8];
 992              step1[3] = (short)input[24];
 993              step1[4] = (short)input[4];
 994              step1[5] = (short)input[20];
 995              step1[6] = (short)input[12];
 996              step1[7] = (short)input[28];
 997              step1[8] = (short)input[2];
 998              step1[9] = (short)input[18];
 999              step1[10] = (short)input[10];
1000              step1[11] = (short)input[26];
1001              step1[12] = (short)input[6];
1002              step1[13] = (short)input[22];
1003              step1[14] = (short)input[14];
1004              step1[15] = (short)input[30];
1005  
1006              temp1 = (short)input[1] * CosPi31_64 - (short)input[31] * CosPi1_64;
1007              temp2 = (short)input[1] * CosPi1_64 + (short)input[31] * CosPi31_64;
1008              step1[16] = (short)WrapLow(DctConstRoundShift(temp1));
1009              step1[31] = (short)WrapLow(DctConstRoundShift(temp2));
1010  
1011              temp1 = (short)input[17] * CosPi15_64 - (short)input[15] * CosPi17_64;
1012              temp2 = (short)input[17] * CosPi17_64 + (short)input[15] * CosPi15_64;
1013              step1[17] = (short)WrapLow(DctConstRoundShift(temp1));
1014              step1[30] = (short)WrapLow(DctConstRoundShift(temp2));
1015  
1016              temp1 = (short)input[9] * CosPi23_64 - (short)input[23] * CosPi9_64;
1017              temp2 = (short)input[9] * CosPi9_64 + (short)input[23] * CosPi23_64;
1018              step1[18] = (short)WrapLow(DctConstRoundShift(temp1));
1019              step1[29] = (short)WrapLow(DctConstRoundShift(temp2));
1020  
1021              temp1 = (short)input[25] * CosPi7_64 - (short)input[7] * CosPi25_64;
1022              temp2 = (short)input[25] * CosPi25_64 + (short)input[7] * CosPi7_64;
1023              step1[19] = (short)WrapLow(DctConstRoundShift(temp1));
1024              step1[28] = (short)WrapLow(DctConstRoundShift(temp2));
1025  
1026              temp1 = (short)input[5] * CosPi27_64 - (short)input[27] * CosPi5_64;
1027              temp2 = (short)input[5] * CosPi5_64 + (short)input[27] * CosPi27_64;
1028              step1[20] = (short)WrapLow(DctConstRoundShift(temp1));
1029              step1[27] = (short)WrapLow(DctConstRoundShift(temp2));
1030  
1031              temp1 = (short)input[21] * CosPi11_64 - (short)input[11] * CosPi21_64;
1032              temp2 = (short)input[21] * CosPi21_64 + (short)input[11] * CosPi11_64;
1033              step1[21] = (short)WrapLow(DctConstRoundShift(temp1));
1034              step1[26] = (short)WrapLow(DctConstRoundShift(temp2));
1035  
1036              temp1 = (short)input[13] * CosPi19_64 - (short)input[19] * CosPi13_64;
1037              temp2 = (short)input[13] * CosPi13_64 + (short)input[19] * CosPi19_64;
1038              step1[22] = (short)WrapLow(DctConstRoundShift(temp1));
1039              step1[25] = (short)WrapLow(DctConstRoundShift(temp2));
1040  
1041              temp1 = (short)input[29] * CosPi3_64 - (short)input[3] * CosPi29_64;
1042              temp2 = (short)input[29] * CosPi29_64 + (short)input[3] * CosPi3_64;
1043              step1[23] = (short)WrapLow(DctConstRoundShift(temp1));
1044              step1[24] = (short)WrapLow(DctConstRoundShift(temp2));
1045  
1046              // stage 2
1047              step2[0] = step1[0];
1048              step2[1] = step1[1];
1049              step2[2] = step1[2];
1050              step2[3] = step1[3];
1051              step2[4] = step1[4];
1052              step2[5] = step1[5];
1053              step2[6] = step1[6];
1054              step2[7] = step1[7];
1055  
1056              temp1 = step1[8] * CosPi30_64 - step1[15] * CosPi2_64;
1057              temp2 = step1[8] * CosPi2_64 + step1[15] * CosPi30_64;
1058              step2[8] = (short)WrapLow(DctConstRoundShift(temp1));
1059              step2[15] = (short)WrapLow(DctConstRoundShift(temp2));
1060  
1061              temp1 = step1[9] * CosPi14_64 - step1[14] * CosPi18_64;
1062              temp2 = step1[9] * CosPi18_64 + step1[14] * CosPi14_64;
1063              step2[9] = (short)WrapLow(DctConstRoundShift(temp1));
1064              step2[14] = (short)WrapLow(DctConstRoundShift(temp2));
1065  
1066              temp1 = step1[10] * CosPi22_64 - step1[13] * CosPi10_64;
1067              temp2 = step1[10] * CosPi10_64 + step1[13] * CosPi22_64;
1068              step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
1069              step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
1070  
1071              temp1 = step1[11] * CosPi6_64 - step1[12] * CosPi26_64;
1072              temp2 = step1[11] * CosPi26_64 + step1[12] * CosPi6_64;
1073              step2[11] = (short)WrapLow(DctConstRoundShift(temp1));
1074              step2[12] = (short)WrapLow(DctConstRoundShift(temp2));
1075  
1076              step2[16] = (short)WrapLow(step1[16] + step1[17]);
1077              step2[17] = (short)WrapLow(step1[16] - step1[17]);
1078              step2[18] = (short)WrapLow(-step1[18] + step1[19]);
1079              step2[19] = (short)WrapLow(step1[18] + step1[19]);
1080              step2[20] = (short)WrapLow(step1[20] + step1[21]);
1081              step2[21] = (short)WrapLow(step1[20] - step1[21]);
1082              step2[22] = (short)WrapLow(-step1[22] + step1[23]);
1083              step2[23] = (short)WrapLow(step1[22] + step1[23]);
1084              step2[24] = (short)WrapLow(step1[24] + step1[25]);
1085              step2[25] = (short)WrapLow(step1[24] - step1[25]);
1086              step2[26] = (short)WrapLow(-step1[26] + step1[27]);
1087              step2[27] = (short)WrapLow(step1[26] + step1[27]);
1088              step2[28] = (short)WrapLow(step1[28] + step1[29]);
1089              step2[29] = (short)WrapLow(step1[28] - step1[29]);
1090              step2[30] = (short)WrapLow(-step1[30] + step1[31]);
1091              step2[31] = (short)WrapLow(step1[30] + step1[31]);
1092  
1093              // stage 3
1094              step1[0] = step2[0];
1095              step1[1] = step2[1];
1096              step1[2] = step2[2];
1097              step1[3] = step2[3];
1098  
1099              temp1 = step2[4] * CosPi28_64 - step2[7] * CosPi4_64;
1100              temp2 = step2[4] * CosPi4_64 + step2[7] * CosPi28_64;
1101              step1[4] = (short)WrapLow(DctConstRoundShift(temp1));
1102              step1[7] = (short)WrapLow(DctConstRoundShift(temp2));
1103              temp1 = step2[5] * CosPi12_64 - step2[6] * CosPi20_64;
1104              temp2 = step2[5] * CosPi20_64 + step2[6] * CosPi12_64;
1105              step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
1106              step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
1107  
1108              step1[8] = (short)WrapLow(step2[8] + step2[9]);
1109              step1[9] = (short)WrapLow(step2[8] - step2[9]);
1110              step1[10] = (short)WrapLow(-step2[10] + step2[11]);
1111              step1[11] = (short)WrapLow(step2[10] + step2[11]);
1112              step1[12] = (short)WrapLow(step2[12] + step2[13]);
1113              step1[13] = (short)WrapLow(step2[12] - step2[13]);
1114              step1[14] = (short)WrapLow(-step2[14] + step2[15]);
1115              step1[15] = (short)WrapLow(step2[14] + step2[15]);
1116  
1117              step1[16] = step2[16];
1118              step1[31] = step2[31];
1119              temp1 = -step2[17] * CosPi4_64 + step2[30] * CosPi28_64;
1120              temp2 = step2[17] * CosPi28_64 + step2[30] * CosPi4_64;
1121              step1[17] = (short)WrapLow(DctConstRoundShift(temp1));
1122              step1[30] = (short)WrapLow(DctConstRoundShift(temp2));
1123              temp1 = -step2[18] * CosPi28_64 - step2[29] * CosPi4_64;
1124              temp2 = -step2[18] * CosPi4_64 + step2[29] * CosPi28_64;
1125              step1[18] = (short)WrapLow(DctConstRoundShift(temp1));
1126              step1[29] = (short)WrapLow(DctConstRoundShift(temp2));
1127              step1[19] = step2[19];
1128              step1[20] = step2[20];
1129              temp1 = -step2[21] * CosPi20_64 + step2[26] * CosPi12_64;
1130              temp2 = step2[21] * CosPi12_64 + step2[26] * CosPi20_64;
1131              step1[21] = (short)WrapLow(DctConstRoundShift(temp1));
1132              step1[26] = (short)WrapLow(DctConstRoundShift(temp2));
1133              temp1 = -step2[22] * CosPi12_64 - step2[25] * CosPi20_64;
1134              temp2 = -step2[22] * CosPi20_64 + step2[25] * CosPi12_64;
1135              step1[22] = (short)WrapLow(DctConstRoundShift(temp1));
1136              step1[25] = (short)WrapLow(DctConstRoundShift(temp2));
1137              step1[23] = step2[23];
1138              step1[24] = step2[24];
1139              step1[27] = step2[27];
1140              step1[28] = step2[28];
1141  
1142              // stage 4
1143              temp1 = (step1[0] + step1[1]) * CosPi16_64;
1144              temp2 = (step1[0] - step1[1]) * CosPi16_64;
1145              step2[0] = (short)WrapLow(DctConstRoundShift(temp1));
1146              step2[1] = (short)WrapLow(DctConstRoundShift(temp2));
1147              temp1 = step1[2] * CosPi24_64 - step1[3] * CosPi8_64;
1148              temp2 = step1[2] * CosPi8_64 + step1[3] * CosPi24_64;
1149              step2[2] = (short)WrapLow(DctConstRoundShift(temp1));
1150              step2[3] = (short)WrapLow(DctConstRoundShift(temp2));
1151              step2[4] = (short)WrapLow(step1[4] + step1[5]);
1152              step2[5] = (short)WrapLow(step1[4] - step1[5]);
1153              step2[6] = (short)WrapLow(-step1[6] + step1[7]);
1154              step2[7] = (short)WrapLow(step1[6] + step1[7]);
1155  
1156              step2[8] = step1[8];
1157              step2[15] = step1[15];
1158              temp1 = -step1[9] * CosPi8_64 + step1[14] * CosPi24_64;
1159              temp2 = step1[9] * CosPi24_64 + step1[14] * CosPi8_64;
1160              step2[9] = (short)WrapLow(DctConstRoundShift(temp1));
1161              step2[14] = (short)WrapLow(DctConstRoundShift(temp2));
1162              temp1 = -step1[10] * CosPi24_64 - step1[13] * CosPi8_64;
1163              temp2 = -step1[10] * CosPi8_64 + step1[13] * CosPi24_64;
1164              step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
1165              step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
1166              step2[11] = step1[11];
1167              step2[12] = step1[12];
1168  
1169              step2[16] = (short)WrapLow(step1[16] + step1[19]);
1170              step2[17] = (short)WrapLow(step1[17] + step1[18]);
1171              step2[18] = (short)WrapLow(step1[17] - step1[18]);
1172              step2[19] = (short)WrapLow(step1[16] - step1[19]);
1173              step2[20] = (short)WrapLow(-step1[20] + step1[23]);
1174              step2[21] = (short)WrapLow(-step1[21] + step1[22]);
1175              step2[22] = (short)WrapLow(step1[21] + step1[22]);
1176              step2[23] = (short)WrapLow(step1[20] + step1[23]);
1177  
1178              step2[24] = (short)WrapLow(step1[24] + step1[27]);
1179              step2[25] = (short)WrapLow(step1[25] + step1[26]);
1180              step2[26] = (short)WrapLow(step1[25] - step1[26]);
1181              step2[27] = (short)WrapLow(step1[24] - step1[27]);
1182              step2[28] = (short)WrapLow(-step1[28] + step1[31]);
1183              step2[29] = (short)WrapLow(-step1[29] + step1[30]);
1184              step2[30] = (short)WrapLow(step1[29] + step1[30]);
1185              step2[31] = (short)WrapLow(step1[28] + step1[31]);
1186  
1187              // stage 5
1188              step1[0] = (short)WrapLow(step2[0] + step2[3]);
1189              step1[1] = (short)WrapLow(step2[1] + step2[2]);
1190              step1[2] = (short)WrapLow(step2[1] - step2[2]);
1191              step1[3] = (short)WrapLow(step2[0] - step2[3]);
1192              step1[4] = step2[4];
1193              temp1 = (step2[6] - step2[5]) * CosPi16_64;
1194              temp2 = (step2[5] + step2[6]) * CosPi16_64;
1195              step1[5] = (short)WrapLow(DctConstRoundShift(temp1));
1196              step1[6] = (short)WrapLow(DctConstRoundShift(temp2));
1197              step1[7] = step2[7];
1198  
1199              step1[8] = (short)WrapLow(step2[8] + step2[11]);
1200              step1[9] = (short)WrapLow(step2[9] + step2[10]);
1201              step1[10] = (short)WrapLow(step2[9] - step2[10]);
1202              step1[11] = (short)WrapLow(step2[8] - step2[11]);
1203              step1[12] = (short)WrapLow(-step2[12] + step2[15]);
1204              step1[13] = (short)WrapLow(-step2[13] + step2[14]);
1205              step1[14] = (short)WrapLow(step2[13] + step2[14]);
1206              step1[15] = (short)WrapLow(step2[12] + step2[15]);
1207  
1208              step1[16] = step2[16];
1209              step1[17] = step2[17];
1210              temp1 = -step2[18] * CosPi8_64 + step2[29] * CosPi24_64;
1211              temp2 = step2[18] * CosPi24_64 + step2[29] * CosPi8_64;
1212              step1[18] = (short)WrapLow(DctConstRoundShift(temp1));
1213              step1[29] = (short)WrapLow(DctConstRoundShift(temp2));
1214              temp1 = -step2[19] * CosPi8_64 + step2[28] * CosPi24_64;
1215              temp2 = step2[19] * CosPi24_64 + step2[28] * CosPi8_64;
1216              step1[19] = (short)WrapLow(DctConstRoundShift(temp1));
1217              step1[28] = (short)WrapLow(DctConstRoundShift(temp2));
1218              temp1 = -step2[20] * CosPi24_64 - step2[27] * CosPi8_64;
1219              temp2 = -step2[20] * CosPi8_64 + step2[27] * CosPi24_64;
1220              step1[20] = (short)WrapLow(DctConstRoundShift(temp1));
1221              step1[27] = (short)WrapLow(DctConstRoundShift(temp2));
1222              temp1 = -step2[21] * CosPi24_64 - step2[26] * CosPi8_64;
1223              temp2 = -step2[21] * CosPi8_64 + step2[26] * CosPi24_64;
1224              step1[21] = (short)WrapLow(DctConstRoundShift(temp1));
1225              step1[26] = (short)WrapLow(DctConstRoundShift(temp2));
1226              step1[22] = step2[22];
1227              step1[23] = step2[23];
1228              step1[24] = step2[24];
1229              step1[25] = step2[25];
1230              step1[30] = step2[30];
1231              step1[31] = step2[31];
1232  
1233              // stage 6
1234              step2[0] = (short)WrapLow(step1[0] + step1[7]);
1235              step2[1] = (short)WrapLow(step1[1] + step1[6]);
1236              step2[2] = (short)WrapLow(step1[2] + step1[5]);
1237              step2[3] = (short)WrapLow(step1[3] + step1[4]);
1238              step2[4] = (short)WrapLow(step1[3] - step1[4]);
1239              step2[5] = (short)WrapLow(step1[2] - step1[5]);
1240              step2[6] = (short)WrapLow(step1[1] - step1[6]);
1241              step2[7] = (short)WrapLow(step1[0] - step1[7]);
1242              step2[8] = step1[8];
1243              step2[9] = step1[9];
1244              temp1 = (-step1[10] + step1[13]) * CosPi16_64;
1245              temp2 = (step1[10] + step1[13]) * CosPi16_64;
1246              step2[10] = (short)WrapLow(DctConstRoundShift(temp1));
1247              step2[13] = (short)WrapLow(DctConstRoundShift(temp2));
1248              temp1 = (-step1[11] + step1[12]) * CosPi16_64;
1249              temp2 = (step1[11] + step1[12]) * CosPi16_64;
1250              step2[11] = (short)WrapLow(DctConstRoundShift(temp1));
1251              step2[12] = (short)WrapLow(DctConstRoundShift(temp2));
1252              step2[14] = step1[14];
1253              step2[15] = step1[15];
1254  
1255              step2[16] = (short)WrapLow(step1[16] + step1[23]);
1256              step2[17] = (short)WrapLow(step1[17] + step1[22]);
1257              step2[18] = (short)WrapLow(step1[18] + step1[21]);
1258              step2[19] = (short)WrapLow(step1[19] + step1[20]);
1259              step2[20] = (short)WrapLow(step1[19] - step1[20]);
1260              step2[21] = (short)WrapLow(step1[18] - step1[21]);
1261              step2[22] = (short)WrapLow(step1[17] - step1[22]);
1262              step2[23] = (short)WrapLow(step1[16] - step1[23]);
1263  
1264              step2[24] = (short)WrapLow(-step1[24] + step1[31]);
1265              step2[25] = (short)WrapLow(-step1[25] + step1[30]);
1266              step2[26] = (short)WrapLow(-step1[26] + step1[29]);
1267              step2[27] = (short)WrapLow(-step1[27] + step1[28]);
1268              step2[28] = (short)WrapLow(step1[27] + step1[28]);
1269              step2[29] = (short)WrapLow(step1[26] + step1[29]);
1270              step2[30] = (short)WrapLow(step1[25] + step1[30]);
1271              step2[31] = (short)WrapLow(step1[24] + step1[31]);
1272  
1273              // stage 7
1274              step1[0] = (short)WrapLow(step2[0] + step2[15]);
1275              step1[1] = (short)WrapLow(step2[1] + step2[14]);
1276              step1[2] = (short)WrapLow(step2[2] + step2[13]);
1277              step1[3] = (short)WrapLow(step2[3] + step2[12]);
1278              step1[4] = (short)WrapLow(step2[4] + step2[11]);
1279              step1[5] = (short)WrapLow(step2[5] + step2[10]);
1280              step1[6] = (short)WrapLow(step2[6] + step2[9]);
1281              step1[7] = (short)WrapLow(step2[7] + step2[8]);
1282              step1[8] = (short)WrapLow(step2[7] - step2[8]);
1283              step1[9] = (short)WrapLow(step2[6] - step2[9]);
1284              step1[10] = (short)WrapLow(step2[5] - step2[10]);
1285              step1[11] = (short)WrapLow(step2[4] - step2[11]);
1286              step1[12] = (short)WrapLow(step2[3] - step2[12]);
1287              step1[13] = (short)WrapLow(step2[2] - step2[13]);
1288              step1[14] = (short)WrapLow(step2[1] - step2[14]);
1289              step1[15] = (short)WrapLow(step2[0] - step2[15]);
1290  
1291              step1[16] = step2[16];
1292              step1[17] = step2[17];
1293              step1[18] = step2[18];
1294              step1[19] = step2[19];
1295              temp1 = (-step2[20] + step2[27]) * CosPi16_64;
1296              temp2 = (step2[20] + step2[27]) * CosPi16_64;
1297              step1[20] = (short)WrapLow(DctConstRoundShift(temp1));
1298              step1[27] = (short)WrapLow(DctConstRoundShift(temp2));
1299              temp1 = (-step2[21] + step2[26]) * CosPi16_64;
1300              temp2 = (step2[21] + step2[26]) * CosPi16_64;
1301              step1[21] = (short)WrapLow(DctConstRoundShift(temp1));
1302              step1[26] = (short)WrapLow(DctConstRoundShift(temp2));
1303              temp1 = (-step2[22] + step2[25]) * CosPi16_64;
1304              temp2 = (step2[22] + step2[25]) * CosPi16_64;
1305              step1[22] = (short)WrapLow(DctConstRoundShift(temp1));
1306              step1[25] = (short)WrapLow(DctConstRoundShift(temp2));
1307              temp1 = (-step2[23] + step2[24]) * CosPi16_64;
1308              temp2 = (step2[23] + step2[24]) * CosPi16_64;
1309              step1[23] = (short)WrapLow(DctConstRoundShift(temp1));
1310              step1[24] = (short)WrapLow(DctConstRoundShift(temp2));
1311              step1[28] = step2[28];
1312              step1[29] = step2[29];
1313              step1[30] = step2[30];
1314              step1[31] = step2[31];
1315  
1316              // final stage
1317              output[0] = WrapLow(step1[0] + step1[31]);
1318              output[1] = WrapLow(step1[1] + step1[30]);
1319              output[2] = WrapLow(step1[2] + step1[29]);
1320              output[3] = WrapLow(step1[3] + step1[28]);
1321              output[4] = WrapLow(step1[4] + step1[27]);
1322              output[5] = WrapLow(step1[5] + step1[26]);
1323              output[6] = WrapLow(step1[6] + step1[25]);
1324              output[7] = WrapLow(step1[7] + step1[24]);
1325              output[8] = WrapLow(step1[8] + step1[23]);
1326              output[9] = WrapLow(step1[9] + step1[22]);
1327              output[10] = WrapLow(step1[10] + step1[21]);
1328              output[11] = WrapLow(step1[11] + step1[20]);
1329              output[12] = WrapLow(step1[12] + step1[19]);
1330              output[13] = WrapLow(step1[13] + step1[18]);
1331              output[14] = WrapLow(step1[14] + step1[17]);
1332              output[15] = WrapLow(step1[15] + step1[16]);
1333              output[16] = WrapLow(step1[15] - step1[16]);
1334              output[17] = WrapLow(step1[14] - step1[17]);
1335              output[18] = WrapLow(step1[13] - step1[18]);
1336              output[19] = WrapLow(step1[12] - step1[19]);
1337              output[20] = WrapLow(step1[11] - step1[20]);
1338              output[21] = WrapLow(step1[10] - step1[21]);
1339              output[22] = WrapLow(step1[9] - step1[22]);
1340              output[23] = WrapLow(step1[8] - step1[23]);
1341              output[24] = WrapLow(step1[7] - step1[24]);
1342              output[25] = WrapLow(step1[6] - step1[25]);
1343              output[26] = WrapLow(step1[5] - step1[26]);
1344              output[27] = WrapLow(step1[4] - step1[27]);
1345              output[28] = WrapLow(step1[3] - step1[28]);
1346              output[29] = WrapLow(step1[2] - step1[29]);
1347              output[30] = WrapLow(step1[1] - step1[30]);
1348              output[31] = WrapLow(step1[0] - step1[31]);
1349          }
1350  
1351          [SkipLocalsInit]
1352          public static void Idct32x321024Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
1353          {
1354              int i, j;
1355              Span<int> output = stackalloc int[32 * 32];
1356              Span<int> outptr = output;
1357              Span<int> tempIn = stackalloc int[32];
1358              Span<int> tempOut = stackalloc int[32];
1359  
1360              // Rows
1361              for (i = 0; i < 32; ++i)
1362              {
1363                  short zeroCoeff = 0;
1364                  for (j = 0; j < 32; ++j)
1365                  {
1366                      zeroCoeff |= (short)input[j];
1367                  }
1368  
1369                  if (zeroCoeff != 0)
1370                  {
1371                      Idct32(input, outptr);
1372                  }
1373                  else
1374                  {
1375                      outptr[..32].Clear();
1376                  }
1377  
1378                  input = input[32..];
1379                  outptr = outptr[32..];
1380              }
1381  
1382              // Columns
1383              for (i = 0; i < 32; ++i)
1384              {
1385                  for (j = 0; j < 32; ++j)
1386                  {
1387                      tempIn[j] = output[j * 32 + i];
1388                  }
1389  
1390                  Idct32(tempIn, tempOut);
1391                  for (j = 0; j < 32; ++j)
1392                  {
1393                      dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
1394                  }
1395              }
1396          }
1397  
1398          [SkipLocalsInit]
1399          public static void Idct32x32135Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
1400          {
1401              int i, j;
1402              Span<int> output = stackalloc int[32 * 32];
1403              Span<int> outptr = output;
1404              Span<int> tempIn = stackalloc int[32];
1405              Span<int> tempOut = stackalloc int[32];
1406  
1407              output.Clear();
1408  
1409              // Rows
1410              // Only upper-left 16x16 has non-zero coeff
1411              for (i = 0; i < 16; ++i)
1412              {
1413                  Idct32(input, outptr);
1414                  input = input[32..];
1415                  outptr = outptr[32..];
1416              }
1417  
1418              // Columns
1419              for (i = 0; i < 32; ++i)
1420              {
1421                  for (j = 0; j < 32; ++j)
1422                  {
1423                      tempIn[j] = output[j * 32 + i];
1424                  }
1425  
1426                  Idct32(tempIn, tempOut);
1427                  for (j = 0; j < 32; ++j)
1428                  {
1429                      dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
1430                  }
1431              }
1432          }
1433  
1434          [SkipLocalsInit]
1435          public static void Idct32x3234Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
1436          {
1437              int i, j;
1438              Span<int> output = stackalloc int[32 * 32];
1439              Span<int> outptr = output;
1440              Span<int> tempIn = stackalloc int[32];
1441              Span<int> tempOut = stackalloc int[32];
1442  
1443              output.Clear();
1444  
1445              // Rows
1446              // Only upper-left 8x8 has non-zero coeff
1447              for (i = 0; i < 8; ++i)
1448              {
1449                  Idct32(input, outptr);
1450                  input = input[32..];
1451                  outptr = outptr[32..];
1452              }
1453  
1454              // Columns
1455              for (i = 0; i < 32; ++i)
1456              {
1457                  for (j = 0; j < 32; ++j)
1458                  {
1459                      tempIn[j] = output[j * 32 + i];
1460                  }
1461  
1462                  Idct32(tempIn, tempOut);
1463                  for (j = 0; j < 32; ++j)
1464                  {
1465                      dest[j * stride + i] = ClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6));
1466                  }
1467              }
1468          }
1469  
1470          public static void Idct32x321Add(ReadOnlySpan<int> input, Span<byte> dest, int stride)
1471          {
1472              int i, j;
1473              long a1;
1474              int output = WrapLow(DctConstRoundShift((short)input[0] * CosPi16_64));
1475  
1476              output = WrapLow(DctConstRoundShift(output * CosPi16_64));
1477              a1 = BitUtils.RoundPowerOfTwo(output, 6);
1478  
1479              for (j = 0; j < 32; ++j)
1480              {
1481                  for (i = 0; i < 32; ++i)
1482                  {
1483                      dest[i] = ClipPixelAdd(dest[i], a1);
1484                  }
1485  
1486                  dest = dest[stride..];
1487              }
1488          }
1489  
1490          [SkipLocalsInit]
1491          public static void HighbdIwht4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
1492          {
1493              /* 4-point reversible, orthonormal inverse Walsh-Hadamard in 3.5 adds,
1494                 0.5 shifts per pixel. */
1495              int i;
1496              Span<int> output = stackalloc int[16];
1497              long a1, b1, c1, d1, e1;
1498              ReadOnlySpan<int> ip = input;
1499              Span<int> op = output;
1500  
1501              for (i = 0; i < 4; i++)
1502              {
1503                  a1 = ip[0] >> UnitQuantShift;
1504                  c1 = ip[1] >> UnitQuantShift;
1505                  d1 = ip[2] >> UnitQuantShift;
1506                  b1 = ip[3] >> UnitQuantShift;
1507                  a1 += c1;
1508                  d1 -= b1;
1509                  e1 = (a1 - d1) >> 1;
1510                  b1 = e1 - b1;
1511                  c1 = e1 - c1;
1512                  a1 -= b1;
1513                  d1 += c1;
1514                  op[0] = HighbdWrapLow(a1, bd);
1515                  op[1] = HighbdWrapLow(b1, bd);
1516                  op[2] = HighbdWrapLow(c1, bd);
1517                  op[3] = HighbdWrapLow(d1, bd);
1518                  ip = ip[4..];
1519                  op = op[4..];
1520              }
1521  
1522              ReadOnlySpan<int> ip2 = output;
1523              for (i = 0; i < 4; i++)
1524              {
1525                  a1 = ip2[4 * 0];
1526                  c1 = ip2[4 * 1];
1527                  d1 = ip2[4 * 2];
1528                  b1 = ip2[4 * 3];
1529                  a1 += c1;
1530                  d1 -= b1;
1531                  e1 = (a1 - d1) >> 1;
1532                  b1 = e1 - b1;
1533                  c1 = e1 - c1;
1534                  a1 -= b1;
1535                  d1 += c1;
1536                  dest[stride * 0] = HighbdClipPixelAdd(dest[stride * 0], HighbdWrapLow(a1, bd), bd);
1537                  dest[stride * 1] = HighbdClipPixelAdd(dest[stride * 1], HighbdWrapLow(b1, bd), bd);
1538                  dest[stride * 2] = HighbdClipPixelAdd(dest[stride * 2], HighbdWrapLow(c1, bd), bd);
1539                  dest[stride * 3] = HighbdClipPixelAdd(dest[stride * 3], HighbdWrapLow(d1, bd), bd);
1540  
1541                  ip2 = ip2[1..];
1542                  dest = dest[1..];
1543              }
1544          }
1545  
1546          [SkipLocalsInit]
1547          public static void HighbdIwht4x41Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
1548          {
1549              int i;
1550              long a1, e1;
1551              Span<int> tmp = stackalloc int[4];
1552              ReadOnlySpan<int> ip = input;
1553              Span<int> op = tmp;
1554  
1555              a1 = ip[0] >> UnitQuantShift;
1556              e1 = a1 >> 1;
1557              a1 -= e1;
1558              op[0] = HighbdWrapLow(a1, bd);
1559              op[1] = op[2] = op[3] = HighbdWrapLow(e1, bd);
1560  
1561              ReadOnlySpan<int> ip2 = tmp;
1562              for (i = 0; i < 4; i++)
1563              {
1564                  e1 = ip2[0] >> 1;
1565                  a1 = ip2[0] - e1;
1566                  dest[stride * 0] = HighbdClipPixelAdd(dest[stride * 0], a1, bd);
1567                  dest[stride * 1] = HighbdClipPixelAdd(dest[stride * 1], e1, bd);
1568                  dest[stride * 2] = HighbdClipPixelAdd(dest[stride * 2], e1, bd);
1569                  dest[stride * 3] = HighbdClipPixelAdd(dest[stride * 3], e1, bd);
1570                  ip2 = ip2[1..];
1571                  dest = dest[1..];
1572              }
1573          }
1574  
1575          public static void HighbdIadst4(ReadOnlySpan<int> input, Span<int> output, int bd)
1576          {
1577              long s0, s1, s2, s3, s4, s5, s6, s7;
1578              int x0 = input[0];
1579              int x1 = input[1];
1580              int x2 = input[2];
1581              int x3 = input[3];
1582  
1583              if (DetectInvalidHighbdInput(input, 4) != 0)
1584              {
1585                  Debug.Assert(false, "invalid highbd txfm input");
1586                  output[..4].Clear();
1587  
1588                  return;
1589              }
1590  
1591              if ((x0 | x1 | x2 | x3) == 0)
1592              {
1593                  output[..4].Clear();
1594  
1595                  return;
1596              }
1597  
1598              s0 = (long)SinPi1_9 * x0;
1599              s1 = (long)SinPi2_9 * x0;
1600              s2 = (long)SinPi3_9 * x1;
1601              s3 = (long)SinPi4_9 * x2;
1602              s4 = (long)SinPi1_9 * x2;
1603              s5 = (long)SinPi2_9 * x3;
1604              s6 = (long)SinPi4_9 * x3;
1605              s7 = HighbdWrapLow(x0 - x2 + x3, bd);
1606  
1607              s0 = s0 + s3 + s5;
1608              s1 = s1 - s4 - s6;
1609              s3 = s2;
1610              s2 = SinPi3_9 * s7;
1611  
1612              // 1-D transform scaling factor is sqrt(2).
1613              // The overall dynamic range is 14b (input) + 14b (multiplication scaling)
1614              // + 1b (addition) = 29b.
1615              // Hence the output bit depth is 15b.
1616              output[0] = HighbdWrapLow(DctConstRoundShift(s0 + s3), bd);
1617              output[1] = HighbdWrapLow(DctConstRoundShift(s1 + s3), bd);
1618              output[2] = HighbdWrapLow(DctConstRoundShift(s2), bd);
1619              output[3] = HighbdWrapLow(DctConstRoundShift(s0 + s1 - s3), bd);
1620          }
1621  
1622          [SkipLocalsInit]
1623          public static void HighbdIdct4(ReadOnlySpan<int> input, Span<int> output, int bd)
1624          {
1625              Span<int> step = stackalloc int[4];
1626              long temp1, temp2;
1627  
1628              if (DetectInvalidHighbdInput(input, 4) != 0)
1629              {
1630                  Debug.Assert(false, "invalid highbd txfm input");
1631                  output[..4].Clear();
1632  
1633                  return;
1634              }
1635  
1636              // stage 1
1637              temp1 = (input[0] + input[2]) * (long)CosPi16_64;
1638              temp2 = (input[0] - input[2]) * (long)CosPi16_64;
1639              step[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
1640              step[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
1641              temp1 = input[1] * (long)CosPi24_64 - input[3] * (long)CosPi8_64;
1642              temp2 = input[1] * (long)CosPi8_64 + input[3] * (long)CosPi24_64;
1643              step[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
1644              step[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
1645  
1646              // stage 2
1647              output[0] = HighbdWrapLow(step[0] + step[3], bd);
1648              output[1] = HighbdWrapLow(step[1] + step[2], bd);
1649              output[2] = HighbdWrapLow(step[1] - step[2], bd);
1650              output[3] = HighbdWrapLow(step[0] - step[3], bd);
1651          }
1652  
1653          [SkipLocalsInit]
1654          public static void HighbdIdct4x416Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
1655          {
1656              int i, j;
1657              Span<int> output = stackalloc int[4 * 4];
1658              Span<int> outptr = output;
1659              Span<int> tempIn = stackalloc int[4];
1660              Span<int> tempOut = stackalloc int[4];
1661  
1662              // Rows
1663              for (i = 0; i < 4; ++i)
1664              {
1665                  HighbdIdct4(input, outptr, bd);
1666                  input = input[4..];
1667                  outptr = outptr[4..];
1668              }
1669  
1670              // Columns
1671              for (i = 0; i < 4; ++i)
1672              {
1673                  for (j = 0; j < 4; ++j)
1674                  {
1675                      tempIn[j] = output[j * 4 + i];
1676                  }
1677  
1678                  HighbdIdct4(tempIn, tempOut, bd);
1679                  for (j = 0; j < 4; ++j)
1680                  {
1681                      dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 4), bd);
1682                  }
1683              }
1684          }
1685  
1686          public static void HighbdIdct4x41Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
1687          {
1688              int i;
1689              long a1;
1690              int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd);
1691  
1692              output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd);
1693              a1 = BitUtils.RoundPowerOfTwo(output, 4);
1694  
1695              for (i = 0; i < 4; i++)
1696              {
1697                  dest[0] = HighbdClipPixelAdd(dest[0], a1, bd);
1698                  dest[1] = HighbdClipPixelAdd(dest[1], a1, bd);
1699                  dest[2] = HighbdClipPixelAdd(dest[2], a1, bd);
1700                  dest[3] = HighbdClipPixelAdd(dest[3], a1, bd);
1701                  dest = dest[stride..];
1702              }
1703          }
1704  
1705          public static void HighbdIadst8(ReadOnlySpan<int> input, Span<int> output, int bd)
1706          {
1707              long s0, s1, s2, s3, s4, s5, s6, s7;
1708              int x0 = input[7];
1709              int x1 = input[0];
1710              int x2 = input[5];
1711              int x3 = input[2];
1712              int x4 = input[3];
1713              int x5 = input[4];
1714              int x6 = input[1];
1715              int x7 = input[6];
1716  
1717              if (DetectInvalidHighbdInput(input, 8) != 0)
1718              {
1719                  Debug.Assert(false, "invalid highbd txfm input");
1720                  output[..8].Clear();
1721  
1722                  return;
1723              }
1724  
1725              if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7) == 0)
1726              {
1727                  output[..8].Clear();
1728  
1729                  return;
1730              }
1731  
1732              // stage 1
1733              s0 = (long)CosPi2_64 * x0 + (long)CosPi30_64 * x1;
1734              s1 = (long)CosPi30_64 * x0 - (long)CosPi2_64 * x1;
1735              s2 = (long)CosPi10_64 * x2 + (long)CosPi22_64 * x3;
1736              s3 = (long)CosPi22_64 * x2 - (long)CosPi10_64 * x3;
1737              s4 = (long)CosPi18_64 * x4 + (long)CosPi14_64 * x5;
1738              s5 = (long)CosPi14_64 * x4 - (long)CosPi18_64 * x5;
1739              s6 = (long)CosPi26_64 * x6 + (long)CosPi6_64 * x7;
1740              s7 = (long)CosPi6_64 * x6 - (long)CosPi26_64 * x7;
1741  
1742              x0 = HighbdWrapLow(DctConstRoundShift(s0 + s4), bd);
1743              x1 = HighbdWrapLow(DctConstRoundShift(s1 + s5), bd);
1744              x2 = HighbdWrapLow(DctConstRoundShift(s2 + s6), bd);
1745              x3 = HighbdWrapLow(DctConstRoundShift(s3 + s7), bd);
1746              x4 = HighbdWrapLow(DctConstRoundShift(s0 - s4), bd);
1747              x5 = HighbdWrapLow(DctConstRoundShift(s1 - s5), bd);
1748              x6 = HighbdWrapLow(DctConstRoundShift(s2 - s6), bd);
1749              x7 = HighbdWrapLow(DctConstRoundShift(s3 - s7), bd);
1750  
1751              // stage 2
1752              s0 = x0;
1753              s1 = x1;
1754              s2 = x2;
1755              s3 = x3;
1756              s4 = (long)CosPi8_64 * x4 + (long)CosPi24_64 * x5;
1757              s5 = (long)CosPi24_64 * x4 - (long)CosPi8_64 * x5;
1758              s6 = (long)(-CosPi24_64) * x6 + (long)CosPi8_64 * x7;
1759              s7 = (long)CosPi8_64 * x6 + (long)CosPi24_64 * x7;
1760  
1761              x0 = HighbdWrapLow(s0 + s2, bd);
1762              x1 = HighbdWrapLow(s1 + s3, bd);
1763              x2 = HighbdWrapLow(s0 - s2, bd);
1764              x3 = HighbdWrapLow(s1 - s3, bd);
1765              x4 = HighbdWrapLow(DctConstRoundShift(s4 + s6), bd);
1766              x5 = HighbdWrapLow(DctConstRoundShift(s5 + s7), bd);
1767              x6 = HighbdWrapLow(DctConstRoundShift(s4 - s6), bd);
1768              x7 = HighbdWrapLow(DctConstRoundShift(s5 - s7), bd);
1769  
1770              // stage 3
1771              s2 = (long)CosPi16_64 * (x2 + x3);
1772              s3 = (long)CosPi16_64 * (x2 - x3);
1773              s6 = (long)CosPi16_64 * (x6 + x7);
1774              s7 = (long)CosPi16_64 * (x6 - x7);
1775  
1776              x2 = HighbdWrapLow(DctConstRoundShift(s2), bd);
1777              x3 = HighbdWrapLow(DctConstRoundShift(s3), bd);
1778              x6 = HighbdWrapLow(DctConstRoundShift(s6), bd);
1779              x7 = HighbdWrapLow(DctConstRoundShift(s7), bd);
1780  
1781              output[0] = HighbdWrapLow(x0, bd);
1782              output[1] = HighbdWrapLow(-x4, bd);
1783              output[2] = HighbdWrapLow(x6, bd);
1784              output[3] = HighbdWrapLow(-x2, bd);
1785              output[4] = HighbdWrapLow(x3, bd);
1786              output[5] = HighbdWrapLow(-x7, bd);
1787              output[6] = HighbdWrapLow(x5, bd);
1788              output[7] = HighbdWrapLow(-x1, bd);
1789          }
1790  
1791          [SkipLocalsInit]
1792          public static void HighbdIdct8(ReadOnlySpan<int> input, Span<int> output, int bd)
1793          {
1794              Span<int> step1 = stackalloc int[8];
1795              Span<int> step2 = stackalloc int[8];
1796              long temp1, temp2;
1797  
1798              if (DetectInvalidHighbdInput(input, 8) != 0)
1799              {
1800                  Debug.Assert(false, "invalid highbd txfm input");
1801                  output[..8].Clear();
1802  
1803                  return;
1804              }
1805  
1806              // stage 1
1807              step1[0] = input[0];
1808              step1[2] = input[4];
1809              step1[1] = input[2];
1810              step1[3] = input[6];
1811              temp1 = input[1] * (long)CosPi28_64 - input[7] * (long)CosPi4_64;
1812              temp2 = input[1] * (long)CosPi4_64 + input[7] * (long)CosPi28_64;
1813              step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
1814              step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
1815              temp1 = input[5] * (long)CosPi12_64 - input[3] * (long)CosPi20_64;
1816              temp2 = input[5] * (long)CosPi20_64 + input[3] * (long)CosPi12_64;
1817              step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
1818              step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
1819  
1820              // stage 2 & stage 3 - even half
1821              HighbdIdct4(step1, step1, bd);
1822  
1823              // stage 2 - odd half
1824              step2[4] = HighbdWrapLow(step1[4] + step1[5], bd);
1825              step2[5] = HighbdWrapLow(step1[4] - step1[5], bd);
1826              step2[6] = HighbdWrapLow(-step1[6] + step1[7], bd);
1827              step2[7] = HighbdWrapLow(step1[6] + step1[7], bd);
1828  
1829              // stage 3 - odd half
1830              step1[4] = step2[4];
1831              temp1 = (step2[6] - step2[5]) * (long)CosPi16_64;
1832              temp2 = (step2[5] + step2[6]) * (long)CosPi16_64;
1833              step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
1834              step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
1835              step1[7] = step2[7];
1836  
1837              // stage 4
1838              output[0] = HighbdWrapLow(step1[0] + step1[7], bd);
1839              output[1] = HighbdWrapLow(step1[1] + step1[6], bd);
1840              output[2] = HighbdWrapLow(step1[2] + step1[5], bd);
1841              output[3] = HighbdWrapLow(step1[3] + step1[4], bd);
1842              output[4] = HighbdWrapLow(step1[3] - step1[4], bd);
1843              output[5] = HighbdWrapLow(step1[2] - step1[5], bd);
1844              output[6] = HighbdWrapLow(step1[1] - step1[6], bd);
1845              output[7] = HighbdWrapLow(step1[0] - step1[7], bd);
1846          }
1847  
1848          [SkipLocalsInit]
1849          public static void HighbdIdct8x864Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
1850          {
1851              int i, j;
1852              Span<int> output = stackalloc int[8 * 8];
1853              Span<int> outptr = output;
1854              Span<int> tempIn = stackalloc int[8];
1855              Span<int> tempOut = stackalloc int[8];
1856  
1857              // First transform rows
1858              for (i = 0; i < 8; ++i)
1859              {
1860                  HighbdIdct8(input, outptr, bd);
1861                  input = input[8..];
1862                  outptr = outptr[8..];
1863              }
1864  
1865              // Then transform columns
1866              for (i = 0; i < 8; ++i)
1867              {
1868                  for (j = 0; j < 8; ++j)
1869                  {
1870                      tempIn[j] = output[j * 8 + i];
1871                  }
1872  
1873                  HighbdIdct8(tempIn, tempOut, bd);
1874                  for (j = 0; j < 8; ++j)
1875                  {
1876                      dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd);
1877                  }
1878              }
1879          }
1880  
1881          [SkipLocalsInit]
1882          public static void HighbdIdct8x812Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
1883          {
1884              int i, j;
1885              Span<int> output = stackalloc int[8 * 8];
1886              Span<int> outptr = output;
1887              Span<int> tempIn = stackalloc int[8];
1888              Span<int> tempOut = stackalloc int[8];
1889  
1890              output.Clear();
1891  
1892              // First transform rows
1893              // Only first 4 row has non-zero coefs
1894              for (i = 0; i < 4; ++i)
1895              {
1896                  HighbdIdct8(input, outptr, bd);
1897                  input = input[8..];
1898                  outptr = outptr[8..];
1899              }
1900  
1901              // Then transform columns
1902              for (i = 0; i < 8; ++i)
1903              {
1904                  for (j = 0; j < 8; ++j)
1905                  {
1906                      tempIn[j] = output[j * 8 + i];
1907                  }
1908  
1909                  HighbdIdct8(tempIn, tempOut, bd);
1910                  for (j = 0; j < 8; ++j)
1911                  {
1912                      dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 5), bd);
1913                  }
1914              }
1915          }
1916  
1917          public static void Vpx_Highbdidct8x8_1_add_c(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
1918          {
1919              int i, j;
1920              long a1;
1921              int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd);
1922  
1923              output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd);
1924              a1 = BitUtils.RoundPowerOfTwo(output, 5);
1925              for (j = 0; j < 8; ++j)
1926              {
1927                  for (i = 0; i < 8; ++i)
1928                  {
1929                      dest[i] = HighbdClipPixelAdd(dest[i], a1, bd);
1930                  }
1931  
1932                  dest = dest[stride..];
1933              }
1934          }
1935  
1936          public static void HighbdIadst16(ReadOnlySpan<int> input, Span<int> output, int bd)
1937          {
1938              long s0, s1, s2, s3, s4, s5, s6, s7, s8;
1939              long s9, s10, s11, s12, s13, s14, s15;
1940              int x0 = input[15];
1941              int x1 = input[0];
1942              int x2 = input[13];
1943              int x3 = input[2];
1944              int x4 = input[11];
1945              int x5 = input[4];
1946              int x6 = input[9];
1947              int x7 = input[6];
1948              int x8 = input[7];
1949              int x9 = input[8];
1950              int x10 = input[5];
1951              int x11 = input[10];
1952              int x12 = input[3];
1953              int x13 = input[12];
1954              int x14 = input[1];
1955              int x15 = input[14];
1956  
1957              if (DetectInvalidHighbdInput(input, 16) != 0)
1958              {
1959                  Debug.Assert(false, "invalid highbd txfm input");
1960                  output[..16].Clear();
1961  
1962                  return;
1963              }
1964  
1965              if ((x0 | x1 | x2 | x3 | x4 | x5 | x6 | x7 | x8 | x9 | x10 | x11 | x12 | x13 | x14 | x15) == 0)
1966              {
1967                  output[..16].Clear();
1968  
1969                  return;
1970              }
1971  
1972              // stage 1
1973              s0 = x0 * (long)CosPi1_64 + x1 * (long)CosPi31_64;
1974              s1 = x0 * (long)CosPi31_64 - x1 * (long)CosPi1_64;
1975              s2 = x2 * (long)CosPi5_64 + x3 * (long)CosPi27_64;
1976              s3 = x2 * (long)CosPi27_64 - x3 * (long)CosPi5_64;
1977              s4 = x4 * (long)CosPi9_64 + x5 * (long)CosPi23_64;
1978              s5 = x4 * (long)CosPi23_64 - x5 * (long)CosPi9_64;
1979              s6 = x6 * (long)CosPi13_64 + x7 * (long)CosPi19_64;
1980              s7 = x6 * (long)CosPi19_64 - x7 * (long)CosPi13_64;
1981              s8 = x8 * (long)CosPi17_64 + x9 * (long)CosPi15_64;
1982              s9 = x8 * (long)CosPi15_64 - x9 * (long)CosPi17_64;
1983              s10 = x10 * (long)CosPi21_64 + x11 * (long)CosPi11_64;
1984              s11 = x10 * (long)CosPi11_64 - x11 * (long)CosPi21_64;
1985              s12 = x12 * (long)CosPi25_64 + x13 * (long)CosPi7_64;
1986              s13 = x12 * (long)CosPi7_64 - x13 * (long)CosPi25_64;
1987              s14 = x14 * (long)CosPi29_64 + x15 * (long)CosPi3_64;
1988              s15 = x14 * (long)CosPi3_64 - x15 * (long)CosPi29_64;
1989  
1990              x0 = HighbdWrapLow(DctConstRoundShift(s0 + s8), bd);
1991              x1 = HighbdWrapLow(DctConstRoundShift(s1 + s9), bd);
1992              x2 = HighbdWrapLow(DctConstRoundShift(s2 + s10), bd);
1993              x3 = HighbdWrapLow(DctConstRoundShift(s3 + s11), bd);
1994              x4 = HighbdWrapLow(DctConstRoundShift(s4 + s12), bd);
1995              x5 = HighbdWrapLow(DctConstRoundShift(s5 + s13), bd);
1996              x6 = HighbdWrapLow(DctConstRoundShift(s6 + s14), bd);
1997              x7 = HighbdWrapLow(DctConstRoundShift(s7 + s15), bd);
1998              x8 = HighbdWrapLow(DctConstRoundShift(s0 - s8), bd);
1999              x9 = HighbdWrapLow(DctConstRoundShift(s1 - s9), bd);
2000              x10 = HighbdWrapLow(DctConstRoundShift(s2 - s10), bd);
2001              x11 = HighbdWrapLow(DctConstRoundShift(s3 - s11), bd);
2002              x12 = HighbdWrapLow(DctConstRoundShift(s4 - s12), bd);
2003              x13 = HighbdWrapLow(DctConstRoundShift(s5 - s13), bd);
2004              x14 = HighbdWrapLow(DctConstRoundShift(s6 - s14), bd);
2005              x15 = HighbdWrapLow(DctConstRoundShift(s7 - s15), bd);
2006  
2007              // stage 2
2008              s0 = x0;
2009              s1 = x1;
2010              s2 = x2;
2011              s3 = x3;
2012              s4 = x4;
2013              s5 = x5;
2014              s6 = x6;
2015              s7 = x7;
2016              s8 = x8 * (long)CosPi4_64 + x9 * (long)CosPi28_64;
2017              s9 = x8 * (long)CosPi28_64 - x9 * (long)CosPi4_64;
2018              s10 = x10 * (long)CosPi20_64 + x11 * (long)CosPi12_64;
2019              s11 = x10 * (long)CosPi12_64 - x11 * (long)CosPi20_64;
2020              s12 = -x12 * (long)CosPi28_64 + x13 * (long)CosPi4_64;
2021              s13 = x12 * (long)CosPi4_64 + x13 * (long)CosPi28_64;
2022              s14 = -x14 * (long)CosPi12_64 + x15 * (long)CosPi20_64;
2023              s15 = x14 * (long)CosPi20_64 + x15 * (long)CosPi12_64;
2024  
2025              x0 = HighbdWrapLow(s0 + s4, bd);
2026              x1 = HighbdWrapLow(s1 + s5, bd);
2027              x2 = HighbdWrapLow(s2 + s6, bd);
2028              x3 = HighbdWrapLow(s3 + s7, bd);
2029              x4 = HighbdWrapLow(s0 - s4, bd);
2030              x5 = HighbdWrapLow(s1 - s5, bd);
2031              x6 = HighbdWrapLow(s2 - s6, bd);
2032              x7 = HighbdWrapLow(s3 - s7, bd);
2033              x8 = HighbdWrapLow(DctConstRoundShift(s8 + s12), bd);
2034              x9 = HighbdWrapLow(DctConstRoundShift(s9 + s13), bd);
2035              x10 = HighbdWrapLow(DctConstRoundShift(s10 + s14), bd);
2036              x11 = HighbdWrapLow(DctConstRoundShift(s11 + s15), bd);
2037              x12 = HighbdWrapLow(DctConstRoundShift(s8 - s12), bd);
2038              x13 = HighbdWrapLow(DctConstRoundShift(s9 - s13), bd);
2039              x14 = HighbdWrapLow(DctConstRoundShift(s10 - s14), bd);
2040              x15 = HighbdWrapLow(DctConstRoundShift(s11 - s15), bd);
2041  
2042              // stage 3
2043              s0 = x0;
2044              s1 = x1;
2045              s2 = x2;
2046              s3 = x3;
2047              s4 = x4 * (long)CosPi8_64 + x5 * (long)CosPi24_64;
2048              s5 = x4 * (long)CosPi24_64 - x5 * (long)CosPi8_64;
2049              s6 = -x6 * (long)CosPi24_64 + x7 * (long)CosPi8_64;
2050              s7 = x6 * (long)CosPi8_64 + x7 * (long)CosPi24_64;
2051              s8 = x8;
2052              s9 = x9;
2053              s10 = x10;
2054              s11 = x11;
2055              s12 = x12 * (long)CosPi8_64 + x13 * (long)CosPi24_64;
2056              s13 = x12 * (long)CosPi24_64 - x13 * (long)CosPi8_64;
2057              s14 = -x14 * (long)CosPi24_64 + x15 * (long)CosPi8_64;
2058              s15 = x14 * (long)CosPi8_64 + x15 * (long)CosPi24_64;
2059  
2060              x0 = HighbdWrapLow(s0 + s2, bd);
2061              x1 = HighbdWrapLow(s1 + s3, bd);
2062              x2 = HighbdWrapLow(s0 - s2, bd);
2063              x3 = HighbdWrapLow(s1 - s3, bd);
2064              x4 = HighbdWrapLow(DctConstRoundShift(s4 + s6), bd);
2065              x5 = HighbdWrapLow(DctConstRoundShift(s5 + s7), bd);
2066              x6 = HighbdWrapLow(DctConstRoundShift(s4 - s6), bd);
2067              x7 = HighbdWrapLow(DctConstRoundShift(s5 - s7), bd);
2068              x8 = HighbdWrapLow(s8 + s10, bd);
2069              x9 = HighbdWrapLow(s9 + s11, bd);
2070              x10 = HighbdWrapLow(s8 - s10, bd);
2071              x11 = HighbdWrapLow(s9 - s11, bd);
2072              x12 = HighbdWrapLow(DctConstRoundShift(s12 + s14), bd);
2073              x13 = HighbdWrapLow(DctConstRoundShift(s13 + s15), bd);
2074              x14 = HighbdWrapLow(DctConstRoundShift(s12 - s14), bd);
2075              x15 = HighbdWrapLow(DctConstRoundShift(s13 - s15), bd);
2076  
2077              // stage 4
2078              s2 = (long)(-CosPi16_64) * (x2 + x3);
2079              s3 = (long)CosPi16_64 * (x2 - x3);
2080              s6 = (long)CosPi16_64 * (x6 + x7);
2081              s7 = (long)CosPi16_64 * (-x6 + x7);
2082              s10 = (long)CosPi16_64 * (x10 + x11);
2083              s11 = (long)CosPi16_64 * (-x10 + x11);
2084              s14 = (long)(-CosPi16_64) * (x14 + x15);
2085              s15 = (long)CosPi16_64 * (x14 - x15);
2086  
2087              x2 = HighbdWrapLow(DctConstRoundShift(s2), bd);
2088              x3 = HighbdWrapLow(DctConstRoundShift(s3), bd);
2089              x6 = HighbdWrapLow(DctConstRoundShift(s6), bd);
2090              x7 = HighbdWrapLow(DctConstRoundShift(s7), bd);
2091              x10 = HighbdWrapLow(DctConstRoundShift(s10), bd);
2092              x11 = HighbdWrapLow(DctConstRoundShift(s11), bd);
2093              x14 = HighbdWrapLow(DctConstRoundShift(s14), bd);
2094              x15 = HighbdWrapLow(DctConstRoundShift(s15), bd);
2095  
2096              output[0] = HighbdWrapLow(x0, bd);
2097              output[1] = HighbdWrapLow(-x8, bd);
2098              output[2] = HighbdWrapLow(x12, bd);
2099              output[3] = HighbdWrapLow(-x4, bd);
2100              output[4] = HighbdWrapLow(x6, bd);
2101              output[5] = HighbdWrapLow(x14, bd);
2102              output[6] = HighbdWrapLow(x10, bd);
2103              output[7] = HighbdWrapLow(x2, bd);
2104              output[8] = HighbdWrapLow(x3, bd);
2105              output[9] = HighbdWrapLow(x11, bd);
2106              output[10] = HighbdWrapLow(x15, bd);
2107              output[11] = HighbdWrapLow(x7, bd);
2108              output[12] = HighbdWrapLow(x5, bd);
2109              output[13] = HighbdWrapLow(-x13, bd);
2110              output[14] = HighbdWrapLow(x9, bd);
2111              output[15] = HighbdWrapLow(-x1, bd);
2112          }
2113  
2114          [SkipLocalsInit]
2115          public static void HighbdIdct16(ReadOnlySpan<int> input, Span<int> output, int bd)
2116          {
2117              Span<int> step1 = stackalloc int[16];
2118              Span<int> step2 = stackalloc int[16];
2119              long temp1, temp2;
2120  
2121              if (DetectInvalidHighbdInput(input, 16) != 0)
2122              {
2123                  Debug.Assert(false, "invalid highbd txfm input");
2124                  output[..16].Clear();
2125  
2126                  return;
2127              }
2128  
2129              // stage 1
2130              step1[0] = input[0 / 2];
2131              step1[1] = input[16 / 2];
2132              step1[2] = input[8 / 2];
2133              step1[3] = input[24 / 2];
2134              step1[4] = input[4 / 2];
2135              step1[5] = input[20 / 2];
2136              step1[6] = input[12 / 2];
2137              step1[7] = input[28 / 2];
2138              step1[8] = input[2 / 2];
2139              step1[9] = input[18 / 2];
2140              step1[10] = input[10 / 2];
2141              step1[11] = input[26 / 2];
2142              step1[12] = input[6 / 2];
2143              step1[13] = input[22 / 2];
2144              step1[14] = input[14 / 2];
2145              step1[15] = input[30 / 2];
2146  
2147              // stage 2
2148              step2[0] = step1[0];
2149              step2[1] = step1[1];
2150              step2[2] = step1[2];
2151              step2[3] = step1[3];
2152              step2[4] = step1[4];
2153              step2[5] = step1[5];
2154              step2[6] = step1[6];
2155              step2[7] = step1[7];
2156  
2157              temp1 = step1[8] * (long)CosPi30_64 - step1[15] * (long)CosPi2_64;
2158              temp2 = step1[8] * (long)CosPi2_64 + step1[15] * (long)CosPi30_64;
2159              step2[8] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2160              step2[15] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2161  
2162              temp1 = step1[9] * (long)CosPi14_64 - step1[14] * (long)CosPi18_64;
2163              temp2 = step1[9] * (long)CosPi18_64 + step1[14] * (long)CosPi14_64;
2164              step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2165              step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2166  
2167              temp1 = step1[10] * (long)CosPi22_64 - step1[13] * (long)CosPi10_64;
2168              temp2 = step1[10] * (long)CosPi10_64 + step1[13] * (long)CosPi22_64;
2169              step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2170              step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2171  
2172              temp1 = step1[11] * (long)CosPi6_64 - step1[12] * (long)CosPi26_64;
2173              temp2 = step1[11] * (long)CosPi26_64 + step1[12] * (long)CosPi6_64;
2174              step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2175              step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2176  
2177              // stage 3
2178              step1[0] = step2[0];
2179              step1[1] = step2[1];
2180              step1[2] = step2[2];
2181              step1[3] = step2[3];
2182  
2183              temp1 = step2[4] * (long)CosPi28_64 - step2[7] * (long)CosPi4_64;
2184              temp2 = step2[4] * (long)CosPi4_64 + step2[7] * (long)CosPi28_64;
2185              step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2186              step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2187              temp1 = step2[5] * (long)CosPi12_64 - step2[6] * (long)CosPi20_64;
2188              temp2 = step2[5] * (long)CosPi20_64 + step2[6] * (long)CosPi12_64;
2189              step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2190              step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2191  
2192              step1[8] = HighbdWrapLow(step2[8] + step2[9], bd);
2193              step1[9] = HighbdWrapLow(step2[8] - step2[9], bd);
2194              step1[10] = HighbdWrapLow(-step2[10] + step2[11], bd);
2195              step1[11] = HighbdWrapLow(step2[10] + step2[11], bd);
2196              step1[12] = HighbdWrapLow(step2[12] + step2[13], bd);
2197              step1[13] = HighbdWrapLow(step2[12] - step2[13], bd);
2198              step1[14] = HighbdWrapLow(-step2[14] + step2[15], bd);
2199              step1[15] = HighbdWrapLow(step2[14] + step2[15], bd);
2200  
2201              // stage 4
2202              temp1 = (step1[0] + step1[1]) * (long)CosPi16_64;
2203              temp2 = (step1[0] - step1[1]) * (long)CosPi16_64;
2204              step2[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2205              step2[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2206              temp1 = step1[2] * (long)CosPi24_64 - step1[3] * (long)CosPi8_64;
2207              temp2 = step1[2] * (long)CosPi8_64 + step1[3] * (long)CosPi24_64;
2208              step2[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2209              step2[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2210              step2[4] = HighbdWrapLow(step1[4] + step1[5], bd);
2211              step2[5] = HighbdWrapLow(step1[4] - step1[5], bd);
2212              step2[6] = HighbdWrapLow(-step1[6] + step1[7], bd);
2213              step2[7] = HighbdWrapLow(step1[6] + step1[7], bd);
2214  
2215              step2[8] = step1[8];
2216              step2[15] = step1[15];
2217              temp1 = -step1[9] * (long)CosPi8_64 + step1[14] * (long)CosPi24_64;
2218              temp2 = step1[9] * (long)CosPi24_64 + step1[14] * (long)CosPi8_64;
2219              step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2220              step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2221              temp1 = -step1[10] * (long)CosPi24_64 - step1[13] * (long)CosPi8_64;
2222              temp2 = -step1[10] * (long)CosPi8_64 + step1[13] * (long)CosPi24_64;
2223              step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2224              step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2225              step2[11] = step1[11];
2226              step2[12] = step1[12];
2227  
2228              // stage 5
2229              step1[0] = HighbdWrapLow(step2[0] + step2[3], bd);
2230              step1[1] = HighbdWrapLow(step2[1] + step2[2], bd);
2231              step1[2] = HighbdWrapLow(step2[1] - step2[2], bd);
2232              step1[3] = HighbdWrapLow(step2[0] - step2[3], bd);
2233              step1[4] = step2[4];
2234              temp1 = (step2[6] - step2[5]) * (long)CosPi16_64;
2235              temp2 = (step2[5] + step2[6]) * (long)CosPi16_64;
2236              step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2237              step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2238              step1[7] = step2[7];
2239  
2240              step1[8] = HighbdWrapLow(step2[8] + step2[11], bd);
2241              step1[9] = HighbdWrapLow(step2[9] + step2[10], bd);
2242              step1[10] = HighbdWrapLow(step2[9] - step2[10], bd);
2243              step1[11] = HighbdWrapLow(step2[8] - step2[11], bd);
2244              step1[12] = HighbdWrapLow(-step2[12] + step2[15], bd);
2245              step1[13] = HighbdWrapLow(-step2[13] + step2[14], bd);
2246              step1[14] = HighbdWrapLow(step2[13] + step2[14], bd);
2247              step1[15] = HighbdWrapLow(step2[12] + step2[15], bd);
2248  
2249              // stage 6
2250              step2[0] = HighbdWrapLow(step1[0] + step1[7], bd);
2251              step2[1] = HighbdWrapLow(step1[1] + step1[6], bd);
2252              step2[2] = HighbdWrapLow(step1[2] + step1[5], bd);
2253              step2[3] = HighbdWrapLow(step1[3] + step1[4], bd);
2254              step2[4] = HighbdWrapLow(step1[3] - step1[4], bd);
2255              step2[5] = HighbdWrapLow(step1[2] - step1[5], bd);
2256              step2[6] = HighbdWrapLow(step1[1] - step1[6], bd);
2257              step2[7] = HighbdWrapLow(step1[0] - step1[7], bd);
2258              step2[8] = step1[8];
2259              step2[9] = step1[9];
2260              temp1 = (-step1[10] + step1[13]) * (long)CosPi16_64;
2261              temp2 = (step1[10] + step1[13]) * (long)CosPi16_64;
2262              step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2263              step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2264              temp1 = (-step1[11] + step1[12]) * (long)CosPi16_64;
2265              temp2 = (step1[11] + step1[12]) * (long)CosPi16_64;
2266              step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2267              step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2268              step2[14] = step1[14];
2269              step2[15] = step1[15];
2270  
2271              // stage 7
2272              output[0] = HighbdWrapLow(step2[0] + step2[15], bd);
2273              output[1] = HighbdWrapLow(step2[1] + step2[14], bd);
2274              output[2] = HighbdWrapLow(step2[2] + step2[13], bd);
2275              output[3] = HighbdWrapLow(step2[3] + step2[12], bd);
2276              output[4] = HighbdWrapLow(step2[4] + step2[11], bd);
2277              output[5] = HighbdWrapLow(step2[5] + step2[10], bd);
2278              output[6] = HighbdWrapLow(step2[6] + step2[9], bd);
2279              output[7] = HighbdWrapLow(step2[7] + step2[8], bd);
2280              output[8] = HighbdWrapLow(step2[7] - step2[8], bd);
2281              output[9] = HighbdWrapLow(step2[6] - step2[9], bd);
2282              output[10] = HighbdWrapLow(step2[5] - step2[10], bd);
2283              output[11] = HighbdWrapLow(step2[4] - step2[11], bd);
2284              output[12] = HighbdWrapLow(step2[3] - step2[12], bd);
2285              output[13] = HighbdWrapLow(step2[2] - step2[13], bd);
2286              output[14] = HighbdWrapLow(step2[1] - step2[14], bd);
2287              output[15] = HighbdWrapLow(step2[0] - step2[15], bd);
2288          }
2289  
2290          [SkipLocalsInit]
2291          public static void HighbdIdct16x16256Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
2292          {
2293              int i, j;
2294              Span<int> output = stackalloc int[16 * 16];
2295              Span<int> outptr = output;
2296              Span<int> tempIn = stackalloc int[16];
2297              Span<int> tempOut = stackalloc int[16];
2298  
2299              // First transform rows
2300              for (i = 0; i < 16; ++i)
2301              {
2302                  HighbdIdct16(input, outptr, bd);
2303                  input = input[16..];
2304                  outptr = outptr[16..];
2305              }
2306  
2307              // Then transform columns
2308              for (i = 0; i < 16; ++i)
2309              {
2310                  for (j = 0; j < 16; ++j)
2311                  {
2312                      tempIn[j] = output[j * 16 + i];
2313                  }
2314  
2315                  HighbdIdct16(tempIn, tempOut, bd);
2316                  for (j = 0; j < 16; ++j)
2317                  {
2318                      dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
2319                  }
2320              }
2321          }
2322  
2323          [SkipLocalsInit]
2324          public static void HighbdIdct16x1638Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
2325          {
2326              int i, j;
2327              Span<int> output = stackalloc int[16 * 16];
2328              Span<int> outptr = output;
2329              Span<int> tempIn = stackalloc int[16];
2330              Span<int> tempOut = stackalloc int[16];
2331  
2332              output.Clear();
2333  
2334              // First transform rows. Since all non-zero dct coefficients are in
2335              // upper-left 8x8 area, we only need to calculate first 8 rows here.
2336              for (i = 0; i < 8; ++i)
2337              {
2338                  HighbdIdct16(input, outptr, bd);
2339                  input = input[16..];
2340                  outptr = outptr[16..];
2341              }
2342  
2343              // Then transform columns
2344              for (i = 0; i < 16; ++i)
2345              {
2346                  Span<ushort> destT = dest;
2347                  for (j = 0; j < 16; ++j)
2348                  {
2349                      tempIn[j] = output[j * 16 + i];
2350                  }
2351  
2352                  HighbdIdct16(tempIn, tempOut, bd);
2353                  for (j = 0; j < 16; ++j)
2354                  {
2355                      destT[i] = HighbdClipPixelAdd(destT[i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
2356                      destT = destT[stride..];
2357                  }
2358              }
2359          }
2360  
2361          [SkipLocalsInit]
2362          public static void HighbdIdct16x1610Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
2363          {
2364              int i, j;
2365              Span<int> output = stackalloc int[16 * 16];
2366              Span<int> outptr = output;
2367              Span<int> tempIn = stackalloc int[16];
2368              Span<int> tempOut = stackalloc int[16];
2369  
2370              output.Clear();
2371  
2372              // First transform rows. Since all non-zero dct coefficients are in
2373              // upper-left 4x4 area, we only need to calculate first 4 rows here.
2374              for (i = 0; i < 4; ++i)
2375              {
2376                  HighbdIdct16(input, outptr, bd);
2377                  input = input[16..];
2378                  outptr = outptr[16..];
2379              }
2380  
2381              // Then transform columns
2382              for (i = 0; i < 16; ++i)
2383              {
2384                  for (j = 0; j < 16; ++j)
2385                  {
2386                      tempIn[j] = output[j * 16 + i];
2387                  }
2388  
2389                  HighbdIdct16(tempIn, tempOut, bd);
2390                  for (j = 0; j < 16; ++j)
2391                  {
2392                      dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
2393                  }
2394              }
2395          }
2396  
2397          public static void HighbdIdct16x161Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
2398          {
2399              int i, j;
2400              long a1;
2401              int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd);
2402  
2403              output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd);
2404              a1 = BitUtils.RoundPowerOfTwo(output, 6);
2405              for (j = 0; j < 16; ++j)
2406              {
2407                  for (i = 0; i < 16; ++i)
2408                  {
2409                      dest[i] = HighbdClipPixelAdd(dest[i], a1, bd);
2410                  }
2411  
2412                  dest = dest[stride..];
2413              }
2414          }
2415  
2416          [SkipLocalsInit]
2417          public static void HighbdIdct32(ReadOnlySpan<int> input, Span<int> output, int bd)
2418          {
2419              Span<int> step1 = stackalloc int[32];
2420              Span<int> step2 = stackalloc int[32];
2421              long temp1, temp2;
2422  
2423              if (DetectInvalidHighbdInput(input, 32) != 0)
2424              {
2425                  Debug.Assert(false, "invalid highbd txfm input");
2426                  output[..32].Clear();
2427  
2428                  return;
2429              }
2430  
2431              // stage 1
2432              step1[0] = input[0];
2433              step1[1] = input[16];
2434              step1[2] = input[8];
2435              step1[3] = input[24];
2436              step1[4] = input[4];
2437              step1[5] = input[20];
2438              step1[6] = input[12];
2439              step1[7] = input[28];
2440              step1[8] = input[2];
2441              step1[9] = input[18];
2442              step1[10] = input[10];
2443              step1[11] = input[26];
2444              step1[12] = input[6];
2445              step1[13] = input[22];
2446              step1[14] = input[14];
2447              step1[15] = input[30];
2448  
2449              temp1 = input[1] * (long)CosPi31_64 - input[31] * (long)CosPi1_64;
2450              temp2 = input[1] * (long)CosPi1_64 + input[31] * (long)CosPi31_64;
2451              step1[16] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2452              step1[31] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2453  
2454              temp1 = input[17] * (long)CosPi15_64 - input[15] * (long)CosPi17_64;
2455              temp2 = input[17] * (long)CosPi17_64 + input[15] * (long)CosPi15_64;
2456              step1[17] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2457              step1[30] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2458  
2459              temp1 = input[9] * (long)CosPi23_64 - input[23] * (long)CosPi9_64;
2460              temp2 = input[9] * (long)CosPi9_64 + input[23] * (long)CosPi23_64;
2461              step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2462              step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2463  
2464              temp1 = input[25] * (long)CosPi7_64 - input[7] * (long)CosPi25_64;
2465              temp2 = input[25] * (long)CosPi25_64 + input[7] * (long)CosPi7_64;
2466              step1[19] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2467              step1[28] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2468  
2469              temp1 = input[5] * (long)CosPi27_64 - input[27] * (long)CosPi5_64;
2470              temp2 = input[5] * (long)CosPi5_64 + input[27] * (long)CosPi27_64;
2471              step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2472              step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2473  
2474              temp1 = input[21] * (long)CosPi11_64 - input[11] * (long)CosPi21_64;
2475              temp2 = input[21] * (long)CosPi21_64 + input[11] * (long)CosPi11_64;
2476              step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2477              step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2478  
2479              temp1 = input[13] * (long)CosPi19_64 - input[19] * (long)CosPi13_64;
2480              temp2 = input[13] * (long)CosPi13_64 + input[19] * (long)CosPi19_64;
2481              step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2482              step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2483  
2484              temp1 = input[29] * (long)CosPi3_64 - input[3] * (long)CosPi29_64;
2485              temp2 = input[29] * (long)CosPi29_64 + input[3] * (long)CosPi3_64;
2486              step1[23] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2487              step1[24] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2488  
2489              // stage 2
2490              step2[0] = step1[0];
2491              step2[1] = step1[1];
2492              step2[2] = step1[2];
2493              step2[3] = step1[3];
2494              step2[4] = step1[4];
2495              step2[5] = step1[5];
2496              step2[6] = step1[6];
2497              step2[7] = step1[7];
2498  
2499              temp1 = step1[8] * (long)CosPi30_64 - step1[15] * (long)CosPi2_64;
2500              temp2 = step1[8] * (long)CosPi2_64 + step1[15] * (long)CosPi30_64;
2501              step2[8] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2502              step2[15] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2503  
2504              temp1 = step1[9] * (long)CosPi14_64 - step1[14] * (long)CosPi18_64;
2505              temp2 = step1[9] * (long)CosPi18_64 + step1[14] * (long)CosPi14_64;
2506              step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2507              step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2508  
2509              temp1 = step1[10] * (long)CosPi22_64 - step1[13] * (long)CosPi10_64;
2510              temp2 = step1[10] * (long)CosPi10_64 + step1[13] * (long)CosPi22_64;
2511              step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2512              step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2513  
2514              temp1 = step1[11] * (long)CosPi6_64 - step1[12] * (long)CosPi26_64;
2515              temp2 = step1[11] * (long)CosPi26_64 + step1[12] * (long)CosPi6_64;
2516              step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2517              step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2518  
2519              step2[16] = HighbdWrapLow(step1[16] + step1[17], bd);
2520              step2[17] = HighbdWrapLow(step1[16] - step1[17], bd);
2521              step2[18] = HighbdWrapLow(-step1[18] + step1[19], bd);
2522              step2[19] = HighbdWrapLow(step1[18] + step1[19], bd);
2523              step2[20] = HighbdWrapLow(step1[20] + step1[21], bd);
2524              step2[21] = HighbdWrapLow(step1[20] - step1[21], bd);
2525              step2[22] = HighbdWrapLow(-step1[22] + step1[23], bd);
2526              step2[23] = HighbdWrapLow(step1[22] + step1[23], bd);
2527              step2[24] = HighbdWrapLow(step1[24] + step1[25], bd);
2528              step2[25] = HighbdWrapLow(step1[24] - step1[25], bd);
2529              step2[26] = HighbdWrapLow(-step1[26] + step1[27], bd);
2530              step2[27] = HighbdWrapLow(step1[26] + step1[27], bd);
2531              step2[28] = HighbdWrapLow(step1[28] + step1[29], bd);
2532              step2[29] = HighbdWrapLow(step1[28] - step1[29], bd);
2533              step2[30] = HighbdWrapLow(-step1[30] + step1[31], bd);
2534              step2[31] = HighbdWrapLow(step1[30] + step1[31], bd);
2535  
2536              // stage 3
2537              step1[0] = step2[0];
2538              step1[1] = step2[1];
2539              step1[2] = step2[2];
2540              step1[3] = step2[3];
2541  
2542              temp1 = step2[4] * (long)CosPi28_64 - step2[7] * (long)CosPi4_64;
2543              temp2 = step2[4] * (long)CosPi4_64 + step2[7] * (long)CosPi28_64;
2544              step1[4] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2545              step1[7] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2546              temp1 = step2[5] * (long)CosPi12_64 - step2[6] * (long)CosPi20_64;
2547              temp2 = step2[5] * (long)CosPi20_64 + step2[6] * (long)CosPi12_64;
2548              step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2549              step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2550  
2551              step1[8] = HighbdWrapLow(step2[8] + step2[9], bd);
2552              step1[9] = HighbdWrapLow(step2[8] - step2[9], bd);
2553              step1[10] = HighbdWrapLow(-step2[10] + step2[11], bd);
2554              step1[11] = HighbdWrapLow(step2[10] + step2[11], bd);
2555              step1[12] = HighbdWrapLow(step2[12] + step2[13], bd);
2556              step1[13] = HighbdWrapLow(step2[12] - step2[13], bd);
2557              step1[14] = HighbdWrapLow(-step2[14] + step2[15], bd);
2558              step1[15] = HighbdWrapLow(step2[14] + step2[15], bd);
2559  
2560              step1[16] = step2[16];
2561              step1[31] = step2[31];
2562              temp1 = -step2[17] * (long)CosPi4_64 + step2[30] * (long)CosPi28_64;
2563              temp2 = step2[17] * (long)CosPi28_64 + step2[30] * (long)CosPi4_64;
2564              step1[17] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2565              step1[30] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2566              temp1 = -step2[18] * (long)CosPi28_64 - step2[29] * (long)CosPi4_64;
2567              temp2 = -step2[18] * (long)CosPi4_64 + step2[29] * (long)CosPi28_64;
2568              step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2569              step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2570              step1[19] = step2[19];
2571              step1[20] = step2[20];
2572              temp1 = -step2[21] * (long)CosPi20_64 + step2[26] * (long)CosPi12_64;
2573              temp2 = step2[21] * (long)CosPi12_64 + step2[26] * (long)CosPi20_64;
2574              step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2575              step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2576              temp1 = -step2[22] * (long)CosPi12_64 - step2[25] * (long)CosPi20_64;
2577              temp2 = -step2[22] * (long)CosPi20_64 + step2[25] * (long)CosPi12_64;
2578              step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2579              step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2580              step1[23] = step2[23];
2581              step1[24] = step2[24];
2582              step1[27] = step2[27];
2583              step1[28] = step2[28];
2584  
2585              // stage 4
2586              temp1 = (step1[0] + step1[1]) * (long)CosPi16_64;
2587              temp2 = (step1[0] - step1[1]) * (long)CosPi16_64;
2588              step2[0] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2589              step2[1] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2590              temp1 = step1[2] * (long)CosPi24_64 - step1[3] * (long)CosPi8_64;
2591              temp2 = step1[2] * (long)CosPi8_64 + step1[3] * (long)CosPi24_64;
2592              step2[2] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2593              step2[3] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2594              step2[4] = HighbdWrapLow(step1[4] + step1[5], bd);
2595              step2[5] = HighbdWrapLow(step1[4] - step1[5], bd);
2596              step2[6] = HighbdWrapLow(-step1[6] + step1[7], bd);
2597              step2[7] = HighbdWrapLow(step1[6] + step1[7], bd);
2598  
2599              step2[8] = step1[8];
2600              step2[15] = step1[15];
2601              temp1 = -step1[9] * (long)CosPi8_64 + step1[14] * (long)CosPi24_64;
2602              temp2 = step1[9] * (long)CosPi24_64 + step1[14] * (long)CosPi8_64;
2603              step2[9] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2604              step2[14] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2605              temp1 = -step1[10] * (long)CosPi24_64 - step1[13] * (long)CosPi8_64;
2606              temp2 = -step1[10] * (long)CosPi8_64 + step1[13] * (long)CosPi24_64;
2607              step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2608              step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2609              step2[11] = step1[11];
2610              step2[12] = step1[12];
2611  
2612              step2[16] = HighbdWrapLow(step1[16] + step1[19], bd);
2613              step2[17] = HighbdWrapLow(step1[17] + step1[18], bd);
2614              step2[18] = HighbdWrapLow(step1[17] - step1[18], bd);
2615              step2[19] = HighbdWrapLow(step1[16] - step1[19], bd);
2616              step2[20] = HighbdWrapLow(-step1[20] + step1[23], bd);
2617              step2[21] = HighbdWrapLow(-step1[21] + step1[22], bd);
2618              step2[22] = HighbdWrapLow(step1[21] + step1[22], bd);
2619              step2[23] = HighbdWrapLow(step1[20] + step1[23], bd);
2620  
2621              step2[24] = HighbdWrapLow(step1[24] + step1[27], bd);
2622              step2[25] = HighbdWrapLow(step1[25] + step1[26], bd);
2623              step2[26] = HighbdWrapLow(step1[25] - step1[26], bd);
2624              step2[27] = HighbdWrapLow(step1[24] - step1[27], bd);
2625              step2[28] = HighbdWrapLow(-step1[28] + step1[31], bd);
2626              step2[29] = HighbdWrapLow(-step1[29] + step1[30], bd);
2627              step2[30] = HighbdWrapLow(step1[29] + step1[30], bd);
2628              step2[31] = HighbdWrapLow(step1[28] + step1[31], bd);
2629  
2630              // stage 5
2631              step1[0] = HighbdWrapLow(step2[0] + step2[3], bd);
2632              step1[1] = HighbdWrapLow(step2[1] + step2[2], bd);
2633              step1[2] = HighbdWrapLow(step2[1] - step2[2], bd);
2634              step1[3] = HighbdWrapLow(step2[0] - step2[3], bd);
2635              step1[4] = step2[4];
2636              temp1 = (step2[6] - step2[5]) * (long)CosPi16_64;
2637              temp2 = (step2[5] + step2[6]) * (long)CosPi16_64;
2638              step1[5] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2639              step1[6] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2640              step1[7] = step2[7];
2641  
2642              step1[8] = HighbdWrapLow(step2[8] + step2[11], bd);
2643              step1[9] = HighbdWrapLow(step2[9] + step2[10], bd);
2644              step1[10] = HighbdWrapLow(step2[9] - step2[10], bd);
2645              step1[11] = HighbdWrapLow(step2[8] - step2[11], bd);
2646              step1[12] = HighbdWrapLow(-step2[12] + step2[15], bd);
2647              step1[13] = HighbdWrapLow(-step2[13] + step2[14], bd);
2648              step1[14] = HighbdWrapLow(step2[13] + step2[14], bd);
2649              step1[15] = HighbdWrapLow(step2[12] + step2[15], bd);
2650  
2651              step1[16] = step2[16];
2652              step1[17] = step2[17];
2653              temp1 = -step2[18] * (long)CosPi8_64 + step2[29] * (long)CosPi24_64;
2654              temp2 = step2[18] * (long)CosPi24_64 + step2[29] * (long)CosPi8_64;
2655              step1[18] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2656              step1[29] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2657              temp1 = -step2[19] * (long)CosPi8_64 + step2[28] * (long)CosPi24_64;
2658              temp2 = step2[19] * (long)CosPi24_64 + step2[28] * (long)CosPi8_64;
2659              step1[19] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2660              step1[28] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2661              temp1 = -step2[20] * (long)CosPi24_64 - step2[27] * (long)CosPi8_64;
2662              temp2 = -step2[20] * (long)CosPi8_64 + step2[27] * (long)CosPi24_64;
2663              step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2664              step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2665              temp1 = -step2[21] * (long)CosPi24_64 - step2[26] * (long)CosPi8_64;
2666              temp2 = -step2[21] * (long)CosPi8_64 + step2[26] * (long)CosPi24_64;
2667              step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2668              step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2669              step1[22] = step2[22];
2670              step1[23] = step2[23];
2671              step1[24] = step2[24];
2672              step1[25] = step2[25];
2673              step1[30] = step2[30];
2674              step1[31] = step2[31];
2675  
2676              // stage 6
2677              step2[0] = HighbdWrapLow(step1[0] + step1[7], bd);
2678              step2[1] = HighbdWrapLow(step1[1] + step1[6], bd);
2679              step2[2] = HighbdWrapLow(step1[2] + step1[5], bd);
2680              step2[3] = HighbdWrapLow(step1[3] + step1[4], bd);
2681              step2[4] = HighbdWrapLow(step1[3] - step1[4], bd);
2682              step2[5] = HighbdWrapLow(step1[2] - step1[5], bd);
2683              step2[6] = HighbdWrapLow(step1[1] - step1[6], bd);
2684              step2[7] = HighbdWrapLow(step1[0] - step1[7], bd);
2685              step2[8] = step1[8];
2686              step2[9] = step1[9];
2687              temp1 = (-step1[10] + step1[13]) * (long)CosPi16_64;
2688              temp2 = (step1[10] + step1[13]) * (long)CosPi16_64;
2689              step2[10] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2690              step2[13] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2691              temp1 = (-step1[11] + step1[12]) * (long)CosPi16_64;
2692              temp2 = (step1[11] + step1[12]) * (long)CosPi16_64;
2693              step2[11] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2694              step2[12] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2695              step2[14] = step1[14];
2696              step2[15] = step1[15];
2697  
2698              step2[16] = HighbdWrapLow(step1[16] + step1[23], bd);
2699              step2[17] = HighbdWrapLow(step1[17] + step1[22], bd);
2700              step2[18] = HighbdWrapLow(step1[18] + step1[21], bd);
2701              step2[19] = HighbdWrapLow(step1[19] + step1[20], bd);
2702              step2[20] = HighbdWrapLow(step1[19] - step1[20], bd);
2703              step2[21] = HighbdWrapLow(step1[18] - step1[21], bd);
2704              step2[22] = HighbdWrapLow(step1[17] - step1[22], bd);
2705              step2[23] = HighbdWrapLow(step1[16] - step1[23], bd);
2706  
2707              step2[24] = HighbdWrapLow(-step1[24] + step1[31], bd);
2708              step2[25] = HighbdWrapLow(-step1[25] + step1[30], bd);
2709              step2[26] = HighbdWrapLow(-step1[26] + step1[29], bd);
2710              step2[27] = HighbdWrapLow(-step1[27] + step1[28], bd);
2711              step2[28] = HighbdWrapLow(step1[27] + step1[28], bd);
2712              step2[29] = HighbdWrapLow(step1[26] + step1[29], bd);
2713              step2[30] = HighbdWrapLow(step1[25] + step1[30], bd);
2714              step2[31] = HighbdWrapLow(step1[24] + step1[31], bd);
2715  
2716              // stage 7
2717              step1[0] = HighbdWrapLow(step2[0] + step2[15], bd);
2718              step1[1] = HighbdWrapLow(step2[1] + step2[14], bd);
2719              step1[2] = HighbdWrapLow(step2[2] + step2[13], bd);
2720              step1[3] = HighbdWrapLow(step2[3] + step2[12], bd);
2721              step1[4] = HighbdWrapLow(step2[4] + step2[11], bd);
2722              step1[5] = HighbdWrapLow(step2[5] + step2[10], bd);
2723              step1[6] = HighbdWrapLow(step2[6] + step2[9], bd);
2724              step1[7] = HighbdWrapLow(step2[7] + step2[8], bd);
2725              step1[8] = HighbdWrapLow(step2[7] - step2[8], bd);
2726              step1[9] = HighbdWrapLow(step2[6] - step2[9], bd);
2727              step1[10] = HighbdWrapLow(step2[5] - step2[10], bd);
2728              step1[11] = HighbdWrapLow(step2[4] - step2[11], bd);
2729              step1[12] = HighbdWrapLow(step2[3] - step2[12], bd);
2730              step1[13] = HighbdWrapLow(step2[2] - step2[13], bd);
2731              step1[14] = HighbdWrapLow(step2[1] - step2[14], bd);
2732              step1[15] = HighbdWrapLow(step2[0] - step2[15], bd);
2733  
2734              step1[16] = step2[16];
2735              step1[17] = step2[17];
2736              step1[18] = step2[18];
2737              step1[19] = step2[19];
2738              temp1 = (-step2[20] + step2[27]) * (long)CosPi16_64;
2739              temp2 = (step2[20] + step2[27]) * (long)CosPi16_64;
2740              step1[20] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2741              step1[27] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2742              temp1 = (-step2[21] + step2[26]) * (long)CosPi16_64;
2743              temp2 = (step2[21] + step2[26]) * (long)CosPi16_64;
2744              step1[21] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2745              step1[26] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2746              temp1 = (-step2[22] + step2[25]) * (long)CosPi16_64;
2747              temp2 = (step2[22] + step2[25]) * (long)CosPi16_64;
2748              step1[22] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2749              step1[25] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2750              temp1 = (-step2[23] + step2[24]) * (long)CosPi16_64;
2751              temp2 = (step2[23] + step2[24]) * (long)CosPi16_64;
2752              step1[23] = HighbdWrapLow(DctConstRoundShift(temp1), bd);
2753              step1[24] = HighbdWrapLow(DctConstRoundShift(temp2), bd);
2754              step1[28] = step2[28];
2755              step1[29] = step2[29];
2756              step1[30] = step2[30];
2757              step1[31] = step2[31];
2758  
2759              // final stage
2760              output[0] = HighbdWrapLow(step1[0] + step1[31], bd);
2761              output[1] = HighbdWrapLow(step1[1] + step1[30], bd);
2762              output[2] = HighbdWrapLow(step1[2] + step1[29], bd);
2763              output[3] = HighbdWrapLow(step1[3] + step1[28], bd);
2764              output[4] = HighbdWrapLow(step1[4] + step1[27], bd);
2765              output[5] = HighbdWrapLow(step1[5] + step1[26], bd);
2766              output[6] = HighbdWrapLow(step1[6] + step1[25], bd);
2767              output[7] = HighbdWrapLow(step1[7] + step1[24], bd);
2768              output[8] = HighbdWrapLow(step1[8] + step1[23], bd);
2769              output[9] = HighbdWrapLow(step1[9] + step1[22], bd);
2770              output[10] = HighbdWrapLow(step1[10] + step1[21], bd);
2771              output[11] = HighbdWrapLow(step1[11] + step1[20], bd);
2772              output[12] = HighbdWrapLow(step1[12] + step1[19], bd);
2773              output[13] = HighbdWrapLow(step1[13] + step1[18], bd);
2774              output[14] = HighbdWrapLow(step1[14] + step1[17], bd);
2775              output[15] = HighbdWrapLow(step1[15] + step1[16], bd);
2776              output[16] = HighbdWrapLow(step1[15] - step1[16], bd);
2777              output[17] = HighbdWrapLow(step1[14] - step1[17], bd);
2778              output[18] = HighbdWrapLow(step1[13] - step1[18], bd);
2779              output[19] = HighbdWrapLow(step1[12] - step1[19], bd);
2780              output[20] = HighbdWrapLow(step1[11] - step1[20], bd);
2781              output[21] = HighbdWrapLow(step1[10] - step1[21], bd);
2782              output[22] = HighbdWrapLow(step1[9] - step1[22], bd);
2783              output[23] = HighbdWrapLow(step1[8] - step1[23], bd);
2784              output[24] = HighbdWrapLow(step1[7] - step1[24], bd);
2785              output[25] = HighbdWrapLow(step1[6] - step1[25], bd);
2786              output[26] = HighbdWrapLow(step1[5] - step1[26], bd);
2787              output[27] = HighbdWrapLow(step1[4] - step1[27], bd);
2788              output[28] = HighbdWrapLow(step1[3] - step1[28], bd);
2789              output[29] = HighbdWrapLow(step1[2] - step1[29], bd);
2790              output[30] = HighbdWrapLow(step1[1] - step1[30], bd);
2791              output[31] = HighbdWrapLow(step1[0] - step1[31], bd);
2792          }
2793  
2794          [SkipLocalsInit]
2795          public static void HighbdIdct32x321024Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
2796          {
2797              int i, j;
2798              Span<int> output = stackalloc int[32 * 32];
2799              Span<int> outptr = output;
2800              Span<int> tempIn = stackalloc int[32];
2801              Span<int> tempOut = stackalloc int[32];
2802  
2803              // Rows
2804              for (i = 0; i < 32; ++i)
2805              {
2806                  int zeroCoeff = 0;
2807                  for (j = 0; j < 32; ++j)
2808                  {
2809                      zeroCoeff |= input[j];
2810                  }
2811  
2812                  if (zeroCoeff != 0)
2813                  {
2814                      HighbdIdct32(input, outptr, bd);
2815                  }
2816                  else
2817                  {
2818                      outptr[..32].Clear();
2819                  }
2820  
2821                  input = input[32..];
2822                  outptr = outptr[32..];
2823              }
2824  
2825              // Columns
2826              for (i = 0; i < 32; ++i)
2827              {
2828                  for (j = 0; j < 32; ++j)
2829                  {
2830                      tempIn[j] = output[j * 32 + i];
2831                  }
2832  
2833                  HighbdIdct32(tempIn, tempOut, bd);
2834                  for (j = 0; j < 32; ++j)
2835                  {
2836                      dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
2837                  }
2838              }
2839          }
2840  
2841          [SkipLocalsInit]
2842          public static void HighbdIdct32x32135Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
2843          {
2844              int i, j;
2845              Span<int> output = stackalloc int[32 * 32];
2846              Span<int> outptr = output;
2847              Span<int> tempIn = stackalloc int[32];
2848              Span<int> tempOut = stackalloc int[32];
2849  
2850              output.Clear();
2851  
2852              // Rows
2853              // Only upper-left 16x16 has non-zero coeff
2854              for (i = 0; i < 16; ++i)
2855              {
2856                  HighbdIdct32(input, outptr, bd);
2857                  input = input[32..];
2858                  outptr = outptr[32..];
2859              }
2860  
2861              // Columns
2862              for (i = 0; i < 32; ++i)
2863              {
2864                  Span<ushort> destT = dest;
2865                  for (j = 0; j < 32; ++j)
2866                  {
2867                      tempIn[j] = output[j * 32 + i];
2868                  }
2869  
2870                  HighbdIdct32(tempIn, tempOut, bd);
2871                  for (j = 0; j < 32; ++j)
2872                  {
2873                      destT[i] = HighbdClipPixelAdd(destT[i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
2874                      destT = destT[stride..];
2875                  }
2876              }
2877          }
2878  
2879          [SkipLocalsInit]
2880          public static void HighbdIdct32x3234Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
2881          {
2882              int i, j;
2883              Span<int> output = stackalloc int[32 * 32];
2884              Span<int> outptr = output;
2885              Span<int> tempIn = stackalloc int[32];
2886              Span<int> tempOut = stackalloc int[32];
2887  
2888              output.Clear();
2889  
2890              // Rows
2891              // Only upper-left 8x8 has non-zero coeff
2892              for (i = 0; i < 8; ++i)
2893              {
2894                  HighbdIdct32(input, outptr, bd);
2895                  input = input[32..];
2896                  outptr = outptr[32..];
2897              }
2898  
2899              // Columns
2900              for (i = 0; i < 32; ++i)
2901              {
2902                  for (j = 0; j < 32; ++j)
2903                  {
2904                      tempIn[j] = output[j * 32 + i];
2905                  }
2906  
2907                  HighbdIdct32(tempIn, tempOut, bd);
2908                  for (j = 0; j < 32; ++j)
2909                  {
2910                      dest[j * stride + i] = HighbdClipPixelAdd(dest[j * stride + i], BitUtils.RoundPowerOfTwo(tempOut[j], 6), bd);
2911                  }
2912              }
2913          }
2914  
2915          public static void HighbdIdct32x321Add(ReadOnlySpan<int> input, Span<ushort> dest, int stride, int bd)
2916          {
2917              int i, j;
2918              int a1;
2919              int output = HighbdWrapLow(DctConstRoundShift(input[0] * (long)CosPi16_64), bd);
2920  
2921              output = HighbdWrapLow(DctConstRoundShift(output * (long)CosPi16_64), bd);
2922              a1 = BitUtils.RoundPowerOfTwo(output, 6);
2923  
2924              for (j = 0; j < 32; ++j)
2925              {
2926                  for (i = 0; i < 32; ++i)
2927                  {
2928                      dest[i] = HighbdClipPixelAdd(dest[i], a1, bd);
2929                  }
2930  
2931                  dest = dest[stride..];
2932              }
2933          }
2934      }
2935  }