/ src / Ryujinx.Graphics.Texture / Utils / BC67Utils.cs
BC67Utils.cs
   1  using System;
   2  using System.Diagnostics;
   3  using System.Runtime.CompilerServices;
   4  using System.Runtime.Intrinsics;
   5  using System.Runtime.Intrinsics.X86;
   6  
   7  namespace Ryujinx.Graphics.Texture.Utils
   8  {
   9      static class BC67Utils
  10      {
  11          private static readonly byte[][] _quantizationLut;
  12          private static readonly byte[][] _quantizationLutNoPBit;
  13  
  14          static BC67Utils()
  15          {
  16              _quantizationLut = new byte[5][];
  17              _quantizationLutNoPBit = new byte[5][];
  18  
  19              for (int depth = 4; depth < 9; depth++)
  20              {
  21                  byte[] lut = new byte[512];
  22                  byte[] lutNoPBit = new byte[256];
  23  
  24                  for (int i = 0; i < lut.Length; i++)
  25                  {
  26                      lut[i] = QuantizeComponentForLut((byte)i, depth, i >> 8);
  27  
  28                      if (i < lutNoPBit.Length)
  29                      {
  30                          lutNoPBit[i] = QuantizeComponentForLut((byte)i, depth);
  31                      }
  32                  }
  33  
  34                  _quantizationLut[depth - 4] = lut;
  35                  _quantizationLutNoPBit[depth - 4] = lutNoPBit;
  36              }
  37          }
  38  
  39          public static (RgbaColor8, RgbaColor8) GetMinMaxColors(ReadOnlySpan<uint> tile, int w, int h)
  40          {
  41              if (Sse41.IsSupported && w == 4 && h == 4)
  42              {
  43                  GetMinMaxColorsOneSubset4x4Sse41(tile, out RgbaColor8 minColor, out RgbaColor8 maxColor);
  44  
  45                  return (minColor, maxColor);
  46              }
  47              else
  48              {
  49                  RgbaColor8 minColor = new(255, 255, 255, 255);
  50                  RgbaColor8 maxColor = default;
  51  
  52                  for (int i = 0; i < tile.Length; i++)
  53                  {
  54                      RgbaColor8 color = RgbaColor8.FromUInt32(tile[i]);
  55  
  56                      minColor.R = Math.Min(minColor.R, color.R);
  57                      minColor.G = Math.Min(minColor.G, color.G);
  58                      minColor.B = Math.Min(minColor.B, color.B);
  59                      minColor.A = Math.Min(minColor.A, color.A);
  60  
  61                      maxColor.R = Math.Max(maxColor.R, color.R);
  62                      maxColor.G = Math.Max(maxColor.G, color.G);
  63                      maxColor.B = Math.Max(maxColor.B, color.B);
  64                      maxColor.A = Math.Max(maxColor.A, color.A);
  65                  }
  66  
  67                  return (minColor, maxColor);
  68              }
  69          }
  70  
  71          public static void GetMinMaxColors(
  72              ReadOnlySpan<byte> partitionTable,
  73              ReadOnlySpan<uint> tile,
  74              int w,
  75              int h,
  76              Span<RgbaColor8> minColors,
  77              Span<RgbaColor8> maxColors,
  78              int subsetCount)
  79          {
  80              if (Sse41.IsSupported && w == 4 && h == 4)
  81              {
  82                  if (subsetCount == 1)
  83                  {
  84                      GetMinMaxColorsOneSubset4x4Sse41(tile, out minColors[0], out maxColors[0]);
  85                      return;
  86                  }
  87                  else if (subsetCount == 2)
  88                  {
  89                      GetMinMaxColorsTwoSubsets4x4Sse41(partitionTable, tile, minColors, maxColors);
  90                      return;
  91                  }
  92              }
  93  
  94              minColors.Fill(new RgbaColor8(255, 255, 255, 255));
  95  
  96              int i = 0;
  97              for (int ty = 0; ty < h; ty++)
  98              {
  99                  for (int tx = 0; tx < w; tx++)
 100                  {
 101                      int subset = partitionTable[ty * w + tx];
 102                      RgbaColor8 color = RgbaColor8.FromUInt32(tile[i++]);
 103  
 104                      minColors[subset].R = Math.Min(minColors[subset].R, color.R);
 105                      minColors[subset].G = Math.Min(minColors[subset].G, color.G);
 106                      minColors[subset].B = Math.Min(minColors[subset].B, color.B);
 107                      minColors[subset].A = Math.Min(minColors[subset].A, color.A);
 108  
 109                      maxColors[subset].R = Math.Max(maxColors[subset].R, color.R);
 110                      maxColors[subset].G = Math.Max(maxColors[subset].G, color.G);
 111                      maxColors[subset].B = Math.Max(maxColors[subset].B, color.B);
 112                      maxColors[subset].A = Math.Max(maxColors[subset].A, color.A);
 113                  }
 114              }
 115          }
 116  
 117          private static unsafe void GetMinMaxColorsOneSubset4x4Sse41(ReadOnlySpan<uint> tile, out RgbaColor8 minColor, out RgbaColor8 maxColor)
 118          {
 119              Vector128<byte> min = Vector128<byte>.AllBitsSet;
 120              Vector128<byte> max = Vector128<byte>.Zero;
 121              Vector128<byte> row0, row1, row2, row3;
 122  
 123              fixed (uint* pTile = tile)
 124              {
 125                  row0 = Sse2.LoadVector128(pTile).AsByte();
 126                  row1 = Sse2.LoadVector128(pTile + 4).AsByte();
 127                  row2 = Sse2.LoadVector128(pTile + 8).AsByte();
 128                  row3 = Sse2.LoadVector128(pTile + 12).AsByte();
 129              }
 130  
 131              min = Sse2.Min(min, row0);
 132              max = Sse2.Max(max, row0);
 133              min = Sse2.Min(min, row1);
 134              max = Sse2.Max(max, row1);
 135              min = Sse2.Min(min, row2);
 136              max = Sse2.Max(max, row2);
 137              min = Sse2.Min(min, row3);
 138              max = Sse2.Max(max, row3);
 139  
 140              minColor = HorizontalMin(min);
 141              maxColor = HorizontalMax(max);
 142          }
 143  
 144          private static unsafe void GetMinMaxColorsTwoSubsets4x4Sse41(
 145              ReadOnlySpan<byte> partitionTable,
 146              ReadOnlySpan<uint> tile,
 147              Span<RgbaColor8> minColors,
 148              Span<RgbaColor8> maxColors)
 149          {
 150              Vector128<byte> partitionMask;
 151  
 152              fixed (byte* pPartitionTable = partitionTable)
 153              {
 154                  partitionMask = Sse2.LoadVector128(pPartitionTable);
 155              }
 156  
 157              Vector128<byte> subset0Mask = Sse2.CompareEqual(partitionMask, Vector128<byte>.Zero);
 158  
 159              Vector128<byte> subset0MaskRep16Low = Sse2.UnpackLow(subset0Mask, subset0Mask);
 160              Vector128<byte> subset0MaskRep16High = Sse2.UnpackHigh(subset0Mask, subset0Mask);
 161  
 162              Vector128<byte> subset0Mask0 = Sse2.UnpackLow(subset0MaskRep16Low.AsInt16(), subset0MaskRep16Low.AsInt16()).AsByte();
 163              Vector128<byte> subset0Mask1 = Sse2.UnpackHigh(subset0MaskRep16Low.AsInt16(), subset0MaskRep16Low.AsInt16()).AsByte();
 164              Vector128<byte> subset0Mask2 = Sse2.UnpackLow(subset0MaskRep16High.AsInt16(), subset0MaskRep16High.AsInt16()).AsByte();
 165              Vector128<byte> subset0Mask3 = Sse2.UnpackHigh(subset0MaskRep16High.AsInt16(), subset0MaskRep16High.AsInt16()).AsByte();
 166  
 167              Vector128<byte> min0 = Vector128<byte>.AllBitsSet;
 168              Vector128<byte> min1 = Vector128<byte>.AllBitsSet;
 169              Vector128<byte> max0 = Vector128<byte>.Zero;
 170              Vector128<byte> max1 = Vector128<byte>.Zero;
 171  
 172              Vector128<byte> row0, row1, row2, row3;
 173  
 174              fixed (uint* pTile = tile)
 175              {
 176                  row0 = Sse2.LoadVector128(pTile).AsByte();
 177                  row1 = Sse2.LoadVector128(pTile + 4).AsByte();
 178                  row2 = Sse2.LoadVector128(pTile + 8).AsByte();
 179                  row3 = Sse2.LoadVector128(pTile + 12).AsByte();
 180              }
 181  
 182              min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row0, subset0Mask0));
 183              min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row1, subset0Mask1));
 184              min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row2, subset0Mask2));
 185              min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row3, subset0Mask3));
 186  
 187              min1 = Sse2.Min(min1, Sse2.Or(row0, subset0Mask0));
 188              min1 = Sse2.Min(min1, Sse2.Or(row1, subset0Mask1));
 189              min1 = Sse2.Min(min1, Sse2.Or(row2, subset0Mask2));
 190              min1 = Sse2.Min(min1, Sse2.Or(row3, subset0Mask3));
 191  
 192              max0 = Sse2.Max(max0, Sse2.And(row0, subset0Mask0));
 193              max0 = Sse2.Max(max0, Sse2.And(row1, subset0Mask1));
 194              max0 = Sse2.Max(max0, Sse2.And(row2, subset0Mask2));
 195              max0 = Sse2.Max(max0, Sse2.And(row3, subset0Mask3));
 196  
 197              max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask0, row0));
 198              max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask1, row1));
 199              max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask2, row2));
 200              max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask3, row3));
 201  
 202              minColors[0] = HorizontalMin(min0);
 203              minColors[1] = HorizontalMin(min1);
 204              maxColors[0] = HorizontalMax(max0);
 205              maxColors[1] = HorizontalMax(max1);
 206          }
 207  
 208          private static RgbaColor8 HorizontalMin(Vector128<byte> x)
 209          {
 210              x = Sse2.Min(x, Sse2.Shuffle(x.AsInt32(), 0x31).AsByte());
 211              x = Sse2.Min(x, Sse2.Shuffle(x.AsInt32(), 2).AsByte());
 212              return RgbaColor8.FromUInt32(x.AsUInt32().GetElement(0));
 213          }
 214  
 215          private static RgbaColor8 HorizontalMax(Vector128<byte> x)
 216          {
 217              x = Sse2.Max(x, Sse2.Shuffle(x.AsInt32(), 0x31).AsByte());
 218              x = Sse2.Max(x, Sse2.Shuffle(x.AsInt32(), 2).AsByte());
 219              return RgbaColor8.FromUInt32(x.AsUInt32().GetElement(0));
 220          }
 221  
 222          public static int SelectIndices(
 223              ReadOnlySpan<uint> values,
 224              uint endPoint0,
 225              uint endPoint1,
 226              int pBit0,
 227              int pBit1,
 228              int indexBitCount,
 229              int indexCount,
 230              int colorDepth,
 231              int alphaDepth,
 232              uint alphaMask)
 233          {
 234              if (Sse41.IsSupported)
 235              {
 236                  if (indexBitCount == 2)
 237                  {
 238                      return Select2BitIndicesSse41(
 239                          values,
 240                          endPoint0,
 241                          endPoint1,
 242                          pBit0,
 243                          pBit1,
 244                          indexBitCount,
 245                          indexCount,
 246                          colorDepth,
 247                          alphaDepth,
 248                          alphaMask);
 249                  }
 250                  else if (indexBitCount == 3)
 251                  {
 252                      return Select3BitIndicesSse41(
 253                          values,
 254                          endPoint0,
 255                          endPoint1,
 256                          pBit0,
 257                          pBit1,
 258                          indexBitCount,
 259                          indexCount,
 260                          colorDepth,
 261                          alphaDepth,
 262                          alphaMask);
 263                  }
 264                  else if (indexBitCount == 4)
 265                  {
 266                      return Select4BitIndicesOneSubsetSse41(
 267                          values,
 268                          endPoint0,
 269                          endPoint1,
 270                          pBit0,
 271                          pBit1,
 272                          indexBitCount,
 273                          indexCount,
 274                          colorDepth,
 275                          alphaDepth,
 276                          alphaMask);
 277                  }
 278              }
 279  
 280              return SelectIndicesFallback(
 281                  values,
 282                  endPoint0,
 283                  endPoint1,
 284                  pBit0,
 285                  pBit1,
 286                  indexBitCount,
 287                  indexCount,
 288                  colorDepth,
 289                  alphaDepth,
 290                  alphaMask);
 291          }
 292  
 293          private static unsafe int Select2BitIndicesSse41(
 294              ReadOnlySpan<uint> values,
 295              uint endPoint0,
 296              uint endPoint1,
 297              int pBit0,
 298              int pBit1,
 299              int indexBitCount,
 300              int indexCount,
 301              int colorDepth,
 302              int alphaDepth,
 303              uint alphaMask)
 304          {
 305              uint alphaMaskForPalette = alphaMask;
 306  
 307              if (alphaDepth == 0)
 308              {
 309                  alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
 310              }
 311  
 312              int errorSum = 0;
 313  
 314              RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0);
 315              RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1);
 316  
 317              Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte();
 318              Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte();
 319  
 320              Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
 321  
 322              Vector128<byte> rWeights;
 323              Vector128<byte> lWeights;
 324  
 325              fixed (byte* pWeights = BC67Tables.Weights[0], pInvWeights = BC67Tables.InverseWeights[0])
 326              {
 327                  rWeights = Sse2.LoadScalarVector128((uint*)pWeights).AsByte();
 328                  lWeights = Sse2.LoadScalarVector128((uint*)pInvWeights).AsByte();
 329              }
 330  
 331              Vector128<byte> iWeights = Sse2.UnpackLow(lWeights, rWeights);
 332              Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
 333              Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
 334              Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
 335  
 336              Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
 337              Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
 338  
 339              for (int i = 0; i < values.Length; i++)
 340              {
 341                  uint c = values[i] | alphaMask;
 342  
 343                  Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
 344  
 345                  Vector128<short> delta0 = Sse2.Subtract(color, pal0);
 346                  Vector128<short> delta1 = Sse2.Subtract(color, pal1);
 347  
 348                  Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
 349                  Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
 350  
 351                  Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
 352  
 353                  Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum01);
 354  
 355                  Vector128<ushort> min = Sse41.MinHorizontal(delta);
 356  
 357                  ushort error = min.GetElement(0);
 358  
 359                  errorSum += error;
 360              }
 361  
 362              return errorSum;
 363          }
 364  
 365          private static unsafe int Select3BitIndicesSse41(
 366              ReadOnlySpan<uint> values,
 367              uint endPoint0,
 368              uint endPoint1,
 369              int pBit0,
 370              int pBit1,
 371              int indexBitCount,
 372              int indexCount,
 373              int colorDepth,
 374              int alphaDepth,
 375              uint alphaMask)
 376          {
 377              uint alphaMaskForPalette = alphaMask;
 378  
 379              if (alphaDepth == 0)
 380              {
 381                  alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
 382              }
 383  
 384              int errorSum = 0;
 385  
 386              RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0);
 387              RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1);
 388  
 389              Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte();
 390              Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte();
 391  
 392              Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
 393  
 394              Vector128<byte> rWeights;
 395              Vector128<byte> lWeights;
 396  
 397              fixed (byte* pWeights = BC67Tables.Weights[1], pInvWeights = BC67Tables.InverseWeights[1])
 398              {
 399                  rWeights = Sse2.LoadScalarVector128((ulong*)pWeights).AsByte();
 400                  lWeights = Sse2.LoadScalarVector128((ulong*)pInvWeights).AsByte();
 401              }
 402  
 403              Vector128<byte> iWeights = Sse2.UnpackLow(lWeights, rWeights);
 404              Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
 405              Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
 406              Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
 407              Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
 408              Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
 409              Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
 410  
 411              Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
 412              Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
 413              Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte()));
 414              Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte()));
 415  
 416              for (int i = 0; i < values.Length; i++)
 417              {
 418                  uint c = values[i] | alphaMask;
 419  
 420                  Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
 421  
 422                  Vector128<short> delta0 = Sse2.Subtract(color, pal0);
 423                  Vector128<short> delta1 = Sse2.Subtract(color, pal1);
 424                  Vector128<short> delta2 = Sse2.Subtract(color, pal2);
 425                  Vector128<short> delta3 = Sse2.Subtract(color, pal3);
 426  
 427                  Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
 428                  Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
 429                  Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2);
 430                  Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3);
 431  
 432                  Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
 433                  Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3);
 434  
 435                  Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23);
 436  
 437                  Vector128<ushort> min = Sse41.MinHorizontal(delta);
 438  
 439                  ushort error = min.GetElement(0);
 440  
 441                  errorSum += error;
 442              }
 443  
 444              return errorSum;
 445          }
 446  
 447          private static unsafe int Select4BitIndicesOneSubsetSse41(
 448              ReadOnlySpan<uint> values,
 449              uint endPoint0,
 450              uint endPoint1,
 451              int pBit0,
 452              int pBit1,
 453              int indexBitCount,
 454              int indexCount,
 455              int colorDepth,
 456              int alphaDepth,
 457              uint alphaMask)
 458          {
 459              uint alphaMaskForPalette = alphaMask;
 460  
 461              if (alphaDepth == 0)
 462              {
 463                  alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
 464              }
 465  
 466              int errorSum = 0;
 467  
 468              RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0);
 469              RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1);
 470  
 471              Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte();
 472              Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte();
 473  
 474              Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
 475  
 476              Vector128<byte> rWeights;
 477              Vector128<byte> lWeights;
 478  
 479              fixed (byte* pWeights = BC67Tables.Weights[2], pInvWeights = BC67Tables.InverseWeights[2])
 480              {
 481                  rWeights = Sse2.LoadVector128(pWeights);
 482                  lWeights = Sse2.LoadVector128(pInvWeights);
 483              }
 484  
 485              Vector128<byte> iWeightsLow = Sse2.UnpackLow(lWeights, rWeights);
 486              Vector128<byte> iWeightsHigh = Sse2.UnpackHigh(lWeights, rWeights);
 487              Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte();
 488              Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte();
 489              Vector128<byte> iWeights45 = Sse2.UnpackLow(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte();
 490              Vector128<byte> iWeights67 = Sse2.UnpackHigh(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte();
 491              Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
 492              Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
 493              Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
 494              Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
 495              Vector128<byte> iWeights4 = Sse2.UnpackLow(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte();
 496              Vector128<byte> iWeights5 = Sse2.UnpackHigh(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte();
 497              Vector128<byte> iWeights6 = Sse2.UnpackLow(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte();
 498              Vector128<byte> iWeights7 = Sse2.UnpackHigh(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte();
 499  
 500              Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
 501              Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
 502              Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte()));
 503              Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte()));
 504              Vector128<short> pal4 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights4.AsSByte()));
 505              Vector128<short> pal5 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights5.AsSByte()));
 506              Vector128<short> pal6 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights6.AsSByte()));
 507              Vector128<short> pal7 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights7.AsSByte()));
 508  
 509              for (int i = 0; i < values.Length; i++)
 510              {
 511                  uint c = values[i] | alphaMask;
 512  
 513                  Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
 514  
 515                  Vector128<short> delta0 = Sse2.Subtract(color, pal0);
 516                  Vector128<short> delta1 = Sse2.Subtract(color, pal1);
 517                  Vector128<short> delta2 = Sse2.Subtract(color, pal2);
 518                  Vector128<short> delta3 = Sse2.Subtract(color, pal3);
 519                  Vector128<short> delta4 = Sse2.Subtract(color, pal4);
 520                  Vector128<short> delta5 = Sse2.Subtract(color, pal5);
 521                  Vector128<short> delta6 = Sse2.Subtract(color, pal6);
 522                  Vector128<short> delta7 = Sse2.Subtract(color, pal7);
 523  
 524                  Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
 525                  Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
 526                  Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2);
 527                  Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3);
 528                  Vector128<int> deltaSum4 = Sse2.MultiplyAddAdjacent(delta4, delta4);
 529                  Vector128<int> deltaSum5 = Sse2.MultiplyAddAdjacent(delta5, delta5);
 530                  Vector128<int> deltaSum6 = Sse2.MultiplyAddAdjacent(delta6, delta6);
 531                  Vector128<int> deltaSum7 = Sse2.MultiplyAddAdjacent(delta7, delta7);
 532  
 533                  Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
 534                  Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3);
 535                  Vector128<int> deltaSum45 = Ssse3.HorizontalAdd(deltaSum4, deltaSum5);
 536                  Vector128<int> deltaSum67 = Ssse3.HorizontalAdd(deltaSum6, deltaSum7);
 537  
 538                  Vector128<ushort> delta0123 = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23);
 539                  Vector128<ushort> delta4567 = Sse41.PackUnsignedSaturate(deltaSum45, deltaSum67);
 540  
 541                  Vector128<ushort> min0123 = Sse41.MinHorizontal(delta0123);
 542                  Vector128<ushort> min4567 = Sse41.MinHorizontal(delta4567);
 543  
 544                  ushort minPos0123 = min0123.GetElement(0);
 545                  ushort minPos4567 = min4567.GetElement(0);
 546  
 547                  if (minPos4567 < minPos0123)
 548                  {
 549                      errorSum += minPos4567;
 550                  }
 551                  else
 552                  {
 553                      errorSum += minPos0123;
 554                  }
 555              }
 556  
 557              return errorSum;
 558          }
 559  
 560          private static int SelectIndicesFallback(
 561              ReadOnlySpan<uint> values,
 562              uint endPoint0,
 563              uint endPoint1,
 564              int pBit0,
 565              int pBit1,
 566              int indexBitCount,
 567              int indexCount,
 568              int colorDepth,
 569              int alphaDepth,
 570              uint alphaMask)
 571          {
 572              int errorSum = 0;
 573  
 574              uint alphaMaskForPalette = alphaMask;
 575  
 576              if (alphaDepth == 0)
 577              {
 578                  alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
 579              }
 580  
 581              Span<uint> palette = stackalloc uint[indexCount];
 582  
 583              RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0);
 584              RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1);
 585  
 586              Unsafe.As<RgbaColor8, uint>(ref c0) |= alphaMaskForPalette;
 587              Unsafe.As<RgbaColor8, uint>(ref c1) |= alphaMaskForPalette;
 588  
 589              palette[0] = c0.ToUInt32();
 590              palette[indexCount - 1] = c1.ToUInt32();
 591  
 592              for (int j = 1; j < indexCount - 1; j++)
 593              {
 594                  palette[j] = Interpolate(c0, c1, j, indexBitCount).ToUInt32();
 595              }
 596  
 597              for (int i = 0; i < values.Length; i++)
 598              {
 599                  uint color = values[i] | alphaMask;
 600  
 601                  int bestMatchScore = int.MaxValue;
 602                  int bestMatchIndex = 0;
 603  
 604                  for (int j = 0; j < indexCount; j++)
 605                  {
 606                      int score = SquaredDifference(
 607                          RgbaColor8.FromUInt32(color).GetColor32(),
 608                          RgbaColor8.FromUInt32(palette[j]).GetColor32());
 609  
 610                      if (score < bestMatchScore)
 611                      {
 612                          bestMatchScore = score;
 613                          bestMatchIndex = j;
 614                      }
 615                  }
 616  
 617                  errorSum += bestMatchScore;
 618              }
 619  
 620              return errorSum;
 621          }
 622  
 623          public static int SelectIndices(
 624              ReadOnlySpan<uint> tile,
 625              int w,
 626              int h,
 627              ReadOnlySpan<uint> endPoints0,
 628              ReadOnlySpan<uint> endPoints1,
 629              ReadOnlySpan<int> pBitValues,
 630              Span<byte> indices,
 631              int subsetCount,
 632              int partition,
 633              int indexBitCount,
 634              int indexCount,
 635              int colorDepth,
 636              int alphaDepth,
 637              int pBits,
 638              uint alphaMask)
 639          {
 640              if (Sse41.IsSupported)
 641              {
 642                  if (indexBitCount == 2)
 643                  {
 644                      return Select2BitIndicesSse41(
 645                          tile,
 646                          w,
 647                          h,
 648                          endPoints0,
 649                          endPoints1,
 650                          pBitValues,
 651                          indices,
 652                          subsetCount,
 653                          partition,
 654                          colorDepth,
 655                          alphaDepth,
 656                          pBits,
 657                          alphaMask);
 658                  }
 659                  else if (indexBitCount == 3)
 660                  {
 661                      return Select3BitIndicesSse41(
 662                          tile,
 663                          w,
 664                          h,
 665                          endPoints0,
 666                          endPoints1,
 667                          pBitValues,
 668                          indices,
 669                          subsetCount,
 670                          partition,
 671                          colorDepth,
 672                          alphaDepth,
 673                          pBits,
 674                          alphaMask);
 675                  }
 676                  else if (indexBitCount == 4)
 677                  {
 678                      Debug.Assert(subsetCount == 1);
 679  
 680                      return Select4BitIndicesOneSubsetSse41(
 681                          tile,
 682                          w,
 683                          h,
 684                          endPoints0[0],
 685                          endPoints1[0],
 686                          pBitValues,
 687                          indices,
 688                          partition,
 689                          colorDepth,
 690                          alphaDepth,
 691                          pBits,
 692                          alphaMask);
 693                  }
 694              }
 695  
 696              return SelectIndicesFallback(
 697                  tile,
 698                  w,
 699                  h,
 700                  endPoints0,
 701                  endPoints1,
 702                  pBitValues,
 703                  indices,
 704                  subsetCount,
 705                  partition,
 706                  indexBitCount,
 707                  indexCount,
 708                  colorDepth,
 709                  alphaDepth,
 710                  pBits,
 711                  alphaMask);
 712          }
 713  
 714          private static unsafe int Select2BitIndicesSse41(
 715              ReadOnlySpan<uint> tile,
 716              int w,
 717              int h,
 718              ReadOnlySpan<uint> endPoints0,
 719              ReadOnlySpan<uint> endPoints1,
 720              ReadOnlySpan<int> pBitValues,
 721              Span<byte> indices,
 722              int subsetCount,
 723              int partition,
 724              int colorDepth,
 725              int alphaDepth,
 726              int pBits,
 727              uint alphaMask)
 728          {
 729              byte[] partitionTable = BC67Tables.PartitionTable[subsetCount - 1][partition];
 730  
 731              uint alphaMaskForPalette = alphaMask;
 732  
 733              if (alphaDepth == 0)
 734              {
 735                  alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
 736              }
 737  
 738              int errorSum = 0;
 739  
 740              for (int subset = 0; subset < subsetCount; subset++)
 741              {
 742                  int pBit0 = -1, pBit1 = -1;
 743  
 744                  if (pBits == subsetCount)
 745                  {
 746                      pBit0 = pBit1 = pBitValues[subset];
 747                  }
 748                  else if (pBits != 0)
 749                  {
 750                      pBit0 = pBitValues[subset * 2];
 751                      pBit1 = pBitValues[subset * 2 + 1];
 752                  }
 753  
 754                  RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoints0[subset]), colorDepth, alphaDepth, pBit0);
 755                  RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoints1[subset]), colorDepth, alphaDepth, pBit1);
 756  
 757                  Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte();
 758                  Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte();
 759  
 760                  Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
 761  
 762                  Vector128<byte> rWeights;
 763                  Vector128<byte> lWeights;
 764  
 765                  fixed (byte* pWeights = BC67Tables.Weights[0], pInvWeights = BC67Tables.InverseWeights[0])
 766                  {
 767                      rWeights = Sse2.LoadScalarVector128((uint*)pWeights).AsByte();
 768                      lWeights = Sse2.LoadScalarVector128((uint*)pInvWeights).AsByte();
 769                  }
 770  
 771                  Vector128<byte> iWeights = Sse2.UnpackLow(lWeights, rWeights);
 772                  Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
 773                  Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
 774                  Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
 775  
 776                  Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
 777                  Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
 778  
 779                  int i = 0;
 780                  for (int ty = 0; ty < h; ty++)
 781                  {
 782                      for (int tx = 0; tx < w; tx++, i++)
 783                      {
 784                          int tileOffset = ty * 4 + tx;
 785                          if (partitionTable[tileOffset] != subset)
 786                          {
 787                              continue;
 788                          }
 789  
 790                          uint c = tile[i] | alphaMask;
 791  
 792                          Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
 793  
 794                          Vector128<short> delta0 = Sse2.Subtract(color, pal0);
 795                          Vector128<short> delta1 = Sse2.Subtract(color, pal1);
 796  
 797                          Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
 798                          Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
 799  
 800                          Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
 801  
 802                          Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum01);
 803  
 804                          Vector128<ushort> min = Sse41.MinHorizontal(delta);
 805  
 806                          uint minPos = min.AsUInt32().GetElement(0);
 807                          ushort error = (ushort)minPos;
 808                          uint index = minPos >> 16;
 809  
 810                          indices[tileOffset] = (byte)index;
 811                          errorSum += error;
 812                      }
 813                  }
 814              }
 815  
 816              return errorSum;
 817          }
 818  
 819          private static unsafe int Select3BitIndicesSse41(
 820              ReadOnlySpan<uint> tile,
 821              int w,
 822              int h,
 823              ReadOnlySpan<uint> endPoints0,
 824              ReadOnlySpan<uint> endPoints1,
 825              ReadOnlySpan<int> pBitValues,
 826              Span<byte> indices,
 827              int subsetCount,
 828              int partition,
 829              int colorDepth,
 830              int alphaDepth,
 831              int pBits,
 832              uint alphaMask)
 833          {
 834              byte[] partitionTable = BC67Tables.PartitionTable[subsetCount - 1][partition];
 835  
 836              uint alphaMaskForPalette = alphaMask;
 837  
 838              if (alphaDepth == 0)
 839              {
 840                  alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
 841              }
 842  
 843              int errorSum = 0;
 844  
 845              for (int subset = 0; subset < subsetCount; subset++)
 846              {
 847                  int pBit0 = -1, pBit1 = -1;
 848  
 849                  if (pBits == subsetCount)
 850                  {
 851                      pBit0 = pBit1 = pBitValues[subset];
 852                  }
 853                  else if (pBits != 0)
 854                  {
 855                      pBit0 = pBitValues[subset * 2];
 856                      pBit1 = pBitValues[subset * 2 + 1];
 857                  }
 858  
 859                  RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoints0[subset]), colorDepth, alphaDepth, pBit0);
 860                  RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoints1[subset]), colorDepth, alphaDepth, pBit1);
 861  
 862                  Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte();
 863                  Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte();
 864  
 865                  Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
 866  
 867                  Vector128<byte> rWeights;
 868                  Vector128<byte> lWeights;
 869  
 870                  fixed (byte* pWeights = BC67Tables.Weights[1], pInvWeights = BC67Tables.InverseWeights[1])
 871                  {
 872                      rWeights = Sse2.LoadScalarVector128((ulong*)pWeights).AsByte();
 873                      lWeights = Sse2.LoadScalarVector128((ulong*)pInvWeights).AsByte();
 874                  }
 875  
 876                  Vector128<byte> iWeights = Sse2.UnpackLow(lWeights, rWeights);
 877                  Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
 878                  Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
 879                  Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
 880                  Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
 881                  Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
 882                  Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
 883  
 884                  Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
 885                  Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
 886                  Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte()));
 887                  Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte()));
 888  
 889                  int i = 0;
 890                  for (int ty = 0; ty < h; ty++)
 891                  {
 892                      for (int tx = 0; tx < w; tx++, i++)
 893                      {
 894                          int tileOffset = ty * 4 + tx;
 895                          if (partitionTable[tileOffset] != subset)
 896                          {
 897                              continue;
 898                          }
 899  
 900                          uint c = tile[i] | alphaMask;
 901  
 902                          Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
 903  
 904                          Vector128<short> delta0 = Sse2.Subtract(color, pal0);
 905                          Vector128<short> delta1 = Sse2.Subtract(color, pal1);
 906                          Vector128<short> delta2 = Sse2.Subtract(color, pal2);
 907                          Vector128<short> delta3 = Sse2.Subtract(color, pal3);
 908  
 909                          Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
 910                          Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
 911                          Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2);
 912                          Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3);
 913  
 914                          Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
 915                          Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3);
 916  
 917                          Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23);
 918  
 919                          Vector128<ushort> min = Sse41.MinHorizontal(delta);
 920  
 921                          uint minPos = min.AsUInt32().GetElement(0);
 922                          ushort error = (ushort)minPos;
 923                          uint index = minPos >> 16;
 924  
 925                          indices[tileOffset] = (byte)index;
 926                          errorSum += error;
 927                      }
 928                  }
 929              }
 930  
 931              return errorSum;
 932          }
 933  
 934          private static unsafe int Select4BitIndicesOneSubsetSse41(
 935              ReadOnlySpan<uint> tile,
 936              int w,
 937              int h,
 938              uint endPoint0,
 939              uint endPoint1,
 940              ReadOnlySpan<int> pBitValues,
 941              Span<byte> indices,
 942              int partition,
 943              int colorDepth,
 944              int alphaDepth,
 945              int pBits,
 946              uint alphaMask)
 947          {
 948              uint alphaMaskForPalette = alphaMask;
 949  
 950              if (alphaDepth == 0)
 951              {
 952                  alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
 953              }
 954  
 955              int errorSum = 0;
 956  
 957              int pBit0 = -1, pBit1 = -1;
 958  
 959              if (pBits != 0)
 960              {
 961                  pBit0 = pBitValues[0];
 962                  pBit1 = pBitValues[1];
 963              }
 964  
 965              RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0);
 966              RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1);
 967  
 968              Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte();
 969              Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte();
 970  
 971              Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
 972  
 973              Vector128<byte> rWeights;
 974              Vector128<byte> lWeights;
 975  
 976              fixed (byte* pWeights = BC67Tables.Weights[2], pInvWeights = BC67Tables.InverseWeights[2])
 977              {
 978                  rWeights = Sse2.LoadVector128(pWeights);
 979                  lWeights = Sse2.LoadVector128(pInvWeights);
 980              }
 981  
 982              Vector128<byte> iWeightsLow = Sse2.UnpackLow(lWeights, rWeights);
 983              Vector128<byte> iWeightsHigh = Sse2.UnpackHigh(lWeights, rWeights);
 984              Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte();
 985              Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte();
 986              Vector128<byte> iWeights45 = Sse2.UnpackLow(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte();
 987              Vector128<byte> iWeights67 = Sse2.UnpackHigh(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte();
 988              Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
 989              Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
 990              Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
 991              Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
 992              Vector128<byte> iWeights4 = Sse2.UnpackLow(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte();
 993              Vector128<byte> iWeights5 = Sse2.UnpackHigh(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte();
 994              Vector128<byte> iWeights6 = Sse2.UnpackLow(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte();
 995              Vector128<byte> iWeights7 = Sse2.UnpackHigh(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte();
 996  
 997              Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
 998              Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
 999              Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte()));
1000              Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte()));
1001              Vector128<short> pal4 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights4.AsSByte()));
1002              Vector128<short> pal5 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights5.AsSByte()));
1003              Vector128<short> pal6 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights6.AsSByte()));
1004              Vector128<short> pal7 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights7.AsSByte()));
1005  
1006              int i = 0;
1007              for (int ty = 0; ty < h; ty++)
1008              {
1009                  for (int tx = 0; tx < w; tx++, i++)
1010                  {
1011                      uint c = tile[i] | alphaMask;
1012  
1013                      Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
1014  
1015                      Vector128<short> delta0 = Sse2.Subtract(color, pal0);
1016                      Vector128<short> delta1 = Sse2.Subtract(color, pal1);
1017                      Vector128<short> delta2 = Sse2.Subtract(color, pal2);
1018                      Vector128<short> delta3 = Sse2.Subtract(color, pal3);
1019                      Vector128<short> delta4 = Sse2.Subtract(color, pal4);
1020                      Vector128<short> delta5 = Sse2.Subtract(color, pal5);
1021                      Vector128<short> delta6 = Sse2.Subtract(color, pal6);
1022                      Vector128<short> delta7 = Sse2.Subtract(color, pal7);
1023  
1024                      Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
1025                      Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
1026                      Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2);
1027                      Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3);
1028                      Vector128<int> deltaSum4 = Sse2.MultiplyAddAdjacent(delta4, delta4);
1029                      Vector128<int> deltaSum5 = Sse2.MultiplyAddAdjacent(delta5, delta5);
1030                      Vector128<int> deltaSum6 = Sse2.MultiplyAddAdjacent(delta6, delta6);
1031                      Vector128<int> deltaSum7 = Sse2.MultiplyAddAdjacent(delta7, delta7);
1032  
1033                      Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
1034                      Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3);
1035                      Vector128<int> deltaSum45 = Ssse3.HorizontalAdd(deltaSum4, deltaSum5);
1036                      Vector128<int> deltaSum67 = Ssse3.HorizontalAdd(deltaSum6, deltaSum7);
1037  
1038                      Vector128<ushort> delta0123 = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23);
1039                      Vector128<ushort> delta4567 = Sse41.PackUnsignedSaturate(deltaSum45, deltaSum67);
1040  
1041                      Vector128<ushort> min0123 = Sse41.MinHorizontal(delta0123);
1042                      Vector128<ushort> min4567 = Sse41.MinHorizontal(delta4567);
1043  
1044                      uint minPos0123 = min0123.AsUInt32().GetElement(0);
1045                      uint minPos4567 = min4567.AsUInt32().GetElement(0);
1046  
1047                      if ((ushort)minPos4567 < (ushort)minPos0123)
1048                      {
1049                          errorSum += (ushort)minPos4567;
1050                          indices[ty * 4 + tx] = (byte)(8 + (minPos4567 >> 16));
1051                      }
1052                      else
1053                      {
1054                          errorSum += (ushort)minPos0123;
1055                          indices[ty * 4 + tx] = (byte)(minPos0123 >> 16);
1056                      }
1057                  }
1058              }
1059  
1060              return errorSum;
1061          }
1062  
1063          private static Vector128<short> ShiftRoundToNearest(Vector128<short> x)
1064          {
1065              return Sse2.ShiftRightLogical(Sse2.Add(x, Vector128.Create((short)32)), 6);
1066          }
1067  
1068          private static int SelectIndicesFallback(
1069              ReadOnlySpan<uint> tile,
1070              int w,
1071              int h,
1072              ReadOnlySpan<uint> endPoints0,
1073              ReadOnlySpan<uint> endPoints1,
1074              ReadOnlySpan<int> pBitValues,
1075              Span<byte> indices,
1076              int subsetCount,
1077              int partition,
1078              int indexBitCount,
1079              int indexCount,
1080              int colorDepth,
1081              int alphaDepth,
1082              int pBits,
1083              uint alphaMask)
1084          {
1085              int errorSum = 0;
1086  
1087              uint alphaMaskForPalette = alphaMask;
1088  
1089              if (alphaDepth == 0)
1090              {
1091                  alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32();
1092              }
1093  
1094              Span<uint> palette = stackalloc uint[subsetCount * indexCount];
1095  
1096              for (int subset = 0; subset < subsetCount; subset++)
1097              {
1098                  int palBase = subset * indexCount;
1099  
1100                  int pBit0 = -1, pBit1 = -1;
1101  
1102                  if (pBits == subsetCount)
1103                  {
1104                      pBit0 = pBit1 = pBitValues[subset];
1105                  }
1106                  else if (pBits != 0)
1107                  {
1108                      pBit0 = pBitValues[subset * 2];
1109                      pBit1 = pBitValues[subset * 2 + 1];
1110                  }
1111  
1112                  RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoints0[subset]), colorDepth, alphaDepth, pBit0);
1113                  RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoints1[subset]), colorDepth, alphaDepth, pBit1);
1114  
1115                  Unsafe.As<RgbaColor8, uint>(ref c0) |= alphaMaskForPalette;
1116                  Unsafe.As<RgbaColor8, uint>(ref c1) |= alphaMaskForPalette;
1117  
1118                  palette[palBase + 0] = c0.ToUInt32();
1119                  palette[palBase + indexCount - 1] = c1.ToUInt32();
1120  
1121                  for (int j = 1; j < indexCount - 1; j++)
1122                  {
1123                      palette[palBase + j] = Interpolate(c0, c1, j, indexBitCount).ToUInt32();
1124                  }
1125              }
1126  
1127              int i = 0;
1128              for (int ty = 0; ty < h; ty++)
1129              {
1130                  for (int tx = 0; tx < w; tx++)
1131                  {
1132                      int subset = BC67Tables.PartitionTable[subsetCount - 1][partition][ty * 4 + tx];
1133                      uint color = tile[i++] | alphaMask;
1134  
1135                      int bestMatchScore = int.MaxValue;
1136                      int bestMatchIndex = 0;
1137  
1138                      for (int j = 0; j < indexCount; j++)
1139                      {
1140                          int score = SquaredDifference(
1141                              RgbaColor8.FromUInt32(color).GetColor32(),
1142                              RgbaColor8.FromUInt32(palette[subset * indexCount + j]).GetColor32());
1143  
1144                          if (score < bestMatchScore)
1145                          {
1146                              bestMatchScore = score;
1147                              bestMatchIndex = j;
1148                          }
1149                      }
1150  
1151                      indices[ty * 4 + tx] = (byte)bestMatchIndex;
1152                      errorSum += bestMatchScore;
1153                  }
1154              }
1155  
1156              return errorSum;
1157          }
1158  
1159          [MethodImpl(MethodImplOptions.AggressiveInlining)]
1160          public static int SquaredDifference(RgbaColor32 color1, RgbaColor32 color2)
1161          {
1162              RgbaColor32 delta = color1 - color2;
1163              return RgbaColor32.Dot(delta, delta);
1164          }
1165  
1166          [MethodImpl(MethodImplOptions.AggressiveInlining)]
1167          public static RgbaColor8 Interpolate(RgbaColor8 color1, RgbaColor8 color2, int weightIndex, int indexBitCount)
1168          {
1169              return Interpolate(color1.GetColor32(), color2.GetColor32(), weightIndex, indexBitCount).GetColor8();
1170          }
1171  
1172          [MethodImpl(MethodImplOptions.AggressiveInlining)]
1173          public static RgbaColor32 Interpolate(RgbaColor32 color1, RgbaColor32 color2, int weightIndex, int indexBitCount)
1174          {
1175              Debug.Assert(indexBitCount >= 2 && indexBitCount <= 4);
1176  
1177              int weight = (((weightIndex << 7) / ((1 << indexBitCount) - 1)) + 1) >> 1;
1178  
1179              RgbaColor32 weightV = new(weight);
1180              RgbaColor32 invWeightV = new(64 - weight);
1181  
1182              return (color1 * invWeightV + color2 * weightV + new RgbaColor32(32)) >> 6;
1183          }
1184  
1185          [MethodImpl(MethodImplOptions.AggressiveInlining)]
1186          public static RgbaColor32 Interpolate(
1187              RgbaColor32 color1,
1188              RgbaColor32 color2,
1189              int colorWeightIndex,
1190              int alphaWeightIndex,
1191              int colorIndexBitCount,
1192              int alphaIndexBitCount)
1193          {
1194              Debug.Assert(colorIndexBitCount >= 2 && colorIndexBitCount <= 4);
1195              Debug.Assert(alphaIndexBitCount >= 2 && alphaIndexBitCount <= 4);
1196  
1197              int colorWeight = BC67Tables.Weights[colorIndexBitCount - 2][colorWeightIndex];
1198              int alphaWeight = BC67Tables.Weights[alphaIndexBitCount - 2][alphaWeightIndex];
1199  
1200              RgbaColor32 weightV = new(colorWeight)
1201              {
1202                  A = alphaWeight,
1203              };
1204              RgbaColor32 invWeightV = new RgbaColor32(64) - weightV;
1205  
1206              return (color1 * invWeightV + color2 * weightV + new RgbaColor32(32)) >> 6;
1207          }
1208  
1209          public static RgbaColor8 Quantize(RgbaColor8 color, int colorBits, int alphaBits, int pBit = -1)
1210          {
1211              if (alphaBits == 0)
1212              {
1213                  int colorShift = 8 - colorBits;
1214  
1215                  uint c;
1216  
1217                  if (pBit >= 0)
1218                  {
1219                      byte[] lutColor = _quantizationLut[colorBits - 4];
1220  
1221                      Debug.Assert(pBit <= 1);
1222                      int high = pBit << 8;
1223                      uint mask = (0xffu >> (colorBits + 1)) * 0x10101;
1224  
1225                      c = lutColor[color.R | high];
1226                      c |= (uint)lutColor[color.G | high] << 8;
1227                      c |= (uint)lutColor[color.B | high] << 16;
1228  
1229                      c <<= colorShift;
1230                      c |= (c >> (colorBits + 1)) & mask;
1231                      c |= ((uint)pBit * 0x10101) << (colorShift - 1);
1232                  }
1233                  else
1234                  {
1235                      byte[] lutColor = _quantizationLutNoPBit[colorBits - 4];
1236  
1237                      uint mask = (0xffu >> colorBits) * 0x10101;
1238  
1239                      c = lutColor[color.R];
1240                      c |= (uint)lutColor[color.G] << 8;
1241                      c |= (uint)lutColor[color.B] << 16;
1242  
1243                      c <<= colorShift;
1244                      c |= (c >> colorBits) & mask;
1245                  }
1246  
1247                  c |= (uint)color.A << 24;
1248  
1249                  return RgbaColor8.FromUInt32(c);
1250              }
1251  
1252              return QuantizeFallback(color, colorBits, alphaBits, pBit);
1253          }
1254  
1255          private static RgbaColor8 QuantizeFallback(RgbaColor8 color, int colorBits, int alphaBits, int pBit = -1)
1256          {
1257              byte r = UnquantizeComponent(QuantizeComponent(color.R, colorBits, pBit), colorBits, pBit);
1258              byte g = UnquantizeComponent(QuantizeComponent(color.G, colorBits, pBit), colorBits, pBit);
1259              byte b = UnquantizeComponent(QuantizeComponent(color.B, colorBits, pBit), colorBits, pBit);
1260              byte a = alphaBits == 0 ? color.A : UnquantizeComponent(QuantizeComponent(color.A, alphaBits, pBit), alphaBits, pBit);
1261              return new RgbaColor8(r, g, b, a);
1262          }
1263  
1264          public static byte QuantizeComponent(byte component, int bits, int pBit = -1)
1265          {
1266              return pBit >= 0 ? _quantizationLut[bits - 4][component | (pBit << 8)] : _quantizationLutNoPBit[bits - 4][component];
1267          }
1268  
1269          private static byte QuantizeComponentForLut(byte component, int bits, int pBit = -1)
1270          {
1271              int shift = 8 - bits;
1272              int fill = component >> bits;
1273  
1274              if (pBit >= 0)
1275              {
1276                  Debug.Assert(pBit <= 1);
1277                  fill >>= 1;
1278                  fill |= pBit << (shift - 1);
1279              }
1280  
1281              int q1 = component >> shift;
1282              int q2 = Math.Max(q1 - 1, 0);
1283              int q3 = Math.Min(q1 + 1, (1 << bits) - 1);
1284  
1285              int delta1 = FastAbs(((q1 << shift) | fill) - component);
1286              int delta2 = component - ((q2 << shift) | fill);
1287              int delta3 = ((q3 << shift) | fill) - component;
1288  
1289              if (delta1 < delta2 && delta1 < delta3)
1290              {
1291                  return (byte)q1;
1292              }
1293              else if (delta2 < delta3)
1294              {
1295                  return (byte)q2;
1296              }
1297              else
1298              {
1299                  return (byte)q3;
1300              }
1301          }
1302  
1303          [MethodImpl(MethodImplOptions.AggressiveInlining)]
1304          private static int FastAbs(int x)
1305          {
1306              int sign = x >> 31;
1307              return (x + sign) ^ sign;
1308          }
1309  
1310          private static byte UnquantizeComponent(byte component, int bits, int pBit)
1311          {
1312              int shift = 8 - bits;
1313              int value = component << shift;
1314  
1315              if (pBit >= 0)
1316              {
1317                  Debug.Assert(pBit <= 1);
1318                  value |= value >> (bits + 1);
1319                  value |= pBit << (shift - 1);
1320              }
1321              else
1322              {
1323                  value |= value >> bits;
1324              }
1325  
1326              return (byte)value;
1327          }
1328      }
1329  }