BC7Encoder.cs
   1  using Ryujinx.Graphics.Texture.Utils;
   2  using System;
   3  using System.Diagnostics;
   4  using System.Numerics;
   5  using System.Runtime.CompilerServices;
   6  using System.Runtime.InteropServices;
   7  using System.Runtime.Intrinsics;
   8  using System.Runtime.Intrinsics.X86;
   9  using System.Threading.Tasks;
  10  
  11  namespace Ryujinx.Graphics.Texture.Encoders
  12  {
  13      static class BC7Encoder
  14      {
  15          private const int MinColorVarianceForModeChange = 160;
  16  
  17          public static void Encode(Memory<byte> outputStorage, ReadOnlyMemory<byte> data, int width, int height, EncodeMode mode)
  18          {
  19              int widthInBlocks = (width + 3) / 4;
  20              int heightInBlocks = (height + 3) / 4;
  21  
  22              bool fastMode = (mode & EncodeMode.ModeMask) == EncodeMode.Fast;
  23  
  24              if (mode.HasFlag(EncodeMode.Multithreaded))
  25              {
  26                  Parallel.For(0, heightInBlocks, (yInBlocks) =>
  27                  {
  28                      Span<ulong> output = MemoryMarshal.Cast<byte, ulong>(outputStorage.Span);
  29                      int y = yInBlocks * 4;
  30  
  31                      for (int xInBlocks = 0; xInBlocks < widthInBlocks; xInBlocks++)
  32                      {
  33                          int x = xInBlocks * 4;
  34                          Block block = CompressBlock(data.Span, x, y, width, height, fastMode);
  35  
  36                          int offset = (yInBlocks * widthInBlocks + xInBlocks) * 2;
  37                          output[offset] = block.Low;
  38                          output[offset + 1] = block.High;
  39                      }
  40                  });
  41              }
  42              else
  43              {
  44                  Span<ulong> output = MemoryMarshal.Cast<byte, ulong>(outputStorage.Span);
  45                  int offset = 0;
  46  
  47                  for (int y = 0; y < height; y += 4)
  48                  {
  49                      for (int x = 0; x < width; x += 4)
  50                      {
  51                          Block block = CompressBlock(data.Span, x, y, width, height, fastMode);
  52  
  53                          output[offset++] = block.Low;
  54                          output[offset++] = block.High;
  55                      }
  56                  }
  57              }
  58          }
  59  
  60          private static readonly int[] _mostFrequentPartitions = new int[]
  61          {
  62              0, 13, 2, 1, 15, 14, 10, 23,
  63          };
  64  
  65          private static Block CompressBlock(ReadOnlySpan<byte> data, int x, int y, int width, int height, bool fastMode)
  66          {
  67              int w = Math.Min(4, width - x);
  68              int h = Math.Min(4, height - y);
  69  
  70              var dataUint = MemoryMarshal.Cast<byte, uint>(data);
  71  
  72              int baseOffset = y * width + x;
  73  
  74              Span<uint> tile = stackalloc uint[w * h];
  75  
  76              for (int ty = 0; ty < h; ty++)
  77              {
  78                  int rowOffset = baseOffset + ty * width;
  79  
  80                  for (int tx = 0; tx < w; tx++)
  81                  {
  82                      tile[ty * w + tx] = dataUint[rowOffset + tx];
  83                  }
  84              }
  85  
  86              return fastMode ? EncodeFast(tile, w, h) : EncodeExhaustive(tile, w, h);
  87          }
  88  
  89          private static Block EncodeFast(ReadOnlySpan<uint> tile, int w, int h)
  90          {
  91              (RgbaColor8 minColor, RgbaColor8 maxColor) = BC67Utils.GetMinMaxColors(tile, w, h);
  92  
  93              bool alphaNotOne = minColor.A != 255 || maxColor.A != 255;
  94              int variance = BC67Utils.SquaredDifference(minColor.GetColor32(), maxColor.GetColor32());
  95              int selectedMode;
  96              int indexMode = 0;
  97  
  98              if (alphaNotOne)
  99              {
 100                  bool constantAlpha = minColor.A == maxColor.A;
 101                  if (constantAlpha)
 102                  {
 103                      selectedMode = variance > MinColorVarianceForModeChange ? 7 : 6;
 104                  }
 105                  else
 106                  {
 107                      if (variance > MinColorVarianceForModeChange)
 108                      {
 109                          Span<uint> uniqueRGB = stackalloc uint[16];
 110                          Span<uint> uniqueAlpha = stackalloc uint[16];
 111  
 112                          int uniqueRGBCount = 0;
 113                          int uniqueAlphaCount = 0;
 114  
 115                          uint rgbMask = new RgbaColor8(255, 255, 255, 0).ToUInt32();
 116                          uint alphaMask = new RgbaColor8(0, 0, 0, 255).ToUInt32();
 117  
 118                          for (int i = 0; i < tile.Length; i++)
 119                          {
 120                              uint c = tile[i];
 121  
 122                              if (!uniqueRGB[..uniqueRGBCount].Contains(c & rgbMask))
 123                              {
 124                                  uniqueRGB[uniqueRGBCount++] = c & rgbMask;
 125                              }
 126  
 127                              if (!uniqueAlpha[..uniqueAlphaCount].Contains(c & alphaMask))
 128                              {
 129                                  uniqueAlpha[uniqueAlphaCount++] = c & alphaMask;
 130                              }
 131                          }
 132  
 133                          selectedMode = 4;
 134                          indexMode = uniqueRGBCount > uniqueAlphaCount ? 1 : 0;
 135                      }
 136                      else
 137                      {
 138                          selectedMode = 5;
 139                      }
 140                  }
 141              }
 142              else
 143              {
 144                  if (variance > MinColorVarianceForModeChange)
 145                  {
 146                      selectedMode = 1;
 147                  }
 148                  else
 149                  {
 150                      selectedMode = 6;
 151                  }
 152              }
 153  
 154              int selectedPartition = 0;
 155  
 156              if (selectedMode == 1 || selectedMode == 7)
 157              {
 158                  int partitionSelectionLowestError = int.MaxValue;
 159  
 160                  for (int i = 0; i < _mostFrequentPartitions.Length; i++)
 161                  {
 162                      int p = _mostFrequentPartitions[i];
 163                      int error = GetEndPointSelectionErrorFast(tile, 2, p, w, h, partitionSelectionLowestError);
 164                      if (error < partitionSelectionLowestError)
 165                      {
 166                          partitionSelectionLowestError = error;
 167                          selectedPartition = p;
 168                      }
 169                  }
 170              }
 171  
 172              return Encode(selectedMode, selectedPartition, 0, indexMode, fastMode: true, tile, w, h, out _);
 173          }
 174  
 175          private static Block EncodeExhaustive(ReadOnlySpan<uint> tile, int w, int h)
 176          {
 177              Block bestBlock = default;
 178              int lowestError = int.MaxValue;
 179              int lowestErrorSubsets = int.MaxValue;
 180  
 181              for (int m = 0; m < 8; m++)
 182              {
 183                  for (int r = 0; r < (m == 4 || m == 5 ? 4 : 1); r++)
 184                  {
 185                      for (int im = 0; im < (m == 4 ? 2 : 1); im++)
 186                      {
 187                          for (int p = 0; p < 1 << BC67Tables.BC7ModeInfos[m].PartitionBitCount; p++)
 188                          {
 189                              Block block = Encode(m, p, r, im, fastMode: false, tile, w, h, out int maxError);
 190                              if (maxError < lowestError || (maxError == lowestError && BC67Tables.BC7ModeInfos[m].SubsetCount < lowestErrorSubsets))
 191                              {
 192                                  lowestError = maxError;
 193                                  lowestErrorSubsets = BC67Tables.BC7ModeInfos[m].SubsetCount;
 194                                  bestBlock = block;
 195                              }
 196                          }
 197                      }
 198                  }
 199              }
 200  
 201              return bestBlock;
 202          }
 203  
 204          private static Block Encode(
 205              int mode,
 206              int partition,
 207              int rotation,
 208              int indexMode,
 209              bool fastMode,
 210              ReadOnlySpan<uint> tile,
 211              int w,
 212              int h,
 213              out int errorSum)
 214          {
 215              BC7ModeInfo modeInfo = BC67Tables.BC7ModeInfos[mode];
 216              int subsetCount = modeInfo.SubsetCount;
 217              int partitionBitCount = modeInfo.PartitionBitCount;
 218              int rotationBitCount = modeInfo.RotationBitCount;
 219              int indexModeBitCount = modeInfo.IndexModeBitCount;
 220              int colorDepth = modeInfo.ColorDepth;
 221              int alphaDepth = modeInfo.AlphaDepth;
 222              int pBits = modeInfo.PBits;
 223              int colorIndexBitCount = modeInfo.ColorIndexBitCount;
 224              int alphaIndexBitCount = modeInfo.AlphaIndexBitCount;
 225              bool separateAlphaIndices = alphaIndexBitCount != 0;
 226  
 227              uint alphaMask;
 228  
 229              if (separateAlphaIndices)
 230              {
 231                  alphaMask = rotation switch
 232                  {
 233                      1 => new RgbaColor8(255, 0, 0, 0).ToUInt32(),
 234                      2 => new RgbaColor8(0, 255, 0, 0).ToUInt32(),
 235                      3 => new RgbaColor8(0, 0, 255, 0).ToUInt32(),
 236                      _ => new RgbaColor8(0, 0, 0, 255).ToUInt32(),
 237                  };
 238              }
 239              else
 240              {
 241                  alphaMask = new RgbaColor8(0, 0, 0, 0).ToUInt32();
 242              }
 243  
 244              if (indexMode != 0)
 245              {
 246                  alphaMask = ~alphaMask;
 247              }
 248  
 249              //
 250              // Select color palette.
 251              //
 252  
 253              Span<uint> endPoints0 = stackalloc uint[subsetCount];
 254              Span<uint> endPoints1 = stackalloc uint[subsetCount];
 255  
 256              SelectEndPoints(
 257                  tile,
 258                  w,
 259                  h,
 260                  endPoints0,
 261                  endPoints1,
 262                  subsetCount,
 263                  partition,
 264                  colorIndexBitCount,
 265                  colorDepth,
 266                  alphaDepth,
 267                  ~alphaMask,
 268                  fastMode);
 269  
 270              if (separateAlphaIndices)
 271              {
 272                  SelectEndPoints(
 273                      tile,
 274                      w,
 275                      h,
 276                      endPoints0,
 277                      endPoints1,
 278                      subsetCount,
 279                      partition,
 280                      alphaIndexBitCount,
 281                      colorDepth,
 282                      alphaDepth,
 283                      alphaMask,
 284                      fastMode);
 285              }
 286  
 287              Span<int> pBitValues = stackalloc int[pBits];
 288  
 289              for (int i = 0; i < pBits; i++)
 290              {
 291                  int pBit;
 292  
 293                  if (pBits == subsetCount)
 294                  {
 295                      pBit = GetPBit(endPoints0[i], endPoints1[i], colorDepth, alphaDepth);
 296                  }
 297                  else
 298                  {
 299                      int subset = i >> 1;
 300                      uint color = (i & 1) == 0 ? endPoints0[subset] : endPoints1[subset];
 301                      pBit = GetPBit(color, colorDepth, alphaDepth);
 302                  }
 303  
 304                  pBitValues[i] = pBit;
 305              }
 306  
 307              int colorIndexCount = 1 << colorIndexBitCount;
 308              int alphaIndexCount = 1 << alphaIndexBitCount;
 309  
 310              Span<byte> colorIndices = stackalloc byte[16];
 311              Span<byte> alphaIndices = stackalloc byte[16];
 312  
 313              errorSum = BC67Utils.SelectIndices(
 314                  tile,
 315                  w,
 316                  h,
 317                  endPoints0,
 318                  endPoints1,
 319                  pBitValues,
 320                  colorIndices,
 321                  subsetCount,
 322                  partition,
 323                  colorIndexBitCount,
 324                  colorIndexCount,
 325                  colorDepth,
 326                  alphaDepth,
 327                  pBits,
 328                  alphaMask);
 329  
 330              if (separateAlphaIndices)
 331              {
 332                  errorSum += BC67Utils.SelectIndices(
 333                      tile,
 334                      w,
 335                      h,
 336                      endPoints0,
 337                      endPoints1,
 338                      pBitValues,
 339                      alphaIndices,
 340                      subsetCount,
 341                      partition,
 342                      alphaIndexBitCount,
 343                      alphaIndexCount,
 344                      colorDepth,
 345                      alphaDepth,
 346                      pBits,
 347                      ~alphaMask);
 348              }
 349  
 350              Span<bool> colorSwapSubset = stackalloc bool[3];
 351  
 352              for (int i = 0; i < 3; i++)
 353              {
 354                  colorSwapSubset[i] = colorIndices[BC67Tables.FixUpIndices[subsetCount - 1][partition][i]] >= (colorIndexCount >> 1);
 355              }
 356  
 357              bool alphaSwapSubset = alphaIndices[0] >= (alphaIndexCount >> 1);
 358  
 359              Block block = new();
 360  
 361              int offset = 0;
 362  
 363              block.Encode(1UL << mode, ref offset, mode + 1);
 364              block.Encode((ulong)partition, ref offset, partitionBitCount);
 365              block.Encode((ulong)rotation, ref offset, rotationBitCount);
 366              block.Encode((ulong)indexMode, ref offset, indexModeBitCount);
 367  
 368              for (int comp = 0; comp < 3; comp++)
 369              {
 370                  int rotatedComp = comp;
 371  
 372                  if (((comp + 1) & 3) == rotation)
 373                  {
 374                      rotatedComp = 3;
 375                  }
 376  
 377                  for (int subset = 0; subset < subsetCount; subset++)
 378                  {
 379                      RgbaColor8 color0 = RgbaColor8.FromUInt32(endPoints0[subset]);
 380                      RgbaColor8 color1 = RgbaColor8.FromUInt32(endPoints1[subset]);
 381  
 382                      int pBit0 = -1, pBit1 = -1;
 383  
 384                      if (pBits == subsetCount)
 385                      {
 386                          pBit0 = pBit1 = pBitValues[subset];
 387                      }
 388                      else if (pBits != 0)
 389                      {
 390                          pBit0 = pBitValues[subset * 2];
 391                          pBit1 = pBitValues[subset * 2 + 1];
 392                      }
 393  
 394                      if (indexMode == 0 ? colorSwapSubset[subset] : alphaSwapSubset)
 395                      {
 396                          block.Encode(BC67Utils.QuantizeComponent(color1.GetComponent(rotatedComp), colorDepth, pBit1), ref offset, colorDepth);
 397                          block.Encode(BC67Utils.QuantizeComponent(color0.GetComponent(rotatedComp), colorDepth, pBit0), ref offset, colorDepth);
 398                      }
 399                      else
 400                      {
 401                          block.Encode(BC67Utils.QuantizeComponent(color0.GetComponent(rotatedComp), colorDepth, pBit0), ref offset, colorDepth);
 402                          block.Encode(BC67Utils.QuantizeComponent(color1.GetComponent(rotatedComp), colorDepth, pBit1), ref offset, colorDepth);
 403                      }
 404                  }
 405              }
 406  
 407              if (alphaDepth != 0)
 408              {
 409                  int rotatedComp = (rotation - 1) & 3;
 410  
 411                  for (int subset = 0; subset < subsetCount; subset++)
 412                  {
 413                      RgbaColor8 color0 = RgbaColor8.FromUInt32(endPoints0[subset]);
 414                      RgbaColor8 color1 = RgbaColor8.FromUInt32(endPoints1[subset]);
 415  
 416                      int pBit0 = -1, pBit1 = -1;
 417  
 418                      if (pBits == subsetCount)
 419                      {
 420                          pBit0 = pBit1 = pBitValues[subset];
 421                      }
 422                      else if (pBits != 0)
 423                      {
 424                          pBit0 = pBitValues[subset * 2];
 425                          pBit1 = pBitValues[subset * 2 + 1];
 426                      }
 427  
 428                      if (separateAlphaIndices && indexMode == 0 ? alphaSwapSubset : colorSwapSubset[subset])
 429                      {
 430                          block.Encode(BC67Utils.QuantizeComponent(color1.GetComponent(rotatedComp), alphaDepth, pBit1), ref offset, alphaDepth);
 431                          block.Encode(BC67Utils.QuantizeComponent(color0.GetComponent(rotatedComp), alphaDepth, pBit0), ref offset, alphaDepth);
 432                      }
 433                      else
 434                      {
 435                          block.Encode(BC67Utils.QuantizeComponent(color0.GetComponent(rotatedComp), alphaDepth, pBit0), ref offset, alphaDepth);
 436                          block.Encode(BC67Utils.QuantizeComponent(color1.GetComponent(rotatedComp), alphaDepth, pBit1), ref offset, alphaDepth);
 437                      }
 438                  }
 439              }
 440  
 441              for (int i = 0; i < pBits; i++)
 442              {
 443                  block.Encode((ulong)pBitValues[i], ref offset, 1);
 444              }
 445  
 446              byte[] fixUpTable = BC67Tables.FixUpIndices[subsetCount - 1][partition];
 447  
 448              for (int i = 0; i < 16; i++)
 449              {
 450                  int subset = BC67Tables.PartitionTable[subsetCount - 1][partition][i];
 451                  byte index = colorIndices[i];
 452  
 453                  if (colorSwapSubset[subset])
 454                  {
 455                      index = (byte)(index ^ (colorIndexCount - 1));
 456                  }
 457  
 458                  int finalIndexBitCount = i == fixUpTable[subset] ? colorIndexBitCount - 1 : colorIndexBitCount;
 459  
 460                  Debug.Assert(index < (1 << finalIndexBitCount));
 461  
 462                  block.Encode(index, ref offset, finalIndexBitCount);
 463              }
 464  
 465              if (separateAlphaIndices)
 466              {
 467                  for (int i = 0; i < 16; i++)
 468                  {
 469                      byte index = alphaIndices[i];
 470  
 471                      if (alphaSwapSubset)
 472                      {
 473                          index = (byte)(index ^ (alphaIndexCount - 1));
 474                      }
 475  
 476                      int finalIndexBitCount = i == 0 ? alphaIndexBitCount - 1 : alphaIndexBitCount;
 477  
 478                      Debug.Assert(index < (1 << finalIndexBitCount));
 479  
 480                      block.Encode(index, ref offset, finalIndexBitCount);
 481                  }
 482              }
 483  
 484              return block;
 485          }
 486  
 487          private static unsafe int GetEndPointSelectionErrorFast(ReadOnlySpan<uint> tile, int subsetCount, int partition, int w, int h, int maxError)
 488          {
 489              byte[] partitionTable = BC67Tables.PartitionTable[subsetCount - 1][partition];
 490  
 491              Span<RgbaColor8> minColors = stackalloc RgbaColor8[subsetCount];
 492              Span<RgbaColor8> maxColors = stackalloc RgbaColor8[subsetCount];
 493  
 494              BC67Utils.GetMinMaxColors(partitionTable, tile, w, h, minColors, maxColors, subsetCount);
 495  
 496              Span<uint> endPoints0 = stackalloc uint[subsetCount];
 497              Span<uint> endPoints1 = stackalloc uint[subsetCount];
 498  
 499              SelectEndPointsFast(partitionTable, tile, w, h, subsetCount, minColors, maxColors, endPoints0, endPoints1, uint.MaxValue);
 500  
 501              Span<RgbaColor32> palette = stackalloc RgbaColor32[8];
 502  
 503              int errorSum = 0;
 504  
 505              for (int subset = 0; subset < subsetCount; subset++)
 506              {
 507                  RgbaColor32 blockDir = maxColors[subset].GetColor32() - minColors[subset].GetColor32();
 508                  int sum = blockDir.R + blockDir.G + blockDir.B + blockDir.A;
 509                  if (sum != 0)
 510                  {
 511                      blockDir = (blockDir << 6) / new RgbaColor32(sum);
 512                  }
 513  
 514                  uint c0 = endPoints0[subset];
 515                  uint c1 = endPoints1[subset];
 516  
 517                  int pBit0 = GetPBit(c0, 6, 0);
 518                  int pBit1 = GetPBit(c1, 6, 0);
 519  
 520                  c0 = BC67Utils.Quantize(RgbaColor8.FromUInt32(c0), 6, 0, pBit0).ToUInt32();
 521                  c1 = BC67Utils.Quantize(RgbaColor8.FromUInt32(c1), 6, 0, pBit1).ToUInt32();
 522  
 523                  if (Sse41.IsSupported)
 524                  {
 525                      Vector128<byte> c0Rep = Vector128.Create(c0).AsByte();
 526                      Vector128<byte> c1Rep = Vector128.Create(c1).AsByte();
 527  
 528                      Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep);
 529  
 530                      Vector128<byte> rWeights;
 531                      Vector128<byte> lWeights;
 532  
 533                      fixed (byte* pWeights = BC67Tables.Weights[1], pInvWeights = BC67Tables.InverseWeights[1])
 534                      {
 535                          rWeights = Sse2.LoadScalarVector128((ulong*)pWeights).AsByte();
 536                          lWeights = Sse2.LoadScalarVector128((ulong*)pInvWeights).AsByte();
 537                      }
 538  
 539                      Vector128<byte> iWeights = Sse2.UnpackLow(rWeights, lWeights);
 540                      Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
 541                      Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeights.AsInt16(), iWeights.AsInt16()).AsByte();
 542                      Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
 543                      Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte();
 544                      Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
 545                      Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte();
 546  
 547                      static Vector128<short> ShiftRoundToNearest(Vector128<short> x)
 548                      {
 549                          return Sse2.ShiftRightLogical(Sse2.Add(x, Vector128.Create((short)32)), 6);
 550                      }
 551  
 552                      Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte()));
 553                      Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte()));
 554                      Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte()));
 555                      Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte()));
 556  
 557                      for (int i = 0; i < tile.Length; i++)
 558                      {
 559                          if (partitionTable[i] != subset)
 560                          {
 561                              continue;
 562                          }
 563  
 564                          uint c = tile[i];
 565  
 566                          Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte());
 567  
 568                          Vector128<short> delta0 = Sse2.Subtract(color, pal0);
 569                          Vector128<short> delta1 = Sse2.Subtract(color, pal1);
 570                          Vector128<short> delta2 = Sse2.Subtract(color, pal2);
 571                          Vector128<short> delta3 = Sse2.Subtract(color, pal3);
 572  
 573                          Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0);
 574                          Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1);
 575                          Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2);
 576                          Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3);
 577  
 578                          Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1);
 579                          Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3);
 580  
 581                          Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23);
 582  
 583                          Vector128<ushort> min = Sse41.MinHorizontal(delta);
 584  
 585                          errorSum += min.GetElement(0);
 586                      }
 587                  }
 588                  else
 589                  {
 590                      RgbaColor32 e032 = RgbaColor8.FromUInt32(c0).GetColor32();
 591                      RgbaColor32 e132 = RgbaColor8.FromUInt32(c1).GetColor32();
 592  
 593                      palette[0] = e032;
 594                      palette[^1] = e132;
 595  
 596                      for (int i = 1; i < palette.Length - 1; i++)
 597                      {
 598                          palette[i] = BC67Utils.Interpolate(e032, e132, i, 3);
 599                      }
 600  
 601                      for (int i = 0; i < tile.Length; i++)
 602                      {
 603                          if (partitionTable[i] != subset)
 604                          {
 605                              continue;
 606                          }
 607  
 608                          uint c = tile[i];
 609                          RgbaColor32 color = Unsafe.As<uint, RgbaColor8>(ref c).GetColor32();
 610  
 611                          int bestMatchScore = int.MaxValue;
 612  
 613                          for (int j = 0; j < palette.Length; j++)
 614                          {
 615                              int score = BC67Utils.SquaredDifference(color, palette[j]);
 616  
 617                              if (score < bestMatchScore)
 618                              {
 619                                  bestMatchScore = score;
 620                              }
 621                          }
 622  
 623                          errorSum += bestMatchScore;
 624                      }
 625                  }
 626  
 627                  // No point in continuing if we are already above maximum.
 628                  if (errorSum >= maxError)
 629                  {
 630                      return int.MaxValue;
 631                  }
 632              }
 633  
 634              return errorSum;
 635          }
 636  
 637          private static void SelectEndPoints(
 638              ReadOnlySpan<uint> tile,
 639              int w,
 640              int h,
 641              Span<uint> endPoints0,
 642              Span<uint> endPoints1,
 643              int subsetCount,
 644              int partition,
 645              int indexBitCount,
 646              int colorDepth,
 647              int alphaDepth,
 648              uint writeMask,
 649              bool fastMode)
 650          {
 651              byte[] partitionTable = BC67Tables.PartitionTable[subsetCount - 1][partition];
 652  
 653              Span<RgbaColor8> minColors = stackalloc RgbaColor8[subsetCount];
 654              Span<RgbaColor8> maxColors = stackalloc RgbaColor8[subsetCount];
 655  
 656              BC67Utils.GetMinMaxColors(partitionTable, tile, w, h, minColors, maxColors, subsetCount);
 657  
 658              uint inverseMask = ~writeMask;
 659  
 660              for (int i = 0; i < subsetCount; i++)
 661              {
 662                  Unsafe.As<RgbaColor8, uint>(ref minColors[i]) |= inverseMask;
 663                  Unsafe.As<RgbaColor8, uint>(ref maxColors[i]) |= inverseMask;
 664              }
 665  
 666              if (fastMode)
 667              {
 668                  SelectEndPointsFast(partitionTable, tile, w, h, subsetCount, minColors, maxColors, endPoints0, endPoints1, writeMask);
 669              }
 670              else
 671              {
 672                  Span<RgbaColor8> colors = stackalloc RgbaColor8[subsetCount * 16];
 673                  Span<byte> counts = stackalloc byte[subsetCount];
 674  
 675                  int i = 0;
 676                  for (int ty = 0; ty < h; ty++)
 677                  {
 678                      for (int tx = 0; tx < w; tx++)
 679                      {
 680                          int subset = partitionTable[ty * 4 + tx];
 681                          RgbaColor8 color = RgbaColor8.FromUInt32(tile[i++] | inverseMask);
 682  
 683                          static void AddIfNew(Span<RgbaColor8> values, RgbaColor8 value, int subset, ref byte count)
 684                          {
 685                              for (int i = 0; i < count; i++)
 686                              {
 687                                  if (values[subset * 16 + i] == value)
 688                                  {
 689                                      return;
 690                                  }
 691                              }
 692  
 693                              values[subset * 16 + count++] = value;
 694                          }
 695  
 696                          AddIfNew(colors, color, subset, ref counts[subset]);
 697                      }
 698                  }
 699  
 700                  for (int subset = 0; subset < subsetCount; subset++)
 701                  {
 702                      int offset = subset * 16;
 703  
 704                      RgbaColor8 minColor = minColors[subset];
 705                      RgbaColor8 maxColor = maxColors[subset];
 706  
 707                      ReadOnlySpan<RgbaColor8> subsetColors = colors.Slice(offset, counts[subset]);
 708  
 709                      (RgbaColor8 e0, RgbaColor8 e1) = SelectEndPoints(subsetColors, minColor, maxColor, indexBitCount, colorDepth, alphaDepth, inverseMask);
 710  
 711                      endPoints0[subset] = (endPoints0[subset] & inverseMask) | (e0.ToUInt32() & writeMask);
 712                      endPoints1[subset] = (endPoints1[subset] & inverseMask) | (e1.ToUInt32() & writeMask);
 713                  }
 714              }
 715          }
 716  
 717          private static unsafe void SelectEndPointsFast(
 718              ReadOnlySpan<byte> partitionTable,
 719              ReadOnlySpan<uint> tile,
 720              int w,
 721              int h,
 722              int subsetCount,
 723              ReadOnlySpan<RgbaColor8> minColors,
 724              ReadOnlySpan<RgbaColor8> maxColors,
 725              Span<uint> endPoints0,
 726              Span<uint> endPoints1,
 727              uint writeMask)
 728          {
 729              uint inverseMask = ~writeMask;
 730  
 731              if (Sse41.IsSupported && w == 4 && h == 4)
 732              {
 733                  Vector128<byte> row0, row1, row2, row3;
 734                  Vector128<short> ones = Vector128<short>.AllBitsSet;
 735  
 736                  fixed (uint* pTile = tile)
 737                  {
 738                      row0 = Sse2.LoadVector128(pTile).AsByte();
 739                      row1 = Sse2.LoadVector128(pTile + 4).AsByte();
 740                      row2 = Sse2.LoadVector128(pTile + 8).AsByte();
 741                      row3 = Sse2.LoadVector128(pTile + 12).AsByte();
 742                  }
 743  
 744                  Vector128<byte> partitionMask;
 745  
 746                  fixed (byte* pPartitionTable = partitionTable)
 747                  {
 748                      partitionMask = Sse2.LoadVector128(pPartitionTable);
 749                  }
 750  
 751                  for (int subset = 0; subset < subsetCount; subset++)
 752                  {
 753                      RgbaColor32 blockDir = maxColors[subset].GetColor32() - minColors[subset].GetColor32();
 754                      int sum = blockDir.R + blockDir.G + blockDir.B + blockDir.A;
 755                      if (sum != 0)
 756                      {
 757                          blockDir = (blockDir << 6) / new RgbaColor32(sum);
 758                      }
 759  
 760                      Vector128<byte> bd = Vector128.Create(blockDir.GetColor8().ToUInt32()).AsByte();
 761  
 762                      Vector128<short> delta0 = Ssse3.MultiplyAddAdjacent(row0, bd.AsSByte());
 763                      Vector128<short> delta1 = Ssse3.MultiplyAddAdjacent(row1, bd.AsSByte());
 764                      Vector128<short> delta2 = Ssse3.MultiplyAddAdjacent(row2, bd.AsSByte());
 765                      Vector128<short> delta3 = Ssse3.MultiplyAddAdjacent(row3, bd.AsSByte());
 766  
 767                      Vector128<short> delta01 = Ssse3.HorizontalAdd(delta0, delta1);
 768                      Vector128<short> delta23 = Ssse3.HorizontalAdd(delta2, delta3);
 769  
 770                      Vector128<byte> subsetMask = Sse2.Xor(Sse2.CompareEqual(partitionMask, Vector128.Create((byte)subset)), ones.AsByte());
 771  
 772                      Vector128<short> subsetMask01 = Sse2.UnpackLow(subsetMask, subsetMask).AsInt16();
 773                      Vector128<short> subsetMask23 = Sse2.UnpackHigh(subsetMask, subsetMask).AsInt16();
 774  
 775                      Vector128<ushort> min01 = Sse41.MinHorizontal(Sse2.Or(delta01, subsetMask01).AsUInt16());
 776                      Vector128<ushort> min23 = Sse41.MinHorizontal(Sse2.Or(delta23, subsetMask23).AsUInt16());
 777                      Vector128<ushort> max01 = Sse41.MinHorizontal(Sse2.Xor(Sse2.AndNot(subsetMask01, delta01), ones).AsUInt16());
 778                      Vector128<ushort> max23 = Sse41.MinHorizontal(Sse2.Xor(Sse2.AndNot(subsetMask23, delta23), ones).AsUInt16());
 779  
 780                      uint minPos01 = min01.AsUInt32().GetElement(0);
 781                      uint minPos23 = min23.AsUInt32().GetElement(0);
 782                      uint maxPos01 = max01.AsUInt32().GetElement(0);
 783                      uint maxPos23 = max23.AsUInt32().GetElement(0);
 784  
 785                      uint minDistColor = (ushort)minPos23 < (ushort)minPos01
 786                          ? tile[(int)(minPos23 >> 16) + 8]
 787                          : tile[(int)(minPos01 >> 16)];
 788  
 789                      // Note that we calculate the maximum as the minimum of the inverse, so less here is actually greater.
 790                      uint maxDistColor = (ushort)maxPos23 < (ushort)maxPos01
 791                          ? tile[(int)(maxPos23 >> 16) + 8]
 792                          : tile[(int)(maxPos01 >> 16)];
 793  
 794                      endPoints0[subset] = (endPoints0[subset] & inverseMask) | (minDistColor & writeMask);
 795                      endPoints1[subset] = (endPoints1[subset] & inverseMask) | (maxDistColor & writeMask);
 796                  }
 797              }
 798              else
 799              {
 800                  for (int subset = 0; subset < subsetCount; subset++)
 801                  {
 802                      RgbaColor32 blockDir = maxColors[subset].GetColor32() - minColors[subset].GetColor32();
 803                      blockDir = RgbaColor32.DivideGuarded(blockDir << 6, new RgbaColor32(blockDir.R + blockDir.G + blockDir.B + blockDir.A), 0);
 804  
 805                      int minDist = int.MaxValue;
 806                      int maxDist = int.MinValue;
 807  
 808                      RgbaColor8 minDistColor = default;
 809                      RgbaColor8 maxDistColor = default;
 810  
 811                      int i = 0;
 812                      for (int ty = 0; ty < h; ty++)
 813                      {
 814                          for (int tx = 0; tx < w; tx++, i++)
 815                          {
 816                              if (partitionTable[ty * 4 + tx] != subset)
 817                              {
 818                                  continue;
 819                              }
 820  
 821                              RgbaColor8 color = RgbaColor8.FromUInt32(tile[i]);
 822                              int dist = RgbaColor32.Dot(color.GetColor32(), blockDir);
 823  
 824                              if (minDist > dist)
 825                              {
 826                                  minDist = dist;
 827                                  minDistColor = color;
 828                              }
 829  
 830                              if (maxDist < dist)
 831                              {
 832                                  maxDist = dist;
 833                                  maxDistColor = color;
 834                              }
 835                          }
 836                      }
 837  
 838                      endPoints0[subset] = (endPoints0[subset] & inverseMask) | (minDistColor.ToUInt32() & writeMask);
 839                      endPoints1[subset] = (endPoints1[subset] & inverseMask) | (maxDistColor.ToUInt32() & writeMask);
 840                  }
 841              }
 842          }
 843  
 844          private static (RgbaColor8, RgbaColor8) SelectEndPoints(
 845              ReadOnlySpan<RgbaColor8> values,
 846              RgbaColor8 minValue,
 847              RgbaColor8 maxValue,
 848              int indexBitCount,
 849              int colorDepth,
 850              int alphaDepth,
 851              uint alphaMask)
 852          {
 853              int n = values.Length;
 854              int numInterpolatedColors = 1 << indexBitCount;
 855              int numInterpolatedColorsMinus1 = numInterpolatedColors - 1;
 856  
 857              if (n == 0)
 858              {
 859                  return (default, default);
 860              }
 861  
 862              minValue = BC67Utils.Quantize(minValue, colorDepth, alphaDepth);
 863              maxValue = BC67Utils.Quantize(maxValue, colorDepth, alphaDepth);
 864  
 865              RgbaColor32 blockDir = maxValue.GetColor32() - minValue.GetColor32();
 866              blockDir = RgbaColor32.DivideGuarded(blockDir << 6, new RgbaColor32(blockDir.R + blockDir.G + blockDir.B + blockDir.A), 0);
 867  
 868              int minDist = int.MaxValue;
 869              int maxDist = 0;
 870  
 871              for (int i = 0; i < values.Length; i++)
 872              {
 873                  RgbaColor8 color = values[i];
 874                  int dist = RgbaColor32.Dot(BC67Utils.Quantize(color, colorDepth, alphaDepth).GetColor32(), blockDir);
 875  
 876                  if (minDist >= dist)
 877                  {
 878                      minDist = dist;
 879                  }
 880  
 881                  if (maxDist <= dist)
 882                  {
 883                      maxDist = dist;
 884                  }
 885              }
 886  
 887              Span<RgbaColor8> palette = stackalloc RgbaColor8[numInterpolatedColors];
 888  
 889              int distRange = Math.Max(1, maxDist - minDist);
 890  
 891              RgbaColor32 nV = new(n);
 892  
 893              int bestErrorSum = int.MaxValue;
 894              RgbaColor8 bestE0 = default;
 895              RgbaColor8 bestE1 = default;
 896  
 897              Span<int> indices = stackalloc int[n];
 898              Span<RgbaColor32> colors = stackalloc RgbaColor32[n];
 899  
 900              for (int maxIndex = numInterpolatedColorsMinus1; maxIndex >= 1; maxIndex--)
 901              {
 902                  int sumX = 0;
 903                  int sumXX = 0;
 904                  int sumXXIncrement = 0;
 905  
 906                  for (int i = 0; i < values.Length; i++)
 907                  {
 908                      RgbaColor32 color = values[i].GetColor32();
 909  
 910                      int dist = RgbaColor32.Dot(color, blockDir);
 911  
 912                      int normalizedValue = ((dist - minDist) << 6) / distRange;
 913                      int texelIndex = (normalizedValue * maxIndex + 32) >> 6;
 914  
 915                      indices[i] = texelIndex;
 916                      colors[i] = color;
 917  
 918                      sumX += texelIndex;
 919                      sumXX += texelIndex * texelIndex;
 920                      sumXXIncrement += 1 + texelIndex * 2;
 921                  }
 922  
 923                  for (int start = 0; start < numInterpolatedColors - maxIndex; start++)
 924                  {
 925                      RgbaColor32 sumY = new(0);
 926                      RgbaColor32 sumXY = new(0);
 927  
 928                      for (int i = 0; i < indices.Length; i++)
 929                      {
 930                          RgbaColor32 y = colors[i];
 931  
 932                          sumY += y;
 933                          sumXY += new RgbaColor32(start + indices[i]) * y;
 934                      }
 935  
 936                      RgbaColor32 sumXV = new(sumX);
 937                      RgbaColor32 sumXXV = new(sumXX);
 938                      RgbaColor32 m = RgbaColor32.DivideGuarded((nV * sumXY - sumXV * sumY) << 6, nV * sumXXV - sumXV * sumXV, 0);
 939                      RgbaColor32 b = ((sumY << 6) - m * sumXV) / nV;
 940  
 941                      RgbaColor8 candidateE0 = (b >> 6).GetColor8();
 942                      RgbaColor8 candidateE1 = ((b + m * new RgbaColor32(numInterpolatedColorsMinus1)) >> 6).GetColor8();
 943  
 944                      int pBit0 = GetPBit(candidateE0.ToUInt32(), colorDepth, alphaDepth);
 945                      int pBit1 = GetPBit(candidateE1.ToUInt32(), colorDepth, alphaDepth);
 946  
 947                      int errorSum = BC67Utils.SelectIndices(
 948                          MemoryMarshal.Cast<RgbaColor8, uint>(values),
 949                          candidateE0.ToUInt32(),
 950                          candidateE1.ToUInt32(),
 951                          pBit0,
 952                          pBit1,
 953                          indexBitCount,
 954                          numInterpolatedColors,
 955                          colorDepth,
 956                          alphaDepth,
 957                          alphaMask);
 958  
 959                      if (errorSum <= bestErrorSum)
 960                      {
 961                          bestErrorSum = errorSum;
 962                          bestE0 = candidateE0;
 963                          bestE1 = candidateE1;
 964                      }
 965  
 966                      sumX += n;
 967                      sumXX += sumXXIncrement;
 968                      sumXXIncrement += 2 * n;
 969                  }
 970              }
 971  
 972              return (bestE0, bestE1);
 973          }
 974  
 975          private static int GetPBit(uint color, int colorDepth, int alphaDepth)
 976          {
 977              uint mask = 0x808080u >> colorDepth;
 978  
 979              if (alphaDepth != 0)
 980              {
 981                  // If alpha is 0, let's assume the color information is not too important and prefer
 982                  // to preserve alpha instead.
 983                  if ((color >> 24) == 0)
 984                  {
 985                      return 0;
 986                  }
 987  
 988                  mask |= 0x80000000u >> alphaDepth;
 989              }
 990  
 991              color &= 0x7f7f7f7fu;
 992              color += mask >> 1;
 993  
 994              int onesCount = BitOperations.PopCount(color & mask);
 995              return onesCount >= 2 ? 1 : 0;
 996          }
 997  
 998          private static int GetPBit(uint c0, uint c1, int colorDepth, int alphaDepth)
 999          {
1000              // Giving preference to the first endpoint yields better results,
1001              // might be a side effect of the endpoint selection algorithm?
1002              return GetPBit(c0, colorDepth, alphaDepth);
1003          }
1004      }
1005  }