BC7Encoder.cs
1 using Ryujinx.Graphics.Texture.Utils; 2 using System; 3 using System.Diagnostics; 4 using System.Numerics; 5 using System.Runtime.CompilerServices; 6 using System.Runtime.InteropServices; 7 using System.Runtime.Intrinsics; 8 using System.Runtime.Intrinsics.X86; 9 using System.Threading.Tasks; 10 11 namespace Ryujinx.Graphics.Texture.Encoders 12 { 13 static class BC7Encoder 14 { 15 private const int MinColorVarianceForModeChange = 160; 16 17 public static void Encode(Memory<byte> outputStorage, ReadOnlyMemory<byte> data, int width, int height, EncodeMode mode) 18 { 19 int widthInBlocks = (width + 3) / 4; 20 int heightInBlocks = (height + 3) / 4; 21 22 bool fastMode = (mode & EncodeMode.ModeMask) == EncodeMode.Fast; 23 24 if (mode.HasFlag(EncodeMode.Multithreaded)) 25 { 26 Parallel.For(0, heightInBlocks, (yInBlocks) => 27 { 28 Span<ulong> output = MemoryMarshal.Cast<byte, ulong>(outputStorage.Span); 29 int y = yInBlocks * 4; 30 31 for (int xInBlocks = 0; xInBlocks < widthInBlocks; xInBlocks++) 32 { 33 int x = xInBlocks * 4; 34 Block block = CompressBlock(data.Span, x, y, width, height, fastMode); 35 36 int offset = (yInBlocks * widthInBlocks + xInBlocks) * 2; 37 output[offset] = block.Low; 38 output[offset + 1] = block.High; 39 } 40 }); 41 } 42 else 43 { 44 Span<ulong> output = MemoryMarshal.Cast<byte, ulong>(outputStorage.Span); 45 int offset = 0; 46 47 for (int y = 0; y < height; y += 4) 48 { 49 for (int x = 0; x < width; x += 4) 50 { 51 Block block = CompressBlock(data.Span, x, y, width, height, fastMode); 52 53 output[offset++] = block.Low; 54 output[offset++] = block.High; 55 } 56 } 57 } 58 } 59 60 private static readonly int[] _mostFrequentPartitions = new int[] 61 { 62 0, 13, 2, 1, 15, 14, 10, 23, 63 }; 64 65 private static Block CompressBlock(ReadOnlySpan<byte> data, int x, int y, int width, int height, bool fastMode) 66 { 67 int w = Math.Min(4, width - x); 68 int h = Math.Min(4, height - y); 69 70 var dataUint = MemoryMarshal.Cast<byte, uint>(data); 71 72 int baseOffset = y * width + x; 73 74 Span<uint> tile = stackalloc uint[w * h]; 75 76 for (int ty = 0; ty < h; ty++) 77 { 78 int rowOffset = baseOffset + ty * width; 79 80 for (int tx = 0; tx < w; tx++) 81 { 82 tile[ty * w + tx] = dataUint[rowOffset + tx]; 83 } 84 } 85 86 return fastMode ? EncodeFast(tile, w, h) : EncodeExhaustive(tile, w, h); 87 } 88 89 private static Block EncodeFast(ReadOnlySpan<uint> tile, int w, int h) 90 { 91 (RgbaColor8 minColor, RgbaColor8 maxColor) = BC67Utils.GetMinMaxColors(tile, w, h); 92 93 bool alphaNotOne = minColor.A != 255 || maxColor.A != 255; 94 int variance = BC67Utils.SquaredDifference(minColor.GetColor32(), maxColor.GetColor32()); 95 int selectedMode; 96 int indexMode = 0; 97 98 if (alphaNotOne) 99 { 100 bool constantAlpha = minColor.A == maxColor.A; 101 if (constantAlpha) 102 { 103 selectedMode = variance > MinColorVarianceForModeChange ? 7 : 6; 104 } 105 else 106 { 107 if (variance > MinColorVarianceForModeChange) 108 { 109 Span<uint> uniqueRGB = stackalloc uint[16]; 110 Span<uint> uniqueAlpha = stackalloc uint[16]; 111 112 int uniqueRGBCount = 0; 113 int uniqueAlphaCount = 0; 114 115 uint rgbMask = new RgbaColor8(255, 255, 255, 0).ToUInt32(); 116 uint alphaMask = new RgbaColor8(0, 0, 0, 255).ToUInt32(); 117 118 for (int i = 0; i < tile.Length; i++) 119 { 120 uint c = tile[i]; 121 122 if (!uniqueRGB[..uniqueRGBCount].Contains(c & rgbMask)) 123 { 124 uniqueRGB[uniqueRGBCount++] = c & rgbMask; 125 } 126 127 if (!uniqueAlpha[..uniqueAlphaCount].Contains(c & alphaMask)) 128 { 129 uniqueAlpha[uniqueAlphaCount++] = c & alphaMask; 130 } 131 } 132 133 selectedMode = 4; 134 indexMode = uniqueRGBCount > uniqueAlphaCount ? 1 : 0; 135 } 136 else 137 { 138 selectedMode = 5; 139 } 140 } 141 } 142 else 143 { 144 if (variance > MinColorVarianceForModeChange) 145 { 146 selectedMode = 1; 147 } 148 else 149 { 150 selectedMode = 6; 151 } 152 } 153 154 int selectedPartition = 0; 155 156 if (selectedMode == 1 || selectedMode == 7) 157 { 158 int partitionSelectionLowestError = int.MaxValue; 159 160 for (int i = 0; i < _mostFrequentPartitions.Length; i++) 161 { 162 int p = _mostFrequentPartitions[i]; 163 int error = GetEndPointSelectionErrorFast(tile, 2, p, w, h, partitionSelectionLowestError); 164 if (error < partitionSelectionLowestError) 165 { 166 partitionSelectionLowestError = error; 167 selectedPartition = p; 168 } 169 } 170 } 171 172 return Encode(selectedMode, selectedPartition, 0, indexMode, fastMode: true, tile, w, h, out _); 173 } 174 175 private static Block EncodeExhaustive(ReadOnlySpan<uint> tile, int w, int h) 176 { 177 Block bestBlock = default; 178 int lowestError = int.MaxValue; 179 int lowestErrorSubsets = int.MaxValue; 180 181 for (int m = 0; m < 8; m++) 182 { 183 for (int r = 0; r < (m == 4 || m == 5 ? 4 : 1); r++) 184 { 185 for (int im = 0; im < (m == 4 ? 2 : 1); im++) 186 { 187 for (int p = 0; p < 1 << BC67Tables.BC7ModeInfos[m].PartitionBitCount; p++) 188 { 189 Block block = Encode(m, p, r, im, fastMode: false, tile, w, h, out int maxError); 190 if (maxError < lowestError || (maxError == lowestError && BC67Tables.BC7ModeInfos[m].SubsetCount < lowestErrorSubsets)) 191 { 192 lowestError = maxError; 193 lowestErrorSubsets = BC67Tables.BC7ModeInfos[m].SubsetCount; 194 bestBlock = block; 195 } 196 } 197 } 198 } 199 } 200 201 return bestBlock; 202 } 203 204 private static Block Encode( 205 int mode, 206 int partition, 207 int rotation, 208 int indexMode, 209 bool fastMode, 210 ReadOnlySpan<uint> tile, 211 int w, 212 int h, 213 out int errorSum) 214 { 215 BC7ModeInfo modeInfo = BC67Tables.BC7ModeInfos[mode]; 216 int subsetCount = modeInfo.SubsetCount; 217 int partitionBitCount = modeInfo.PartitionBitCount; 218 int rotationBitCount = modeInfo.RotationBitCount; 219 int indexModeBitCount = modeInfo.IndexModeBitCount; 220 int colorDepth = modeInfo.ColorDepth; 221 int alphaDepth = modeInfo.AlphaDepth; 222 int pBits = modeInfo.PBits; 223 int colorIndexBitCount = modeInfo.ColorIndexBitCount; 224 int alphaIndexBitCount = modeInfo.AlphaIndexBitCount; 225 bool separateAlphaIndices = alphaIndexBitCount != 0; 226 227 uint alphaMask; 228 229 if (separateAlphaIndices) 230 { 231 alphaMask = rotation switch 232 { 233 1 => new RgbaColor8(255, 0, 0, 0).ToUInt32(), 234 2 => new RgbaColor8(0, 255, 0, 0).ToUInt32(), 235 3 => new RgbaColor8(0, 0, 255, 0).ToUInt32(), 236 _ => new RgbaColor8(0, 0, 0, 255).ToUInt32(), 237 }; 238 } 239 else 240 { 241 alphaMask = new RgbaColor8(0, 0, 0, 0).ToUInt32(); 242 } 243 244 if (indexMode != 0) 245 { 246 alphaMask = ~alphaMask; 247 } 248 249 // 250 // Select color palette. 251 // 252 253 Span<uint> endPoints0 = stackalloc uint[subsetCount]; 254 Span<uint> endPoints1 = stackalloc uint[subsetCount]; 255 256 SelectEndPoints( 257 tile, 258 w, 259 h, 260 endPoints0, 261 endPoints1, 262 subsetCount, 263 partition, 264 colorIndexBitCount, 265 colorDepth, 266 alphaDepth, 267 ~alphaMask, 268 fastMode); 269 270 if (separateAlphaIndices) 271 { 272 SelectEndPoints( 273 tile, 274 w, 275 h, 276 endPoints0, 277 endPoints1, 278 subsetCount, 279 partition, 280 alphaIndexBitCount, 281 colorDepth, 282 alphaDepth, 283 alphaMask, 284 fastMode); 285 } 286 287 Span<int> pBitValues = stackalloc int[pBits]; 288 289 for (int i = 0; i < pBits; i++) 290 { 291 int pBit; 292 293 if (pBits == subsetCount) 294 { 295 pBit = GetPBit(endPoints0[i], endPoints1[i], colorDepth, alphaDepth); 296 } 297 else 298 { 299 int subset = i >> 1; 300 uint color = (i & 1) == 0 ? endPoints0[subset] : endPoints1[subset]; 301 pBit = GetPBit(color, colorDepth, alphaDepth); 302 } 303 304 pBitValues[i] = pBit; 305 } 306 307 int colorIndexCount = 1 << colorIndexBitCount; 308 int alphaIndexCount = 1 << alphaIndexBitCount; 309 310 Span<byte> colorIndices = stackalloc byte[16]; 311 Span<byte> alphaIndices = stackalloc byte[16]; 312 313 errorSum = BC67Utils.SelectIndices( 314 tile, 315 w, 316 h, 317 endPoints0, 318 endPoints1, 319 pBitValues, 320 colorIndices, 321 subsetCount, 322 partition, 323 colorIndexBitCount, 324 colorIndexCount, 325 colorDepth, 326 alphaDepth, 327 pBits, 328 alphaMask); 329 330 if (separateAlphaIndices) 331 { 332 errorSum += BC67Utils.SelectIndices( 333 tile, 334 w, 335 h, 336 endPoints0, 337 endPoints1, 338 pBitValues, 339 alphaIndices, 340 subsetCount, 341 partition, 342 alphaIndexBitCount, 343 alphaIndexCount, 344 colorDepth, 345 alphaDepth, 346 pBits, 347 ~alphaMask); 348 } 349 350 Span<bool> colorSwapSubset = stackalloc bool[3]; 351 352 for (int i = 0; i < 3; i++) 353 { 354 colorSwapSubset[i] = colorIndices[BC67Tables.FixUpIndices[subsetCount - 1][partition][i]] >= (colorIndexCount >> 1); 355 } 356 357 bool alphaSwapSubset = alphaIndices[0] >= (alphaIndexCount >> 1); 358 359 Block block = new(); 360 361 int offset = 0; 362 363 block.Encode(1UL << mode, ref offset, mode + 1); 364 block.Encode((ulong)partition, ref offset, partitionBitCount); 365 block.Encode((ulong)rotation, ref offset, rotationBitCount); 366 block.Encode((ulong)indexMode, ref offset, indexModeBitCount); 367 368 for (int comp = 0; comp < 3; comp++) 369 { 370 int rotatedComp = comp; 371 372 if (((comp + 1) & 3) == rotation) 373 { 374 rotatedComp = 3; 375 } 376 377 for (int subset = 0; subset < subsetCount; subset++) 378 { 379 RgbaColor8 color0 = RgbaColor8.FromUInt32(endPoints0[subset]); 380 RgbaColor8 color1 = RgbaColor8.FromUInt32(endPoints1[subset]); 381 382 int pBit0 = -1, pBit1 = -1; 383 384 if (pBits == subsetCount) 385 { 386 pBit0 = pBit1 = pBitValues[subset]; 387 } 388 else if (pBits != 0) 389 { 390 pBit0 = pBitValues[subset * 2]; 391 pBit1 = pBitValues[subset * 2 + 1]; 392 } 393 394 if (indexMode == 0 ? colorSwapSubset[subset] : alphaSwapSubset) 395 { 396 block.Encode(BC67Utils.QuantizeComponent(color1.GetComponent(rotatedComp), colorDepth, pBit1), ref offset, colorDepth); 397 block.Encode(BC67Utils.QuantizeComponent(color0.GetComponent(rotatedComp), colorDepth, pBit0), ref offset, colorDepth); 398 } 399 else 400 { 401 block.Encode(BC67Utils.QuantizeComponent(color0.GetComponent(rotatedComp), colorDepth, pBit0), ref offset, colorDepth); 402 block.Encode(BC67Utils.QuantizeComponent(color1.GetComponent(rotatedComp), colorDepth, pBit1), ref offset, colorDepth); 403 } 404 } 405 } 406 407 if (alphaDepth != 0) 408 { 409 int rotatedComp = (rotation - 1) & 3; 410 411 for (int subset = 0; subset < subsetCount; subset++) 412 { 413 RgbaColor8 color0 = RgbaColor8.FromUInt32(endPoints0[subset]); 414 RgbaColor8 color1 = RgbaColor8.FromUInt32(endPoints1[subset]); 415 416 int pBit0 = -1, pBit1 = -1; 417 418 if (pBits == subsetCount) 419 { 420 pBit0 = pBit1 = pBitValues[subset]; 421 } 422 else if (pBits != 0) 423 { 424 pBit0 = pBitValues[subset * 2]; 425 pBit1 = pBitValues[subset * 2 + 1]; 426 } 427 428 if (separateAlphaIndices && indexMode == 0 ? alphaSwapSubset : colorSwapSubset[subset]) 429 { 430 block.Encode(BC67Utils.QuantizeComponent(color1.GetComponent(rotatedComp), alphaDepth, pBit1), ref offset, alphaDepth); 431 block.Encode(BC67Utils.QuantizeComponent(color0.GetComponent(rotatedComp), alphaDepth, pBit0), ref offset, alphaDepth); 432 } 433 else 434 { 435 block.Encode(BC67Utils.QuantizeComponent(color0.GetComponent(rotatedComp), alphaDepth, pBit0), ref offset, alphaDepth); 436 block.Encode(BC67Utils.QuantizeComponent(color1.GetComponent(rotatedComp), alphaDepth, pBit1), ref offset, alphaDepth); 437 } 438 } 439 } 440 441 for (int i = 0; i < pBits; i++) 442 { 443 block.Encode((ulong)pBitValues[i], ref offset, 1); 444 } 445 446 byte[] fixUpTable = BC67Tables.FixUpIndices[subsetCount - 1][partition]; 447 448 for (int i = 0; i < 16; i++) 449 { 450 int subset = BC67Tables.PartitionTable[subsetCount - 1][partition][i]; 451 byte index = colorIndices[i]; 452 453 if (colorSwapSubset[subset]) 454 { 455 index = (byte)(index ^ (colorIndexCount - 1)); 456 } 457 458 int finalIndexBitCount = i == fixUpTable[subset] ? colorIndexBitCount - 1 : colorIndexBitCount; 459 460 Debug.Assert(index < (1 << finalIndexBitCount)); 461 462 block.Encode(index, ref offset, finalIndexBitCount); 463 } 464 465 if (separateAlphaIndices) 466 { 467 for (int i = 0; i < 16; i++) 468 { 469 byte index = alphaIndices[i]; 470 471 if (alphaSwapSubset) 472 { 473 index = (byte)(index ^ (alphaIndexCount - 1)); 474 } 475 476 int finalIndexBitCount = i == 0 ? alphaIndexBitCount - 1 : alphaIndexBitCount; 477 478 Debug.Assert(index < (1 << finalIndexBitCount)); 479 480 block.Encode(index, ref offset, finalIndexBitCount); 481 } 482 } 483 484 return block; 485 } 486 487 private static unsafe int GetEndPointSelectionErrorFast(ReadOnlySpan<uint> tile, int subsetCount, int partition, int w, int h, int maxError) 488 { 489 byte[] partitionTable = BC67Tables.PartitionTable[subsetCount - 1][partition]; 490 491 Span<RgbaColor8> minColors = stackalloc RgbaColor8[subsetCount]; 492 Span<RgbaColor8> maxColors = stackalloc RgbaColor8[subsetCount]; 493 494 BC67Utils.GetMinMaxColors(partitionTable, tile, w, h, minColors, maxColors, subsetCount); 495 496 Span<uint> endPoints0 = stackalloc uint[subsetCount]; 497 Span<uint> endPoints1 = stackalloc uint[subsetCount]; 498 499 SelectEndPointsFast(partitionTable, tile, w, h, subsetCount, minColors, maxColors, endPoints0, endPoints1, uint.MaxValue); 500 501 Span<RgbaColor32> palette = stackalloc RgbaColor32[8]; 502 503 int errorSum = 0; 504 505 for (int subset = 0; subset < subsetCount; subset++) 506 { 507 RgbaColor32 blockDir = maxColors[subset].GetColor32() - minColors[subset].GetColor32(); 508 int sum = blockDir.R + blockDir.G + blockDir.B + blockDir.A; 509 if (sum != 0) 510 { 511 blockDir = (blockDir << 6) / new RgbaColor32(sum); 512 } 513 514 uint c0 = endPoints0[subset]; 515 uint c1 = endPoints1[subset]; 516 517 int pBit0 = GetPBit(c0, 6, 0); 518 int pBit1 = GetPBit(c1, 6, 0); 519 520 c0 = BC67Utils.Quantize(RgbaColor8.FromUInt32(c0), 6, 0, pBit0).ToUInt32(); 521 c1 = BC67Utils.Quantize(RgbaColor8.FromUInt32(c1), 6, 0, pBit1).ToUInt32(); 522 523 if (Sse41.IsSupported) 524 { 525 Vector128<byte> c0Rep = Vector128.Create(c0).AsByte(); 526 Vector128<byte> c1Rep = Vector128.Create(c1).AsByte(); 527 528 Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep); 529 530 Vector128<byte> rWeights; 531 Vector128<byte> lWeights; 532 533 fixed (byte* pWeights = BC67Tables.Weights[1], pInvWeights = BC67Tables.InverseWeights[1]) 534 { 535 rWeights = Sse2.LoadScalarVector128((ulong*)pWeights).AsByte(); 536 lWeights = Sse2.LoadScalarVector128((ulong*)pInvWeights).AsByte(); 537 } 538 539 Vector128<byte> iWeights = Sse2.UnpackLow(rWeights, lWeights); 540 Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte(); 541 Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeights.AsInt16(), iWeights.AsInt16()).AsByte(); 542 Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); 543 Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); 544 Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); 545 Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); 546 547 static Vector128<short> ShiftRoundToNearest(Vector128<short> x) 548 { 549 return Sse2.ShiftRightLogical(Sse2.Add(x, Vector128.Create((short)32)), 6); 550 } 551 552 Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte())); 553 Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte())); 554 Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte())); 555 Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte())); 556 557 for (int i = 0; i < tile.Length; i++) 558 { 559 if (partitionTable[i] != subset) 560 { 561 continue; 562 } 563 564 uint c = tile[i]; 565 566 Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte()); 567 568 Vector128<short> delta0 = Sse2.Subtract(color, pal0); 569 Vector128<short> delta1 = Sse2.Subtract(color, pal1); 570 Vector128<short> delta2 = Sse2.Subtract(color, pal2); 571 Vector128<short> delta3 = Sse2.Subtract(color, pal3); 572 573 Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0); 574 Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1); 575 Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2); 576 Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3); 577 578 Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1); 579 Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3); 580 581 Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23); 582 583 Vector128<ushort> min = Sse41.MinHorizontal(delta); 584 585 errorSum += min.GetElement(0); 586 } 587 } 588 else 589 { 590 RgbaColor32 e032 = RgbaColor8.FromUInt32(c0).GetColor32(); 591 RgbaColor32 e132 = RgbaColor8.FromUInt32(c1).GetColor32(); 592 593 palette[0] = e032; 594 palette[^1] = e132; 595 596 for (int i = 1; i < palette.Length - 1; i++) 597 { 598 palette[i] = BC67Utils.Interpolate(e032, e132, i, 3); 599 } 600 601 for (int i = 0; i < tile.Length; i++) 602 { 603 if (partitionTable[i] != subset) 604 { 605 continue; 606 } 607 608 uint c = tile[i]; 609 RgbaColor32 color = Unsafe.As<uint, RgbaColor8>(ref c).GetColor32(); 610 611 int bestMatchScore = int.MaxValue; 612 613 for (int j = 0; j < palette.Length; j++) 614 { 615 int score = BC67Utils.SquaredDifference(color, palette[j]); 616 617 if (score < bestMatchScore) 618 { 619 bestMatchScore = score; 620 } 621 } 622 623 errorSum += bestMatchScore; 624 } 625 } 626 627 // No point in continuing if we are already above maximum. 628 if (errorSum >= maxError) 629 { 630 return int.MaxValue; 631 } 632 } 633 634 return errorSum; 635 } 636 637 private static void SelectEndPoints( 638 ReadOnlySpan<uint> tile, 639 int w, 640 int h, 641 Span<uint> endPoints0, 642 Span<uint> endPoints1, 643 int subsetCount, 644 int partition, 645 int indexBitCount, 646 int colorDepth, 647 int alphaDepth, 648 uint writeMask, 649 bool fastMode) 650 { 651 byte[] partitionTable = BC67Tables.PartitionTable[subsetCount - 1][partition]; 652 653 Span<RgbaColor8> minColors = stackalloc RgbaColor8[subsetCount]; 654 Span<RgbaColor8> maxColors = stackalloc RgbaColor8[subsetCount]; 655 656 BC67Utils.GetMinMaxColors(partitionTable, tile, w, h, minColors, maxColors, subsetCount); 657 658 uint inverseMask = ~writeMask; 659 660 for (int i = 0; i < subsetCount; i++) 661 { 662 Unsafe.As<RgbaColor8, uint>(ref minColors[i]) |= inverseMask; 663 Unsafe.As<RgbaColor8, uint>(ref maxColors[i]) |= inverseMask; 664 } 665 666 if (fastMode) 667 { 668 SelectEndPointsFast(partitionTable, tile, w, h, subsetCount, minColors, maxColors, endPoints0, endPoints1, writeMask); 669 } 670 else 671 { 672 Span<RgbaColor8> colors = stackalloc RgbaColor8[subsetCount * 16]; 673 Span<byte> counts = stackalloc byte[subsetCount]; 674 675 int i = 0; 676 for (int ty = 0; ty < h; ty++) 677 { 678 for (int tx = 0; tx < w; tx++) 679 { 680 int subset = partitionTable[ty * 4 + tx]; 681 RgbaColor8 color = RgbaColor8.FromUInt32(tile[i++] | inverseMask); 682 683 static void AddIfNew(Span<RgbaColor8> values, RgbaColor8 value, int subset, ref byte count) 684 { 685 for (int i = 0; i < count; i++) 686 { 687 if (values[subset * 16 + i] == value) 688 { 689 return; 690 } 691 } 692 693 values[subset * 16 + count++] = value; 694 } 695 696 AddIfNew(colors, color, subset, ref counts[subset]); 697 } 698 } 699 700 for (int subset = 0; subset < subsetCount; subset++) 701 { 702 int offset = subset * 16; 703 704 RgbaColor8 minColor = minColors[subset]; 705 RgbaColor8 maxColor = maxColors[subset]; 706 707 ReadOnlySpan<RgbaColor8> subsetColors = colors.Slice(offset, counts[subset]); 708 709 (RgbaColor8 e0, RgbaColor8 e1) = SelectEndPoints(subsetColors, minColor, maxColor, indexBitCount, colorDepth, alphaDepth, inverseMask); 710 711 endPoints0[subset] = (endPoints0[subset] & inverseMask) | (e0.ToUInt32() & writeMask); 712 endPoints1[subset] = (endPoints1[subset] & inverseMask) | (e1.ToUInt32() & writeMask); 713 } 714 } 715 } 716 717 private static unsafe void SelectEndPointsFast( 718 ReadOnlySpan<byte> partitionTable, 719 ReadOnlySpan<uint> tile, 720 int w, 721 int h, 722 int subsetCount, 723 ReadOnlySpan<RgbaColor8> minColors, 724 ReadOnlySpan<RgbaColor8> maxColors, 725 Span<uint> endPoints0, 726 Span<uint> endPoints1, 727 uint writeMask) 728 { 729 uint inverseMask = ~writeMask; 730 731 if (Sse41.IsSupported && w == 4 && h == 4) 732 { 733 Vector128<byte> row0, row1, row2, row3; 734 Vector128<short> ones = Vector128<short>.AllBitsSet; 735 736 fixed (uint* pTile = tile) 737 { 738 row0 = Sse2.LoadVector128(pTile).AsByte(); 739 row1 = Sse2.LoadVector128(pTile + 4).AsByte(); 740 row2 = Sse2.LoadVector128(pTile + 8).AsByte(); 741 row3 = Sse2.LoadVector128(pTile + 12).AsByte(); 742 } 743 744 Vector128<byte> partitionMask; 745 746 fixed (byte* pPartitionTable = partitionTable) 747 { 748 partitionMask = Sse2.LoadVector128(pPartitionTable); 749 } 750 751 for (int subset = 0; subset < subsetCount; subset++) 752 { 753 RgbaColor32 blockDir = maxColors[subset].GetColor32() - minColors[subset].GetColor32(); 754 int sum = blockDir.R + blockDir.G + blockDir.B + blockDir.A; 755 if (sum != 0) 756 { 757 blockDir = (blockDir << 6) / new RgbaColor32(sum); 758 } 759 760 Vector128<byte> bd = Vector128.Create(blockDir.GetColor8().ToUInt32()).AsByte(); 761 762 Vector128<short> delta0 = Ssse3.MultiplyAddAdjacent(row0, bd.AsSByte()); 763 Vector128<short> delta1 = Ssse3.MultiplyAddAdjacent(row1, bd.AsSByte()); 764 Vector128<short> delta2 = Ssse3.MultiplyAddAdjacent(row2, bd.AsSByte()); 765 Vector128<short> delta3 = Ssse3.MultiplyAddAdjacent(row3, bd.AsSByte()); 766 767 Vector128<short> delta01 = Ssse3.HorizontalAdd(delta0, delta1); 768 Vector128<short> delta23 = Ssse3.HorizontalAdd(delta2, delta3); 769 770 Vector128<byte> subsetMask = Sse2.Xor(Sse2.CompareEqual(partitionMask, Vector128.Create((byte)subset)), ones.AsByte()); 771 772 Vector128<short> subsetMask01 = Sse2.UnpackLow(subsetMask, subsetMask).AsInt16(); 773 Vector128<short> subsetMask23 = Sse2.UnpackHigh(subsetMask, subsetMask).AsInt16(); 774 775 Vector128<ushort> min01 = Sse41.MinHorizontal(Sse2.Or(delta01, subsetMask01).AsUInt16()); 776 Vector128<ushort> min23 = Sse41.MinHorizontal(Sse2.Or(delta23, subsetMask23).AsUInt16()); 777 Vector128<ushort> max01 = Sse41.MinHorizontal(Sse2.Xor(Sse2.AndNot(subsetMask01, delta01), ones).AsUInt16()); 778 Vector128<ushort> max23 = Sse41.MinHorizontal(Sse2.Xor(Sse2.AndNot(subsetMask23, delta23), ones).AsUInt16()); 779 780 uint minPos01 = min01.AsUInt32().GetElement(0); 781 uint minPos23 = min23.AsUInt32().GetElement(0); 782 uint maxPos01 = max01.AsUInt32().GetElement(0); 783 uint maxPos23 = max23.AsUInt32().GetElement(0); 784 785 uint minDistColor = (ushort)minPos23 < (ushort)minPos01 786 ? tile[(int)(minPos23 >> 16) + 8] 787 : tile[(int)(minPos01 >> 16)]; 788 789 // Note that we calculate the maximum as the minimum of the inverse, so less here is actually greater. 790 uint maxDistColor = (ushort)maxPos23 < (ushort)maxPos01 791 ? tile[(int)(maxPos23 >> 16) + 8] 792 : tile[(int)(maxPos01 >> 16)]; 793 794 endPoints0[subset] = (endPoints0[subset] & inverseMask) | (minDistColor & writeMask); 795 endPoints1[subset] = (endPoints1[subset] & inverseMask) | (maxDistColor & writeMask); 796 } 797 } 798 else 799 { 800 for (int subset = 0; subset < subsetCount; subset++) 801 { 802 RgbaColor32 blockDir = maxColors[subset].GetColor32() - minColors[subset].GetColor32(); 803 blockDir = RgbaColor32.DivideGuarded(blockDir << 6, new RgbaColor32(blockDir.R + blockDir.G + blockDir.B + blockDir.A), 0); 804 805 int minDist = int.MaxValue; 806 int maxDist = int.MinValue; 807 808 RgbaColor8 minDistColor = default; 809 RgbaColor8 maxDistColor = default; 810 811 int i = 0; 812 for (int ty = 0; ty < h; ty++) 813 { 814 for (int tx = 0; tx < w; tx++, i++) 815 { 816 if (partitionTable[ty * 4 + tx] != subset) 817 { 818 continue; 819 } 820 821 RgbaColor8 color = RgbaColor8.FromUInt32(tile[i]); 822 int dist = RgbaColor32.Dot(color.GetColor32(), blockDir); 823 824 if (minDist > dist) 825 { 826 minDist = dist; 827 minDistColor = color; 828 } 829 830 if (maxDist < dist) 831 { 832 maxDist = dist; 833 maxDistColor = color; 834 } 835 } 836 } 837 838 endPoints0[subset] = (endPoints0[subset] & inverseMask) | (minDistColor.ToUInt32() & writeMask); 839 endPoints1[subset] = (endPoints1[subset] & inverseMask) | (maxDistColor.ToUInt32() & writeMask); 840 } 841 } 842 } 843 844 private static (RgbaColor8, RgbaColor8) SelectEndPoints( 845 ReadOnlySpan<RgbaColor8> values, 846 RgbaColor8 minValue, 847 RgbaColor8 maxValue, 848 int indexBitCount, 849 int colorDepth, 850 int alphaDepth, 851 uint alphaMask) 852 { 853 int n = values.Length; 854 int numInterpolatedColors = 1 << indexBitCount; 855 int numInterpolatedColorsMinus1 = numInterpolatedColors - 1; 856 857 if (n == 0) 858 { 859 return (default, default); 860 } 861 862 minValue = BC67Utils.Quantize(minValue, colorDepth, alphaDepth); 863 maxValue = BC67Utils.Quantize(maxValue, colorDepth, alphaDepth); 864 865 RgbaColor32 blockDir = maxValue.GetColor32() - minValue.GetColor32(); 866 blockDir = RgbaColor32.DivideGuarded(blockDir << 6, new RgbaColor32(blockDir.R + blockDir.G + blockDir.B + blockDir.A), 0); 867 868 int minDist = int.MaxValue; 869 int maxDist = 0; 870 871 for (int i = 0; i < values.Length; i++) 872 { 873 RgbaColor8 color = values[i]; 874 int dist = RgbaColor32.Dot(BC67Utils.Quantize(color, colorDepth, alphaDepth).GetColor32(), blockDir); 875 876 if (minDist >= dist) 877 { 878 minDist = dist; 879 } 880 881 if (maxDist <= dist) 882 { 883 maxDist = dist; 884 } 885 } 886 887 Span<RgbaColor8> palette = stackalloc RgbaColor8[numInterpolatedColors]; 888 889 int distRange = Math.Max(1, maxDist - minDist); 890 891 RgbaColor32 nV = new(n); 892 893 int bestErrorSum = int.MaxValue; 894 RgbaColor8 bestE0 = default; 895 RgbaColor8 bestE1 = default; 896 897 Span<int> indices = stackalloc int[n]; 898 Span<RgbaColor32> colors = stackalloc RgbaColor32[n]; 899 900 for (int maxIndex = numInterpolatedColorsMinus1; maxIndex >= 1; maxIndex--) 901 { 902 int sumX = 0; 903 int sumXX = 0; 904 int sumXXIncrement = 0; 905 906 for (int i = 0; i < values.Length; i++) 907 { 908 RgbaColor32 color = values[i].GetColor32(); 909 910 int dist = RgbaColor32.Dot(color, blockDir); 911 912 int normalizedValue = ((dist - minDist) << 6) / distRange; 913 int texelIndex = (normalizedValue * maxIndex + 32) >> 6; 914 915 indices[i] = texelIndex; 916 colors[i] = color; 917 918 sumX += texelIndex; 919 sumXX += texelIndex * texelIndex; 920 sumXXIncrement += 1 + texelIndex * 2; 921 } 922 923 for (int start = 0; start < numInterpolatedColors - maxIndex; start++) 924 { 925 RgbaColor32 sumY = new(0); 926 RgbaColor32 sumXY = new(0); 927 928 for (int i = 0; i < indices.Length; i++) 929 { 930 RgbaColor32 y = colors[i]; 931 932 sumY += y; 933 sumXY += new RgbaColor32(start + indices[i]) * y; 934 } 935 936 RgbaColor32 sumXV = new(sumX); 937 RgbaColor32 sumXXV = new(sumXX); 938 RgbaColor32 m = RgbaColor32.DivideGuarded((nV * sumXY - sumXV * sumY) << 6, nV * sumXXV - sumXV * sumXV, 0); 939 RgbaColor32 b = ((sumY << 6) - m * sumXV) / nV; 940 941 RgbaColor8 candidateE0 = (b >> 6).GetColor8(); 942 RgbaColor8 candidateE1 = ((b + m * new RgbaColor32(numInterpolatedColorsMinus1)) >> 6).GetColor8(); 943 944 int pBit0 = GetPBit(candidateE0.ToUInt32(), colorDepth, alphaDepth); 945 int pBit1 = GetPBit(candidateE1.ToUInt32(), colorDepth, alphaDepth); 946 947 int errorSum = BC67Utils.SelectIndices( 948 MemoryMarshal.Cast<RgbaColor8, uint>(values), 949 candidateE0.ToUInt32(), 950 candidateE1.ToUInt32(), 951 pBit0, 952 pBit1, 953 indexBitCount, 954 numInterpolatedColors, 955 colorDepth, 956 alphaDepth, 957 alphaMask); 958 959 if (errorSum <= bestErrorSum) 960 { 961 bestErrorSum = errorSum; 962 bestE0 = candidateE0; 963 bestE1 = candidateE1; 964 } 965 966 sumX += n; 967 sumXX += sumXXIncrement; 968 sumXXIncrement += 2 * n; 969 } 970 } 971 972 return (bestE0, bestE1); 973 } 974 975 private static int GetPBit(uint color, int colorDepth, int alphaDepth) 976 { 977 uint mask = 0x808080u >> colorDepth; 978 979 if (alphaDepth != 0) 980 { 981 // If alpha is 0, let's assume the color information is not too important and prefer 982 // to preserve alpha instead. 983 if ((color >> 24) == 0) 984 { 985 return 0; 986 } 987 988 mask |= 0x80000000u >> alphaDepth; 989 } 990 991 color &= 0x7f7f7f7fu; 992 color += mask >> 1; 993 994 int onesCount = BitOperations.PopCount(color & mask); 995 return onesCount >= 2 ? 1 : 0; 996 } 997 998 private static int GetPBit(uint c0, uint c1, int colorDepth, int alphaDepth) 999 { 1000 // Giving preference to the first endpoint yields better results, 1001 // might be a side effect of the endpoint selection algorithm? 1002 return GetPBit(c0, colorDepth, alphaDepth); 1003 } 1004 } 1005 }