BC67Utils.cs
1 using System; 2 using System.Diagnostics; 3 using System.Runtime.CompilerServices; 4 using System.Runtime.Intrinsics; 5 using System.Runtime.Intrinsics.X86; 6 7 namespace Ryujinx.Graphics.Texture.Utils 8 { 9 static class BC67Utils 10 { 11 private static readonly byte[][] _quantizationLut; 12 private static readonly byte[][] _quantizationLutNoPBit; 13 14 static BC67Utils() 15 { 16 _quantizationLut = new byte[5][]; 17 _quantizationLutNoPBit = new byte[5][]; 18 19 for (int depth = 4; depth < 9; depth++) 20 { 21 byte[] lut = new byte[512]; 22 byte[] lutNoPBit = new byte[256]; 23 24 for (int i = 0; i < lut.Length; i++) 25 { 26 lut[i] = QuantizeComponentForLut((byte)i, depth, i >> 8); 27 28 if (i < lutNoPBit.Length) 29 { 30 lutNoPBit[i] = QuantizeComponentForLut((byte)i, depth); 31 } 32 } 33 34 _quantizationLut[depth - 4] = lut; 35 _quantizationLutNoPBit[depth - 4] = lutNoPBit; 36 } 37 } 38 39 public static (RgbaColor8, RgbaColor8) GetMinMaxColors(ReadOnlySpan<uint> tile, int w, int h) 40 { 41 if (Sse41.IsSupported && w == 4 && h == 4) 42 { 43 GetMinMaxColorsOneSubset4x4Sse41(tile, out RgbaColor8 minColor, out RgbaColor8 maxColor); 44 45 return (minColor, maxColor); 46 } 47 else 48 { 49 RgbaColor8 minColor = new(255, 255, 255, 255); 50 RgbaColor8 maxColor = default; 51 52 for (int i = 0; i < tile.Length; i++) 53 { 54 RgbaColor8 color = RgbaColor8.FromUInt32(tile[i]); 55 56 minColor.R = Math.Min(minColor.R, color.R); 57 minColor.G = Math.Min(minColor.G, color.G); 58 minColor.B = Math.Min(minColor.B, color.B); 59 minColor.A = Math.Min(minColor.A, color.A); 60 61 maxColor.R = Math.Max(maxColor.R, color.R); 62 maxColor.G = Math.Max(maxColor.G, color.G); 63 maxColor.B = Math.Max(maxColor.B, color.B); 64 maxColor.A = Math.Max(maxColor.A, color.A); 65 } 66 67 return (minColor, maxColor); 68 } 69 } 70 71 public static void GetMinMaxColors( 72 ReadOnlySpan<byte> partitionTable, 73 ReadOnlySpan<uint> tile, 74 int w, 75 int h, 76 Span<RgbaColor8> minColors, 77 Span<RgbaColor8> maxColors, 78 int subsetCount) 79 { 80 if (Sse41.IsSupported && w == 4 && h == 4) 81 { 82 if (subsetCount == 1) 83 { 84 GetMinMaxColorsOneSubset4x4Sse41(tile, out minColors[0], out maxColors[0]); 85 return; 86 } 87 else if (subsetCount == 2) 88 { 89 GetMinMaxColorsTwoSubsets4x4Sse41(partitionTable, tile, minColors, maxColors); 90 return; 91 } 92 } 93 94 minColors.Fill(new RgbaColor8(255, 255, 255, 255)); 95 96 int i = 0; 97 for (int ty = 0; ty < h; ty++) 98 { 99 for (int tx = 0; tx < w; tx++) 100 { 101 int subset = partitionTable[ty * w + tx]; 102 RgbaColor8 color = RgbaColor8.FromUInt32(tile[i++]); 103 104 minColors[subset].R = Math.Min(minColors[subset].R, color.R); 105 minColors[subset].G = Math.Min(minColors[subset].G, color.G); 106 minColors[subset].B = Math.Min(minColors[subset].B, color.B); 107 minColors[subset].A = Math.Min(minColors[subset].A, color.A); 108 109 maxColors[subset].R = Math.Max(maxColors[subset].R, color.R); 110 maxColors[subset].G = Math.Max(maxColors[subset].G, color.G); 111 maxColors[subset].B = Math.Max(maxColors[subset].B, color.B); 112 maxColors[subset].A = Math.Max(maxColors[subset].A, color.A); 113 } 114 } 115 } 116 117 private static unsafe void GetMinMaxColorsOneSubset4x4Sse41(ReadOnlySpan<uint> tile, out RgbaColor8 minColor, out RgbaColor8 maxColor) 118 { 119 Vector128<byte> min = Vector128<byte>.AllBitsSet; 120 Vector128<byte> max = Vector128<byte>.Zero; 121 Vector128<byte> row0, row1, row2, row3; 122 123 fixed (uint* pTile = tile) 124 { 125 row0 = Sse2.LoadVector128(pTile).AsByte(); 126 row1 = Sse2.LoadVector128(pTile + 4).AsByte(); 127 row2 = Sse2.LoadVector128(pTile + 8).AsByte(); 128 row3 = Sse2.LoadVector128(pTile + 12).AsByte(); 129 } 130 131 min = Sse2.Min(min, row0); 132 max = Sse2.Max(max, row0); 133 min = Sse2.Min(min, row1); 134 max = Sse2.Max(max, row1); 135 min = Sse2.Min(min, row2); 136 max = Sse2.Max(max, row2); 137 min = Sse2.Min(min, row3); 138 max = Sse2.Max(max, row3); 139 140 minColor = HorizontalMin(min); 141 maxColor = HorizontalMax(max); 142 } 143 144 private static unsafe void GetMinMaxColorsTwoSubsets4x4Sse41( 145 ReadOnlySpan<byte> partitionTable, 146 ReadOnlySpan<uint> tile, 147 Span<RgbaColor8> minColors, 148 Span<RgbaColor8> maxColors) 149 { 150 Vector128<byte> partitionMask; 151 152 fixed (byte* pPartitionTable = partitionTable) 153 { 154 partitionMask = Sse2.LoadVector128(pPartitionTable); 155 } 156 157 Vector128<byte> subset0Mask = Sse2.CompareEqual(partitionMask, Vector128<byte>.Zero); 158 159 Vector128<byte> subset0MaskRep16Low = Sse2.UnpackLow(subset0Mask, subset0Mask); 160 Vector128<byte> subset0MaskRep16High = Sse2.UnpackHigh(subset0Mask, subset0Mask); 161 162 Vector128<byte> subset0Mask0 = Sse2.UnpackLow(subset0MaskRep16Low.AsInt16(), subset0MaskRep16Low.AsInt16()).AsByte(); 163 Vector128<byte> subset0Mask1 = Sse2.UnpackHigh(subset0MaskRep16Low.AsInt16(), subset0MaskRep16Low.AsInt16()).AsByte(); 164 Vector128<byte> subset0Mask2 = Sse2.UnpackLow(subset0MaskRep16High.AsInt16(), subset0MaskRep16High.AsInt16()).AsByte(); 165 Vector128<byte> subset0Mask3 = Sse2.UnpackHigh(subset0MaskRep16High.AsInt16(), subset0MaskRep16High.AsInt16()).AsByte(); 166 167 Vector128<byte> min0 = Vector128<byte>.AllBitsSet; 168 Vector128<byte> min1 = Vector128<byte>.AllBitsSet; 169 Vector128<byte> max0 = Vector128<byte>.Zero; 170 Vector128<byte> max1 = Vector128<byte>.Zero; 171 172 Vector128<byte> row0, row1, row2, row3; 173 174 fixed (uint* pTile = tile) 175 { 176 row0 = Sse2.LoadVector128(pTile).AsByte(); 177 row1 = Sse2.LoadVector128(pTile + 4).AsByte(); 178 row2 = Sse2.LoadVector128(pTile + 8).AsByte(); 179 row3 = Sse2.LoadVector128(pTile + 12).AsByte(); 180 } 181 182 min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row0, subset0Mask0)); 183 min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row1, subset0Mask1)); 184 min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row2, subset0Mask2)); 185 min0 = Sse2.Min(min0, Sse41.BlendVariable(min0, row3, subset0Mask3)); 186 187 min1 = Sse2.Min(min1, Sse2.Or(row0, subset0Mask0)); 188 min1 = Sse2.Min(min1, Sse2.Or(row1, subset0Mask1)); 189 min1 = Sse2.Min(min1, Sse2.Or(row2, subset0Mask2)); 190 min1 = Sse2.Min(min1, Sse2.Or(row3, subset0Mask3)); 191 192 max0 = Sse2.Max(max0, Sse2.And(row0, subset0Mask0)); 193 max0 = Sse2.Max(max0, Sse2.And(row1, subset0Mask1)); 194 max0 = Sse2.Max(max0, Sse2.And(row2, subset0Mask2)); 195 max0 = Sse2.Max(max0, Sse2.And(row3, subset0Mask3)); 196 197 max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask0, row0)); 198 max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask1, row1)); 199 max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask2, row2)); 200 max1 = Sse2.Max(max1, Sse2.AndNot(subset0Mask3, row3)); 201 202 minColors[0] = HorizontalMin(min0); 203 minColors[1] = HorizontalMin(min1); 204 maxColors[0] = HorizontalMax(max0); 205 maxColors[1] = HorizontalMax(max1); 206 } 207 208 private static RgbaColor8 HorizontalMin(Vector128<byte> x) 209 { 210 x = Sse2.Min(x, Sse2.Shuffle(x.AsInt32(), 0x31).AsByte()); 211 x = Sse2.Min(x, Sse2.Shuffle(x.AsInt32(), 2).AsByte()); 212 return RgbaColor8.FromUInt32(x.AsUInt32().GetElement(0)); 213 } 214 215 private static RgbaColor8 HorizontalMax(Vector128<byte> x) 216 { 217 x = Sse2.Max(x, Sse2.Shuffle(x.AsInt32(), 0x31).AsByte()); 218 x = Sse2.Max(x, Sse2.Shuffle(x.AsInt32(), 2).AsByte()); 219 return RgbaColor8.FromUInt32(x.AsUInt32().GetElement(0)); 220 } 221 222 public static int SelectIndices( 223 ReadOnlySpan<uint> values, 224 uint endPoint0, 225 uint endPoint1, 226 int pBit0, 227 int pBit1, 228 int indexBitCount, 229 int indexCount, 230 int colorDepth, 231 int alphaDepth, 232 uint alphaMask) 233 { 234 if (Sse41.IsSupported) 235 { 236 if (indexBitCount == 2) 237 { 238 return Select2BitIndicesSse41( 239 values, 240 endPoint0, 241 endPoint1, 242 pBit0, 243 pBit1, 244 indexBitCount, 245 indexCount, 246 colorDepth, 247 alphaDepth, 248 alphaMask); 249 } 250 else if (indexBitCount == 3) 251 { 252 return Select3BitIndicesSse41( 253 values, 254 endPoint0, 255 endPoint1, 256 pBit0, 257 pBit1, 258 indexBitCount, 259 indexCount, 260 colorDepth, 261 alphaDepth, 262 alphaMask); 263 } 264 else if (indexBitCount == 4) 265 { 266 return Select4BitIndicesOneSubsetSse41( 267 values, 268 endPoint0, 269 endPoint1, 270 pBit0, 271 pBit1, 272 indexBitCount, 273 indexCount, 274 colorDepth, 275 alphaDepth, 276 alphaMask); 277 } 278 } 279 280 return SelectIndicesFallback( 281 values, 282 endPoint0, 283 endPoint1, 284 pBit0, 285 pBit1, 286 indexBitCount, 287 indexCount, 288 colorDepth, 289 alphaDepth, 290 alphaMask); 291 } 292 293 private static unsafe int Select2BitIndicesSse41( 294 ReadOnlySpan<uint> values, 295 uint endPoint0, 296 uint endPoint1, 297 int pBit0, 298 int pBit1, 299 int indexBitCount, 300 int indexCount, 301 int colorDepth, 302 int alphaDepth, 303 uint alphaMask) 304 { 305 uint alphaMaskForPalette = alphaMask; 306 307 if (alphaDepth == 0) 308 { 309 alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32(); 310 } 311 312 int errorSum = 0; 313 314 RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0); 315 RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1); 316 317 Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte(); 318 Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte(); 319 320 Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep); 321 322 Vector128<byte> rWeights; 323 Vector128<byte> lWeights; 324 325 fixed (byte* pWeights = BC67Tables.Weights[0], pInvWeights = BC67Tables.InverseWeights[0]) 326 { 327 rWeights = Sse2.LoadScalarVector128((uint*)pWeights).AsByte(); 328 lWeights = Sse2.LoadScalarVector128((uint*)pInvWeights).AsByte(); 329 } 330 331 Vector128<byte> iWeights = Sse2.UnpackLow(lWeights, rWeights); 332 Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte(); 333 Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); 334 Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); 335 336 Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte())); 337 Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte())); 338 339 for (int i = 0; i < values.Length; i++) 340 { 341 uint c = values[i] | alphaMask; 342 343 Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte()); 344 345 Vector128<short> delta0 = Sse2.Subtract(color, pal0); 346 Vector128<short> delta1 = Sse2.Subtract(color, pal1); 347 348 Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0); 349 Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1); 350 351 Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1); 352 353 Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum01); 354 355 Vector128<ushort> min = Sse41.MinHorizontal(delta); 356 357 ushort error = min.GetElement(0); 358 359 errorSum += error; 360 } 361 362 return errorSum; 363 } 364 365 private static unsafe int Select3BitIndicesSse41( 366 ReadOnlySpan<uint> values, 367 uint endPoint0, 368 uint endPoint1, 369 int pBit0, 370 int pBit1, 371 int indexBitCount, 372 int indexCount, 373 int colorDepth, 374 int alphaDepth, 375 uint alphaMask) 376 { 377 uint alphaMaskForPalette = alphaMask; 378 379 if (alphaDepth == 0) 380 { 381 alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32(); 382 } 383 384 int errorSum = 0; 385 386 RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0); 387 RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1); 388 389 Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte(); 390 Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte(); 391 392 Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep); 393 394 Vector128<byte> rWeights; 395 Vector128<byte> lWeights; 396 397 fixed (byte* pWeights = BC67Tables.Weights[1], pInvWeights = BC67Tables.InverseWeights[1]) 398 { 399 rWeights = Sse2.LoadScalarVector128((ulong*)pWeights).AsByte(); 400 lWeights = Sse2.LoadScalarVector128((ulong*)pInvWeights).AsByte(); 401 } 402 403 Vector128<byte> iWeights = Sse2.UnpackLow(lWeights, rWeights); 404 Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte(); 405 Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeights.AsInt16(), iWeights.AsInt16()).AsByte(); 406 Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); 407 Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); 408 Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); 409 Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); 410 411 Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte())); 412 Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte())); 413 Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte())); 414 Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte())); 415 416 for (int i = 0; i < values.Length; i++) 417 { 418 uint c = values[i] | alphaMask; 419 420 Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte()); 421 422 Vector128<short> delta0 = Sse2.Subtract(color, pal0); 423 Vector128<short> delta1 = Sse2.Subtract(color, pal1); 424 Vector128<short> delta2 = Sse2.Subtract(color, pal2); 425 Vector128<short> delta3 = Sse2.Subtract(color, pal3); 426 427 Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0); 428 Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1); 429 Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2); 430 Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3); 431 432 Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1); 433 Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3); 434 435 Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23); 436 437 Vector128<ushort> min = Sse41.MinHorizontal(delta); 438 439 ushort error = min.GetElement(0); 440 441 errorSum += error; 442 } 443 444 return errorSum; 445 } 446 447 private static unsafe int Select4BitIndicesOneSubsetSse41( 448 ReadOnlySpan<uint> values, 449 uint endPoint0, 450 uint endPoint1, 451 int pBit0, 452 int pBit1, 453 int indexBitCount, 454 int indexCount, 455 int colorDepth, 456 int alphaDepth, 457 uint alphaMask) 458 { 459 uint alphaMaskForPalette = alphaMask; 460 461 if (alphaDepth == 0) 462 { 463 alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32(); 464 } 465 466 int errorSum = 0; 467 468 RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0); 469 RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1); 470 471 Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte(); 472 Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte(); 473 474 Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep); 475 476 Vector128<byte> rWeights; 477 Vector128<byte> lWeights; 478 479 fixed (byte* pWeights = BC67Tables.Weights[2], pInvWeights = BC67Tables.InverseWeights[2]) 480 { 481 rWeights = Sse2.LoadVector128(pWeights); 482 lWeights = Sse2.LoadVector128(pInvWeights); 483 } 484 485 Vector128<byte> iWeightsLow = Sse2.UnpackLow(lWeights, rWeights); 486 Vector128<byte> iWeightsHigh = Sse2.UnpackHigh(lWeights, rWeights); 487 Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte(); 488 Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte(); 489 Vector128<byte> iWeights45 = Sse2.UnpackLow(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte(); 490 Vector128<byte> iWeights67 = Sse2.UnpackHigh(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte(); 491 Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); 492 Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); 493 Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); 494 Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); 495 Vector128<byte> iWeights4 = Sse2.UnpackLow(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte(); 496 Vector128<byte> iWeights5 = Sse2.UnpackHigh(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte(); 497 Vector128<byte> iWeights6 = Sse2.UnpackLow(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte(); 498 Vector128<byte> iWeights7 = Sse2.UnpackHigh(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte(); 499 500 Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte())); 501 Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte())); 502 Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte())); 503 Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte())); 504 Vector128<short> pal4 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights4.AsSByte())); 505 Vector128<short> pal5 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights5.AsSByte())); 506 Vector128<short> pal6 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights6.AsSByte())); 507 Vector128<short> pal7 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights7.AsSByte())); 508 509 for (int i = 0; i < values.Length; i++) 510 { 511 uint c = values[i] | alphaMask; 512 513 Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte()); 514 515 Vector128<short> delta0 = Sse2.Subtract(color, pal0); 516 Vector128<short> delta1 = Sse2.Subtract(color, pal1); 517 Vector128<short> delta2 = Sse2.Subtract(color, pal2); 518 Vector128<short> delta3 = Sse2.Subtract(color, pal3); 519 Vector128<short> delta4 = Sse2.Subtract(color, pal4); 520 Vector128<short> delta5 = Sse2.Subtract(color, pal5); 521 Vector128<short> delta6 = Sse2.Subtract(color, pal6); 522 Vector128<short> delta7 = Sse2.Subtract(color, pal7); 523 524 Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0); 525 Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1); 526 Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2); 527 Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3); 528 Vector128<int> deltaSum4 = Sse2.MultiplyAddAdjacent(delta4, delta4); 529 Vector128<int> deltaSum5 = Sse2.MultiplyAddAdjacent(delta5, delta5); 530 Vector128<int> deltaSum6 = Sse2.MultiplyAddAdjacent(delta6, delta6); 531 Vector128<int> deltaSum7 = Sse2.MultiplyAddAdjacent(delta7, delta7); 532 533 Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1); 534 Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3); 535 Vector128<int> deltaSum45 = Ssse3.HorizontalAdd(deltaSum4, deltaSum5); 536 Vector128<int> deltaSum67 = Ssse3.HorizontalAdd(deltaSum6, deltaSum7); 537 538 Vector128<ushort> delta0123 = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23); 539 Vector128<ushort> delta4567 = Sse41.PackUnsignedSaturate(deltaSum45, deltaSum67); 540 541 Vector128<ushort> min0123 = Sse41.MinHorizontal(delta0123); 542 Vector128<ushort> min4567 = Sse41.MinHorizontal(delta4567); 543 544 ushort minPos0123 = min0123.GetElement(0); 545 ushort minPos4567 = min4567.GetElement(0); 546 547 if (minPos4567 < minPos0123) 548 { 549 errorSum += minPos4567; 550 } 551 else 552 { 553 errorSum += minPos0123; 554 } 555 } 556 557 return errorSum; 558 } 559 560 private static int SelectIndicesFallback( 561 ReadOnlySpan<uint> values, 562 uint endPoint0, 563 uint endPoint1, 564 int pBit0, 565 int pBit1, 566 int indexBitCount, 567 int indexCount, 568 int colorDepth, 569 int alphaDepth, 570 uint alphaMask) 571 { 572 int errorSum = 0; 573 574 uint alphaMaskForPalette = alphaMask; 575 576 if (alphaDepth == 0) 577 { 578 alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32(); 579 } 580 581 Span<uint> palette = stackalloc uint[indexCount]; 582 583 RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0); 584 RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1); 585 586 Unsafe.As<RgbaColor8, uint>(ref c0) |= alphaMaskForPalette; 587 Unsafe.As<RgbaColor8, uint>(ref c1) |= alphaMaskForPalette; 588 589 palette[0] = c0.ToUInt32(); 590 palette[indexCount - 1] = c1.ToUInt32(); 591 592 for (int j = 1; j < indexCount - 1; j++) 593 { 594 palette[j] = Interpolate(c0, c1, j, indexBitCount).ToUInt32(); 595 } 596 597 for (int i = 0; i < values.Length; i++) 598 { 599 uint color = values[i] | alphaMask; 600 601 int bestMatchScore = int.MaxValue; 602 int bestMatchIndex = 0; 603 604 for (int j = 0; j < indexCount; j++) 605 { 606 int score = SquaredDifference( 607 RgbaColor8.FromUInt32(color).GetColor32(), 608 RgbaColor8.FromUInt32(palette[j]).GetColor32()); 609 610 if (score < bestMatchScore) 611 { 612 bestMatchScore = score; 613 bestMatchIndex = j; 614 } 615 } 616 617 errorSum += bestMatchScore; 618 } 619 620 return errorSum; 621 } 622 623 public static int SelectIndices( 624 ReadOnlySpan<uint> tile, 625 int w, 626 int h, 627 ReadOnlySpan<uint> endPoints0, 628 ReadOnlySpan<uint> endPoints1, 629 ReadOnlySpan<int> pBitValues, 630 Span<byte> indices, 631 int subsetCount, 632 int partition, 633 int indexBitCount, 634 int indexCount, 635 int colorDepth, 636 int alphaDepth, 637 int pBits, 638 uint alphaMask) 639 { 640 if (Sse41.IsSupported) 641 { 642 if (indexBitCount == 2) 643 { 644 return Select2BitIndicesSse41( 645 tile, 646 w, 647 h, 648 endPoints0, 649 endPoints1, 650 pBitValues, 651 indices, 652 subsetCount, 653 partition, 654 colorDepth, 655 alphaDepth, 656 pBits, 657 alphaMask); 658 } 659 else if (indexBitCount == 3) 660 { 661 return Select3BitIndicesSse41( 662 tile, 663 w, 664 h, 665 endPoints0, 666 endPoints1, 667 pBitValues, 668 indices, 669 subsetCount, 670 partition, 671 colorDepth, 672 alphaDepth, 673 pBits, 674 alphaMask); 675 } 676 else if (indexBitCount == 4) 677 { 678 Debug.Assert(subsetCount == 1); 679 680 return Select4BitIndicesOneSubsetSse41( 681 tile, 682 w, 683 h, 684 endPoints0[0], 685 endPoints1[0], 686 pBitValues, 687 indices, 688 partition, 689 colorDepth, 690 alphaDepth, 691 pBits, 692 alphaMask); 693 } 694 } 695 696 return SelectIndicesFallback( 697 tile, 698 w, 699 h, 700 endPoints0, 701 endPoints1, 702 pBitValues, 703 indices, 704 subsetCount, 705 partition, 706 indexBitCount, 707 indexCount, 708 colorDepth, 709 alphaDepth, 710 pBits, 711 alphaMask); 712 } 713 714 private static unsafe int Select2BitIndicesSse41( 715 ReadOnlySpan<uint> tile, 716 int w, 717 int h, 718 ReadOnlySpan<uint> endPoints0, 719 ReadOnlySpan<uint> endPoints1, 720 ReadOnlySpan<int> pBitValues, 721 Span<byte> indices, 722 int subsetCount, 723 int partition, 724 int colorDepth, 725 int alphaDepth, 726 int pBits, 727 uint alphaMask) 728 { 729 byte[] partitionTable = BC67Tables.PartitionTable[subsetCount - 1][partition]; 730 731 uint alphaMaskForPalette = alphaMask; 732 733 if (alphaDepth == 0) 734 { 735 alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32(); 736 } 737 738 int errorSum = 0; 739 740 for (int subset = 0; subset < subsetCount; subset++) 741 { 742 int pBit0 = -1, pBit1 = -1; 743 744 if (pBits == subsetCount) 745 { 746 pBit0 = pBit1 = pBitValues[subset]; 747 } 748 else if (pBits != 0) 749 { 750 pBit0 = pBitValues[subset * 2]; 751 pBit1 = pBitValues[subset * 2 + 1]; 752 } 753 754 RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoints0[subset]), colorDepth, alphaDepth, pBit0); 755 RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoints1[subset]), colorDepth, alphaDepth, pBit1); 756 757 Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte(); 758 Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte(); 759 760 Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep); 761 762 Vector128<byte> rWeights; 763 Vector128<byte> lWeights; 764 765 fixed (byte* pWeights = BC67Tables.Weights[0], pInvWeights = BC67Tables.InverseWeights[0]) 766 { 767 rWeights = Sse2.LoadScalarVector128((uint*)pWeights).AsByte(); 768 lWeights = Sse2.LoadScalarVector128((uint*)pInvWeights).AsByte(); 769 } 770 771 Vector128<byte> iWeights = Sse2.UnpackLow(lWeights, rWeights); 772 Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte(); 773 Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); 774 Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); 775 776 Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte())); 777 Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte())); 778 779 int i = 0; 780 for (int ty = 0; ty < h; ty++) 781 { 782 for (int tx = 0; tx < w; tx++, i++) 783 { 784 int tileOffset = ty * 4 + tx; 785 if (partitionTable[tileOffset] != subset) 786 { 787 continue; 788 } 789 790 uint c = tile[i] | alphaMask; 791 792 Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte()); 793 794 Vector128<short> delta0 = Sse2.Subtract(color, pal0); 795 Vector128<short> delta1 = Sse2.Subtract(color, pal1); 796 797 Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0); 798 Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1); 799 800 Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1); 801 802 Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum01); 803 804 Vector128<ushort> min = Sse41.MinHorizontal(delta); 805 806 uint minPos = min.AsUInt32().GetElement(0); 807 ushort error = (ushort)minPos; 808 uint index = minPos >> 16; 809 810 indices[tileOffset] = (byte)index; 811 errorSum += error; 812 } 813 } 814 } 815 816 return errorSum; 817 } 818 819 private static unsafe int Select3BitIndicesSse41( 820 ReadOnlySpan<uint> tile, 821 int w, 822 int h, 823 ReadOnlySpan<uint> endPoints0, 824 ReadOnlySpan<uint> endPoints1, 825 ReadOnlySpan<int> pBitValues, 826 Span<byte> indices, 827 int subsetCount, 828 int partition, 829 int colorDepth, 830 int alphaDepth, 831 int pBits, 832 uint alphaMask) 833 { 834 byte[] partitionTable = BC67Tables.PartitionTable[subsetCount - 1][partition]; 835 836 uint alphaMaskForPalette = alphaMask; 837 838 if (alphaDepth == 0) 839 { 840 alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32(); 841 } 842 843 int errorSum = 0; 844 845 for (int subset = 0; subset < subsetCount; subset++) 846 { 847 int pBit0 = -1, pBit1 = -1; 848 849 if (pBits == subsetCount) 850 { 851 pBit0 = pBit1 = pBitValues[subset]; 852 } 853 else if (pBits != 0) 854 { 855 pBit0 = pBitValues[subset * 2]; 856 pBit1 = pBitValues[subset * 2 + 1]; 857 } 858 859 RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoints0[subset]), colorDepth, alphaDepth, pBit0); 860 RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoints1[subset]), colorDepth, alphaDepth, pBit1); 861 862 Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte(); 863 Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte(); 864 865 Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep); 866 867 Vector128<byte> rWeights; 868 Vector128<byte> lWeights; 869 870 fixed (byte* pWeights = BC67Tables.Weights[1], pInvWeights = BC67Tables.InverseWeights[1]) 871 { 872 rWeights = Sse2.LoadScalarVector128((ulong*)pWeights).AsByte(); 873 lWeights = Sse2.LoadScalarVector128((ulong*)pInvWeights).AsByte(); 874 } 875 876 Vector128<byte> iWeights = Sse2.UnpackLow(lWeights, rWeights); 877 Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeights.AsInt16(), iWeights.AsInt16()).AsByte(); 878 Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeights.AsInt16(), iWeights.AsInt16()).AsByte(); 879 Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); 880 Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); 881 Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); 882 Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); 883 884 Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte())); 885 Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte())); 886 Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte())); 887 Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte())); 888 889 int i = 0; 890 for (int ty = 0; ty < h; ty++) 891 { 892 for (int tx = 0; tx < w; tx++, i++) 893 { 894 int tileOffset = ty * 4 + tx; 895 if (partitionTable[tileOffset] != subset) 896 { 897 continue; 898 } 899 900 uint c = tile[i] | alphaMask; 901 902 Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte()); 903 904 Vector128<short> delta0 = Sse2.Subtract(color, pal0); 905 Vector128<short> delta1 = Sse2.Subtract(color, pal1); 906 Vector128<short> delta2 = Sse2.Subtract(color, pal2); 907 Vector128<short> delta3 = Sse2.Subtract(color, pal3); 908 909 Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0); 910 Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1); 911 Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2); 912 Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3); 913 914 Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1); 915 Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3); 916 917 Vector128<ushort> delta = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23); 918 919 Vector128<ushort> min = Sse41.MinHorizontal(delta); 920 921 uint minPos = min.AsUInt32().GetElement(0); 922 ushort error = (ushort)minPos; 923 uint index = minPos >> 16; 924 925 indices[tileOffset] = (byte)index; 926 errorSum += error; 927 } 928 } 929 } 930 931 return errorSum; 932 } 933 934 private static unsafe int Select4BitIndicesOneSubsetSse41( 935 ReadOnlySpan<uint> tile, 936 int w, 937 int h, 938 uint endPoint0, 939 uint endPoint1, 940 ReadOnlySpan<int> pBitValues, 941 Span<byte> indices, 942 int partition, 943 int colorDepth, 944 int alphaDepth, 945 int pBits, 946 uint alphaMask) 947 { 948 uint alphaMaskForPalette = alphaMask; 949 950 if (alphaDepth == 0) 951 { 952 alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32(); 953 } 954 955 int errorSum = 0; 956 957 int pBit0 = -1, pBit1 = -1; 958 959 if (pBits != 0) 960 { 961 pBit0 = pBitValues[0]; 962 pBit1 = pBitValues[1]; 963 } 964 965 RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoint0), colorDepth, alphaDepth, pBit0); 966 RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoint1), colorDepth, alphaDepth, pBit1); 967 968 Vector128<byte> c0Rep = Vector128.Create(c0.ToUInt32() | alphaMaskForPalette).AsByte(); 969 Vector128<byte> c1Rep = Vector128.Create(c1.ToUInt32() | alphaMaskForPalette).AsByte(); 970 971 Vector128<byte> c0c1 = Sse2.UnpackLow(c0Rep, c1Rep); 972 973 Vector128<byte> rWeights; 974 Vector128<byte> lWeights; 975 976 fixed (byte* pWeights = BC67Tables.Weights[2], pInvWeights = BC67Tables.InverseWeights[2]) 977 { 978 rWeights = Sse2.LoadVector128(pWeights); 979 lWeights = Sse2.LoadVector128(pInvWeights); 980 } 981 982 Vector128<byte> iWeightsLow = Sse2.UnpackLow(lWeights, rWeights); 983 Vector128<byte> iWeightsHigh = Sse2.UnpackHigh(lWeights, rWeights); 984 Vector128<byte> iWeights01 = Sse2.UnpackLow(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte(); 985 Vector128<byte> iWeights23 = Sse2.UnpackHigh(iWeightsLow.AsInt16(), iWeightsLow.AsInt16()).AsByte(); 986 Vector128<byte> iWeights45 = Sse2.UnpackLow(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte(); 987 Vector128<byte> iWeights67 = Sse2.UnpackHigh(iWeightsHigh.AsInt16(), iWeightsHigh.AsInt16()).AsByte(); 988 Vector128<byte> iWeights0 = Sse2.UnpackLow(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); 989 Vector128<byte> iWeights1 = Sse2.UnpackHigh(iWeights01.AsInt16(), iWeights01.AsInt16()).AsByte(); 990 Vector128<byte> iWeights2 = Sse2.UnpackLow(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); 991 Vector128<byte> iWeights3 = Sse2.UnpackHigh(iWeights23.AsInt16(), iWeights23.AsInt16()).AsByte(); 992 Vector128<byte> iWeights4 = Sse2.UnpackLow(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte(); 993 Vector128<byte> iWeights5 = Sse2.UnpackHigh(iWeights45.AsInt16(), iWeights45.AsInt16()).AsByte(); 994 Vector128<byte> iWeights6 = Sse2.UnpackLow(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte(); 995 Vector128<byte> iWeights7 = Sse2.UnpackHigh(iWeights67.AsInt16(), iWeights67.AsInt16()).AsByte(); 996 997 Vector128<short> pal0 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights0.AsSByte())); 998 Vector128<short> pal1 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights1.AsSByte())); 999 Vector128<short> pal2 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights2.AsSByte())); 1000 Vector128<short> pal3 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights3.AsSByte())); 1001 Vector128<short> pal4 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights4.AsSByte())); 1002 Vector128<short> pal5 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights5.AsSByte())); 1003 Vector128<short> pal6 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights6.AsSByte())); 1004 Vector128<short> pal7 = ShiftRoundToNearest(Ssse3.MultiplyAddAdjacent(c0c1, iWeights7.AsSByte())); 1005 1006 int i = 0; 1007 for (int ty = 0; ty < h; ty++) 1008 { 1009 for (int tx = 0; tx < w; tx++, i++) 1010 { 1011 uint c = tile[i] | alphaMask; 1012 1013 Vector128<short> color = Sse41.ConvertToVector128Int16(Vector128.Create(c).AsByte()); 1014 1015 Vector128<short> delta0 = Sse2.Subtract(color, pal0); 1016 Vector128<short> delta1 = Sse2.Subtract(color, pal1); 1017 Vector128<short> delta2 = Sse2.Subtract(color, pal2); 1018 Vector128<short> delta3 = Sse2.Subtract(color, pal3); 1019 Vector128<short> delta4 = Sse2.Subtract(color, pal4); 1020 Vector128<short> delta5 = Sse2.Subtract(color, pal5); 1021 Vector128<short> delta6 = Sse2.Subtract(color, pal6); 1022 Vector128<short> delta7 = Sse2.Subtract(color, pal7); 1023 1024 Vector128<int> deltaSum0 = Sse2.MultiplyAddAdjacent(delta0, delta0); 1025 Vector128<int> deltaSum1 = Sse2.MultiplyAddAdjacent(delta1, delta1); 1026 Vector128<int> deltaSum2 = Sse2.MultiplyAddAdjacent(delta2, delta2); 1027 Vector128<int> deltaSum3 = Sse2.MultiplyAddAdjacent(delta3, delta3); 1028 Vector128<int> deltaSum4 = Sse2.MultiplyAddAdjacent(delta4, delta4); 1029 Vector128<int> deltaSum5 = Sse2.MultiplyAddAdjacent(delta5, delta5); 1030 Vector128<int> deltaSum6 = Sse2.MultiplyAddAdjacent(delta6, delta6); 1031 Vector128<int> deltaSum7 = Sse2.MultiplyAddAdjacent(delta7, delta7); 1032 1033 Vector128<int> deltaSum01 = Ssse3.HorizontalAdd(deltaSum0, deltaSum1); 1034 Vector128<int> deltaSum23 = Ssse3.HorizontalAdd(deltaSum2, deltaSum3); 1035 Vector128<int> deltaSum45 = Ssse3.HorizontalAdd(deltaSum4, deltaSum5); 1036 Vector128<int> deltaSum67 = Ssse3.HorizontalAdd(deltaSum6, deltaSum7); 1037 1038 Vector128<ushort> delta0123 = Sse41.PackUnsignedSaturate(deltaSum01, deltaSum23); 1039 Vector128<ushort> delta4567 = Sse41.PackUnsignedSaturate(deltaSum45, deltaSum67); 1040 1041 Vector128<ushort> min0123 = Sse41.MinHorizontal(delta0123); 1042 Vector128<ushort> min4567 = Sse41.MinHorizontal(delta4567); 1043 1044 uint minPos0123 = min0123.AsUInt32().GetElement(0); 1045 uint minPos4567 = min4567.AsUInt32().GetElement(0); 1046 1047 if ((ushort)minPos4567 < (ushort)minPos0123) 1048 { 1049 errorSum += (ushort)minPos4567; 1050 indices[ty * 4 + tx] = (byte)(8 + (minPos4567 >> 16)); 1051 } 1052 else 1053 { 1054 errorSum += (ushort)minPos0123; 1055 indices[ty * 4 + tx] = (byte)(minPos0123 >> 16); 1056 } 1057 } 1058 } 1059 1060 return errorSum; 1061 } 1062 1063 private static Vector128<short> ShiftRoundToNearest(Vector128<short> x) 1064 { 1065 return Sse2.ShiftRightLogical(Sse2.Add(x, Vector128.Create((short)32)), 6); 1066 } 1067 1068 private static int SelectIndicesFallback( 1069 ReadOnlySpan<uint> tile, 1070 int w, 1071 int h, 1072 ReadOnlySpan<uint> endPoints0, 1073 ReadOnlySpan<uint> endPoints1, 1074 ReadOnlySpan<int> pBitValues, 1075 Span<byte> indices, 1076 int subsetCount, 1077 int partition, 1078 int indexBitCount, 1079 int indexCount, 1080 int colorDepth, 1081 int alphaDepth, 1082 int pBits, 1083 uint alphaMask) 1084 { 1085 int errorSum = 0; 1086 1087 uint alphaMaskForPalette = alphaMask; 1088 1089 if (alphaDepth == 0) 1090 { 1091 alphaMaskForPalette |= new RgbaColor8(0, 0, 0, 255).ToUInt32(); 1092 } 1093 1094 Span<uint> palette = stackalloc uint[subsetCount * indexCount]; 1095 1096 for (int subset = 0; subset < subsetCount; subset++) 1097 { 1098 int palBase = subset * indexCount; 1099 1100 int pBit0 = -1, pBit1 = -1; 1101 1102 if (pBits == subsetCount) 1103 { 1104 pBit0 = pBit1 = pBitValues[subset]; 1105 } 1106 else if (pBits != 0) 1107 { 1108 pBit0 = pBitValues[subset * 2]; 1109 pBit1 = pBitValues[subset * 2 + 1]; 1110 } 1111 1112 RgbaColor8 c0 = Quantize(RgbaColor8.FromUInt32(endPoints0[subset]), colorDepth, alphaDepth, pBit0); 1113 RgbaColor8 c1 = Quantize(RgbaColor8.FromUInt32(endPoints1[subset]), colorDepth, alphaDepth, pBit1); 1114 1115 Unsafe.As<RgbaColor8, uint>(ref c0) |= alphaMaskForPalette; 1116 Unsafe.As<RgbaColor8, uint>(ref c1) |= alphaMaskForPalette; 1117 1118 palette[palBase + 0] = c0.ToUInt32(); 1119 palette[palBase + indexCount - 1] = c1.ToUInt32(); 1120 1121 for (int j = 1; j < indexCount - 1; j++) 1122 { 1123 palette[palBase + j] = Interpolate(c0, c1, j, indexBitCount).ToUInt32(); 1124 } 1125 } 1126 1127 int i = 0; 1128 for (int ty = 0; ty < h; ty++) 1129 { 1130 for (int tx = 0; tx < w; tx++) 1131 { 1132 int subset = BC67Tables.PartitionTable[subsetCount - 1][partition][ty * 4 + tx]; 1133 uint color = tile[i++] | alphaMask; 1134 1135 int bestMatchScore = int.MaxValue; 1136 int bestMatchIndex = 0; 1137 1138 for (int j = 0; j < indexCount; j++) 1139 { 1140 int score = SquaredDifference( 1141 RgbaColor8.FromUInt32(color).GetColor32(), 1142 RgbaColor8.FromUInt32(palette[subset * indexCount + j]).GetColor32()); 1143 1144 if (score < bestMatchScore) 1145 { 1146 bestMatchScore = score; 1147 bestMatchIndex = j; 1148 } 1149 } 1150 1151 indices[ty * 4 + tx] = (byte)bestMatchIndex; 1152 errorSum += bestMatchScore; 1153 } 1154 } 1155 1156 return errorSum; 1157 } 1158 1159 [MethodImpl(MethodImplOptions.AggressiveInlining)] 1160 public static int SquaredDifference(RgbaColor32 color1, RgbaColor32 color2) 1161 { 1162 RgbaColor32 delta = color1 - color2; 1163 return RgbaColor32.Dot(delta, delta); 1164 } 1165 1166 [MethodImpl(MethodImplOptions.AggressiveInlining)] 1167 public static RgbaColor8 Interpolate(RgbaColor8 color1, RgbaColor8 color2, int weightIndex, int indexBitCount) 1168 { 1169 return Interpolate(color1.GetColor32(), color2.GetColor32(), weightIndex, indexBitCount).GetColor8(); 1170 } 1171 1172 [MethodImpl(MethodImplOptions.AggressiveInlining)] 1173 public static RgbaColor32 Interpolate(RgbaColor32 color1, RgbaColor32 color2, int weightIndex, int indexBitCount) 1174 { 1175 Debug.Assert(indexBitCount >= 2 && indexBitCount <= 4); 1176 1177 int weight = (((weightIndex << 7) / ((1 << indexBitCount) - 1)) + 1) >> 1; 1178 1179 RgbaColor32 weightV = new(weight); 1180 RgbaColor32 invWeightV = new(64 - weight); 1181 1182 return (color1 * invWeightV + color2 * weightV + new RgbaColor32(32)) >> 6; 1183 } 1184 1185 [MethodImpl(MethodImplOptions.AggressiveInlining)] 1186 public static RgbaColor32 Interpolate( 1187 RgbaColor32 color1, 1188 RgbaColor32 color2, 1189 int colorWeightIndex, 1190 int alphaWeightIndex, 1191 int colorIndexBitCount, 1192 int alphaIndexBitCount) 1193 { 1194 Debug.Assert(colorIndexBitCount >= 2 && colorIndexBitCount <= 4); 1195 Debug.Assert(alphaIndexBitCount >= 2 && alphaIndexBitCount <= 4); 1196 1197 int colorWeight = BC67Tables.Weights[colorIndexBitCount - 2][colorWeightIndex]; 1198 int alphaWeight = BC67Tables.Weights[alphaIndexBitCount - 2][alphaWeightIndex]; 1199 1200 RgbaColor32 weightV = new(colorWeight) 1201 { 1202 A = alphaWeight, 1203 }; 1204 RgbaColor32 invWeightV = new RgbaColor32(64) - weightV; 1205 1206 return (color1 * invWeightV + color2 * weightV + new RgbaColor32(32)) >> 6; 1207 } 1208 1209 public static RgbaColor8 Quantize(RgbaColor8 color, int colorBits, int alphaBits, int pBit = -1) 1210 { 1211 if (alphaBits == 0) 1212 { 1213 int colorShift = 8 - colorBits; 1214 1215 uint c; 1216 1217 if (pBit >= 0) 1218 { 1219 byte[] lutColor = _quantizationLut[colorBits - 4]; 1220 1221 Debug.Assert(pBit <= 1); 1222 int high = pBit << 8; 1223 uint mask = (0xffu >> (colorBits + 1)) * 0x10101; 1224 1225 c = lutColor[color.R | high]; 1226 c |= (uint)lutColor[color.G | high] << 8; 1227 c |= (uint)lutColor[color.B | high] << 16; 1228 1229 c <<= colorShift; 1230 c |= (c >> (colorBits + 1)) & mask; 1231 c |= ((uint)pBit * 0x10101) << (colorShift - 1); 1232 } 1233 else 1234 { 1235 byte[] lutColor = _quantizationLutNoPBit[colorBits - 4]; 1236 1237 uint mask = (0xffu >> colorBits) * 0x10101; 1238 1239 c = lutColor[color.R]; 1240 c |= (uint)lutColor[color.G] << 8; 1241 c |= (uint)lutColor[color.B] << 16; 1242 1243 c <<= colorShift; 1244 c |= (c >> colorBits) & mask; 1245 } 1246 1247 c |= (uint)color.A << 24; 1248 1249 return RgbaColor8.FromUInt32(c); 1250 } 1251 1252 return QuantizeFallback(color, colorBits, alphaBits, pBit); 1253 } 1254 1255 private static RgbaColor8 QuantizeFallback(RgbaColor8 color, int colorBits, int alphaBits, int pBit = -1) 1256 { 1257 byte r = UnquantizeComponent(QuantizeComponent(color.R, colorBits, pBit), colorBits, pBit); 1258 byte g = UnquantizeComponent(QuantizeComponent(color.G, colorBits, pBit), colorBits, pBit); 1259 byte b = UnquantizeComponent(QuantizeComponent(color.B, colorBits, pBit), colorBits, pBit); 1260 byte a = alphaBits == 0 ? color.A : UnquantizeComponent(QuantizeComponent(color.A, alphaBits, pBit), alphaBits, pBit); 1261 return new RgbaColor8(r, g, b, a); 1262 } 1263 1264 public static byte QuantizeComponent(byte component, int bits, int pBit = -1) 1265 { 1266 return pBit >= 0 ? _quantizationLut[bits - 4][component | (pBit << 8)] : _quantizationLutNoPBit[bits - 4][component]; 1267 } 1268 1269 private static byte QuantizeComponentForLut(byte component, int bits, int pBit = -1) 1270 { 1271 int shift = 8 - bits; 1272 int fill = component >> bits; 1273 1274 if (pBit >= 0) 1275 { 1276 Debug.Assert(pBit <= 1); 1277 fill >>= 1; 1278 fill |= pBit << (shift - 1); 1279 } 1280 1281 int q1 = component >> shift; 1282 int q2 = Math.Max(q1 - 1, 0); 1283 int q3 = Math.Min(q1 + 1, (1 << bits) - 1); 1284 1285 int delta1 = FastAbs(((q1 << shift) | fill) - component); 1286 int delta2 = component - ((q2 << shift) | fill); 1287 int delta3 = ((q3 << shift) | fill) - component; 1288 1289 if (delta1 < delta2 && delta1 < delta3) 1290 { 1291 return (byte)q1; 1292 } 1293 else if (delta2 < delta3) 1294 { 1295 return (byte)q2; 1296 } 1297 else 1298 { 1299 return (byte)q3; 1300 } 1301 } 1302 1303 [MethodImpl(MethodImplOptions.AggressiveInlining)] 1304 private static int FastAbs(int x) 1305 { 1306 int sign = x >> 31; 1307 return (x + sign) ^ sign; 1308 } 1309 1310 private static byte UnquantizeComponent(byte component, int bits, int pBit) 1311 { 1312 int shift = 8 - bits; 1313 int value = component << shift; 1314 1315 if (pBit >= 0) 1316 { 1317 Debug.Assert(pBit <= 1); 1318 value |= value >> (bits + 1); 1319 value |= pBit << (shift - 1); 1320 } 1321 else 1322 { 1323 value |= value >> bits; 1324 } 1325 1326 return (byte)value; 1327 } 1328 } 1329 }