BCnDecoder.cs
1 using Ryujinx.Common; 2 using Ryujinx.Common.Memory; 3 using System; 4 using System.Buffers.Binary; 5 using System.Runtime.InteropServices; 6 using System.Runtime.Intrinsics; 7 using System.Runtime.Intrinsics.X86; 8 9 namespace Ryujinx.Graphics.Texture 10 { 11 public static class BCnDecoder 12 { 13 private const int BlockWidth = 4; 14 private const int BlockHeight = 4; 15 16 public static MemoryOwner<byte> DecodeBC1(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers) 17 { 18 int size = 0; 19 20 for (int l = 0; l < levels; l++) 21 { 22 size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4; 23 } 24 25 MemoryOwner<byte> output = MemoryOwner<byte>.Rent(size); 26 27 Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight * 4]; 28 29 Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile); 30 Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output.Span); 31 32 Span<Vector128<byte>> tileAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(tile); 33 34 Span<Vector128<byte>> outputLine0 = default; 35 Span<Vector128<byte>> outputLine1 = default; 36 Span<Vector128<byte>> outputLine2 = default; 37 Span<Vector128<byte>> outputLine3 = default; 38 39 int imageBaseOOffs = 0; 40 41 for (int l = 0; l < levels; l++) 42 { 43 int w = BitUtils.DivRoundUp(width, BlockWidth); 44 int h = BitUtils.DivRoundUp(height, BlockHeight); 45 46 for (int l2 = 0; l2 < layers; l2++) 47 { 48 for (int z = 0; z < depth; z++) 49 { 50 for (int y = 0; y < h; y++) 51 { 52 int baseY = y * BlockHeight; 53 int copyHeight = Math.Min(BlockHeight, height - baseY); 54 int lineBaseOOffs = imageBaseOOffs + baseY * width; 55 56 if (copyHeight == 4) 57 { 58 outputLine0 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[lineBaseOOffs..]); 59 outputLine1 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width)..]); 60 outputLine2 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width * 2)..]); 61 outputLine3 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width * 3)..]); 62 } 63 64 for (int x = 0; x < w; x++) 65 { 66 int baseX = x * BlockWidth; 67 int copyWidth = Math.Min(BlockWidth, width - baseX); 68 69 BC1DecodeTileRgb(tile, data); 70 71 if ((copyWidth | copyHeight) == 4) 72 { 73 outputLine0[x] = tileAsVector128[0]; 74 outputLine1[x] = tileAsVector128[1]; 75 outputLine2[x] = tileAsVector128[2]; 76 outputLine3[x] = tileAsVector128[3]; 77 } 78 else 79 { 80 int pixelBaseOOffs = lineBaseOOffs + baseX; 81 82 for (int tY = 0; tY < copyHeight; tY++) 83 { 84 tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth)); 85 } 86 } 87 88 data = data[8..]; 89 } 90 } 91 92 imageBaseOOffs += width * height; 93 } 94 } 95 96 width = Math.Max(1, width >> 1); 97 height = Math.Max(1, height >> 1); 98 depth = Math.Max(1, depth >> 1); 99 } 100 101 return output; 102 } 103 104 public static MemoryOwner<byte> DecodeBC2(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers) 105 { 106 int size = 0; 107 108 for (int l = 0; l < levels; l++) 109 { 110 size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4; 111 } 112 113 MemoryOwner<byte> output = MemoryOwner<byte>.Rent(size); 114 115 Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight * 4]; 116 117 Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile); 118 Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output.Span); 119 120 Span<Vector128<byte>> tileAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(tile); 121 122 Span<Vector128<byte>> outputLine0 = default; 123 Span<Vector128<byte>> outputLine1 = default; 124 Span<Vector128<byte>> outputLine2 = default; 125 Span<Vector128<byte>> outputLine3 = default; 126 127 int imageBaseOOffs = 0; 128 129 for (int l = 0; l < levels; l++) 130 { 131 int w = BitUtils.DivRoundUp(width, BlockWidth); 132 int h = BitUtils.DivRoundUp(height, BlockHeight); 133 134 for (int l2 = 0; l2 < layers; l2++) 135 { 136 for (int z = 0; z < depth; z++) 137 { 138 for (int y = 0; y < h; y++) 139 { 140 int baseY = y * BlockHeight; 141 int copyHeight = Math.Min(BlockHeight, height - baseY); 142 int lineBaseOOffs = imageBaseOOffs + baseY * width; 143 144 if (copyHeight == 4) 145 { 146 outputLine0 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[lineBaseOOffs..]); 147 outputLine1 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width)..]); 148 outputLine2 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width * 2)..]); 149 outputLine3 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width * 3)..]); 150 } 151 152 for (int x = 0; x < w; x++) 153 { 154 int baseX = x * BlockWidth; 155 int copyWidth = Math.Min(BlockWidth, width - baseX); 156 157 BC23DecodeTileRgb(tile, data[8..]); 158 159 ulong block = BinaryPrimitives.ReadUInt64LittleEndian(data); 160 161 for (int i = 3; i < BlockWidth * BlockHeight * 4; i += 4, block >>= 4) 162 { 163 tile[i] = (byte)((block & 0xf) | (block << 4)); 164 } 165 166 if ((copyWidth | copyHeight) == 4) 167 { 168 outputLine0[x] = tileAsVector128[0]; 169 outputLine1[x] = tileAsVector128[1]; 170 outputLine2[x] = tileAsVector128[2]; 171 outputLine3[x] = tileAsVector128[3]; 172 } 173 else 174 { 175 int pixelBaseOOffs = lineBaseOOffs + baseX; 176 177 for (int tY = 0; tY < copyHeight; tY++) 178 { 179 tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth)); 180 } 181 } 182 183 data = data[16..]; 184 } 185 } 186 187 imageBaseOOffs += width * height; 188 } 189 } 190 191 width = Math.Max(1, width >> 1); 192 height = Math.Max(1, height >> 1); 193 depth = Math.Max(1, depth >> 1); 194 } 195 196 return output; 197 } 198 199 public static MemoryOwner<byte> DecodeBC3(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers) 200 { 201 int size = 0; 202 203 for (int l = 0; l < levels; l++) 204 { 205 size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4; 206 } 207 208 MemoryOwner<byte> output = MemoryOwner<byte>.Rent(size); 209 210 Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight * 4]; 211 Span<byte> rPal = stackalloc byte[8]; 212 213 Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile); 214 Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output.Span); 215 216 Span<Vector128<byte>> tileAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(tile); 217 218 Span<Vector128<byte>> outputLine0 = default; 219 Span<Vector128<byte>> outputLine1 = default; 220 Span<Vector128<byte>> outputLine2 = default; 221 Span<Vector128<byte>> outputLine3 = default; 222 223 int imageBaseOOffs = 0; 224 225 for (int l = 0; l < levels; l++) 226 { 227 int w = BitUtils.DivRoundUp(width, BlockWidth); 228 int h = BitUtils.DivRoundUp(height, BlockHeight); 229 230 for (int l2 = 0; l2 < layers; l2++) 231 { 232 for (int z = 0; z < depth; z++) 233 { 234 for (int y = 0; y < h; y++) 235 { 236 int baseY = y * BlockHeight; 237 int copyHeight = Math.Min(BlockHeight, height - baseY); 238 int lineBaseOOffs = imageBaseOOffs + baseY * width; 239 240 if (copyHeight == 4) 241 { 242 outputLine0 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[lineBaseOOffs..]); 243 outputLine1 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width)..]); 244 outputLine2 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width * 2)..]); 245 outputLine3 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width * 3)..]); 246 } 247 248 for (int x = 0; x < w; x++) 249 { 250 int baseX = x * BlockWidth; 251 int copyWidth = Math.Min(BlockWidth, width - baseX); 252 253 BC23DecodeTileRgb(tile, data[8..]); 254 255 ulong block = BinaryPrimitives.ReadUInt64LittleEndian(data); 256 257 rPal[0] = (byte)block; 258 rPal[1] = (byte)(block >> 8); 259 260 BCnLerpAlphaUnorm(rPal); 261 BCnDecodeTileAlphaRgba(tile, rPal, block >> 16); 262 263 if ((copyWidth | copyHeight) == 4) 264 { 265 outputLine0[x] = tileAsVector128[0]; 266 outputLine1[x] = tileAsVector128[1]; 267 outputLine2[x] = tileAsVector128[2]; 268 outputLine3[x] = tileAsVector128[3]; 269 } 270 else 271 { 272 int pixelBaseOOffs = lineBaseOOffs + baseX; 273 274 for (int tY = 0; tY < copyHeight; tY++) 275 { 276 tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth)); 277 } 278 } 279 280 data = data[16..]; 281 } 282 } 283 284 imageBaseOOffs += width * height; 285 } 286 } 287 288 width = Math.Max(1, width >> 1); 289 height = Math.Max(1, height >> 1); 290 depth = Math.Max(1, depth >> 1); 291 } 292 293 return output; 294 } 295 296 public static MemoryOwner<byte> DecodeBC4(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers, bool signed) 297 { 298 int size = 0; 299 300 for (int l = 0; l < levels; l++) 301 { 302 size += BitUtils.AlignUp(Math.Max(1, width >> l), 4) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers; 303 } 304 305 // Backends currently expect a stride alignment of 4 bytes, so output width must be aligned. 306 int alignedWidth = BitUtils.AlignUp(width, 4); 307 308 MemoryOwner<byte> output = MemoryOwner<byte>.Rent(size); 309 Span<byte> outputSpan = output.Span; 310 311 ReadOnlySpan<ulong> data64 = MemoryMarshal.Cast<byte, ulong>(data); 312 313 Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight]; 314 Span<byte> rPal = stackalloc byte[8]; 315 316 Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile); 317 318 Span<uint> outputLine0 = default; 319 Span<uint> outputLine1 = default; 320 Span<uint> outputLine2 = default; 321 Span<uint> outputLine3 = default; 322 323 int imageBaseOOffs = 0; 324 325 for (int l = 0; l < levels; l++) 326 { 327 int w = BitUtils.DivRoundUp(width, BlockWidth); 328 int h = BitUtils.DivRoundUp(height, BlockHeight); 329 330 for (int l2 = 0; l2 < layers; l2++) 331 { 332 for (int z = 0; z < depth; z++) 333 { 334 for (int y = 0; y < h; y++) 335 { 336 int baseY = y * BlockHeight; 337 int copyHeight = Math.Min(BlockHeight, height - baseY); 338 int lineBaseOOffs = imageBaseOOffs + baseY * alignedWidth; 339 340 if (copyHeight == 4) 341 { 342 outputLine0 = MemoryMarshal.Cast<byte, uint>(outputSpan[lineBaseOOffs..]); 343 outputLine1 = MemoryMarshal.Cast<byte, uint>(outputSpan[(lineBaseOOffs + alignedWidth)..]); 344 outputLine2 = MemoryMarshal.Cast<byte, uint>(outputSpan[(lineBaseOOffs + alignedWidth * 2)..]); 345 outputLine3 = MemoryMarshal.Cast<byte, uint>(outputSpan[(lineBaseOOffs + alignedWidth * 3)..]); 346 } 347 348 for (int x = 0; x < w; x++) 349 { 350 int baseX = x * BlockWidth; 351 int copyWidth = Math.Min(BlockWidth, width - baseX); 352 353 ulong block = data64[0]; 354 355 rPal[0] = (byte)block; 356 rPal[1] = (byte)(block >> 8); 357 358 if (signed) 359 { 360 BCnLerpAlphaSnorm(rPal); 361 } 362 else 363 { 364 BCnLerpAlphaUnorm(rPal); 365 } 366 367 BCnDecodeTileAlpha(tile, rPal, block >> 16); 368 369 if ((copyWidth | copyHeight) == 4) 370 { 371 outputLine0[x] = tileAsUint[0]; 372 outputLine1[x] = tileAsUint[1]; 373 outputLine2[x] = tileAsUint[2]; 374 outputLine3[x] = tileAsUint[3]; 375 } 376 else 377 { 378 int pixelBaseOOffs = lineBaseOOffs + baseX; 379 380 for (int tY = 0; tY < copyHeight; tY++) 381 { 382 tile.Slice(tY * 4, copyWidth).CopyTo(outputSpan.Slice(pixelBaseOOffs + alignedWidth * tY, copyWidth)); 383 } 384 } 385 386 data64 = data64[1..]; 387 } 388 } 389 390 imageBaseOOffs += alignedWidth * height; 391 } 392 } 393 394 width = Math.Max(1, width >> 1); 395 height = Math.Max(1, height >> 1); 396 depth = Math.Max(1, depth >> 1); 397 398 alignedWidth = BitUtils.AlignUp(width, 4); 399 } 400 401 return output; 402 } 403 404 public static MemoryOwner<byte> DecodeBC5(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers, bool signed) 405 { 406 int size = 0; 407 408 for (int l = 0; l < levels; l++) 409 { 410 size += BitUtils.AlignUp(Math.Max(1, width >> l), 2) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 2; 411 } 412 413 // Backends currently expect a stride alignment of 4 bytes, so output width must be aligned. 414 int alignedWidth = BitUtils.AlignUp(width, 2); 415 416 MemoryOwner<byte> output = MemoryOwner<byte>.Rent(size); 417 418 ReadOnlySpan<ulong> data64 = MemoryMarshal.Cast<byte, ulong>(data); 419 420 Span<byte> rTile = stackalloc byte[BlockWidth * BlockHeight * 2]; 421 Span<byte> gTile = stackalloc byte[BlockWidth * BlockHeight * 2]; 422 Span<byte> rPal = stackalloc byte[8]; 423 Span<byte> gPal = stackalloc byte[8]; 424 425 Span<ushort> outputAsUshort = MemoryMarshal.Cast<byte, ushort>(output.Span); 426 427 Span<uint> rTileAsUint = MemoryMarshal.Cast<byte, uint>(rTile); 428 Span<uint> gTileAsUint = MemoryMarshal.Cast<byte, uint>(gTile); 429 430 Span<ulong> outputLine0 = default; 431 Span<ulong> outputLine1 = default; 432 Span<ulong> outputLine2 = default; 433 Span<ulong> outputLine3 = default; 434 435 int imageBaseOOffs = 0; 436 437 for (int l = 0; l < levels; l++) 438 { 439 int w = BitUtils.DivRoundUp(width, BlockWidth); 440 int h = BitUtils.DivRoundUp(height, BlockHeight); 441 442 for (int l2 = 0; l2 < layers; l2++) 443 { 444 for (int z = 0; z < depth; z++) 445 { 446 for (int y = 0; y < h; y++) 447 { 448 int baseY = y * BlockHeight; 449 int copyHeight = Math.Min(BlockHeight, height - baseY); 450 int lineBaseOOffs = imageBaseOOffs + baseY * alignedWidth; 451 452 if (copyHeight == 4) 453 { 454 outputLine0 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort[lineBaseOOffs..]); 455 outputLine1 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort[(lineBaseOOffs + alignedWidth)..]); 456 outputLine2 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort[(lineBaseOOffs + alignedWidth * 2)..]); 457 outputLine3 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort[(lineBaseOOffs + alignedWidth * 3)..]); 458 } 459 460 for (int x = 0; x < w; x++) 461 { 462 int baseX = x * BlockWidth; 463 int copyWidth = Math.Min(BlockWidth, width - baseX); 464 465 ulong blockL = data64[0]; 466 ulong blockH = data64[1]; 467 468 rPal[0] = (byte)blockL; 469 rPal[1] = (byte)(blockL >> 8); 470 gPal[0] = (byte)blockH; 471 gPal[1] = (byte)(blockH >> 8); 472 473 if (signed) 474 { 475 BCnLerpAlphaSnorm(rPal); 476 BCnLerpAlphaSnorm(gPal); 477 } 478 else 479 { 480 BCnLerpAlphaUnorm(rPal); 481 BCnLerpAlphaUnorm(gPal); 482 } 483 484 BCnDecodeTileAlpha(rTile, rPal, blockL >> 16); 485 BCnDecodeTileAlpha(gTile, gPal, blockH >> 16); 486 487 if ((copyWidth | copyHeight) == 4) 488 { 489 outputLine0[x] = InterleaveBytes(rTileAsUint[0], gTileAsUint[0]); 490 outputLine1[x] = InterleaveBytes(rTileAsUint[1], gTileAsUint[1]); 491 outputLine2[x] = InterleaveBytes(rTileAsUint[2], gTileAsUint[2]); 492 outputLine3[x] = InterleaveBytes(rTileAsUint[3], gTileAsUint[3]); 493 } 494 else 495 { 496 int pixelBaseOOffs = lineBaseOOffs + baseX; 497 498 for (int tY = 0; tY < copyHeight; tY++) 499 { 500 int line = pixelBaseOOffs + alignedWidth * tY; 501 502 for (int tX = 0; tX < copyWidth; tX++) 503 { 504 int texel = tY * BlockWidth + tX; 505 506 outputAsUshort[line + tX] = (ushort)(rTile[texel] | (gTile[texel] << 8)); 507 } 508 } 509 } 510 511 data64 = data64[2..]; 512 } 513 } 514 515 imageBaseOOffs += alignedWidth * height; 516 } 517 } 518 519 width = Math.Max(1, width >> 1); 520 height = Math.Max(1, height >> 1); 521 depth = Math.Max(1, depth >> 1); 522 523 alignedWidth = BitUtils.AlignUp(width, 2); 524 } 525 526 return output; 527 } 528 529 public static MemoryOwner<byte> DecodeBC6(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers, bool signed) 530 { 531 int size = 0; 532 533 for (int l = 0; l < levels; l++) 534 { 535 size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 8; 536 } 537 538 MemoryOwner<byte> output = MemoryOwner<byte>.Rent(size); 539 Span<byte> outputSpan = output.Span; 540 541 int inputOffset = 0; 542 int outputOffset = 0; 543 544 for (int l = 0; l < levels; l++) 545 { 546 int w = BitUtils.DivRoundUp(width, BlockWidth); 547 int h = BitUtils.DivRoundUp(height, BlockHeight); 548 549 for (int l2 = 0; l2 < layers; l2++) 550 { 551 for (int z = 0; z < depth; z++) 552 { 553 BC6Decoder.Decode(outputSpan[outputOffset..], data[inputOffset..], width, height, signed); 554 555 inputOffset += w * h * 16; 556 outputOffset += width * height * 8; 557 } 558 } 559 560 width = Math.Max(1, width >> 1); 561 height = Math.Max(1, height >> 1); 562 depth = Math.Max(1, depth >> 1); 563 } 564 565 return output; 566 } 567 568 public static MemoryOwner<byte> DecodeBC7(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers) 569 { 570 int size = 0; 571 572 for (int l = 0; l < levels; l++) 573 { 574 size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4; 575 } 576 577 MemoryOwner<byte> output = MemoryOwner<byte>.Rent(size); 578 Span<byte> outputSpan = output.Span; 579 580 int inputOffset = 0; 581 int outputOffset = 0; 582 583 for (int l = 0; l < levels; l++) 584 { 585 int w = BitUtils.DivRoundUp(width, BlockWidth); 586 int h = BitUtils.DivRoundUp(height, BlockHeight); 587 588 for (int l2 = 0; l2 < layers; l2++) 589 { 590 for (int z = 0; z < depth; z++) 591 { 592 BC7Decoder.Decode(outputSpan[outputOffset..], data[inputOffset..], width, height); 593 594 inputOffset += w * h * 16; 595 outputOffset += width * height * 4; 596 } 597 } 598 599 width = Math.Max(1, width >> 1); 600 height = Math.Max(1, height >> 1); 601 depth = Math.Max(1, depth >> 1); 602 } 603 604 return output; 605 } 606 607 private static ulong InterleaveBytes(uint left, uint right) 608 { 609 return InterleaveBytesWithZeros(left) | (InterleaveBytesWithZeros(right) << 8); 610 } 611 612 private static ulong InterleaveBytesWithZeros(uint value) 613 { 614 ulong output = value; 615 output = (output ^ (output << 16)) & 0xffff0000ffffUL; 616 output = (output ^ (output << 8)) & 0xff00ff00ff00ffUL; 617 return output; 618 } 619 620 private static void BCnLerpAlphaUnorm(Span<byte> alpha) 621 { 622 byte a0 = alpha[0]; 623 byte a1 = alpha[1]; 624 625 if (a0 > a1) 626 { 627 alpha[2] = (byte)((6 * a0 + 1 * a1) / 7); 628 alpha[3] = (byte)((5 * a0 + 2 * a1) / 7); 629 alpha[4] = (byte)((4 * a0 + 3 * a1) / 7); 630 alpha[5] = (byte)((3 * a0 + 4 * a1) / 7); 631 alpha[6] = (byte)((2 * a0 + 5 * a1) / 7); 632 alpha[7] = (byte)((1 * a0 + 6 * a1) / 7); 633 } 634 else 635 { 636 alpha[2] = (byte)((4 * a0 + 1 * a1) / 5); 637 alpha[3] = (byte)((3 * a0 + 2 * a1) / 5); 638 alpha[4] = (byte)((2 * a0 + 3 * a1) / 5); 639 alpha[5] = (byte)((1 * a0 + 4 * a1) / 5); 640 alpha[6] = 0; 641 alpha[7] = 0xff; 642 } 643 } 644 645 private static void BCnLerpAlphaSnorm(Span<byte> alpha) 646 { 647 sbyte a0 = (sbyte)alpha[0]; 648 sbyte a1 = (sbyte)alpha[1]; 649 650 if (a0 > a1) 651 { 652 alpha[2] = (byte)((6 * a0 + 1 * a1) / 7); 653 alpha[3] = (byte)((5 * a0 + 2 * a1) / 7); 654 alpha[4] = (byte)((4 * a0 + 3 * a1) / 7); 655 alpha[5] = (byte)((3 * a0 + 4 * a1) / 7); 656 alpha[6] = (byte)((2 * a0 + 5 * a1) / 7); 657 alpha[7] = (byte)((1 * a0 + 6 * a1) / 7); 658 } 659 else 660 { 661 alpha[2] = (byte)((4 * a0 + 1 * a1) / 5); 662 alpha[3] = (byte)((3 * a0 + 2 * a1) / 5); 663 alpha[4] = (byte)((2 * a0 + 3 * a1) / 5); 664 alpha[5] = (byte)((1 * a0 + 4 * a1) / 5); 665 alpha[6] = 0x80; 666 alpha[7] = 0x7f; 667 } 668 } 669 670 private unsafe static void BCnDecodeTileAlpha(Span<byte> output, Span<byte> rPal, ulong rI) 671 { 672 if (Avx2.IsSupported) 673 { 674 Span<Vector128<byte>> outputAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(output); 675 676 Vector128<uint> shifts = Vector128.Create(0u, 3u, 6u, 9u); 677 Vector128<uint> masks = Vector128.Create(7u); 678 679 Vector128<byte> vClut; 680 681 fixed (byte* pRPal = rPal) 682 { 683 vClut = Sse2.LoadScalarVector128((ulong*)pRPal).AsByte(); 684 } 685 686 Vector128<uint> indices0 = Vector128.Create((uint)rI); 687 Vector128<uint> indices1 = Vector128.Create((uint)(rI >> 24)); 688 Vector128<uint> indices00 = Avx2.ShiftRightLogicalVariable(indices0, shifts); 689 Vector128<uint> indices10 = Avx2.ShiftRightLogicalVariable(indices1, shifts); 690 Vector128<uint> indices01 = Sse2.ShiftRightLogical(indices00, 12); 691 Vector128<uint> indices11 = Sse2.ShiftRightLogical(indices10, 12); 692 indices00 = Sse2.And(indices00, masks); 693 indices10 = Sse2.And(indices10, masks); 694 indices01 = Sse2.And(indices01, masks); 695 indices11 = Sse2.And(indices11, masks); 696 697 Vector128<ushort> indicesW0 = Sse41.PackUnsignedSaturate(indices00.AsInt32(), indices01.AsInt32()); 698 Vector128<ushort> indicesW1 = Sse41.PackUnsignedSaturate(indices10.AsInt32(), indices11.AsInt32()); 699 700 Vector128<byte> indices = Sse2.PackUnsignedSaturate(indicesW0.AsInt16(), indicesW1.AsInt16()); 701 702 outputAsVector128[0] = Ssse3.Shuffle(vClut, indices); 703 } 704 else 705 { 706 for (int i = 0; i < BlockWidth * BlockHeight; i++, rI >>= 3) 707 { 708 output[i] = rPal[(int)(rI & 7)]; 709 } 710 } 711 } 712 713 private unsafe static void BCnDecodeTileAlphaRgba(Span<byte> output, Span<byte> rPal, ulong rI) 714 { 715 if (Avx2.IsSupported) 716 { 717 Span<Vector256<uint>> outputAsVector256 = MemoryMarshal.Cast<byte, Vector256<uint>>(output); 718 719 Vector256<uint> shifts = Vector256.Create(0u, 3u, 6u, 9u, 12u, 15u, 18u, 21u); 720 721 Vector128<uint> vClut128; 722 723 fixed (byte* pRPal = rPal) 724 { 725 vClut128 = Sse2.LoadScalarVector128((ulong*)pRPal).AsUInt32(); 726 } 727 728 Vector256<uint> vClut = Avx2.ConvertToVector256Int32(vClut128.AsByte()).AsUInt32(); 729 vClut = Avx2.ShiftLeftLogical(vClut, 24); 730 731 Vector256<uint> indices0 = Vector256.Create((uint)rI); 732 Vector256<uint> indices1 = Vector256.Create((uint)(rI >> 24)); 733 734 indices0 = Avx2.ShiftRightLogicalVariable(indices0, shifts); 735 indices1 = Avx2.ShiftRightLogicalVariable(indices1, shifts); 736 737 outputAsVector256[0] = Avx2.Or(outputAsVector256[0], Avx2.PermuteVar8x32(vClut, indices0)); 738 outputAsVector256[1] = Avx2.Or(outputAsVector256[1], Avx2.PermuteVar8x32(vClut, indices1)); 739 } 740 else 741 { 742 for (int i = 3; i < BlockWidth * BlockHeight * 4; i += 4, rI >>= 3) 743 { 744 output[i] = rPal[(int)(rI & 7)]; 745 } 746 } 747 } 748 749 private unsafe static void BC1DecodeTileRgb(Span<byte> output, ReadOnlySpan<byte> input) 750 { 751 Span<uint> clut = stackalloc uint[4]; 752 753 uint c0c1 = BinaryPrimitives.ReadUInt32LittleEndian(input); 754 uint c0 = (ushort)c0c1; 755 uint c1 = (ushort)(c0c1 >> 16); 756 757 clut[0] = ConvertRgb565ToRgb888(c0) | 0xff000000; 758 clut[1] = ConvertRgb565ToRgb888(c1) | 0xff000000; 759 clut[2] = BC1LerpRgb2(clut[0], clut[1], c0, c1); 760 clut[3] = BC1LerpRgb3(clut[0], clut[1], c0, c1); 761 762 BCnDecodeTileRgb(clut, output, input); 763 } 764 765 private unsafe static void BC23DecodeTileRgb(Span<byte> output, ReadOnlySpan<byte> input) 766 { 767 Span<uint> clut = stackalloc uint[4]; 768 769 uint c0c1 = BinaryPrimitives.ReadUInt32LittleEndian(input); 770 uint c0 = (ushort)c0c1; 771 uint c1 = (ushort)(c0c1 >> 16); 772 773 clut[0] = ConvertRgb565ToRgb888(c0); 774 clut[1] = ConvertRgb565ToRgb888(c1); 775 clut[2] = BC23LerpRgb2(clut[0], clut[1]); 776 clut[3] = BC23LerpRgb3(clut[0], clut[1]); 777 778 BCnDecodeTileRgb(clut, output, input); 779 } 780 781 private unsafe static void BCnDecodeTileRgb(Span<uint> clut, Span<byte> output, ReadOnlySpan<byte> input) 782 { 783 if (Avx2.IsSupported) 784 { 785 Span<Vector256<uint>> outputAsVector256 = MemoryMarshal.Cast<byte, Vector256<uint>>(output); 786 787 Vector256<uint> shifts0 = Vector256.Create(0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u); 788 Vector256<uint> shifts1 = Vector256.Create(16u, 18u, 20u, 22u, 24u, 26u, 28u, 30u); 789 Vector256<uint> masks = Vector256.Create(3u); 790 791 Vector256<uint> vClut; 792 793 fixed (uint* pClut = &clut[0]) 794 { 795 vClut = Sse2.LoadVector128(pClut).ToVector256Unsafe(); 796 } 797 798 Vector256<uint> indices0; 799 800 fixed (byte* pInput = input) 801 { 802 indices0 = Avx2.BroadcastScalarToVector256((uint*)(pInput + 4)); 803 } 804 805 Vector256<uint> indices1 = indices0; 806 807 indices0 = Avx2.ShiftRightLogicalVariable(indices0, shifts0); 808 indices1 = Avx2.ShiftRightLogicalVariable(indices1, shifts1); 809 indices0 = Avx2.And(indices0, masks); 810 indices1 = Avx2.And(indices1, masks); 811 812 outputAsVector256[0] = Avx2.PermuteVar8x32(vClut, indices0); 813 outputAsVector256[1] = Avx2.PermuteVar8x32(vClut, indices1); 814 } 815 else 816 { 817 Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output); 818 819 uint indices = BinaryPrimitives.ReadUInt32LittleEndian(input[4..]); 820 821 for (int i = 0; i < BlockWidth * BlockHeight; i++, indices >>= 2) 822 { 823 outputAsUint[i] = clut[(int)(indices & 3)]; 824 } 825 } 826 } 827 828 private static uint BC1LerpRgb2(uint color0, uint color1, uint c0, uint c1) 829 { 830 if (c0 > c1) 831 { 832 return BC23LerpRgb2(color0, color1) | 0xff000000; 833 } 834 835 uint carry = color0 & color1; 836 uint addHalve = ((color0 ^ color1) >> 1) & 0x7f7f7f; 837 return (addHalve + carry) | 0xff000000; 838 } 839 840 private static uint BC23LerpRgb2(uint color0, uint color1) 841 { 842 uint r0 = (byte)color0; 843 uint g0 = color0 & 0xff00; 844 uint b0 = color0 & 0xff0000; 845 846 uint r1 = (byte)color1; 847 uint g1 = color1 & 0xff00; 848 uint b1 = color1 & 0xff0000; 849 850 uint mixR = (2 * r0 + r1) / 3; 851 uint mixG = (2 * g0 + g1) / 3; 852 uint mixB = (2 * b0 + b1) / 3; 853 854 return mixR | (mixG & 0xff00) | (mixB & 0xff0000); 855 } 856 857 private static uint BC1LerpRgb3(uint color0, uint color1, uint c0, uint c1) 858 { 859 if (c0 > c1) 860 { 861 return BC23LerpRgb3(color0, color1) | 0xff000000; 862 } 863 864 return 0; 865 } 866 867 private static uint BC23LerpRgb3(uint color0, uint color1) 868 { 869 uint r0 = (byte)color0; 870 uint g0 = color0 & 0xff00; 871 uint b0 = color0 & 0xff0000; 872 873 uint r1 = (byte)color1; 874 uint g1 = color1 & 0xff00; 875 uint b1 = color1 & 0xff0000; 876 877 uint mixR = (2 * r1 + r0) / 3; 878 uint mixG = (2 * g1 + g0) / 3; 879 uint mixB = (2 * b1 + b0) / 3; 880 881 return mixR | (mixG & 0xff00) | (mixB & 0xff0000); 882 } 883 884 private static uint ConvertRgb565ToRgb888(uint value) 885 { 886 uint b = (value & 0x1f) << 19; 887 uint g = (value << 5) & 0xfc00; 888 uint r = (value >> 8) & 0xf8; 889 890 b |= b >> 5; 891 g |= g >> 6; 892 r |= r >> 5; 893 894 return r | (g & 0xff00) | (b & 0xff0000); 895 } 896 } 897 }