/ src / Ryujinx.Graphics.Texture / BCnDecoder.cs
BCnDecoder.cs
  1  using Ryujinx.Common;
  2  using Ryujinx.Common.Memory;
  3  using System;
  4  using System.Buffers.Binary;
  5  using System.Runtime.InteropServices;
  6  using System.Runtime.Intrinsics;
  7  using System.Runtime.Intrinsics.X86;
  8  
  9  namespace Ryujinx.Graphics.Texture
 10  {
 11      public static class BCnDecoder
 12      {
 13          private const int BlockWidth = 4;
 14          private const int BlockHeight = 4;
 15  
 16          public static MemoryOwner<byte> DecodeBC1(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers)
 17          {
 18              int size = 0;
 19  
 20              for (int l = 0; l < levels; l++)
 21              {
 22                  size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
 23              }
 24  
 25              MemoryOwner<byte> output = MemoryOwner<byte>.Rent(size);
 26  
 27              Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight * 4];
 28  
 29              Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile);
 30              Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output.Span);
 31  
 32              Span<Vector128<byte>> tileAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(tile);
 33  
 34              Span<Vector128<byte>> outputLine0 = default;
 35              Span<Vector128<byte>> outputLine1 = default;
 36              Span<Vector128<byte>> outputLine2 = default;
 37              Span<Vector128<byte>> outputLine3 = default;
 38  
 39              int imageBaseOOffs = 0;
 40  
 41              for (int l = 0; l < levels; l++)
 42              {
 43                  int w = BitUtils.DivRoundUp(width, BlockWidth);
 44                  int h = BitUtils.DivRoundUp(height, BlockHeight);
 45  
 46                  for (int l2 = 0; l2 < layers; l2++)
 47                  {
 48                      for (int z = 0; z < depth; z++)
 49                      {
 50                          for (int y = 0; y < h; y++)
 51                          {
 52                              int baseY = y * BlockHeight;
 53                              int copyHeight = Math.Min(BlockHeight, height - baseY);
 54                              int lineBaseOOffs = imageBaseOOffs + baseY * width;
 55  
 56                              if (copyHeight == 4)
 57                              {
 58                                  outputLine0 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[lineBaseOOffs..]);
 59                                  outputLine1 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width)..]);
 60                                  outputLine2 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width * 2)..]);
 61                                  outputLine3 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width * 3)..]);
 62                              }
 63  
 64                              for (int x = 0; x < w; x++)
 65                              {
 66                                  int baseX = x * BlockWidth;
 67                                  int copyWidth = Math.Min(BlockWidth, width - baseX);
 68  
 69                                  BC1DecodeTileRgb(tile, data);
 70  
 71                                  if ((copyWidth | copyHeight) == 4)
 72                                  {
 73                                      outputLine0[x] = tileAsVector128[0];
 74                                      outputLine1[x] = tileAsVector128[1];
 75                                      outputLine2[x] = tileAsVector128[2];
 76                                      outputLine3[x] = tileAsVector128[3];
 77                                  }
 78                                  else
 79                                  {
 80                                      int pixelBaseOOffs = lineBaseOOffs + baseX;
 81  
 82                                      for (int tY = 0; tY < copyHeight; tY++)
 83                                      {
 84                                          tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth));
 85                                      }
 86                                  }
 87  
 88                                  data = data[8..];
 89                              }
 90                          }
 91  
 92                          imageBaseOOffs += width * height;
 93                      }
 94                  }
 95  
 96                  width = Math.Max(1, width >> 1);
 97                  height = Math.Max(1, height >> 1);
 98                  depth = Math.Max(1, depth >> 1);
 99              }
100  
101              return output;
102          }
103  
104          public static MemoryOwner<byte> DecodeBC2(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers)
105          {
106              int size = 0;
107  
108              for (int l = 0; l < levels; l++)
109              {
110                  size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
111              }
112  
113              MemoryOwner<byte> output = MemoryOwner<byte>.Rent(size);
114  
115              Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight * 4];
116  
117              Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile);
118              Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output.Span);
119  
120              Span<Vector128<byte>> tileAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(tile);
121  
122              Span<Vector128<byte>> outputLine0 = default;
123              Span<Vector128<byte>> outputLine1 = default;
124              Span<Vector128<byte>> outputLine2 = default;
125              Span<Vector128<byte>> outputLine3 = default;
126  
127              int imageBaseOOffs = 0;
128  
129              for (int l = 0; l < levels; l++)
130              {
131                  int w = BitUtils.DivRoundUp(width, BlockWidth);
132                  int h = BitUtils.DivRoundUp(height, BlockHeight);
133  
134                  for (int l2 = 0; l2 < layers; l2++)
135                  {
136                      for (int z = 0; z < depth; z++)
137                      {
138                          for (int y = 0; y < h; y++)
139                          {
140                              int baseY = y * BlockHeight;
141                              int copyHeight = Math.Min(BlockHeight, height - baseY);
142                              int lineBaseOOffs = imageBaseOOffs + baseY * width;
143  
144                              if (copyHeight == 4)
145                              {
146                                  outputLine0 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[lineBaseOOffs..]);
147                                  outputLine1 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width)..]);
148                                  outputLine2 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width * 2)..]);
149                                  outputLine3 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width * 3)..]);
150                              }
151  
152                              for (int x = 0; x < w; x++)
153                              {
154                                  int baseX = x * BlockWidth;
155                                  int copyWidth = Math.Min(BlockWidth, width - baseX);
156  
157                                  BC23DecodeTileRgb(tile, data[8..]);
158  
159                                  ulong block = BinaryPrimitives.ReadUInt64LittleEndian(data);
160  
161                                  for (int i = 3; i < BlockWidth * BlockHeight * 4; i += 4, block >>= 4)
162                                  {
163                                      tile[i] = (byte)((block & 0xf) | (block << 4));
164                                  }
165  
166                                  if ((copyWidth | copyHeight) == 4)
167                                  {
168                                      outputLine0[x] = tileAsVector128[0];
169                                      outputLine1[x] = tileAsVector128[1];
170                                      outputLine2[x] = tileAsVector128[2];
171                                      outputLine3[x] = tileAsVector128[3];
172                                  }
173                                  else
174                                  {
175                                      int pixelBaseOOffs = lineBaseOOffs + baseX;
176  
177                                      for (int tY = 0; tY < copyHeight; tY++)
178                                      {
179                                          tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth));
180                                      }
181                                  }
182  
183                                  data = data[16..];
184                              }
185                          }
186  
187                          imageBaseOOffs += width * height;
188                      }
189                  }
190  
191                  width = Math.Max(1, width >> 1);
192                  height = Math.Max(1, height >> 1);
193                  depth = Math.Max(1, depth >> 1);
194              }
195  
196              return output;
197          }
198  
199          public static MemoryOwner<byte> DecodeBC3(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers)
200          {
201              int size = 0;
202  
203              for (int l = 0; l < levels; l++)
204              {
205                  size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
206              }
207  
208              MemoryOwner<byte> output = MemoryOwner<byte>.Rent(size);
209  
210              Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight * 4];
211              Span<byte> rPal = stackalloc byte[8];
212  
213              Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile);
214              Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output.Span);
215  
216              Span<Vector128<byte>> tileAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(tile);
217  
218              Span<Vector128<byte>> outputLine0 = default;
219              Span<Vector128<byte>> outputLine1 = default;
220              Span<Vector128<byte>> outputLine2 = default;
221              Span<Vector128<byte>> outputLine3 = default;
222  
223              int imageBaseOOffs = 0;
224  
225              for (int l = 0; l < levels; l++)
226              {
227                  int w = BitUtils.DivRoundUp(width, BlockWidth);
228                  int h = BitUtils.DivRoundUp(height, BlockHeight);
229  
230                  for (int l2 = 0; l2 < layers; l2++)
231                  {
232                      for (int z = 0; z < depth; z++)
233                      {
234                          for (int y = 0; y < h; y++)
235                          {
236                              int baseY = y * BlockHeight;
237                              int copyHeight = Math.Min(BlockHeight, height - baseY);
238                              int lineBaseOOffs = imageBaseOOffs + baseY * width;
239  
240                              if (copyHeight == 4)
241                              {
242                                  outputLine0 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[lineBaseOOffs..]);
243                                  outputLine1 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width)..]);
244                                  outputLine2 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width * 2)..]);
245                                  outputLine3 = MemoryMarshal.Cast<uint, Vector128<byte>>(outputAsUint[(lineBaseOOffs + width * 3)..]);
246                              }
247  
248                              for (int x = 0; x < w; x++)
249                              {
250                                  int baseX = x * BlockWidth;
251                                  int copyWidth = Math.Min(BlockWidth, width - baseX);
252  
253                                  BC23DecodeTileRgb(tile, data[8..]);
254  
255                                  ulong block = BinaryPrimitives.ReadUInt64LittleEndian(data);
256  
257                                  rPal[0] = (byte)block;
258                                  rPal[1] = (byte)(block >> 8);
259  
260                                  BCnLerpAlphaUnorm(rPal);
261                                  BCnDecodeTileAlphaRgba(tile, rPal, block >> 16);
262  
263                                  if ((copyWidth | copyHeight) == 4)
264                                  {
265                                      outputLine0[x] = tileAsVector128[0];
266                                      outputLine1[x] = tileAsVector128[1];
267                                      outputLine2[x] = tileAsVector128[2];
268                                      outputLine3[x] = tileAsVector128[3];
269                                  }
270                                  else
271                                  {
272                                      int pixelBaseOOffs = lineBaseOOffs + baseX;
273  
274                                      for (int tY = 0; tY < copyHeight; tY++)
275                                      {
276                                          tileAsUint.Slice(tY * 4, copyWidth).CopyTo(outputAsUint.Slice(pixelBaseOOffs + width * tY, copyWidth));
277                                      }
278                                  }
279  
280                                  data = data[16..];
281                              }
282                          }
283  
284                          imageBaseOOffs += width * height;
285                      }
286                  }
287  
288                  width = Math.Max(1, width >> 1);
289                  height = Math.Max(1, height >> 1);
290                  depth = Math.Max(1, depth >> 1);
291              }
292  
293              return output;
294          }
295  
296          public static MemoryOwner<byte> DecodeBC4(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers, bool signed)
297          {
298              int size = 0;
299  
300              for (int l = 0; l < levels; l++)
301              {
302                  size += BitUtils.AlignUp(Math.Max(1, width >> l), 4) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers;
303              }
304  
305              // Backends currently expect a stride alignment of 4 bytes, so output width must be aligned.
306              int alignedWidth = BitUtils.AlignUp(width, 4);
307  
308              MemoryOwner<byte> output = MemoryOwner<byte>.Rent(size);
309              Span<byte> outputSpan = output.Span;
310  
311              ReadOnlySpan<ulong> data64 = MemoryMarshal.Cast<byte, ulong>(data);
312  
313              Span<byte> tile = stackalloc byte[BlockWidth * BlockHeight];
314              Span<byte> rPal = stackalloc byte[8];
315  
316              Span<uint> tileAsUint = MemoryMarshal.Cast<byte, uint>(tile);
317  
318              Span<uint> outputLine0 = default;
319              Span<uint> outputLine1 = default;
320              Span<uint> outputLine2 = default;
321              Span<uint> outputLine3 = default;
322  
323              int imageBaseOOffs = 0;
324  
325              for (int l = 0; l < levels; l++)
326              {
327                  int w = BitUtils.DivRoundUp(width, BlockWidth);
328                  int h = BitUtils.DivRoundUp(height, BlockHeight);
329  
330                  for (int l2 = 0; l2 < layers; l2++)
331                  {
332                      for (int z = 0; z < depth; z++)
333                      {
334                          for (int y = 0; y < h; y++)
335                          {
336                              int baseY = y * BlockHeight;
337                              int copyHeight = Math.Min(BlockHeight, height - baseY);
338                              int lineBaseOOffs = imageBaseOOffs + baseY * alignedWidth;
339  
340                              if (copyHeight == 4)
341                              {
342                                  outputLine0 = MemoryMarshal.Cast<byte, uint>(outputSpan[lineBaseOOffs..]);
343                                  outputLine1 = MemoryMarshal.Cast<byte, uint>(outputSpan[(lineBaseOOffs + alignedWidth)..]);
344                                  outputLine2 = MemoryMarshal.Cast<byte, uint>(outputSpan[(lineBaseOOffs + alignedWidth * 2)..]);
345                                  outputLine3 = MemoryMarshal.Cast<byte, uint>(outputSpan[(lineBaseOOffs + alignedWidth * 3)..]);
346                              }
347  
348                              for (int x = 0; x < w; x++)
349                              {
350                                  int baseX = x * BlockWidth;
351                                  int copyWidth = Math.Min(BlockWidth, width - baseX);
352  
353                                  ulong block = data64[0];
354  
355                                  rPal[0] = (byte)block;
356                                  rPal[1] = (byte)(block >> 8);
357  
358                                  if (signed)
359                                  {
360                                      BCnLerpAlphaSnorm(rPal);
361                                  }
362                                  else
363                                  {
364                                      BCnLerpAlphaUnorm(rPal);
365                                  }
366  
367                                  BCnDecodeTileAlpha(tile, rPal, block >> 16);
368  
369                                  if ((copyWidth | copyHeight) == 4)
370                                  {
371                                      outputLine0[x] = tileAsUint[0];
372                                      outputLine1[x] = tileAsUint[1];
373                                      outputLine2[x] = tileAsUint[2];
374                                      outputLine3[x] = tileAsUint[3];
375                                  }
376                                  else
377                                  {
378                                      int pixelBaseOOffs = lineBaseOOffs + baseX;
379  
380                                      for (int tY = 0; tY < copyHeight; tY++)
381                                      {
382                                          tile.Slice(tY * 4, copyWidth).CopyTo(outputSpan.Slice(pixelBaseOOffs + alignedWidth * tY, copyWidth));
383                                      }
384                                  }
385  
386                                  data64 = data64[1..];
387                              }
388                          }
389  
390                          imageBaseOOffs += alignedWidth * height;
391                      }
392                  }
393  
394                  width = Math.Max(1, width >> 1);
395                  height = Math.Max(1, height >> 1);
396                  depth = Math.Max(1, depth >> 1);
397  
398                  alignedWidth = BitUtils.AlignUp(width, 4);
399              }
400  
401              return output;
402          }
403  
404          public static MemoryOwner<byte> DecodeBC5(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers, bool signed)
405          {
406              int size = 0;
407  
408              for (int l = 0; l < levels; l++)
409              {
410                  size += BitUtils.AlignUp(Math.Max(1, width >> l), 2) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 2;
411              }
412  
413              // Backends currently expect a stride alignment of 4 bytes, so output width must be aligned.
414              int alignedWidth = BitUtils.AlignUp(width, 2);
415  
416              MemoryOwner<byte> output = MemoryOwner<byte>.Rent(size);
417  
418              ReadOnlySpan<ulong> data64 = MemoryMarshal.Cast<byte, ulong>(data);
419  
420              Span<byte> rTile = stackalloc byte[BlockWidth * BlockHeight * 2];
421              Span<byte> gTile = stackalloc byte[BlockWidth * BlockHeight * 2];
422              Span<byte> rPal = stackalloc byte[8];
423              Span<byte> gPal = stackalloc byte[8];
424  
425              Span<ushort> outputAsUshort = MemoryMarshal.Cast<byte, ushort>(output.Span);
426  
427              Span<uint> rTileAsUint = MemoryMarshal.Cast<byte, uint>(rTile);
428              Span<uint> gTileAsUint = MemoryMarshal.Cast<byte, uint>(gTile);
429  
430              Span<ulong> outputLine0 = default;
431              Span<ulong> outputLine1 = default;
432              Span<ulong> outputLine2 = default;
433              Span<ulong> outputLine3 = default;
434  
435              int imageBaseOOffs = 0;
436  
437              for (int l = 0; l < levels; l++)
438              {
439                  int w = BitUtils.DivRoundUp(width, BlockWidth);
440                  int h = BitUtils.DivRoundUp(height, BlockHeight);
441  
442                  for (int l2 = 0; l2 < layers; l2++)
443                  {
444                      for (int z = 0; z < depth; z++)
445                      {
446                          for (int y = 0; y < h; y++)
447                          {
448                              int baseY = y * BlockHeight;
449                              int copyHeight = Math.Min(BlockHeight, height - baseY);
450                              int lineBaseOOffs = imageBaseOOffs + baseY * alignedWidth;
451  
452                              if (copyHeight == 4)
453                              {
454                                  outputLine0 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort[lineBaseOOffs..]);
455                                  outputLine1 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort[(lineBaseOOffs + alignedWidth)..]);
456                                  outputLine2 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort[(lineBaseOOffs + alignedWidth * 2)..]);
457                                  outputLine3 = MemoryMarshal.Cast<ushort, ulong>(outputAsUshort[(lineBaseOOffs + alignedWidth * 3)..]);
458                              }
459  
460                              for (int x = 0; x < w; x++)
461                              {
462                                  int baseX = x * BlockWidth;
463                                  int copyWidth = Math.Min(BlockWidth, width - baseX);
464  
465                                  ulong blockL = data64[0];
466                                  ulong blockH = data64[1];
467  
468                                  rPal[0] = (byte)blockL;
469                                  rPal[1] = (byte)(blockL >> 8);
470                                  gPal[0] = (byte)blockH;
471                                  gPal[1] = (byte)(blockH >> 8);
472  
473                                  if (signed)
474                                  {
475                                      BCnLerpAlphaSnorm(rPal);
476                                      BCnLerpAlphaSnorm(gPal);
477                                  }
478                                  else
479                                  {
480                                      BCnLerpAlphaUnorm(rPal);
481                                      BCnLerpAlphaUnorm(gPal);
482                                  }
483  
484                                  BCnDecodeTileAlpha(rTile, rPal, blockL >> 16);
485                                  BCnDecodeTileAlpha(gTile, gPal, blockH >> 16);
486  
487                                  if ((copyWidth | copyHeight) == 4)
488                                  {
489                                      outputLine0[x] = InterleaveBytes(rTileAsUint[0], gTileAsUint[0]);
490                                      outputLine1[x] = InterleaveBytes(rTileAsUint[1], gTileAsUint[1]);
491                                      outputLine2[x] = InterleaveBytes(rTileAsUint[2], gTileAsUint[2]);
492                                      outputLine3[x] = InterleaveBytes(rTileAsUint[3], gTileAsUint[3]);
493                                  }
494                                  else
495                                  {
496                                      int pixelBaseOOffs = lineBaseOOffs + baseX;
497  
498                                      for (int tY = 0; tY < copyHeight; tY++)
499                                      {
500                                          int line = pixelBaseOOffs + alignedWidth * tY;
501  
502                                          for (int tX = 0; tX < copyWidth; tX++)
503                                          {
504                                              int texel = tY * BlockWidth + tX;
505  
506                                              outputAsUshort[line + tX] = (ushort)(rTile[texel] | (gTile[texel] << 8));
507                                          }
508                                      }
509                                  }
510  
511                                  data64 = data64[2..];
512                              }
513                          }
514  
515                          imageBaseOOffs += alignedWidth * height;
516                      }
517                  }
518  
519                  width = Math.Max(1, width >> 1);
520                  height = Math.Max(1, height >> 1);
521                  depth = Math.Max(1, depth >> 1);
522  
523                  alignedWidth = BitUtils.AlignUp(width, 2);
524              }
525  
526              return output;
527          }
528  
529          public static MemoryOwner<byte> DecodeBC6(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers, bool signed)
530          {
531              int size = 0;
532  
533              for (int l = 0; l < levels; l++)
534              {
535                  size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 8;
536              }
537  
538              MemoryOwner<byte> output = MemoryOwner<byte>.Rent(size);
539              Span<byte> outputSpan = output.Span;
540  
541              int inputOffset = 0;
542              int outputOffset = 0;
543  
544              for (int l = 0; l < levels; l++)
545              {
546                  int w = BitUtils.DivRoundUp(width, BlockWidth);
547                  int h = BitUtils.DivRoundUp(height, BlockHeight);
548  
549                  for (int l2 = 0; l2 < layers; l2++)
550                  {
551                      for (int z = 0; z < depth; z++)
552                      {
553                          BC6Decoder.Decode(outputSpan[outputOffset..], data[inputOffset..], width, height, signed);
554  
555                          inputOffset += w * h * 16;
556                          outputOffset += width * height * 8;
557                      }
558                  }
559  
560                  width = Math.Max(1, width >> 1);
561                  height = Math.Max(1, height >> 1);
562                  depth = Math.Max(1, depth >> 1);
563              }
564  
565              return output;
566          }
567  
568          public static MemoryOwner<byte> DecodeBC7(ReadOnlySpan<byte> data, int width, int height, int depth, int levels, int layers)
569          {
570              int size = 0;
571  
572              for (int l = 0; l < levels; l++)
573              {
574                  size += Math.Max(1, width >> l) * Math.Max(1, height >> l) * Math.Max(1, depth >> l) * layers * 4;
575              }
576  
577              MemoryOwner<byte> output = MemoryOwner<byte>.Rent(size);
578              Span<byte> outputSpan = output.Span;
579  
580              int inputOffset = 0;
581              int outputOffset = 0;
582  
583              for (int l = 0; l < levels; l++)
584              {
585                  int w = BitUtils.DivRoundUp(width, BlockWidth);
586                  int h = BitUtils.DivRoundUp(height, BlockHeight);
587  
588                  for (int l2 = 0; l2 < layers; l2++)
589                  {
590                      for (int z = 0; z < depth; z++)
591                      {
592                          BC7Decoder.Decode(outputSpan[outputOffset..], data[inputOffset..], width, height);
593  
594                          inputOffset += w * h * 16;
595                          outputOffset += width * height * 4;
596                      }
597                  }
598  
599                  width = Math.Max(1, width >> 1);
600                  height = Math.Max(1, height >> 1);
601                  depth = Math.Max(1, depth >> 1);
602              }
603  
604              return output;
605          }
606  
607          private static ulong InterleaveBytes(uint left, uint right)
608          {
609              return InterleaveBytesWithZeros(left) | (InterleaveBytesWithZeros(right) << 8);
610          }
611  
612          private static ulong InterleaveBytesWithZeros(uint value)
613          {
614              ulong output = value;
615              output = (output ^ (output << 16)) & 0xffff0000ffffUL;
616              output = (output ^ (output << 8)) & 0xff00ff00ff00ffUL;
617              return output;
618          }
619  
620          private static void BCnLerpAlphaUnorm(Span<byte> alpha)
621          {
622              byte a0 = alpha[0];
623              byte a1 = alpha[1];
624  
625              if (a0 > a1)
626              {
627                  alpha[2] = (byte)((6 * a0 + 1 * a1) / 7);
628                  alpha[3] = (byte)((5 * a0 + 2 * a1) / 7);
629                  alpha[4] = (byte)((4 * a0 + 3 * a1) / 7);
630                  alpha[5] = (byte)((3 * a0 + 4 * a1) / 7);
631                  alpha[6] = (byte)((2 * a0 + 5 * a1) / 7);
632                  alpha[7] = (byte)((1 * a0 + 6 * a1) / 7);
633              }
634              else
635              {
636                  alpha[2] = (byte)((4 * a0 + 1 * a1) / 5);
637                  alpha[3] = (byte)((3 * a0 + 2 * a1) / 5);
638                  alpha[4] = (byte)((2 * a0 + 3 * a1) / 5);
639                  alpha[5] = (byte)((1 * a0 + 4 * a1) / 5);
640                  alpha[6] = 0;
641                  alpha[7] = 0xff;
642              }
643          }
644  
645          private static void BCnLerpAlphaSnorm(Span<byte> alpha)
646          {
647              sbyte a0 = (sbyte)alpha[0];
648              sbyte a1 = (sbyte)alpha[1];
649  
650              if (a0 > a1)
651              {
652                  alpha[2] = (byte)((6 * a0 + 1 * a1) / 7);
653                  alpha[3] = (byte)((5 * a0 + 2 * a1) / 7);
654                  alpha[4] = (byte)((4 * a0 + 3 * a1) / 7);
655                  alpha[5] = (byte)((3 * a0 + 4 * a1) / 7);
656                  alpha[6] = (byte)((2 * a0 + 5 * a1) / 7);
657                  alpha[7] = (byte)((1 * a0 + 6 * a1) / 7);
658              }
659              else
660              {
661                  alpha[2] = (byte)((4 * a0 + 1 * a1) / 5);
662                  alpha[3] = (byte)((3 * a0 + 2 * a1) / 5);
663                  alpha[4] = (byte)((2 * a0 + 3 * a1) / 5);
664                  alpha[5] = (byte)((1 * a0 + 4 * a1) / 5);
665                  alpha[6] = 0x80;
666                  alpha[7] = 0x7f;
667              }
668          }
669  
670          private unsafe static void BCnDecodeTileAlpha(Span<byte> output, Span<byte> rPal, ulong rI)
671          {
672              if (Avx2.IsSupported)
673              {
674                  Span<Vector128<byte>> outputAsVector128 = MemoryMarshal.Cast<byte, Vector128<byte>>(output);
675  
676                  Vector128<uint> shifts = Vector128.Create(0u, 3u, 6u, 9u);
677                  Vector128<uint> masks = Vector128.Create(7u);
678  
679                  Vector128<byte> vClut;
680  
681                  fixed (byte* pRPal = rPal)
682                  {
683                      vClut = Sse2.LoadScalarVector128((ulong*)pRPal).AsByte();
684                  }
685  
686                  Vector128<uint> indices0 = Vector128.Create((uint)rI);
687                  Vector128<uint> indices1 = Vector128.Create((uint)(rI >> 24));
688                  Vector128<uint> indices00 = Avx2.ShiftRightLogicalVariable(indices0, shifts);
689                  Vector128<uint> indices10 = Avx2.ShiftRightLogicalVariable(indices1, shifts);
690                  Vector128<uint> indices01 = Sse2.ShiftRightLogical(indices00, 12);
691                  Vector128<uint> indices11 = Sse2.ShiftRightLogical(indices10, 12);
692                  indices00 = Sse2.And(indices00, masks);
693                  indices10 = Sse2.And(indices10, masks);
694                  indices01 = Sse2.And(indices01, masks);
695                  indices11 = Sse2.And(indices11, masks);
696  
697                  Vector128<ushort> indicesW0 = Sse41.PackUnsignedSaturate(indices00.AsInt32(), indices01.AsInt32());
698                  Vector128<ushort> indicesW1 = Sse41.PackUnsignedSaturate(indices10.AsInt32(), indices11.AsInt32());
699  
700                  Vector128<byte> indices = Sse2.PackUnsignedSaturate(indicesW0.AsInt16(), indicesW1.AsInt16());
701  
702                  outputAsVector128[0] = Ssse3.Shuffle(vClut, indices);
703              }
704              else
705              {
706                  for (int i = 0; i < BlockWidth * BlockHeight; i++, rI >>= 3)
707                  {
708                      output[i] = rPal[(int)(rI & 7)];
709                  }
710              }
711          }
712  
713          private unsafe static void BCnDecodeTileAlphaRgba(Span<byte> output, Span<byte> rPal, ulong rI)
714          {
715              if (Avx2.IsSupported)
716              {
717                  Span<Vector256<uint>> outputAsVector256 = MemoryMarshal.Cast<byte, Vector256<uint>>(output);
718  
719                  Vector256<uint> shifts = Vector256.Create(0u, 3u, 6u, 9u, 12u, 15u, 18u, 21u);
720  
721                  Vector128<uint> vClut128;
722  
723                  fixed (byte* pRPal = rPal)
724                  {
725                      vClut128 = Sse2.LoadScalarVector128((ulong*)pRPal).AsUInt32();
726                  }
727  
728                  Vector256<uint> vClut = Avx2.ConvertToVector256Int32(vClut128.AsByte()).AsUInt32();
729                  vClut = Avx2.ShiftLeftLogical(vClut, 24);
730  
731                  Vector256<uint> indices0 = Vector256.Create((uint)rI);
732                  Vector256<uint> indices1 = Vector256.Create((uint)(rI >> 24));
733  
734                  indices0 = Avx2.ShiftRightLogicalVariable(indices0, shifts);
735                  indices1 = Avx2.ShiftRightLogicalVariable(indices1, shifts);
736  
737                  outputAsVector256[0] = Avx2.Or(outputAsVector256[0], Avx2.PermuteVar8x32(vClut, indices0));
738                  outputAsVector256[1] = Avx2.Or(outputAsVector256[1], Avx2.PermuteVar8x32(vClut, indices1));
739              }
740              else
741              {
742                  for (int i = 3; i < BlockWidth * BlockHeight * 4; i += 4, rI >>= 3)
743                  {
744                      output[i] = rPal[(int)(rI & 7)];
745                  }
746              }
747          }
748  
749          private unsafe static void BC1DecodeTileRgb(Span<byte> output, ReadOnlySpan<byte> input)
750          {
751              Span<uint> clut = stackalloc uint[4];
752  
753              uint c0c1 = BinaryPrimitives.ReadUInt32LittleEndian(input);
754              uint c0 = (ushort)c0c1;
755              uint c1 = (ushort)(c0c1 >> 16);
756  
757              clut[0] = ConvertRgb565ToRgb888(c0) | 0xff000000;
758              clut[1] = ConvertRgb565ToRgb888(c1) | 0xff000000;
759              clut[2] = BC1LerpRgb2(clut[0], clut[1], c0, c1);
760              clut[3] = BC1LerpRgb3(clut[0], clut[1], c0, c1);
761  
762              BCnDecodeTileRgb(clut, output, input);
763          }
764  
765          private unsafe static void BC23DecodeTileRgb(Span<byte> output, ReadOnlySpan<byte> input)
766          {
767              Span<uint> clut = stackalloc uint[4];
768  
769              uint c0c1 = BinaryPrimitives.ReadUInt32LittleEndian(input);
770              uint c0 = (ushort)c0c1;
771              uint c1 = (ushort)(c0c1 >> 16);
772  
773              clut[0] = ConvertRgb565ToRgb888(c0);
774              clut[1] = ConvertRgb565ToRgb888(c1);
775              clut[2] = BC23LerpRgb2(clut[0], clut[1]);
776              clut[3] = BC23LerpRgb3(clut[0], clut[1]);
777  
778              BCnDecodeTileRgb(clut, output, input);
779          }
780  
781          private unsafe static void BCnDecodeTileRgb(Span<uint> clut, Span<byte> output, ReadOnlySpan<byte> input)
782          {
783              if (Avx2.IsSupported)
784              {
785                  Span<Vector256<uint>> outputAsVector256 = MemoryMarshal.Cast<byte, Vector256<uint>>(output);
786  
787                  Vector256<uint> shifts0 = Vector256.Create(0u, 2u, 4u, 6u, 8u, 10u, 12u, 14u);
788                  Vector256<uint> shifts1 = Vector256.Create(16u, 18u, 20u, 22u, 24u, 26u, 28u, 30u);
789                  Vector256<uint> masks = Vector256.Create(3u);
790  
791                  Vector256<uint> vClut;
792  
793                  fixed (uint* pClut = &clut[0])
794                  {
795                      vClut = Sse2.LoadVector128(pClut).ToVector256Unsafe();
796                  }
797  
798                  Vector256<uint> indices0;
799  
800                  fixed (byte* pInput = input)
801                  {
802                      indices0 = Avx2.BroadcastScalarToVector256((uint*)(pInput + 4));
803                  }
804  
805                  Vector256<uint> indices1 = indices0;
806  
807                  indices0 = Avx2.ShiftRightLogicalVariable(indices0, shifts0);
808                  indices1 = Avx2.ShiftRightLogicalVariable(indices1, shifts1);
809                  indices0 = Avx2.And(indices0, masks);
810                  indices1 = Avx2.And(indices1, masks);
811  
812                  outputAsVector256[0] = Avx2.PermuteVar8x32(vClut, indices0);
813                  outputAsVector256[1] = Avx2.PermuteVar8x32(vClut, indices1);
814              }
815              else
816              {
817                  Span<uint> outputAsUint = MemoryMarshal.Cast<byte, uint>(output);
818  
819                  uint indices = BinaryPrimitives.ReadUInt32LittleEndian(input[4..]);
820  
821                  for (int i = 0; i < BlockWidth * BlockHeight; i++, indices >>= 2)
822                  {
823                      outputAsUint[i] = clut[(int)(indices & 3)];
824                  }
825              }
826          }
827  
828          private static uint BC1LerpRgb2(uint color0, uint color1, uint c0, uint c1)
829          {
830              if (c0 > c1)
831              {
832                  return BC23LerpRgb2(color0, color1) | 0xff000000;
833              }
834  
835              uint carry = color0 & color1;
836              uint addHalve = ((color0 ^ color1) >> 1) & 0x7f7f7f;
837              return (addHalve + carry) | 0xff000000;
838          }
839  
840          private static uint BC23LerpRgb2(uint color0, uint color1)
841          {
842              uint r0 = (byte)color0;
843              uint g0 = color0 & 0xff00;
844              uint b0 = color0 & 0xff0000;
845  
846              uint r1 = (byte)color1;
847              uint g1 = color1 & 0xff00;
848              uint b1 = color1 & 0xff0000;
849  
850              uint mixR = (2 * r0 + r1) / 3;
851              uint mixG = (2 * g0 + g1) / 3;
852              uint mixB = (2 * b0 + b1) / 3;
853  
854              return mixR | (mixG & 0xff00) | (mixB & 0xff0000);
855          }
856  
857          private static uint BC1LerpRgb3(uint color0, uint color1, uint c0, uint c1)
858          {
859              if (c0 > c1)
860              {
861                  return BC23LerpRgb3(color0, color1) | 0xff000000;
862              }
863  
864              return 0;
865          }
866  
867          private static uint BC23LerpRgb3(uint color0, uint color1)
868          {
869              uint r0 = (byte)color0;
870              uint g0 = color0 & 0xff00;
871              uint b0 = color0 & 0xff0000;
872  
873              uint r1 = (byte)color1;
874              uint g1 = color1 & 0xff00;
875              uint b1 = color1 & 0xff0000;
876  
877              uint mixR = (2 * r1 + r0) / 3;
878              uint mixG = (2 * g1 + g0) / 3;
879              uint mixB = (2 * b1 + b0) / 3;
880  
881              return mixR | (mixG & 0xff00) | (mixB & 0xff0000);
882          }
883  
884          private static uint ConvertRgb565ToRgb888(uint value)
885          {
886              uint b = (value & 0x1f) << 19;
887              uint g = (value << 5) & 0xfc00;
888              uint r = (value >> 8) & 0xf8;
889  
890              b |= b >> 5;
891              g |= g >> 6;
892              r |= r >> 5;
893  
894              return r | (g & 0xff00) | (b & 0xff0000);
895          }
896      }
897  }