/ src / Ryujinx.Graphics.Nvdec / Image / SurfaceReader.cs
SurfaceReader.cs
  1  using Ryujinx.Common;
  2  using Ryujinx.Graphics.Device;
  3  using Ryujinx.Graphics.Texture;
  4  using Ryujinx.Graphics.Video;
  5  using System;
  6  using System.Runtime.Intrinsics;
  7  using System.Runtime.Intrinsics.X86;
  8  using static Ryujinx.Graphics.Nvdec.Image.SurfaceCommon;
  9  
 10  namespace Ryujinx.Graphics.Nvdec.Image
 11  {
 12      static class SurfaceReader
 13      {
 14          public static void Read(DeviceMemoryManager mm, ISurface surface, uint lumaOffset, uint chromaOffset)
 15          {
 16              int width = surface.Width;
 17              int height = surface.Height;
 18              int stride = surface.Stride;
 19  
 20              ReadOnlySpan<byte> luma = mm.DeviceGetSpan(lumaOffset, GetBlockLinearSize(width, height, 1));
 21  
 22              ReadLuma(surface.YPlane.AsSpan(), luma, stride, width, height);
 23  
 24              int uvWidth = surface.UvWidth;
 25              int uvHeight = surface.UvHeight;
 26              int uvStride = surface.UvStride;
 27  
 28              ReadOnlySpan<byte> chroma = mm.DeviceGetSpan(chromaOffset, GetBlockLinearSize(uvWidth, uvHeight, 2));
 29  
 30              ReadChroma(surface.UPlane.AsSpan(), surface.VPlane.AsSpan(), chroma, uvStride, uvWidth, uvHeight);
 31          }
 32  
 33          private static void ReadLuma(Span<byte> dst, ReadOnlySpan<byte> src, int dstStride, int width, int height)
 34          {
 35              LayoutConverter.ConvertBlockLinearToLinear(dst, width, height, dstStride, 1, 2, src);
 36          }
 37  
 38          private unsafe static void ReadChroma(
 39              Span<byte> dstU,
 40              Span<byte> dstV,
 41              ReadOnlySpan<byte> src,
 42              int dstStride,
 43              int width,
 44              int height)
 45          {
 46              OffsetCalculator calc = new(width, height, 0, false, 2, 2);
 47  
 48              if (Sse2.IsSupported)
 49              {
 50                  int strideTrunc64 = BitUtils.AlignDown(width * 2, 64);
 51  
 52                  int outStrideGap = dstStride - width;
 53  
 54                  fixed (byte* dstUPtr = dstU, dstVPtr = dstV, dataPtr = src)
 55                  {
 56                      byte* uPtr = dstUPtr;
 57                      byte* vPtr = dstVPtr;
 58  
 59                      for (int y = 0; y < height; y++)
 60                      {
 61                          calc.SetY(y);
 62  
 63                          for (int x = 0; x < strideTrunc64; x += 64, uPtr += 32, vPtr += 32)
 64                          {
 65                              byte* offset = dataPtr + calc.GetOffsetWithLineOffset64(x);
 66                              byte* offset2 = offset + 0x20;
 67                              byte* offset3 = offset + 0x100;
 68                              byte* offset4 = offset + 0x120;
 69  
 70                              Vector128<byte> value = *(Vector128<byte>*)offset;
 71                              Vector128<byte> value2 = *(Vector128<byte>*)offset2;
 72                              Vector128<byte> value3 = *(Vector128<byte>*)offset3;
 73                              Vector128<byte> value4 = *(Vector128<byte>*)offset4;
 74  
 75                              Vector128<byte> u00 = Sse2.UnpackLow(value, value2);
 76                              Vector128<byte> v00 = Sse2.UnpackHigh(value, value2);
 77                              Vector128<byte> u01 = Sse2.UnpackLow(value3, value4);
 78                              Vector128<byte> v01 = Sse2.UnpackHigh(value3, value4);
 79  
 80                              Vector128<byte> u10 = Sse2.UnpackLow(u00, v00);
 81                              Vector128<byte> v10 = Sse2.UnpackHigh(u00, v00);
 82                              Vector128<byte> u11 = Sse2.UnpackLow(u01, v01);
 83                              Vector128<byte> v11 = Sse2.UnpackHigh(u01, v01);
 84  
 85                              Vector128<byte> u20 = Sse2.UnpackLow(u10, v10);
 86                              Vector128<byte> v20 = Sse2.UnpackHigh(u10, v10);
 87                              Vector128<byte> u21 = Sse2.UnpackLow(u11, v11);
 88                              Vector128<byte> v21 = Sse2.UnpackHigh(u11, v11);
 89  
 90                              Vector128<byte> u30 = Sse2.UnpackLow(u20, v20);
 91                              Vector128<byte> v30 = Sse2.UnpackHigh(u20, v20);
 92                              Vector128<byte> u31 = Sse2.UnpackLow(u21, v21);
 93                              Vector128<byte> v31 = Sse2.UnpackHigh(u21, v21);
 94  
 95                              *(Vector128<byte>*)uPtr = u30;
 96                              *(Vector128<byte>*)(uPtr + 16) = u31;
 97                              *(Vector128<byte>*)vPtr = v30;
 98                              *(Vector128<byte>*)(vPtr + 16) = v31;
 99                          }
100  
101                          for (int x = strideTrunc64 / 2; x < width; x++, uPtr++, vPtr++)
102                          {
103                              byte* offset = dataPtr + calc.GetOffset(x);
104  
105                              *uPtr = *offset;
106                              *vPtr = *(offset + 1);
107                          }
108  
109                          uPtr += outStrideGap;
110                          vPtr += outStrideGap;
111                      }
112                  }
113              }
114              else
115              {
116                  for (int y = 0; y < height; y++)
117                  {
118                      int dstBaseOffset = y * dstStride;
119  
120                      calc.SetY(y);
121  
122                      for (int x = 0; x < width; x++)
123                      {
124                          int srcOffset = calc.GetOffset(x);
125  
126                          dstU[dstBaseOffset + x] = src[srcOffset];
127                          dstV[dstBaseOffset + x] = src[srcOffset + 1];
128                      }
129                  }
130              }
131          }
132      }
133  }