/ src / video_core / renderer_software / sw_rasterizer.cpp
sw_rasterizer.cpp
  1  // Copyright 2015 Citra Emulator Project
  2  // Licensed under GPLv2 or any later version
  3  // Refer to the license.txt file included.
  4  
  5  #include <boost/container/static_vector.hpp>
  6  #include "common/logging/log.h"
  7  #include "common/microprofile.h"
  8  #include "common/quaternion.h"
  9  #include "common/vector_math.h"
 10  #include "core/memory.h"
 11  #include "video_core/pica/output_vertex.h"
 12  #include "video_core/pica/pica_core.h"
 13  #include "video_core/renderer_software/sw_framebuffer.h"
 14  #include "video_core/renderer_software/sw_lighting.h"
 15  #include "video_core/renderer_software/sw_proctex.h"
 16  #include "video_core/renderer_software/sw_rasterizer.h"
 17  #include "video_core/renderer_software/sw_texturing.h"
 18  #include "video_core/texture/texture_decode.h"
 19  
 20  namespace SwRenderer {
 21  
 22  using Pica::f24;
 23  using Pica::FramebufferRegs;
 24  using Pica::RasterizerRegs;
 25  using Pica::TexturingRegs;
 26  using Pica::Texture::LookupTexture;
 27  using Pica::Texture::TextureInfo;
 28  
 29  // Certain games render 2D elements very close to clip plane 0 resulting in very tiny
 30  // negative/positive z values when computing with f32 precision,
 31  // causing some vertices to get erroneously clipped. To workaround this problem,
 32  // we can use a very small epsilon value for clip plane comparison.
 33  constexpr f32 EPSILON_Z = 0.00000001f;
 34  
 35  struct Vertex : Pica::OutputVertex {
 36      Vertex(const OutputVertex& v) : OutputVertex(v) {}
 37  
 38      /// Attributes used to store intermediate results position after perspective divide.
 39      Common::Vec3<f24> screenpos;
 40  
 41      /**
 42       * Linear interpolation
 43       * factor: 0=this, 1=vtx
 44       * Note: This function cannot be called after perspective divide.
 45       **/
 46      void Lerp(f24 factor, const Vertex& vtx) {
 47          pos = pos * factor + vtx.pos * (f24::One() - factor);
 48          quat = quat * factor + vtx.quat * (f24::One() - factor);
 49          color = color * factor + vtx.color * (f24::One() - factor);
 50          tc0 = tc0 * factor + vtx.tc0 * (f24::One() - factor);
 51          tc1 = tc1 * factor + vtx.tc1 * (f24::One() - factor);
 52          tc0_w = tc0_w * factor + vtx.tc0_w * (f24::One() - factor);
 53          view = view * factor + vtx.view * (f24::One() - factor);
 54          tc2 = tc2 * factor + vtx.tc2 * (f24::One() - factor);
 55      }
 56  
 57      /**
 58       * Linear interpolation
 59       * factor: 0=v0, 1=v1
 60       * Note: This function cannot be called after perspective divide.
 61       **/
 62      static Vertex Lerp(f24 factor, const Vertex& v0, const Vertex& v1) {
 63          Vertex ret = v0;
 64          ret.Lerp(factor, v1);
 65          return ret;
 66      }
 67  };
 68  
 69  namespace {
 70  
 71  MICROPROFILE_DEFINE(GPU_Rasterization, "GPU", "Rasterization", MP_RGB(50, 50, 240));
 72  
 73  struct ClippingEdge {
 74  public:
 75      constexpr ClippingEdge(Common::Vec4<f24> coeffs,
 76                             Common::Vec4<f24> bias = Common::Vec4<f24>(f24::Zero(), f24::Zero(),
 77                                                                        f24::Zero(), f24::Zero()))
 78          : pos(f24::Zero()), coeffs(coeffs), bias(bias) {}
 79  
 80      bool IsInside(const Vertex& vertex) const {
 81          return Common::Dot(vertex.pos + bias, coeffs) >= f24::FromFloat32(-EPSILON_Z);
 82      }
 83  
 84      bool IsOutSide(const Vertex& vertex) const {
 85          return !IsInside(vertex);
 86      }
 87  
 88      Vertex GetIntersection(const Vertex& v0, const Vertex& v1) const {
 89          const f24 dp = Common::Dot(v0.pos + bias, coeffs);
 90          const f24 dp_prev = Common::Dot(v1.pos + bias, coeffs);
 91          const f24 factor = dp_prev / (dp_prev - dp);
 92          return Vertex::Lerp(factor, v0, v1);
 93      }
 94  
 95  private:
 96      [[maybe_unused]] f24 pos;
 97      Common::Vec4<f24> coeffs;
 98      Common::Vec4<f24> bias;
 99  };
100  
101  } // Anonymous namespace
102  
103  RasterizerSoftware::RasterizerSoftware(Memory::MemorySystem& memory_, Pica::PicaCore& pica_)
104      : memory{memory_}, pica{pica_}, regs{pica.regs.internal},
105        num_sw_threads{std::max(std::thread::hardware_concurrency(), 2U)},
106        sw_workers{num_sw_threads, "SwRenderer workers"}, fb{memory, regs.framebuffer} {}
107  
108  void RasterizerSoftware::AddTriangle(const Pica::OutputVertex& v0, const Pica::OutputVertex& v1,
109                                       const Pica::OutputVertex& v2) {
110      /**
111       * Clipping a planar n-gon against a plane will remove at least 1 vertex and introduces 2 at
112       * the new edge (or less in degenerate cases). As such, we can say that each clipping plane
113       * introduces at most 1 new vertex to the polygon. Since we start with a triangle and have a
114       * fixed 6 clipping planes, the maximum number of vertices of the clipped polygon is 3 + 6 = 9.
115       **/
116      static constexpr std::size_t MAX_VERTICES = 9;
117  
118      boost::container::static_vector<Vertex, MAX_VERTICES> buffer_a = {v0, v1, v2};
119      boost::container::static_vector<Vertex, MAX_VERTICES> buffer_b;
120  
121      FlipQuaternionIfOpposite(buffer_a[1].quat, buffer_a[0].quat);
122      FlipQuaternionIfOpposite(buffer_a[2].quat, buffer_a[0].quat);
123  
124      auto* output_list = &buffer_a;
125      auto* input_list = &buffer_b;
126  
127      // NOTE: We clip against a w=epsilon plane to guarantee that the output has a positive w value.
128      // TODO: Not sure if this is a valid approach. Also should probably instead use the smallest
129      //       epsilon possible within f24 accuracy.
130      static constexpr f24 EPSILON = f24::FromFloat32(0.00001f);
131      static constexpr f24 f0 = f24::Zero();
132      static constexpr f24 f1 = f24::One();
133      static constexpr std::array<ClippingEdge, 7> clipping_edges = {{
134          {Common::MakeVec(-f1, f0, f0, f1)},                                        // x = +w
135          {Common::MakeVec(f1, f0, f0, f1)},                                         // x = -w
136          {Common::MakeVec(f0, -f1, f0, f1)},                                        // y = +w
137          {Common::MakeVec(f0, f1, f0, f1)},                                         // y = -w
138          {Common::MakeVec(f0, f0, -f1, f0)},                                        // z =  0
139          {Common::MakeVec(f0, f0, f1, f1)},                                         // z = -w
140          {Common::MakeVec(f0, f0, f0, f1), Common::Vec4<f24>(f0, f0, f0, EPSILON)}, // w = EPSILON
141      }};
142  
143      // Simple implementation of the Sutherland-Hodgman clipping algorithm.
144      // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here)
145      const auto clip = [&](const ClippingEdge& edge) {
146          std::swap(input_list, output_list);
147          output_list->clear();
148  
149          const Vertex* reference_vertex = &input_list->back();
150          for (const auto& vertex : *input_list) {
151              // NOTE: This algorithm changes vertex order in some cases!
152              if (edge.IsInside(vertex)) {
153                  if (edge.IsOutSide(*reference_vertex)) {
154                      output_list->push_back(edge.GetIntersection(vertex, *reference_vertex));
155                  }
156                  output_list->push_back(vertex);
157              } else if (edge.IsInside(*reference_vertex)) {
158                  output_list->push_back(edge.GetIntersection(vertex, *reference_vertex));
159              }
160              reference_vertex = &vertex;
161          }
162      };
163  
164      for (const ClippingEdge& edge : clipping_edges) {
165          clip(edge);
166          if (output_list->size() < 3) {
167              return;
168          }
169      }
170  
171      if (regs.rasterizer.clip_enable) {
172          const ClippingEdge custom_edge{regs.rasterizer.GetClipCoef()};
173          clip(custom_edge);
174          if (output_list->size() < 3) {
175              return;
176          }
177      }
178  
179      MakeScreenCoords((*output_list)[0]);
180      MakeScreenCoords((*output_list)[1]);
181  
182      for (std::size_t i = 0; i < output_list->size() - 2; i++) {
183          Vertex& vtx0 = (*output_list)[0];
184          Vertex& vtx1 = (*output_list)[i + 1];
185          Vertex& vtx2 = (*output_list)[i + 2];
186  
187          MakeScreenCoords(vtx2);
188  
189          LOG_TRACE(
190              Render_Software,
191              "Triangle {}/{} at position ({:.3}, {:.3}, {:.3}, {:.3f}), "
192              "({:.3}, {:.3}, {:.3}, {:.3}), ({:.3}, {:.3}, {:.3}, {:.3}) and "
193              "screen position ({:.2}, {:.2}, {:.2}), ({:.2}, {:.2}, {:.2}), ({:.2}, {:.2}, {:.2})",
194              i + 1, output_list->size() - 2, vtx0.pos.x.ToFloat32(), vtx0.pos.y.ToFloat32(),
195              vtx0.pos.z.ToFloat32(), vtx0.pos.w.ToFloat32(), vtx1.pos.x.ToFloat32(),
196              vtx1.pos.y.ToFloat32(), vtx1.pos.z.ToFloat32(), vtx1.pos.w.ToFloat32(),
197              vtx2.pos.x.ToFloat32(), vtx2.pos.y.ToFloat32(), vtx2.pos.z.ToFloat32(),
198              vtx2.pos.w.ToFloat32(), vtx0.screenpos.x.ToFloat32(), vtx0.screenpos.y.ToFloat32(),
199              vtx0.screenpos.z.ToFloat32(), vtx1.screenpos.x.ToFloat32(),
200              vtx1.screenpos.y.ToFloat32(), vtx1.screenpos.z.ToFloat32(),
201              vtx2.screenpos.x.ToFloat32(), vtx2.screenpos.y.ToFloat32(),
202              vtx2.screenpos.z.ToFloat32());
203  
204          ProcessTriangle(vtx0, vtx1, vtx2);
205      }
206  }
207  
208  void RasterizerSoftware::MakeScreenCoords(Vertex& vtx) {
209      Viewport viewport{};
210      viewport.halfsize_x = f24::FromRaw(regs.rasterizer.viewport_size_x);
211      viewport.halfsize_y = f24::FromRaw(regs.rasterizer.viewport_size_y);
212      viewport.offset_x = f24::FromFloat32(static_cast<f32>(regs.rasterizer.viewport_corner.x));
213      viewport.offset_y = f24::FromFloat32(static_cast<f32>(regs.rasterizer.viewport_corner.y));
214  
215      f24 inv_w = f24::One() / vtx.pos.w;
216      vtx.pos.w = inv_w;
217      vtx.quat *= inv_w;
218      vtx.color *= inv_w;
219      vtx.tc0 *= inv_w;
220      vtx.tc1 *= inv_w;
221      vtx.tc0_w *= inv_w;
222      vtx.view *= inv_w;
223      vtx.tc2 *= inv_w;
224  
225      vtx.screenpos[0] = (vtx.pos.x * inv_w + f24::One()) * viewport.halfsize_x + viewport.offset_x;
226      vtx.screenpos[1] = (vtx.pos.y * inv_w + f24::One()) * viewport.halfsize_y + viewport.offset_y;
227      vtx.screenpos[2] = vtx.pos.z * inv_w;
228  }
229  
230  void RasterizerSoftware::ProcessTriangle(const Vertex& v0, const Vertex& v1, const Vertex& v2,
231                                           bool reversed) {
232      MICROPROFILE_SCOPE(GPU_Rasterization);
233  
234      // Vertex positions in rasterizer coordinates
235      static auto screen_to_rasterizer_coords = [](const Common::Vec3<f24>& vec) {
236          return Common::Vec3{Fix12P4::FromFloat24(vec.x), Fix12P4::FromFloat24(vec.y),
237                              Fix12P4::FromFloat24(vec.z)};
238      };
239  
240      const std::array<Common::Vec3<Fix12P4>, 3> vtxpos = {
241          screen_to_rasterizer_coords(v0.screenpos),
242          screen_to_rasterizer_coords(v1.screenpos),
243          screen_to_rasterizer_coords(v2.screenpos),
244      };
245  
246      if (regs.rasterizer.cull_mode == RasterizerRegs::CullMode::KeepAll) {
247          // Make sure we always end up with a triangle wound counter-clockwise
248          if (!reversed && SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) {
249              ProcessTriangle(v0, v2, v1, true);
250              return;
251          }
252      } else {
253          if (!reversed && regs.rasterizer.cull_mode == RasterizerRegs::CullMode::KeepClockWise) {
254              // Reverse vertex order and use the CCW code path.
255              ProcessTriangle(v0, v2, v1, true);
256              return;
257          }
258          // Cull away triangles which are wound clockwise.
259          if (SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) {
260              return;
261          }
262      }
263  
264      u16 min_x = std::min({vtxpos[0].x, vtxpos[1].x, vtxpos[2].x});
265      u16 min_y = std::min({vtxpos[0].y, vtxpos[1].y, vtxpos[2].y});
266      u16 max_x = std::max({vtxpos[0].x, vtxpos[1].x, vtxpos[2].x});
267      u16 max_y = std::max({vtxpos[0].y, vtxpos[1].y, vtxpos[2].y});
268  
269      // Convert the scissor box coordinates to 12.4 fixed point
270      const u16 scissor_x1 = static_cast<u16>(regs.rasterizer.scissor_test.x1 << 4);
271      const u16 scissor_y1 = static_cast<u16>(regs.rasterizer.scissor_test.y1 << 4);
272      // x2,y2 have +1 added to cover the entire sub-pixel area
273      const u16 scissor_x2 = static_cast<u16>((regs.rasterizer.scissor_test.x2 + 1) << 4);
274      const u16 scissor_y2 = static_cast<u16>((regs.rasterizer.scissor_test.y2 + 1) << 4);
275  
276      if (regs.rasterizer.scissor_test.mode == RasterizerRegs::ScissorMode::Include) {
277          // Calculate the new bounds
278          min_x = std::max(min_x, scissor_x1);
279          min_y = std::max(min_y, scissor_y1);
280          max_x = std::min(max_x, scissor_x2);
281          max_y = std::min(max_y, scissor_y2);
282      }
283  
284      min_x &= Fix12P4::IntMask();
285      min_y &= Fix12P4::IntMask();
286      max_x = ((max_x + Fix12P4::FracMask()) & Fix12P4::IntMask());
287      max_y = ((max_y + Fix12P4::FracMask()) & Fix12P4::IntMask());
288  
289      const int bias0 =
290          IsRightSideOrFlatBottomEdge(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) ? -1 : 0;
291      const int bias1 =
292          IsRightSideOrFlatBottomEdge(vtxpos[1].xy(), vtxpos[2].xy(), vtxpos[0].xy()) ? -1 : 0;
293      const int bias2 =
294          IsRightSideOrFlatBottomEdge(vtxpos[2].xy(), vtxpos[0].xy(), vtxpos[1].xy()) ? -1 : 0;
295  
296      const auto w_inverse = Common::MakeVec(v0.pos.w, v1.pos.w, v2.pos.w);
297  
298      const auto textures = regs.texturing.GetTextures();
299      const auto tev_stages = regs.texturing.GetTevStages();
300  
301      fb.Bind();
302  
303      // Enter rasterization loop, starting at the center of the topleft bounding box corner.
304      // TODO: Not sure if looping through x first might be faster
305      for (u16 y = min_y + 8; y < max_y; y += 0x10) {
306          const auto process_scanline = [&, y] {
307              for (u16 x = min_x + 8; x < max_x; x += 0x10) {
308                  // Do not process the pixel if it's inside the scissor box and the scissor mode is
309                  // set to Exclude.
310                  if (regs.rasterizer.scissor_test.mode == RasterizerRegs::ScissorMode::Exclude) {
311                      if (x >= scissor_x1 && x < scissor_x2 && y >= scissor_y1 && y < scissor_y2) {
312                          continue;
313                      }
314                  }
315  
316                  // Calculate the barycentric coordinates w0, w1 and w2
317                  const s32 w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y});
318                  const s32 w1 = bias1 + SignedArea(vtxpos[2].xy(), vtxpos[0].xy(), {x, y});
319                  const s32 w2 = bias2 + SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), {x, y});
320                  const s32 wsum = w0 + w1 + w2;
321  
322                  // If current pixel is not covered by the current primitive
323                  if (w0 < 0 || w1 < 0 || w2 < 0) {
324                      continue;
325                  }
326  
327                  const auto baricentric_coordinates = Common::MakeVec(
328                      f24::FromFloat32(static_cast<f32>(w0)), f24::FromFloat32(static_cast<f32>(w1)),
329                      f24::FromFloat32(static_cast<f32>(w2)));
330                  const f24 interpolated_w_inverse =
331                      f24::One() / Common::Dot(w_inverse, baricentric_coordinates);
332  
333                  // interpolated_z = z / w
334                  const float interpolated_z_over_w =
335                      (v0.screenpos[2].ToFloat32() * w0 + v1.screenpos[2].ToFloat32() * w1 +
336                       v2.screenpos[2].ToFloat32() * w2) /
337                      wsum;
338  
339                  // Not fully accurate. About 3 bits in precision are missing.
340                  // Z-Buffer (z / w * scale + offset)
341                  const float depth_scale =
342                      f24::FromRaw(regs.rasterizer.viewport_depth_range).ToFloat32();
343                  const float depth_offset =
344                      f24::FromRaw(regs.rasterizer.viewport_depth_near_plane).ToFloat32();
345                  float depth = interpolated_z_over_w * depth_scale + depth_offset;
346  
347                  // Potentially switch to W-Buffer
348                  if (regs.rasterizer.depthmap_enable ==
349                      Pica::RasterizerRegs::DepthBuffering::WBuffering) {
350                      // W-Buffer (z * scale + w * offset = (z / w * scale + offset) * w)
351                      depth *= interpolated_w_inverse.ToFloat32() * wsum;
352                  }
353  
354                  // Clamp the result
355                  depth = std::clamp(depth, 0.0f, 1.0f);
356  
357                  /**
358                   * Perspective correct attribute interpolation:
359                   * Attribute values cannot be calculated by simple linear interpolation since
360                   * they are not linear in screen space. For example, when interpolating a
361                   * texture coordinate across two vertices, something simple like
362                   *     u = (u0*w0 + u1*w1)/(w0+w1)
363                   * will not work. However, the attribute value divided by the
364                   * clipspace w-coordinate (u/w) and and the inverse w-coordinate (1/w) are linear
365                   * in screenspace. Hence, we can linearly interpolate these two independently and
366                   * calculate the interpolated attribute by dividing the results.
367                   * I.e.
368                   *     u_over_w   = ((u0/v0.pos.w)*w0 + (u1/v1.pos.w)*w1)/(w0+w1)
369                   *     one_over_w = (( 1/v0.pos.w)*w0 + ( 1/v1.pos.w)*w1)/(w0+w1)
370                   *     u = u_over_w / one_over_w
371                   *
372                   * The generalization to three vertices is straightforward in baricentric
373                   *coordinates.
374                   **/
375                  const auto get_interpolated_attribute = [&](f24 attr0, f24 attr1, f24 attr2) {
376                      auto attr_over_w = Common::MakeVec(attr0, attr1, attr2);
377                      f24 interpolated_attr_over_w =
378                          Common::Dot(attr_over_w, baricentric_coordinates);
379                      return interpolated_attr_over_w * interpolated_w_inverse;
380                  };
381  
382                  const Common::Vec4<u8> primary_color{
383                      static_cast<u8>(
384                          round(get_interpolated_attribute(v0.color.r(), v1.color.r(), v2.color.r())
385                                    .ToFloat32() *
386                                255)),
387                      static_cast<u8>(
388                          round(get_interpolated_attribute(v0.color.g(), v1.color.g(), v2.color.g())
389                                    .ToFloat32() *
390                                255)),
391                      static_cast<u8>(
392                          round(get_interpolated_attribute(v0.color.b(), v1.color.b(), v2.color.b())
393                                    .ToFloat32() *
394                                255)),
395                      static_cast<u8>(
396                          round(get_interpolated_attribute(v0.color.a(), v1.color.a(), v2.color.a())
397                                    .ToFloat32() *
398                                255)),
399                  };
400  
401                  std::array<Common::Vec2<f24>, 3> uv;
402                  uv[0].u() = get_interpolated_attribute(v0.tc0.u(), v1.tc0.u(), v2.tc0.u());
403                  uv[0].v() = get_interpolated_attribute(v0.tc0.v(), v1.tc0.v(), v2.tc0.v());
404                  uv[1].u() = get_interpolated_attribute(v0.tc1.u(), v1.tc1.u(), v2.tc1.u());
405                  uv[1].v() = get_interpolated_attribute(v0.tc1.v(), v1.tc1.v(), v2.tc1.v());
406                  uv[2].u() = get_interpolated_attribute(v0.tc2.u(), v1.tc2.u(), v2.tc2.u());
407                  uv[2].v() = get_interpolated_attribute(v0.tc2.v(), v1.tc2.v(), v2.tc2.v());
408  
409                  // Sample bound texture units.
410                  const f24 tc0_w = get_interpolated_attribute(v0.tc0_w, v1.tc0_w, v2.tc0_w);
411                  const auto texture_color = TextureColor(uv, textures, tc0_w);
412  
413                  Common::Vec4<u8> primary_fragment_color = {0, 0, 0, 0};
414                  Common::Vec4<u8> secondary_fragment_color = {0, 0, 0, 0};
415  
416                  if (!regs.lighting.disable) {
417                      const auto normquat =
418                          Common::Quaternion<f32>{
419                              {get_interpolated_attribute(v0.quat.x, v1.quat.x, v2.quat.x)
420                                   .ToFloat32(),
421                               get_interpolated_attribute(v0.quat.y, v1.quat.y, v2.quat.y)
422                                   .ToFloat32(),
423                               get_interpolated_attribute(v0.quat.z, v1.quat.z, v2.quat.z)
424                                   .ToFloat32()},
425                              get_interpolated_attribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(),
426                          }
427                              .Normalized();
428  
429                      const Common::Vec3f view{
430                          get_interpolated_attribute(v0.view.x, v1.view.x, v2.view.x).ToFloat32(),
431                          get_interpolated_attribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(),
432                          get_interpolated_attribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32(),
433                      };
434                      std::tie(primary_fragment_color, secondary_fragment_color) =
435                          ComputeFragmentsColors(regs.lighting, pica.lighting, normquat, view,
436                                                 texture_color);
437                  }
438  
439                  // Write the TEV stages.
440                  auto combiner_output =
441                      WriteTevConfig(texture_color, tev_stages, primary_color, primary_fragment_color,
442                                     secondary_fragment_color);
443  
444                  const auto& output_merger = regs.framebuffer.output_merger;
445                  if (output_merger.fragment_operation_mode ==
446                      FramebufferRegs::FragmentOperationMode::Shadow) {
447                      const u32 depth_int = static_cast<u32>(depth * 0xFFFFFF);
448                      // Use green color as the shadow intensity
449                      const u8 stencil = combiner_output.y;
450                      fb.DrawShadowMapPixel(x >> 4, y >> 4, depth_int, stencil);
451                      // Skip the normal output merger pipeline if it is in shadow mode
452                      continue;
453                  }
454  
455                  // Does alpha testing happen before or after stencil?
456                  if (!DoAlphaTest(combiner_output.a())) {
457                      continue;
458                  }
459                  WriteFog(depth, combiner_output);
460                  if (!DoDepthStencilTest(x, y, depth)) {
461                      continue;
462                  }
463                  const auto result = PixelColor(x, y, combiner_output);
464                  if (regs.framebuffer.framebuffer.allow_color_write != 0) {
465                      fb.DrawPixel(x >> 4, y >> 4, result);
466                  }
467              }
468          };
469          sw_workers.QueueWork(std::move(process_scanline));
470      }
471      sw_workers.WaitForRequests();
472  }
473  
474  std::array<Common::Vec4<u8>, 4> RasterizerSoftware::TextureColor(
475      std::span<const Common::Vec2<f24>, 3> uv,
476      std::span<const Pica::TexturingRegs::FullTextureConfig, 3> textures, f24 tc0_w) const {
477      std::array<Common::Vec4<u8>, 4> texture_color{};
478      for (u32 i = 0; i < 3; ++i) {
479          const auto& texture = textures[i];
480          if (!texture.enabled) [[unlikely]] {
481              continue;
482          }
483          if (texture.config.address == 0) [[unlikely]] {
484              texture_color[i] = {0, 0, 0, 255};
485              continue;
486          }
487  
488          const s32 coordinate_i = (i == 2 && regs.texturing.main_config.texture2_use_coord1) ? 1 : i;
489          f24 u = uv[coordinate_i].u();
490          f24 v = uv[coordinate_i].v();
491  
492          // Only unit 0 respects the texturing type (according to 3DBrew)
493          PAddr texture_address = texture.config.GetPhysicalAddress();
494          f24 shadow_z;
495          if (i == 0) {
496              switch (texture.config.type) {
497              case TexturingRegs::TextureConfig::Texture2D:
498                  break;
499              case TexturingRegs::TextureConfig::ShadowCube:
500              case TexturingRegs::TextureConfig::TextureCube: {
501                  std::tie(u, v, shadow_z, texture_address) =
502                      ConvertCubeCoord(u, v, tc0_w, regs.texturing);
503                  break;
504              }
505              case TexturingRegs::TextureConfig::Projection2D: {
506                  u /= tc0_w;
507                  v /= tc0_w;
508                  break;
509              }
510              case TexturingRegs::TextureConfig::Shadow2D: {
511                  if (!regs.texturing.shadow.orthographic) {
512                      u /= tc0_w;
513                      v /= tc0_w;
514                  }
515                  shadow_z = f24::FromFloat32(std::abs(tc0_w.ToFloat32()));
516                  break;
517              }
518              case TexturingRegs::TextureConfig::Disabled:
519                  continue; // skip this unit and continue to the next unit
520              default:
521                  LOG_ERROR(HW_GPU, "Unhandled texture type {:x}", (int)texture.config.type);
522                  UNIMPLEMENTED();
523                  break;
524              }
525          }
526  
527          const f24 width = f24::FromFloat32(static_cast<f32>(texture.config.width));
528          const f24 height = f24::FromFloat32(static_cast<f32>(texture.config.height));
529          s32 s = static_cast<s32>((u * width).ToFloat32());
530          s32 t = static_cast<s32>((v * height).ToFloat32());
531  
532          bool use_border_s = false;
533          bool use_border_t = false;
534  
535          if (texture.config.wrap_s == TexturingRegs::TextureConfig::ClampToBorder) {
536              use_border_s = s < 0 || s >= static_cast<s32>(texture.config.width);
537          } else if (texture.config.wrap_s == TexturingRegs::TextureConfig::ClampToBorder2) {
538              use_border_s = s >= static_cast<s32>(texture.config.width);
539          }
540  
541          if (texture.config.wrap_t == TexturingRegs::TextureConfig::ClampToBorder) {
542              use_border_t = t < 0 || t >= static_cast<s32>(texture.config.height);
543          } else if (texture.config.wrap_t == TexturingRegs::TextureConfig::ClampToBorder2) {
544              use_border_t = t >= static_cast<s32>(texture.config.height);
545          }
546  
547          if (use_border_s || use_border_t) {
548              const auto border_color = texture.config.border_color;
549              texture_color[i] = Common::MakeVec(border_color.r.Value(), border_color.g.Value(),
550                                                 border_color.b.Value(), border_color.a.Value())
551                                     .Cast<u8>();
552          } else {
553              // Textures are laid out from bottom to top, hence we invert the t coordinate.
554              // NOTE: This may not be the right place for the inversion.
555              // TODO: Check if this applies to ETC textures, too.
556              s = GetWrappedTexCoord(texture.config.wrap_s, s, texture.config.width);
557              t = texture.config.height - 1 -
558                  GetWrappedTexCoord(texture.config.wrap_t, t, texture.config.height);
559  
560              const u8* texture_data = memory.GetPhysicalPointer(texture_address);
561              const auto info = TextureInfo::FromPicaRegister(texture.config, texture.format);
562  
563              // TODO: Apply the min and mag filters to the texture
564              texture_color[i] = LookupTexture(texture_data, s, t, info);
565          }
566  
567          if (i == 0 && (texture.config.type == TexturingRegs::TextureConfig::Shadow2D ||
568                         texture.config.type == TexturingRegs::TextureConfig::ShadowCube)) {
569  
570              s32 z_int = static_cast<s32>(std::min(shadow_z.ToFloat32(), 1.0f) * 0xFFFFFF);
571              z_int -= regs.texturing.shadow.bias << 1;
572              const auto& color = texture_color[i];
573              const s32 z_ref = (color.w << 16) | (color.z << 8) | color.y;
574              u8 density;
575              if (z_ref >= z_int) {
576                  density = color.x;
577              } else {
578                  density = 0;
579              }
580              texture_color[i] = {density, density, density, density};
581          }
582      }
583  
584      // Sample procedural texture
585      if (regs.texturing.main_config.texture3_enable) {
586          const auto& proctex_uv = uv[regs.texturing.main_config.texture3_coordinates];
587          texture_color[3] = ProcTex(proctex_uv.u().ToFloat32(), proctex_uv.v().ToFloat32(),
588                                     regs.texturing, pica.proctex);
589      }
590  
591      return texture_color;
592  }
593  
594  Common::Vec4<u8> RasterizerSoftware::PixelColor(u16 x, u16 y,
595                                                  Common::Vec4<u8> combiner_output) const {
596      const auto dest = fb.GetPixel(x >> 4, y >> 4);
597      Common::Vec4<u8> blend_output = combiner_output;
598  
599      const auto& output_merger = regs.framebuffer.output_merger;
600      if (output_merger.alphablend_enable) {
601          const auto params = output_merger.alpha_blending;
602          const auto lookup_factor = [&](u32 channel, FramebufferRegs::BlendFactor factor) -> u8 {
603              DEBUG_ASSERT(channel < 4);
604  
605              const Common::Vec4<u8> blend_const =
606                  Common::MakeVec(
607                      output_merger.blend_const.r.Value(), output_merger.blend_const.g.Value(),
608                      output_merger.blend_const.b.Value(), output_merger.blend_const.a.Value())
609                      .Cast<u8>();
610  
611              switch (factor) {
612              case FramebufferRegs::BlendFactor::Zero:
613                  return 0;
614              case FramebufferRegs::BlendFactor::One:
615                  return 255;
616              case FramebufferRegs::BlendFactor::SourceColor:
617                  return combiner_output[channel];
618              case FramebufferRegs::BlendFactor::OneMinusSourceColor:
619                  return 255 - combiner_output[channel];
620              case FramebufferRegs::BlendFactor::DestColor:
621                  return dest[channel];
622              case FramebufferRegs::BlendFactor::OneMinusDestColor:
623                  return 255 - dest[channel];
624              case FramebufferRegs::BlendFactor::SourceAlpha:
625                  return combiner_output.a();
626              case FramebufferRegs::BlendFactor::OneMinusSourceAlpha:
627                  return 255 - combiner_output.a();
628              case FramebufferRegs::BlendFactor::DestAlpha:
629                  return dest.a();
630              case FramebufferRegs::BlendFactor::OneMinusDestAlpha:
631                  return 255 - dest.a();
632              case FramebufferRegs::BlendFactor::ConstantColor:
633                  return blend_const[channel];
634              case FramebufferRegs::BlendFactor::OneMinusConstantColor:
635                  return 255 - blend_const[channel];
636              case FramebufferRegs::BlendFactor::ConstantAlpha:
637                  return blend_const.a();
638              case FramebufferRegs::BlendFactor::OneMinusConstantAlpha:
639                  return 255 - blend_const.a();
640              case FramebufferRegs::BlendFactor::SourceAlphaSaturate:
641                  // Returns 1.0 for the alpha channel
642                  if (channel == 3) {
643                      return 255;
644                  }
645                  return std::min(combiner_output.a(), static_cast<u8>(255 - dest.a()));
646              default:
647                  LOG_CRITICAL(HW_GPU, "Unknown blend factor {:x}", factor);
648                  UNIMPLEMENTED();
649                  break;
650              }
651              return combiner_output[channel];
652          };
653  
654          const auto srcfactor = Common::MakeVec(
655              lookup_factor(0, params.factor_source_rgb), lookup_factor(1, params.factor_source_rgb),
656              lookup_factor(2, params.factor_source_rgb), lookup_factor(3, params.factor_source_a));
657  
658          const auto dstfactor = Common::MakeVec(
659              lookup_factor(0, params.factor_dest_rgb), lookup_factor(1, params.factor_dest_rgb),
660              lookup_factor(2, params.factor_dest_rgb), lookup_factor(3, params.factor_dest_a));
661  
662          blend_output = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor,
663                                               params.blend_equation_rgb);
664          blend_output.a() = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor,
665                                                   params.blend_equation_a)
666                                 .a();
667      } else {
668          blend_output =
669              Common::MakeVec(LogicOp(combiner_output.r(), dest.r(), output_merger.logic_op),
670                              LogicOp(combiner_output.g(), dest.g(), output_merger.logic_op),
671                              LogicOp(combiner_output.b(), dest.b(), output_merger.logic_op),
672                              LogicOp(combiner_output.a(), dest.a(), output_merger.logic_op));
673      }
674  
675      const Common::Vec4<u8> result = {
676          output_merger.red_enable ? blend_output.r() : dest.r(),
677          output_merger.green_enable ? blend_output.g() : dest.g(),
678          output_merger.blue_enable ? blend_output.b() : dest.b(),
679          output_merger.alpha_enable ? blend_output.a() : dest.a(),
680      };
681  
682      return result;
683  }
684  
685  Common::Vec4<u8> RasterizerSoftware::WriteTevConfig(
686      std::span<const Common::Vec4<u8>, 4> texture_color,
687      std::span<const Pica::TexturingRegs::TevStageConfig, 6> tev_stages,
688      Common::Vec4<u8> primary_color, Common::Vec4<u8> primary_fragment_color,
689      Common::Vec4<u8> secondary_fragment_color) {
690      /**
691       * Texture environment - consists of 6 stages of color and alpha combining.
692       * Color combiners take three input color values from some source (e.g. interpolated
693       * vertex color, texture color, previous stage, etc), perform some very simple
694       * operations on each of them (e.g. inversion) and then calculate the output color
695       * with some basic arithmetic. Alpha combiners can be configured separately but work
696       * analogously.
697       **/
698      Common::Vec4<u8> combiner_output = {0, 0, 0, 0};
699      Common::Vec4<u8> combiner_buffer = {0, 0, 0, 0};
700      Common::Vec4<u8> next_combiner_buffer =
701          Common::MakeVec(regs.texturing.tev_combiner_buffer_color.r.Value(),
702                          regs.texturing.tev_combiner_buffer_color.g.Value(),
703                          regs.texturing.tev_combiner_buffer_color.b.Value(),
704                          regs.texturing.tev_combiner_buffer_color.a.Value())
705              .Cast<u8>();
706  
707      for (u32 tev_stage_index = 0; tev_stage_index < tev_stages.size(); ++tev_stage_index) {
708          const auto& tev_stage = tev_stages[tev_stage_index];
709          using Source = TexturingRegs::TevStageConfig::Source;
710  
711          auto get_source = [&](Source source) -> Common::Vec4<u8> {
712              switch (source) {
713              case Source::PrimaryColor:
714                  return primary_color;
715              case Source::PrimaryFragmentColor:
716                  return primary_fragment_color;
717              case Source::SecondaryFragmentColor:
718                  return secondary_fragment_color;
719              case Source::Texture0:
720                  return texture_color[0];
721              case Source::Texture1:
722                  return texture_color[1];
723              case Source::Texture2:
724                  return texture_color[2];
725              case Source::Texture3:
726                  return texture_color[3];
727              case Source::PreviousBuffer:
728                  return combiner_buffer;
729              case Source::Constant:
730                  return Common::MakeVec(tev_stage.const_r.Value(), tev_stage.const_g.Value(),
731                                         tev_stage.const_b.Value(), tev_stage.const_a.Value())
732                      .Cast<u8>();
733              case Source::Previous:
734                  return combiner_output;
735              default:
736                  LOG_ERROR(HW_GPU, "Unknown color combiner source {}", (int)source);
737                  UNIMPLEMENTED();
738                  return {0, 0, 0, 0};
739              }
740          };
741  
742          /**
743           * Color combiner
744           * NOTE: Not sure if the alpha combiner might use the color output of the previous
745           *       stage as input. Hence, we currently don't directly write the result to
746           *       combiner_output.rgb(), but instead store it in a temporary variable until
747           *       alpha combining has been done.
748           **/
749          const auto source1 = tev_stage_index == 0 && tev_stage.color_source1 == Source::Previous
750                                   ? tev_stage.color_source3.Value()
751                                   : tev_stage.color_source1.Value();
752          const auto source2 = tev_stage_index == 0 && tev_stage.color_source2 == Source::Previous
753                                   ? tev_stage.color_source3.Value()
754                                   : tev_stage.color_source2.Value();
755          const std::array<Common::Vec3<u8>, 3> color_result = {
756              GetColorModifier(tev_stage.color_modifier1, get_source(source1)),
757              GetColorModifier(tev_stage.color_modifier2, get_source(source2)),
758              GetColorModifier(tev_stage.color_modifier3, get_source(tev_stage.color_source3)),
759          };
760          const Common::Vec3<u8> color_output = ColorCombine(tev_stage.color_op, color_result);
761  
762          u8 alpha_output;
763          if (tev_stage.color_op == TexturingRegs::TevStageConfig::Operation::Dot3_RGBA) {
764              // result of Dot3_RGBA operation is also placed to the alpha component
765              alpha_output = color_output.x;
766          } else {
767              // alpha combiner
768              const std::array<u8, 3> alpha_result = {{
769                  GetAlphaModifier(tev_stage.alpha_modifier1, get_source(tev_stage.alpha_source1)),
770                  GetAlphaModifier(tev_stage.alpha_modifier2, get_source(tev_stage.alpha_source2)),
771                  GetAlphaModifier(tev_stage.alpha_modifier3, get_source(tev_stage.alpha_source3)),
772              }};
773              alpha_output = AlphaCombine(tev_stage.alpha_op, alpha_result);
774          }
775  
776          combiner_output[0] = std::min(255U, color_output.r() * tev_stage.GetColorMultiplier());
777          combiner_output[1] = std::min(255U, color_output.g() * tev_stage.GetColorMultiplier());
778          combiner_output[2] = std::min(255U, color_output.b() * tev_stage.GetColorMultiplier());
779          combiner_output[3] = std::min(255U, alpha_output * tev_stage.GetAlphaMultiplier());
780  
781          combiner_buffer = next_combiner_buffer;
782  
783          if (regs.texturing.tev_combiner_buffer_input.TevStageUpdatesCombinerBufferColor(
784                  tev_stage_index)) {
785              next_combiner_buffer.r() = combiner_output.r();
786              next_combiner_buffer.g() = combiner_output.g();
787              next_combiner_buffer.b() = combiner_output.b();
788          }
789  
790          if (regs.texturing.tev_combiner_buffer_input.TevStageUpdatesCombinerBufferAlpha(
791                  tev_stage_index)) {
792              next_combiner_buffer.a() = combiner_output.a();
793          }
794      }
795  
796      return combiner_output;
797  }
798  
799  void RasterizerSoftware::WriteFog(float depth, Common::Vec4<u8>& combiner_output) const {
800      /**
801       * Apply fog combiner. Not fully accurate. We'd have to know what data type is used to
802       * store the depth etc. Using float for now until we know more about Pica datatypes.
803       **/
804      if (regs.texturing.fog_mode == TexturingRegs::FogMode::Fog) {
805          const Common::Vec3<u8> fog_color =
806              Common::MakeVec(regs.texturing.fog_color.r.Value(), regs.texturing.fog_color.g.Value(),
807                              regs.texturing.fog_color.b.Value())
808                  .Cast<u8>();
809  
810          float fog_index;
811          if (regs.texturing.fog_flip) {
812              fog_index = (1.0f - depth) * 128.0f;
813          } else {
814              fog_index = depth * 128.0f;
815          }
816  
817          // Generate clamped fog factor from LUT for given fog index
818          const f32 fog_i = std::clamp(floorf(fog_index), 0.0f, 127.0f);
819          const f32 fog_f = fog_index - fog_i;
820          const auto& fog_lut_entry = pica.fog.lut[static_cast<u32>(fog_i)];
821          f32 fog_factor = fog_lut_entry.ToFloat() + fog_lut_entry.DiffToFloat() * fog_f;
822          fog_factor = std::clamp(fog_factor, 0.0f, 1.0f);
823          for (u32 i = 0; i < 3; i++) {
824              combiner_output[i] = static_cast<u8>(fog_factor * combiner_output[i] +
825                                                   (1.0f - fog_factor) * fog_color[i]);
826          }
827      }
828  }
829  
830  bool RasterizerSoftware::DoAlphaTest(u8 alpha) const {
831      const auto& output_merger = regs.framebuffer.output_merger;
832      if (!output_merger.alpha_test.enable) {
833          return true;
834      }
835      switch (output_merger.alpha_test.func) {
836      case FramebufferRegs::CompareFunc::Never:
837          return false;
838      case FramebufferRegs::CompareFunc::Always:
839          return true;
840      case FramebufferRegs::CompareFunc::Equal:
841          return alpha == output_merger.alpha_test.ref;
842      case FramebufferRegs::CompareFunc::NotEqual:
843          return alpha != output_merger.alpha_test.ref;
844      case FramebufferRegs::CompareFunc::LessThan:
845          return alpha < output_merger.alpha_test.ref;
846      case FramebufferRegs::CompareFunc::LessThanOrEqual:
847          return alpha <= output_merger.alpha_test.ref;
848      case FramebufferRegs::CompareFunc::GreaterThan:
849          return alpha > output_merger.alpha_test.ref;
850      case FramebufferRegs::CompareFunc::GreaterThanOrEqual:
851          return alpha >= output_merger.alpha_test.ref;
852      default:
853          LOG_CRITICAL(Render_Software, "Unknown alpha test condition {}",
854                       output_merger.alpha_test.func.Value());
855          return false;
856      }
857  }
858  
859  bool RasterizerSoftware::DoDepthStencilTest(u16 x, u16 y, float depth) const {
860      const auto& framebuffer = regs.framebuffer.framebuffer;
861      const auto stencil_test = regs.framebuffer.output_merger.stencil_test;
862      u8 old_stencil = 0;
863  
864      const auto update_stencil = [&](Pica::FramebufferRegs::StencilAction action) {
865          const u8 new_stencil =
866              PerformStencilAction(action, old_stencil, stencil_test.reference_value);
867          if (framebuffer.allow_depth_stencil_write != 0) {
868              const u8 stencil =
869                  (new_stencil & stencil_test.write_mask) | (old_stencil & ~stencil_test.write_mask);
870              fb.SetStencil(x >> 4, y >> 4, stencil);
871          }
872      };
873  
874      const bool stencil_action_enable =
875          regs.framebuffer.output_merger.stencil_test.enable &&
876          regs.framebuffer.framebuffer.depth_format == FramebufferRegs::DepthFormat::D24S8;
877  
878      if (stencil_action_enable) {
879          old_stencil = fb.GetStencil(x >> 4, y >> 4);
880          const u8 dest = old_stencil & stencil_test.input_mask;
881          const u8 ref = stencil_test.reference_value & stencil_test.input_mask;
882          bool pass = false;
883          switch (stencil_test.func) {
884          case FramebufferRegs::CompareFunc::Never:
885              pass = false;
886              break;
887          case FramebufferRegs::CompareFunc::Always:
888              pass = true;
889              break;
890          case FramebufferRegs::CompareFunc::Equal:
891              pass = (ref == dest);
892              break;
893          case FramebufferRegs::CompareFunc::NotEqual:
894              pass = (ref != dest);
895              break;
896          case FramebufferRegs::CompareFunc::LessThan:
897              pass = (ref < dest);
898              break;
899          case FramebufferRegs::CompareFunc::LessThanOrEqual:
900              pass = (ref <= dest);
901              break;
902          case FramebufferRegs::CompareFunc::GreaterThan:
903              pass = (ref > dest);
904              break;
905          case FramebufferRegs::CompareFunc::GreaterThanOrEqual:
906              pass = (ref >= dest);
907              break;
908          }
909          if (!pass) {
910              update_stencil(stencil_test.action_stencil_fail);
911              return false;
912          }
913      }
914  
915      const u32 num_bits = FramebufferRegs::DepthBitsPerPixel(framebuffer.depth_format);
916      const u32 z = static_cast<u32>(depth * ((1 << num_bits) - 1));
917  
918      const auto& output_merger = regs.framebuffer.output_merger;
919      if (output_merger.depth_test_enable) {
920          const u32 ref_z = fb.GetDepth(x >> 4, y >> 4);
921          bool pass = false;
922          switch (output_merger.depth_test_func) {
923          case FramebufferRegs::CompareFunc::Never:
924              pass = false;
925              break;
926          case FramebufferRegs::CompareFunc::Always:
927              pass = true;
928              break;
929          case FramebufferRegs::CompareFunc::Equal:
930              pass = z == ref_z;
931              break;
932          case FramebufferRegs::CompareFunc::NotEqual:
933              pass = z != ref_z;
934              break;
935          case FramebufferRegs::CompareFunc::LessThan:
936              pass = z < ref_z;
937              break;
938          case FramebufferRegs::CompareFunc::LessThanOrEqual:
939              pass = z <= ref_z;
940              break;
941          case FramebufferRegs::CompareFunc::GreaterThan:
942              pass = z > ref_z;
943              break;
944          case FramebufferRegs::CompareFunc::GreaterThanOrEqual:
945              pass = z >= ref_z;
946              break;
947          }
948          if (!pass) {
949              if (stencil_action_enable) {
950                  update_stencil(stencil_test.action_depth_fail);
951              }
952              return false;
953          }
954      }
955      if (framebuffer.allow_depth_stencil_write != 0 && output_merger.depth_write_enable) {
956          fb.SetDepth(x >> 4, y >> 4, z);
957      }
958      // The stencil depth_pass action is executed even if depth testing is disabled
959      if (stencil_action_enable) {
960          update_stencil(stencil_test.action_depth_pass);
961      }
962  
963      return true;
964  }
965  
966  } // namespace SwRenderer