sw_rasterizer.cpp
1 // Copyright 2015 Citra Emulator Project 2 // Licensed under GPLv2 or any later version 3 // Refer to the license.txt file included. 4 5 #include <boost/container/static_vector.hpp> 6 #include "common/logging/log.h" 7 #include "common/microprofile.h" 8 #include "common/quaternion.h" 9 #include "common/vector_math.h" 10 #include "core/memory.h" 11 #include "video_core/pica/output_vertex.h" 12 #include "video_core/pica/pica_core.h" 13 #include "video_core/renderer_software/sw_framebuffer.h" 14 #include "video_core/renderer_software/sw_lighting.h" 15 #include "video_core/renderer_software/sw_proctex.h" 16 #include "video_core/renderer_software/sw_rasterizer.h" 17 #include "video_core/renderer_software/sw_texturing.h" 18 #include "video_core/texture/texture_decode.h" 19 20 namespace SwRenderer { 21 22 using Pica::f24; 23 using Pica::FramebufferRegs; 24 using Pica::RasterizerRegs; 25 using Pica::TexturingRegs; 26 using Pica::Texture::LookupTexture; 27 using Pica::Texture::TextureInfo; 28 29 // Certain games render 2D elements very close to clip plane 0 resulting in very tiny 30 // negative/positive z values when computing with f32 precision, 31 // causing some vertices to get erroneously clipped. To workaround this problem, 32 // we can use a very small epsilon value for clip plane comparison. 33 constexpr f32 EPSILON_Z = 0.00000001f; 34 35 struct Vertex : Pica::OutputVertex { 36 Vertex(const OutputVertex& v) : OutputVertex(v) {} 37 38 /// Attributes used to store intermediate results position after perspective divide. 39 Common::Vec3<f24> screenpos; 40 41 /** 42 * Linear interpolation 43 * factor: 0=this, 1=vtx 44 * Note: This function cannot be called after perspective divide. 45 **/ 46 void Lerp(f24 factor, const Vertex& vtx) { 47 pos = pos * factor + vtx.pos * (f24::One() - factor); 48 quat = quat * factor + vtx.quat * (f24::One() - factor); 49 color = color * factor + vtx.color * (f24::One() - factor); 50 tc0 = tc0 * factor + vtx.tc0 * (f24::One() - factor); 51 tc1 = tc1 * factor + vtx.tc1 * (f24::One() - factor); 52 tc0_w = tc0_w * factor + vtx.tc0_w * (f24::One() - factor); 53 view = view * factor + vtx.view * (f24::One() - factor); 54 tc2 = tc2 * factor + vtx.tc2 * (f24::One() - factor); 55 } 56 57 /** 58 * Linear interpolation 59 * factor: 0=v0, 1=v1 60 * Note: This function cannot be called after perspective divide. 61 **/ 62 static Vertex Lerp(f24 factor, const Vertex& v0, const Vertex& v1) { 63 Vertex ret = v0; 64 ret.Lerp(factor, v1); 65 return ret; 66 } 67 }; 68 69 namespace { 70 71 MICROPROFILE_DEFINE(GPU_Rasterization, "GPU", "Rasterization", MP_RGB(50, 50, 240)); 72 73 struct ClippingEdge { 74 public: 75 constexpr ClippingEdge(Common::Vec4<f24> coeffs, 76 Common::Vec4<f24> bias = Common::Vec4<f24>(f24::Zero(), f24::Zero(), 77 f24::Zero(), f24::Zero())) 78 : pos(f24::Zero()), coeffs(coeffs), bias(bias) {} 79 80 bool IsInside(const Vertex& vertex) const { 81 return Common::Dot(vertex.pos + bias, coeffs) >= f24::FromFloat32(-EPSILON_Z); 82 } 83 84 bool IsOutSide(const Vertex& vertex) const { 85 return !IsInside(vertex); 86 } 87 88 Vertex GetIntersection(const Vertex& v0, const Vertex& v1) const { 89 const f24 dp = Common::Dot(v0.pos + bias, coeffs); 90 const f24 dp_prev = Common::Dot(v1.pos + bias, coeffs); 91 const f24 factor = dp_prev / (dp_prev - dp); 92 return Vertex::Lerp(factor, v0, v1); 93 } 94 95 private: 96 [[maybe_unused]] f24 pos; 97 Common::Vec4<f24> coeffs; 98 Common::Vec4<f24> bias; 99 }; 100 101 } // Anonymous namespace 102 103 RasterizerSoftware::RasterizerSoftware(Memory::MemorySystem& memory_, Pica::PicaCore& pica_) 104 : memory{memory_}, pica{pica_}, regs{pica.regs.internal}, 105 num_sw_threads{std::max(std::thread::hardware_concurrency(), 2U)}, 106 sw_workers{num_sw_threads, "SwRenderer workers"}, fb{memory, regs.framebuffer} {} 107 108 void RasterizerSoftware::AddTriangle(const Pica::OutputVertex& v0, const Pica::OutputVertex& v1, 109 const Pica::OutputVertex& v2) { 110 /** 111 * Clipping a planar n-gon against a plane will remove at least 1 vertex and introduces 2 at 112 * the new edge (or less in degenerate cases). As such, we can say that each clipping plane 113 * introduces at most 1 new vertex to the polygon. Since we start with a triangle and have a 114 * fixed 6 clipping planes, the maximum number of vertices of the clipped polygon is 3 + 6 = 9. 115 **/ 116 static constexpr std::size_t MAX_VERTICES = 9; 117 118 boost::container::static_vector<Vertex, MAX_VERTICES> buffer_a = {v0, v1, v2}; 119 boost::container::static_vector<Vertex, MAX_VERTICES> buffer_b; 120 121 FlipQuaternionIfOpposite(buffer_a[1].quat, buffer_a[0].quat); 122 FlipQuaternionIfOpposite(buffer_a[2].quat, buffer_a[0].quat); 123 124 auto* output_list = &buffer_a; 125 auto* input_list = &buffer_b; 126 127 // NOTE: We clip against a w=epsilon plane to guarantee that the output has a positive w value. 128 // TODO: Not sure if this is a valid approach. Also should probably instead use the smallest 129 // epsilon possible within f24 accuracy. 130 static constexpr f24 EPSILON = f24::FromFloat32(0.00001f); 131 static constexpr f24 f0 = f24::Zero(); 132 static constexpr f24 f1 = f24::One(); 133 static constexpr std::array<ClippingEdge, 7> clipping_edges = {{ 134 {Common::MakeVec(-f1, f0, f0, f1)}, // x = +w 135 {Common::MakeVec(f1, f0, f0, f1)}, // x = -w 136 {Common::MakeVec(f0, -f1, f0, f1)}, // y = +w 137 {Common::MakeVec(f0, f1, f0, f1)}, // y = -w 138 {Common::MakeVec(f0, f0, -f1, f0)}, // z = 0 139 {Common::MakeVec(f0, f0, f1, f1)}, // z = -w 140 {Common::MakeVec(f0, f0, f0, f1), Common::Vec4<f24>(f0, f0, f0, EPSILON)}, // w = EPSILON 141 }}; 142 143 // Simple implementation of the Sutherland-Hodgman clipping algorithm. 144 // TODO: Make this less inefficient (currently lots of useless buffering overhead happens here) 145 const auto clip = [&](const ClippingEdge& edge) { 146 std::swap(input_list, output_list); 147 output_list->clear(); 148 149 const Vertex* reference_vertex = &input_list->back(); 150 for (const auto& vertex : *input_list) { 151 // NOTE: This algorithm changes vertex order in some cases! 152 if (edge.IsInside(vertex)) { 153 if (edge.IsOutSide(*reference_vertex)) { 154 output_list->push_back(edge.GetIntersection(vertex, *reference_vertex)); 155 } 156 output_list->push_back(vertex); 157 } else if (edge.IsInside(*reference_vertex)) { 158 output_list->push_back(edge.GetIntersection(vertex, *reference_vertex)); 159 } 160 reference_vertex = &vertex; 161 } 162 }; 163 164 for (const ClippingEdge& edge : clipping_edges) { 165 clip(edge); 166 if (output_list->size() < 3) { 167 return; 168 } 169 } 170 171 if (regs.rasterizer.clip_enable) { 172 const ClippingEdge custom_edge{regs.rasterizer.GetClipCoef()}; 173 clip(custom_edge); 174 if (output_list->size() < 3) { 175 return; 176 } 177 } 178 179 MakeScreenCoords((*output_list)[0]); 180 MakeScreenCoords((*output_list)[1]); 181 182 for (std::size_t i = 0; i < output_list->size() - 2; i++) { 183 Vertex& vtx0 = (*output_list)[0]; 184 Vertex& vtx1 = (*output_list)[i + 1]; 185 Vertex& vtx2 = (*output_list)[i + 2]; 186 187 MakeScreenCoords(vtx2); 188 189 LOG_TRACE( 190 Render_Software, 191 "Triangle {}/{} at position ({:.3}, {:.3}, {:.3}, {:.3f}), " 192 "({:.3}, {:.3}, {:.3}, {:.3}), ({:.3}, {:.3}, {:.3}, {:.3}) and " 193 "screen position ({:.2}, {:.2}, {:.2}), ({:.2}, {:.2}, {:.2}), ({:.2}, {:.2}, {:.2})", 194 i + 1, output_list->size() - 2, vtx0.pos.x.ToFloat32(), vtx0.pos.y.ToFloat32(), 195 vtx0.pos.z.ToFloat32(), vtx0.pos.w.ToFloat32(), vtx1.pos.x.ToFloat32(), 196 vtx1.pos.y.ToFloat32(), vtx1.pos.z.ToFloat32(), vtx1.pos.w.ToFloat32(), 197 vtx2.pos.x.ToFloat32(), vtx2.pos.y.ToFloat32(), vtx2.pos.z.ToFloat32(), 198 vtx2.pos.w.ToFloat32(), vtx0.screenpos.x.ToFloat32(), vtx0.screenpos.y.ToFloat32(), 199 vtx0.screenpos.z.ToFloat32(), vtx1.screenpos.x.ToFloat32(), 200 vtx1.screenpos.y.ToFloat32(), vtx1.screenpos.z.ToFloat32(), 201 vtx2.screenpos.x.ToFloat32(), vtx2.screenpos.y.ToFloat32(), 202 vtx2.screenpos.z.ToFloat32()); 203 204 ProcessTriangle(vtx0, vtx1, vtx2); 205 } 206 } 207 208 void RasterizerSoftware::MakeScreenCoords(Vertex& vtx) { 209 Viewport viewport{}; 210 viewport.halfsize_x = f24::FromRaw(regs.rasterizer.viewport_size_x); 211 viewport.halfsize_y = f24::FromRaw(regs.rasterizer.viewport_size_y); 212 viewport.offset_x = f24::FromFloat32(static_cast<f32>(regs.rasterizer.viewport_corner.x)); 213 viewport.offset_y = f24::FromFloat32(static_cast<f32>(regs.rasterizer.viewport_corner.y)); 214 215 f24 inv_w = f24::One() / vtx.pos.w; 216 vtx.pos.w = inv_w; 217 vtx.quat *= inv_w; 218 vtx.color *= inv_w; 219 vtx.tc0 *= inv_w; 220 vtx.tc1 *= inv_w; 221 vtx.tc0_w *= inv_w; 222 vtx.view *= inv_w; 223 vtx.tc2 *= inv_w; 224 225 vtx.screenpos[0] = (vtx.pos.x * inv_w + f24::One()) * viewport.halfsize_x + viewport.offset_x; 226 vtx.screenpos[1] = (vtx.pos.y * inv_w + f24::One()) * viewport.halfsize_y + viewport.offset_y; 227 vtx.screenpos[2] = vtx.pos.z * inv_w; 228 } 229 230 void RasterizerSoftware::ProcessTriangle(const Vertex& v0, const Vertex& v1, const Vertex& v2, 231 bool reversed) { 232 MICROPROFILE_SCOPE(GPU_Rasterization); 233 234 // Vertex positions in rasterizer coordinates 235 static auto screen_to_rasterizer_coords = [](const Common::Vec3<f24>& vec) { 236 return Common::Vec3{Fix12P4::FromFloat24(vec.x), Fix12P4::FromFloat24(vec.y), 237 Fix12P4::FromFloat24(vec.z)}; 238 }; 239 240 const std::array<Common::Vec3<Fix12P4>, 3> vtxpos = { 241 screen_to_rasterizer_coords(v0.screenpos), 242 screen_to_rasterizer_coords(v1.screenpos), 243 screen_to_rasterizer_coords(v2.screenpos), 244 }; 245 246 if (regs.rasterizer.cull_mode == RasterizerRegs::CullMode::KeepAll) { 247 // Make sure we always end up with a triangle wound counter-clockwise 248 if (!reversed && SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) { 249 ProcessTriangle(v0, v2, v1, true); 250 return; 251 } 252 } else { 253 if (!reversed && regs.rasterizer.cull_mode == RasterizerRegs::CullMode::KeepClockWise) { 254 // Reverse vertex order and use the CCW code path. 255 ProcessTriangle(v0, v2, v1, true); 256 return; 257 } 258 // Cull away triangles which are wound clockwise. 259 if (SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) <= 0) { 260 return; 261 } 262 } 263 264 u16 min_x = std::min({vtxpos[0].x, vtxpos[1].x, vtxpos[2].x}); 265 u16 min_y = std::min({vtxpos[0].y, vtxpos[1].y, vtxpos[2].y}); 266 u16 max_x = std::max({vtxpos[0].x, vtxpos[1].x, vtxpos[2].x}); 267 u16 max_y = std::max({vtxpos[0].y, vtxpos[1].y, vtxpos[2].y}); 268 269 // Convert the scissor box coordinates to 12.4 fixed point 270 const u16 scissor_x1 = static_cast<u16>(regs.rasterizer.scissor_test.x1 << 4); 271 const u16 scissor_y1 = static_cast<u16>(regs.rasterizer.scissor_test.y1 << 4); 272 // x2,y2 have +1 added to cover the entire sub-pixel area 273 const u16 scissor_x2 = static_cast<u16>((regs.rasterizer.scissor_test.x2 + 1) << 4); 274 const u16 scissor_y2 = static_cast<u16>((regs.rasterizer.scissor_test.y2 + 1) << 4); 275 276 if (regs.rasterizer.scissor_test.mode == RasterizerRegs::ScissorMode::Include) { 277 // Calculate the new bounds 278 min_x = std::max(min_x, scissor_x1); 279 min_y = std::max(min_y, scissor_y1); 280 max_x = std::min(max_x, scissor_x2); 281 max_y = std::min(max_y, scissor_y2); 282 } 283 284 min_x &= Fix12P4::IntMask(); 285 min_y &= Fix12P4::IntMask(); 286 max_x = ((max_x + Fix12P4::FracMask()) & Fix12P4::IntMask()); 287 max_y = ((max_y + Fix12P4::FracMask()) & Fix12P4::IntMask()); 288 289 const int bias0 = 290 IsRightSideOrFlatBottomEdge(vtxpos[0].xy(), vtxpos[1].xy(), vtxpos[2].xy()) ? -1 : 0; 291 const int bias1 = 292 IsRightSideOrFlatBottomEdge(vtxpos[1].xy(), vtxpos[2].xy(), vtxpos[0].xy()) ? -1 : 0; 293 const int bias2 = 294 IsRightSideOrFlatBottomEdge(vtxpos[2].xy(), vtxpos[0].xy(), vtxpos[1].xy()) ? -1 : 0; 295 296 const auto w_inverse = Common::MakeVec(v0.pos.w, v1.pos.w, v2.pos.w); 297 298 const auto textures = regs.texturing.GetTextures(); 299 const auto tev_stages = regs.texturing.GetTevStages(); 300 301 fb.Bind(); 302 303 // Enter rasterization loop, starting at the center of the topleft bounding box corner. 304 // TODO: Not sure if looping through x first might be faster 305 for (u16 y = min_y + 8; y < max_y; y += 0x10) { 306 const auto process_scanline = [&, y] { 307 for (u16 x = min_x + 8; x < max_x; x += 0x10) { 308 // Do not process the pixel if it's inside the scissor box and the scissor mode is 309 // set to Exclude. 310 if (regs.rasterizer.scissor_test.mode == RasterizerRegs::ScissorMode::Exclude) { 311 if (x >= scissor_x1 && x < scissor_x2 && y >= scissor_y1 && y < scissor_y2) { 312 continue; 313 } 314 } 315 316 // Calculate the barycentric coordinates w0, w1 and w2 317 const s32 w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y}); 318 const s32 w1 = bias1 + SignedArea(vtxpos[2].xy(), vtxpos[0].xy(), {x, y}); 319 const s32 w2 = bias2 + SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), {x, y}); 320 const s32 wsum = w0 + w1 + w2; 321 322 // If current pixel is not covered by the current primitive 323 if (w0 < 0 || w1 < 0 || w2 < 0) { 324 continue; 325 } 326 327 const auto baricentric_coordinates = Common::MakeVec( 328 f24::FromFloat32(static_cast<f32>(w0)), f24::FromFloat32(static_cast<f32>(w1)), 329 f24::FromFloat32(static_cast<f32>(w2))); 330 const f24 interpolated_w_inverse = 331 f24::One() / Common::Dot(w_inverse, baricentric_coordinates); 332 333 // interpolated_z = z / w 334 const float interpolated_z_over_w = 335 (v0.screenpos[2].ToFloat32() * w0 + v1.screenpos[2].ToFloat32() * w1 + 336 v2.screenpos[2].ToFloat32() * w2) / 337 wsum; 338 339 // Not fully accurate. About 3 bits in precision are missing. 340 // Z-Buffer (z / w * scale + offset) 341 const float depth_scale = 342 f24::FromRaw(regs.rasterizer.viewport_depth_range).ToFloat32(); 343 const float depth_offset = 344 f24::FromRaw(regs.rasterizer.viewport_depth_near_plane).ToFloat32(); 345 float depth = interpolated_z_over_w * depth_scale + depth_offset; 346 347 // Potentially switch to W-Buffer 348 if (regs.rasterizer.depthmap_enable == 349 Pica::RasterizerRegs::DepthBuffering::WBuffering) { 350 // W-Buffer (z * scale + w * offset = (z / w * scale + offset) * w) 351 depth *= interpolated_w_inverse.ToFloat32() * wsum; 352 } 353 354 // Clamp the result 355 depth = std::clamp(depth, 0.0f, 1.0f); 356 357 /** 358 * Perspective correct attribute interpolation: 359 * Attribute values cannot be calculated by simple linear interpolation since 360 * they are not linear in screen space. For example, when interpolating a 361 * texture coordinate across two vertices, something simple like 362 * u = (u0*w0 + u1*w1)/(w0+w1) 363 * will not work. However, the attribute value divided by the 364 * clipspace w-coordinate (u/w) and and the inverse w-coordinate (1/w) are linear 365 * in screenspace. Hence, we can linearly interpolate these two independently and 366 * calculate the interpolated attribute by dividing the results. 367 * I.e. 368 * u_over_w = ((u0/v0.pos.w)*w0 + (u1/v1.pos.w)*w1)/(w0+w1) 369 * one_over_w = (( 1/v0.pos.w)*w0 + ( 1/v1.pos.w)*w1)/(w0+w1) 370 * u = u_over_w / one_over_w 371 * 372 * The generalization to three vertices is straightforward in baricentric 373 *coordinates. 374 **/ 375 const auto get_interpolated_attribute = [&](f24 attr0, f24 attr1, f24 attr2) { 376 auto attr_over_w = Common::MakeVec(attr0, attr1, attr2); 377 f24 interpolated_attr_over_w = 378 Common::Dot(attr_over_w, baricentric_coordinates); 379 return interpolated_attr_over_w * interpolated_w_inverse; 380 }; 381 382 const Common::Vec4<u8> primary_color{ 383 static_cast<u8>( 384 round(get_interpolated_attribute(v0.color.r(), v1.color.r(), v2.color.r()) 385 .ToFloat32() * 386 255)), 387 static_cast<u8>( 388 round(get_interpolated_attribute(v0.color.g(), v1.color.g(), v2.color.g()) 389 .ToFloat32() * 390 255)), 391 static_cast<u8>( 392 round(get_interpolated_attribute(v0.color.b(), v1.color.b(), v2.color.b()) 393 .ToFloat32() * 394 255)), 395 static_cast<u8>( 396 round(get_interpolated_attribute(v0.color.a(), v1.color.a(), v2.color.a()) 397 .ToFloat32() * 398 255)), 399 }; 400 401 std::array<Common::Vec2<f24>, 3> uv; 402 uv[0].u() = get_interpolated_attribute(v0.tc0.u(), v1.tc0.u(), v2.tc0.u()); 403 uv[0].v() = get_interpolated_attribute(v0.tc0.v(), v1.tc0.v(), v2.tc0.v()); 404 uv[1].u() = get_interpolated_attribute(v0.tc1.u(), v1.tc1.u(), v2.tc1.u()); 405 uv[1].v() = get_interpolated_attribute(v0.tc1.v(), v1.tc1.v(), v2.tc1.v()); 406 uv[2].u() = get_interpolated_attribute(v0.tc2.u(), v1.tc2.u(), v2.tc2.u()); 407 uv[2].v() = get_interpolated_attribute(v0.tc2.v(), v1.tc2.v(), v2.tc2.v()); 408 409 // Sample bound texture units. 410 const f24 tc0_w = get_interpolated_attribute(v0.tc0_w, v1.tc0_w, v2.tc0_w); 411 const auto texture_color = TextureColor(uv, textures, tc0_w); 412 413 Common::Vec4<u8> primary_fragment_color = {0, 0, 0, 0}; 414 Common::Vec4<u8> secondary_fragment_color = {0, 0, 0, 0}; 415 416 if (!regs.lighting.disable) { 417 const auto normquat = 418 Common::Quaternion<f32>{ 419 {get_interpolated_attribute(v0.quat.x, v1.quat.x, v2.quat.x) 420 .ToFloat32(), 421 get_interpolated_attribute(v0.quat.y, v1.quat.y, v2.quat.y) 422 .ToFloat32(), 423 get_interpolated_attribute(v0.quat.z, v1.quat.z, v2.quat.z) 424 .ToFloat32()}, 425 get_interpolated_attribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(), 426 } 427 .Normalized(); 428 429 const Common::Vec3f view{ 430 get_interpolated_attribute(v0.view.x, v1.view.x, v2.view.x).ToFloat32(), 431 get_interpolated_attribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(), 432 get_interpolated_attribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32(), 433 }; 434 std::tie(primary_fragment_color, secondary_fragment_color) = 435 ComputeFragmentsColors(regs.lighting, pica.lighting, normquat, view, 436 texture_color); 437 } 438 439 // Write the TEV stages. 440 auto combiner_output = 441 WriteTevConfig(texture_color, tev_stages, primary_color, primary_fragment_color, 442 secondary_fragment_color); 443 444 const auto& output_merger = regs.framebuffer.output_merger; 445 if (output_merger.fragment_operation_mode == 446 FramebufferRegs::FragmentOperationMode::Shadow) { 447 const u32 depth_int = static_cast<u32>(depth * 0xFFFFFF); 448 // Use green color as the shadow intensity 449 const u8 stencil = combiner_output.y; 450 fb.DrawShadowMapPixel(x >> 4, y >> 4, depth_int, stencil); 451 // Skip the normal output merger pipeline if it is in shadow mode 452 continue; 453 } 454 455 // Does alpha testing happen before or after stencil? 456 if (!DoAlphaTest(combiner_output.a())) { 457 continue; 458 } 459 WriteFog(depth, combiner_output); 460 if (!DoDepthStencilTest(x, y, depth)) { 461 continue; 462 } 463 const auto result = PixelColor(x, y, combiner_output); 464 if (regs.framebuffer.framebuffer.allow_color_write != 0) { 465 fb.DrawPixel(x >> 4, y >> 4, result); 466 } 467 } 468 }; 469 sw_workers.QueueWork(std::move(process_scanline)); 470 } 471 sw_workers.WaitForRequests(); 472 } 473 474 std::array<Common::Vec4<u8>, 4> RasterizerSoftware::TextureColor( 475 std::span<const Common::Vec2<f24>, 3> uv, 476 std::span<const Pica::TexturingRegs::FullTextureConfig, 3> textures, f24 tc0_w) const { 477 std::array<Common::Vec4<u8>, 4> texture_color{}; 478 for (u32 i = 0; i < 3; ++i) { 479 const auto& texture = textures[i]; 480 if (!texture.enabled) [[unlikely]] { 481 continue; 482 } 483 if (texture.config.address == 0) [[unlikely]] { 484 texture_color[i] = {0, 0, 0, 255}; 485 continue; 486 } 487 488 const s32 coordinate_i = (i == 2 && regs.texturing.main_config.texture2_use_coord1) ? 1 : i; 489 f24 u = uv[coordinate_i].u(); 490 f24 v = uv[coordinate_i].v(); 491 492 // Only unit 0 respects the texturing type (according to 3DBrew) 493 PAddr texture_address = texture.config.GetPhysicalAddress(); 494 f24 shadow_z; 495 if (i == 0) { 496 switch (texture.config.type) { 497 case TexturingRegs::TextureConfig::Texture2D: 498 break; 499 case TexturingRegs::TextureConfig::ShadowCube: 500 case TexturingRegs::TextureConfig::TextureCube: { 501 std::tie(u, v, shadow_z, texture_address) = 502 ConvertCubeCoord(u, v, tc0_w, regs.texturing); 503 break; 504 } 505 case TexturingRegs::TextureConfig::Projection2D: { 506 u /= tc0_w; 507 v /= tc0_w; 508 break; 509 } 510 case TexturingRegs::TextureConfig::Shadow2D: { 511 if (!regs.texturing.shadow.orthographic) { 512 u /= tc0_w; 513 v /= tc0_w; 514 } 515 shadow_z = f24::FromFloat32(std::abs(tc0_w.ToFloat32())); 516 break; 517 } 518 case TexturingRegs::TextureConfig::Disabled: 519 continue; // skip this unit and continue to the next unit 520 default: 521 LOG_ERROR(HW_GPU, "Unhandled texture type {:x}", (int)texture.config.type); 522 UNIMPLEMENTED(); 523 break; 524 } 525 } 526 527 const f24 width = f24::FromFloat32(static_cast<f32>(texture.config.width)); 528 const f24 height = f24::FromFloat32(static_cast<f32>(texture.config.height)); 529 s32 s = static_cast<s32>((u * width).ToFloat32()); 530 s32 t = static_cast<s32>((v * height).ToFloat32()); 531 532 bool use_border_s = false; 533 bool use_border_t = false; 534 535 if (texture.config.wrap_s == TexturingRegs::TextureConfig::ClampToBorder) { 536 use_border_s = s < 0 || s >= static_cast<s32>(texture.config.width); 537 } else if (texture.config.wrap_s == TexturingRegs::TextureConfig::ClampToBorder2) { 538 use_border_s = s >= static_cast<s32>(texture.config.width); 539 } 540 541 if (texture.config.wrap_t == TexturingRegs::TextureConfig::ClampToBorder) { 542 use_border_t = t < 0 || t >= static_cast<s32>(texture.config.height); 543 } else if (texture.config.wrap_t == TexturingRegs::TextureConfig::ClampToBorder2) { 544 use_border_t = t >= static_cast<s32>(texture.config.height); 545 } 546 547 if (use_border_s || use_border_t) { 548 const auto border_color = texture.config.border_color; 549 texture_color[i] = Common::MakeVec(border_color.r.Value(), border_color.g.Value(), 550 border_color.b.Value(), border_color.a.Value()) 551 .Cast<u8>(); 552 } else { 553 // Textures are laid out from bottom to top, hence we invert the t coordinate. 554 // NOTE: This may not be the right place for the inversion. 555 // TODO: Check if this applies to ETC textures, too. 556 s = GetWrappedTexCoord(texture.config.wrap_s, s, texture.config.width); 557 t = texture.config.height - 1 - 558 GetWrappedTexCoord(texture.config.wrap_t, t, texture.config.height); 559 560 const u8* texture_data = memory.GetPhysicalPointer(texture_address); 561 const auto info = TextureInfo::FromPicaRegister(texture.config, texture.format); 562 563 // TODO: Apply the min and mag filters to the texture 564 texture_color[i] = LookupTexture(texture_data, s, t, info); 565 } 566 567 if (i == 0 && (texture.config.type == TexturingRegs::TextureConfig::Shadow2D || 568 texture.config.type == TexturingRegs::TextureConfig::ShadowCube)) { 569 570 s32 z_int = static_cast<s32>(std::min(shadow_z.ToFloat32(), 1.0f) * 0xFFFFFF); 571 z_int -= regs.texturing.shadow.bias << 1; 572 const auto& color = texture_color[i]; 573 const s32 z_ref = (color.w << 16) | (color.z << 8) | color.y; 574 u8 density; 575 if (z_ref >= z_int) { 576 density = color.x; 577 } else { 578 density = 0; 579 } 580 texture_color[i] = {density, density, density, density}; 581 } 582 } 583 584 // Sample procedural texture 585 if (regs.texturing.main_config.texture3_enable) { 586 const auto& proctex_uv = uv[regs.texturing.main_config.texture3_coordinates]; 587 texture_color[3] = ProcTex(proctex_uv.u().ToFloat32(), proctex_uv.v().ToFloat32(), 588 regs.texturing, pica.proctex); 589 } 590 591 return texture_color; 592 } 593 594 Common::Vec4<u8> RasterizerSoftware::PixelColor(u16 x, u16 y, 595 Common::Vec4<u8> combiner_output) const { 596 const auto dest = fb.GetPixel(x >> 4, y >> 4); 597 Common::Vec4<u8> blend_output = combiner_output; 598 599 const auto& output_merger = regs.framebuffer.output_merger; 600 if (output_merger.alphablend_enable) { 601 const auto params = output_merger.alpha_blending; 602 const auto lookup_factor = [&](u32 channel, FramebufferRegs::BlendFactor factor) -> u8 { 603 DEBUG_ASSERT(channel < 4); 604 605 const Common::Vec4<u8> blend_const = 606 Common::MakeVec( 607 output_merger.blend_const.r.Value(), output_merger.blend_const.g.Value(), 608 output_merger.blend_const.b.Value(), output_merger.blend_const.a.Value()) 609 .Cast<u8>(); 610 611 switch (factor) { 612 case FramebufferRegs::BlendFactor::Zero: 613 return 0; 614 case FramebufferRegs::BlendFactor::One: 615 return 255; 616 case FramebufferRegs::BlendFactor::SourceColor: 617 return combiner_output[channel]; 618 case FramebufferRegs::BlendFactor::OneMinusSourceColor: 619 return 255 - combiner_output[channel]; 620 case FramebufferRegs::BlendFactor::DestColor: 621 return dest[channel]; 622 case FramebufferRegs::BlendFactor::OneMinusDestColor: 623 return 255 - dest[channel]; 624 case FramebufferRegs::BlendFactor::SourceAlpha: 625 return combiner_output.a(); 626 case FramebufferRegs::BlendFactor::OneMinusSourceAlpha: 627 return 255 - combiner_output.a(); 628 case FramebufferRegs::BlendFactor::DestAlpha: 629 return dest.a(); 630 case FramebufferRegs::BlendFactor::OneMinusDestAlpha: 631 return 255 - dest.a(); 632 case FramebufferRegs::BlendFactor::ConstantColor: 633 return blend_const[channel]; 634 case FramebufferRegs::BlendFactor::OneMinusConstantColor: 635 return 255 - blend_const[channel]; 636 case FramebufferRegs::BlendFactor::ConstantAlpha: 637 return blend_const.a(); 638 case FramebufferRegs::BlendFactor::OneMinusConstantAlpha: 639 return 255 - blend_const.a(); 640 case FramebufferRegs::BlendFactor::SourceAlphaSaturate: 641 // Returns 1.0 for the alpha channel 642 if (channel == 3) { 643 return 255; 644 } 645 return std::min(combiner_output.a(), static_cast<u8>(255 - dest.a())); 646 default: 647 LOG_CRITICAL(HW_GPU, "Unknown blend factor {:x}", factor); 648 UNIMPLEMENTED(); 649 break; 650 } 651 return combiner_output[channel]; 652 }; 653 654 const auto srcfactor = Common::MakeVec( 655 lookup_factor(0, params.factor_source_rgb), lookup_factor(1, params.factor_source_rgb), 656 lookup_factor(2, params.factor_source_rgb), lookup_factor(3, params.factor_source_a)); 657 658 const auto dstfactor = Common::MakeVec( 659 lookup_factor(0, params.factor_dest_rgb), lookup_factor(1, params.factor_dest_rgb), 660 lookup_factor(2, params.factor_dest_rgb), lookup_factor(3, params.factor_dest_a)); 661 662 blend_output = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, 663 params.blend_equation_rgb); 664 blend_output.a() = EvaluateBlendEquation(combiner_output, srcfactor, dest, dstfactor, 665 params.blend_equation_a) 666 .a(); 667 } else { 668 blend_output = 669 Common::MakeVec(LogicOp(combiner_output.r(), dest.r(), output_merger.logic_op), 670 LogicOp(combiner_output.g(), dest.g(), output_merger.logic_op), 671 LogicOp(combiner_output.b(), dest.b(), output_merger.logic_op), 672 LogicOp(combiner_output.a(), dest.a(), output_merger.logic_op)); 673 } 674 675 const Common::Vec4<u8> result = { 676 output_merger.red_enable ? blend_output.r() : dest.r(), 677 output_merger.green_enable ? blend_output.g() : dest.g(), 678 output_merger.blue_enable ? blend_output.b() : dest.b(), 679 output_merger.alpha_enable ? blend_output.a() : dest.a(), 680 }; 681 682 return result; 683 } 684 685 Common::Vec4<u8> RasterizerSoftware::WriteTevConfig( 686 std::span<const Common::Vec4<u8>, 4> texture_color, 687 std::span<const Pica::TexturingRegs::TevStageConfig, 6> tev_stages, 688 Common::Vec4<u8> primary_color, Common::Vec4<u8> primary_fragment_color, 689 Common::Vec4<u8> secondary_fragment_color) { 690 /** 691 * Texture environment - consists of 6 stages of color and alpha combining. 692 * Color combiners take three input color values from some source (e.g. interpolated 693 * vertex color, texture color, previous stage, etc), perform some very simple 694 * operations on each of them (e.g. inversion) and then calculate the output color 695 * with some basic arithmetic. Alpha combiners can be configured separately but work 696 * analogously. 697 **/ 698 Common::Vec4<u8> combiner_output = {0, 0, 0, 0}; 699 Common::Vec4<u8> combiner_buffer = {0, 0, 0, 0}; 700 Common::Vec4<u8> next_combiner_buffer = 701 Common::MakeVec(regs.texturing.tev_combiner_buffer_color.r.Value(), 702 regs.texturing.tev_combiner_buffer_color.g.Value(), 703 regs.texturing.tev_combiner_buffer_color.b.Value(), 704 regs.texturing.tev_combiner_buffer_color.a.Value()) 705 .Cast<u8>(); 706 707 for (u32 tev_stage_index = 0; tev_stage_index < tev_stages.size(); ++tev_stage_index) { 708 const auto& tev_stage = tev_stages[tev_stage_index]; 709 using Source = TexturingRegs::TevStageConfig::Source; 710 711 auto get_source = [&](Source source) -> Common::Vec4<u8> { 712 switch (source) { 713 case Source::PrimaryColor: 714 return primary_color; 715 case Source::PrimaryFragmentColor: 716 return primary_fragment_color; 717 case Source::SecondaryFragmentColor: 718 return secondary_fragment_color; 719 case Source::Texture0: 720 return texture_color[0]; 721 case Source::Texture1: 722 return texture_color[1]; 723 case Source::Texture2: 724 return texture_color[2]; 725 case Source::Texture3: 726 return texture_color[3]; 727 case Source::PreviousBuffer: 728 return combiner_buffer; 729 case Source::Constant: 730 return Common::MakeVec(tev_stage.const_r.Value(), tev_stage.const_g.Value(), 731 tev_stage.const_b.Value(), tev_stage.const_a.Value()) 732 .Cast<u8>(); 733 case Source::Previous: 734 return combiner_output; 735 default: 736 LOG_ERROR(HW_GPU, "Unknown color combiner source {}", (int)source); 737 UNIMPLEMENTED(); 738 return {0, 0, 0, 0}; 739 } 740 }; 741 742 /** 743 * Color combiner 744 * NOTE: Not sure if the alpha combiner might use the color output of the previous 745 * stage as input. Hence, we currently don't directly write the result to 746 * combiner_output.rgb(), but instead store it in a temporary variable until 747 * alpha combining has been done. 748 **/ 749 const auto source1 = tev_stage_index == 0 && tev_stage.color_source1 == Source::Previous 750 ? tev_stage.color_source3.Value() 751 : tev_stage.color_source1.Value(); 752 const auto source2 = tev_stage_index == 0 && tev_stage.color_source2 == Source::Previous 753 ? tev_stage.color_source3.Value() 754 : tev_stage.color_source2.Value(); 755 const std::array<Common::Vec3<u8>, 3> color_result = { 756 GetColorModifier(tev_stage.color_modifier1, get_source(source1)), 757 GetColorModifier(tev_stage.color_modifier2, get_source(source2)), 758 GetColorModifier(tev_stage.color_modifier3, get_source(tev_stage.color_source3)), 759 }; 760 const Common::Vec3<u8> color_output = ColorCombine(tev_stage.color_op, color_result); 761 762 u8 alpha_output; 763 if (tev_stage.color_op == TexturingRegs::TevStageConfig::Operation::Dot3_RGBA) { 764 // result of Dot3_RGBA operation is also placed to the alpha component 765 alpha_output = color_output.x; 766 } else { 767 // alpha combiner 768 const std::array<u8, 3> alpha_result = {{ 769 GetAlphaModifier(tev_stage.alpha_modifier1, get_source(tev_stage.alpha_source1)), 770 GetAlphaModifier(tev_stage.alpha_modifier2, get_source(tev_stage.alpha_source2)), 771 GetAlphaModifier(tev_stage.alpha_modifier3, get_source(tev_stage.alpha_source3)), 772 }}; 773 alpha_output = AlphaCombine(tev_stage.alpha_op, alpha_result); 774 } 775 776 combiner_output[0] = std::min(255U, color_output.r() * tev_stage.GetColorMultiplier()); 777 combiner_output[1] = std::min(255U, color_output.g() * tev_stage.GetColorMultiplier()); 778 combiner_output[2] = std::min(255U, color_output.b() * tev_stage.GetColorMultiplier()); 779 combiner_output[3] = std::min(255U, alpha_output * tev_stage.GetAlphaMultiplier()); 780 781 combiner_buffer = next_combiner_buffer; 782 783 if (regs.texturing.tev_combiner_buffer_input.TevStageUpdatesCombinerBufferColor( 784 tev_stage_index)) { 785 next_combiner_buffer.r() = combiner_output.r(); 786 next_combiner_buffer.g() = combiner_output.g(); 787 next_combiner_buffer.b() = combiner_output.b(); 788 } 789 790 if (regs.texturing.tev_combiner_buffer_input.TevStageUpdatesCombinerBufferAlpha( 791 tev_stage_index)) { 792 next_combiner_buffer.a() = combiner_output.a(); 793 } 794 } 795 796 return combiner_output; 797 } 798 799 void RasterizerSoftware::WriteFog(float depth, Common::Vec4<u8>& combiner_output) const { 800 /** 801 * Apply fog combiner. Not fully accurate. We'd have to know what data type is used to 802 * store the depth etc. Using float for now until we know more about Pica datatypes. 803 **/ 804 if (regs.texturing.fog_mode == TexturingRegs::FogMode::Fog) { 805 const Common::Vec3<u8> fog_color = 806 Common::MakeVec(regs.texturing.fog_color.r.Value(), regs.texturing.fog_color.g.Value(), 807 regs.texturing.fog_color.b.Value()) 808 .Cast<u8>(); 809 810 float fog_index; 811 if (regs.texturing.fog_flip) { 812 fog_index = (1.0f - depth) * 128.0f; 813 } else { 814 fog_index = depth * 128.0f; 815 } 816 817 // Generate clamped fog factor from LUT for given fog index 818 const f32 fog_i = std::clamp(floorf(fog_index), 0.0f, 127.0f); 819 const f32 fog_f = fog_index - fog_i; 820 const auto& fog_lut_entry = pica.fog.lut[static_cast<u32>(fog_i)]; 821 f32 fog_factor = fog_lut_entry.ToFloat() + fog_lut_entry.DiffToFloat() * fog_f; 822 fog_factor = std::clamp(fog_factor, 0.0f, 1.0f); 823 for (u32 i = 0; i < 3; i++) { 824 combiner_output[i] = static_cast<u8>(fog_factor * combiner_output[i] + 825 (1.0f - fog_factor) * fog_color[i]); 826 } 827 } 828 } 829 830 bool RasterizerSoftware::DoAlphaTest(u8 alpha) const { 831 const auto& output_merger = regs.framebuffer.output_merger; 832 if (!output_merger.alpha_test.enable) { 833 return true; 834 } 835 switch (output_merger.alpha_test.func) { 836 case FramebufferRegs::CompareFunc::Never: 837 return false; 838 case FramebufferRegs::CompareFunc::Always: 839 return true; 840 case FramebufferRegs::CompareFunc::Equal: 841 return alpha == output_merger.alpha_test.ref; 842 case FramebufferRegs::CompareFunc::NotEqual: 843 return alpha != output_merger.alpha_test.ref; 844 case FramebufferRegs::CompareFunc::LessThan: 845 return alpha < output_merger.alpha_test.ref; 846 case FramebufferRegs::CompareFunc::LessThanOrEqual: 847 return alpha <= output_merger.alpha_test.ref; 848 case FramebufferRegs::CompareFunc::GreaterThan: 849 return alpha > output_merger.alpha_test.ref; 850 case FramebufferRegs::CompareFunc::GreaterThanOrEqual: 851 return alpha >= output_merger.alpha_test.ref; 852 default: 853 LOG_CRITICAL(Render_Software, "Unknown alpha test condition {}", 854 output_merger.alpha_test.func.Value()); 855 return false; 856 } 857 } 858 859 bool RasterizerSoftware::DoDepthStencilTest(u16 x, u16 y, float depth) const { 860 const auto& framebuffer = regs.framebuffer.framebuffer; 861 const auto stencil_test = regs.framebuffer.output_merger.stencil_test; 862 u8 old_stencil = 0; 863 864 const auto update_stencil = [&](Pica::FramebufferRegs::StencilAction action) { 865 const u8 new_stencil = 866 PerformStencilAction(action, old_stencil, stencil_test.reference_value); 867 if (framebuffer.allow_depth_stencil_write != 0) { 868 const u8 stencil = 869 (new_stencil & stencil_test.write_mask) | (old_stencil & ~stencil_test.write_mask); 870 fb.SetStencil(x >> 4, y >> 4, stencil); 871 } 872 }; 873 874 const bool stencil_action_enable = 875 regs.framebuffer.output_merger.stencil_test.enable && 876 regs.framebuffer.framebuffer.depth_format == FramebufferRegs::DepthFormat::D24S8; 877 878 if (stencil_action_enable) { 879 old_stencil = fb.GetStencil(x >> 4, y >> 4); 880 const u8 dest = old_stencil & stencil_test.input_mask; 881 const u8 ref = stencil_test.reference_value & stencil_test.input_mask; 882 bool pass = false; 883 switch (stencil_test.func) { 884 case FramebufferRegs::CompareFunc::Never: 885 pass = false; 886 break; 887 case FramebufferRegs::CompareFunc::Always: 888 pass = true; 889 break; 890 case FramebufferRegs::CompareFunc::Equal: 891 pass = (ref == dest); 892 break; 893 case FramebufferRegs::CompareFunc::NotEqual: 894 pass = (ref != dest); 895 break; 896 case FramebufferRegs::CompareFunc::LessThan: 897 pass = (ref < dest); 898 break; 899 case FramebufferRegs::CompareFunc::LessThanOrEqual: 900 pass = (ref <= dest); 901 break; 902 case FramebufferRegs::CompareFunc::GreaterThan: 903 pass = (ref > dest); 904 break; 905 case FramebufferRegs::CompareFunc::GreaterThanOrEqual: 906 pass = (ref >= dest); 907 break; 908 } 909 if (!pass) { 910 update_stencil(stencil_test.action_stencil_fail); 911 return false; 912 } 913 } 914 915 const u32 num_bits = FramebufferRegs::DepthBitsPerPixel(framebuffer.depth_format); 916 const u32 z = static_cast<u32>(depth * ((1 << num_bits) - 1)); 917 918 const auto& output_merger = regs.framebuffer.output_merger; 919 if (output_merger.depth_test_enable) { 920 const u32 ref_z = fb.GetDepth(x >> 4, y >> 4); 921 bool pass = false; 922 switch (output_merger.depth_test_func) { 923 case FramebufferRegs::CompareFunc::Never: 924 pass = false; 925 break; 926 case FramebufferRegs::CompareFunc::Always: 927 pass = true; 928 break; 929 case FramebufferRegs::CompareFunc::Equal: 930 pass = z == ref_z; 931 break; 932 case FramebufferRegs::CompareFunc::NotEqual: 933 pass = z != ref_z; 934 break; 935 case FramebufferRegs::CompareFunc::LessThan: 936 pass = z < ref_z; 937 break; 938 case FramebufferRegs::CompareFunc::LessThanOrEqual: 939 pass = z <= ref_z; 940 break; 941 case FramebufferRegs::CompareFunc::GreaterThan: 942 pass = z > ref_z; 943 break; 944 case FramebufferRegs::CompareFunc::GreaterThanOrEqual: 945 pass = z >= ref_z; 946 break; 947 } 948 if (!pass) { 949 if (stencil_action_enable) { 950 update_stencil(stencil_test.action_depth_fail); 951 } 952 return false; 953 } 954 } 955 if (framebuffer.allow_depth_stencil_write != 0 && output_merger.depth_write_enable) { 956 fb.SetDepth(x >> 4, y >> 4, z); 957 } 958 // The stencil depth_pass action is executed even if depth testing is disabled 959 if (stencil_action_enable) { 960 update_stencil(stencil_test.action_depth_pass); 961 } 962 963 return true; 964 } 965 966 } // namespace SwRenderer