From 16f431877510bdab1f69b2363ef63e25a9a540d1 Mon Sep 17 00:00:00 2001 From: Wunkolo Date: Mon, 19 Aug 2024 14:55:41 +0200 Subject: [PATCH] shader_jit: Fix/optimize conditional evaluation --- src/tests/video_core/shader.cpp | 88 +++++++++++++++++++ src/video_core/pica/shader_unit.cpp | 5 +- src/video_core/pica/shader_unit.h | 10 +-- src/video_core/shader/shader_interpreter.cpp | 3 - .../shader/shader_jit_a64_compiler.cpp | 46 +++++++--- .../shader/shader_jit_x64_compiler.cpp | 32 +++---- 6 files changed, 144 insertions(+), 40 deletions(-) diff --git a/src/tests/video_core/shader.cpp b/src/tests/video_core/shader.cpp index 0b94e7bebf..4f4dc0067c 100644 --- a/src/tests/video_core/shader.cpp +++ b/src/tests/video_core/shader.cpp @@ -676,6 +676,94 @@ TEMPLATE_TEST_CASE("Nested Loop", "[video_core][shader]", ShaderJitTest) { } } +SHADER_TEST_CASE("Conditional", "[video_core][shader]") { + const auto sh_input = SourceRegister::MakeInput(0); + const auto sh_temp = SourceRegister::MakeTemporary(0); + const auto sh_output = DestRegister::MakeOutput(0); + + const std::initializer_list assembly_template = { + // IFC configured later + {OpCode::Id::NOP}, + // True + {OpCode::Id::MOV, sh_output, sh_input}, + {OpCode::Id::END}, + // False + {OpCode::Id::MOV, sh_output, sh_temp}, + {OpCode::Id::END}, + }; + + const bool ref_x = GENERATE(0, 1); + const bool cmp_x = GENERATE(0, 1); + const bool result_x = (cmp_x == ref_x); + + const bool ref_y = GENERATE(0, 1); + const bool cmp_y = GENERATE(0, 1); + const bool result_y = (cmp_y == ref_y); + + nihstro::Instruction IFC = {}; + IFC.opcode = nihstro::OpCode::Id::IFC; + IFC.flow_control.num_instructions = 2; + IFC.flow_control.dest_offset = 3; + IFC.flow_control.refx = ref_x; + IFC.flow_control.refy = ref_y; + + Pica::ShaderUnit shader_unit; + shader_unit.conditional_code[0] = cmp_x; + shader_unit.conditional_code[1] = cmp_y; + + // JustX + { + auto shader_setup = CompileShaderSetup(assembly_template); + IFC.flow_control.op = nihstro::Instruction::FlowControlType::Op::JustX; + shader_setup->program_code[0] = IFC.hex; + const float result = result_x ? 1.0f : 0.0f; + + auto shader_test = TestType(std::move(shader_setup)); + shader_test.Run(shader_unit, 1.0f); + + REQUIRE(shader_unit.output[0].x.ToFloat32() == result); + } + + // JustY + { + auto shader_setup = CompileShaderSetup(assembly_template); + IFC.flow_control.op = nihstro::Instruction::FlowControlType::Op::JustY; + shader_setup->program_code[0] = IFC.hex; + const float result = result_y ? 1.0f : 0.0f; + + auto shader_test = TestType(std::move(shader_setup)); + shader_test.Run(shader_unit, 1.0f); + + REQUIRE(shader_unit.output[0].x.ToFloat32() == result); + } + + // OR + { + auto shader_setup = CompileShaderSetup(assembly_template); + IFC.flow_control.op = nihstro::Instruction::FlowControlType::Op::Or; + shader_setup->program_code[0] = IFC.hex; + const float result = (result_x || result_y) ? 1.0f : 0.0f; + + auto shader_test = TestType(std::move(shader_setup)); + shader_test.Run(shader_unit, 1.0f); + + REQUIRE(shader_unit.output[0].x.ToFloat32() == result); + } + + // AND + { + auto shader_setup = CompileShaderSetup(assembly_template); + IFC.flow_control.op = nihstro::Instruction::FlowControlType::Op::And; + shader_setup->program_code[0] = IFC.hex; + const float result = (result_x && result_y) ? 1.0f : 0.0f; + + auto shader_test = TestType(std::move(shader_setup)); + shader_test.Run(shader_unit, 1.0f); + + REQUIRE(shader_unit.output[0].x.ToFloat32() == result); + } +} + SHADER_TEST_CASE("Source Swizzle", "[video_core][shader]") { const auto sh_input = SourceRegister::MakeInput(0); const auto sh_output = DestRegister::MakeOutput(0); diff --git a/src/video_core/pica/shader_unit.cpp b/src/video_core/pica/shader_unit.cpp index cac3ecd4ad..5d81f857ab 100644 --- a/src/video_core/pica/shader_unit.cpp +++ b/src/video_core/pica/shader_unit.cpp @@ -9,10 +9,7 @@ namespace Pica { -ShaderUnit::ShaderUnit(GeometryEmitter* emitter) : emitter_ptr{emitter} { - const Common::Vec4 temp_vec{f24::Zero(), f24::Zero(), f24::Zero(), f24::One()}; - temporary.fill(temp_vec); -} +ShaderUnit::ShaderUnit(GeometryEmitter* emitter) : emitter_ptr{emitter} {} ShaderUnit::~ShaderUnit() = default; diff --git a/src/video_core/pica/shader_unit.h b/src/video_core/pica/shader_unit.h index 12d533d16e..2f9c1843f6 100644 --- a/src/video_core/pica/shader_unit.h +++ b/src/video_core/pica/shader_unit.h @@ -46,11 +46,11 @@ struct ShaderUnit { } public: - s32 address_registers[3]; - bool conditional_code[2]; - alignas(16) std::array, 16> input; - alignas(16) std::array, 16> temporary; - alignas(16) std::array, 16> output; + s32 address_registers[3] = {}; + bool conditional_code[2] = {}; + alignas(16) std::array, 16> input = {}; + alignas(16) std::array, 16> temporary = {}; + alignas(16) std::array, 16> output = {}; GeometryEmitter* emitter_ptr; private: diff --git a/src/video_core/shader/shader_interpreter.cpp b/src/video_core/shader/shader_interpreter.cpp index 6a4efdd82b..cf2cb7836f 100644 --- a/src/video_core/shader/shader_interpreter.cpp +++ b/src/video_core/shader/shader_interpreter.cpp @@ -52,9 +52,6 @@ static void RunInterpreter(const ShaderSetup& setup, ShaderUnit& state, boost::circular_buffer loop_stack(4); u32 program_counter = entry_point; - state.conditional_code[0] = false; - state.conditional_code[1] = false; - const auto do_if = [&](Instruction instr, bool condition) { if (condition) { if_stack.push_back({ diff --git a/src/video_core/shader/shader_jit_a64_compiler.cpp b/src/video_core/shader/shader_jit_a64_compiler.cpp index c65f988522..d8393279fe 100644 --- a/src/video_core/shader/shader_jit_a64_compiler.cpp +++ b/src/video_core/shader/shader_jit_a64_compiler.cpp @@ -386,28 +386,50 @@ void JitShader::Compile_SanitizedMul(QReg src1, QReg src2, QReg scratch0) { } void JitShader::Compile_EvaluateCondition(Instruction instr) { - const u8 refx = instr.flow_control.refx.Value(); - const u8 refy = instr.flow_control.refy.Value(); + const bool refx = instr.flow_control.refx.Value(); + const bool refy = instr.flow_control.refy.Value(); switch (instr.flow_control.op) { // Note: NXOR is used below to check for equality - case Instruction::FlowControlType::Or: - EOR(XSCRATCH0, COND0, refx ^ 1); - EOR(XSCRATCH1, COND1, refy ^ 1); - ORR(XSCRATCH0, XSCRATCH0, XSCRATCH1); + case Instruction::FlowControlType::Or: { + XReg OpX = XSCRATCH0; + if (!refx) { + EOR(OpX, COND0, u8(refx) ^ 1); + } else { + OpX = COND0; + } + XReg OpY = XSCRATCH1; + if (!refy) { + EOR(OpY, COND1, u8(refy) ^ 1); + } else { + OpY = COND1; + } + ORR(XSCRATCH0, OpX, OpY); CMP(XSCRATCH0, 0); break; + } // Note: TST will AND two registers and set the EQ/NE flags on the result - case Instruction::FlowControlType::And: - EOR(XSCRATCH0, COND0, refx ^ 1); - EOR(XSCRATCH1, COND1, refy ^ 1); - TST(XSCRATCH0, XSCRATCH1); + case Instruction::FlowControlType::And: { + XReg OpX = XSCRATCH0; + if (!refx) { + EOR(OpX, COND0, u8(refx) ^ 1); + } else { + OpX = COND0; + } + XReg OpY = XSCRATCH1; + if (!refy) { + EOR(OpY, COND1, u8(refy) ^ 1); + } else { + OpY = COND1; + } + TST(OpX, OpY); break; + } case Instruction::FlowControlType::JustX: - CMP(COND0, refx); + CMP(COND0, u8(refx) ^ 1); break; case Instruction::FlowControlType::JustY: - CMP(COND1, refy); + CMP(COND1, u8(refy) ^ 1); break; default: UNREACHABLE(); diff --git a/src/video_core/shader/shader_jit_x64_compiler.cpp b/src/video_core/shader/shader_jit_x64_compiler.cpp index d101976666..70f59a3d9e 100644 --- a/src/video_core/shader/shader_jit_x64_compiler.cpp +++ b/src/video_core/shader/shader_jit_x64_compiler.cpp @@ -401,29 +401,29 @@ void JitShader::Compile_EvaluateCondition(Instruction instr) { // Note: NXOR is used below to check for equality switch (instr.flow_control.op) { case Instruction::FlowControlType::Or: - mov(eax, COND0); - mov(ebx, COND1); - xor_(eax, (instr.flow_control.refx.Value() ^ 1)); - xor_(ebx, (instr.flow_control.refy.Value() ^ 1)); - or_(eax, ebx); + mov(al, COND0.cvt8()); + mov(bl, COND1.cvt8()); + xor_(al, (instr.flow_control.refx.Value() ^ 1)); + xor_(bl, (instr.flow_control.refy.Value() ^ 1)); + or_(al, bl); break; case Instruction::FlowControlType::And: - mov(eax, COND0); - mov(ebx, COND1); - xor_(eax, (instr.flow_control.refx.Value() ^ 1)); - xor_(ebx, (instr.flow_control.refy.Value() ^ 1)); - and_(eax, ebx); + mov(al, COND0); + mov(bl, COND1); + xor_(al, (instr.flow_control.refx.Value() ^ 1)); + xor_(bl, (instr.flow_control.refy.Value() ^ 1)); + and_(al, bl); break; case Instruction::FlowControlType::JustX: - mov(eax, COND0); - xor_(eax, (instr.flow_control.refx.Value() ^ 1)); + mov(al, COND0); + xor_(al, (instr.flow_control.refx.Value() ^ 1)); break; case Instruction::FlowControlType::JustY: - mov(eax, COND1); - xor_(eax, (instr.flow_control.refy.Value() ^ 1)); + mov(al, COND1); + xor_(al, (instr.flow_control.refy.Value() ^ 1)); break; } } @@ -1002,8 +1002,8 @@ void JitShader::Compile(const std::array* program_ mov(LOOPCOUNT_REG, dword[STATE + offsetof(ShaderUnit, address_registers[2])]); // Load conditional code - mov(COND0, byte[STATE + offsetof(ShaderUnit, conditional_code[0])]); - mov(COND1, byte[STATE + offsetof(ShaderUnit, conditional_code[1])]); + movzx(COND0, byte[STATE + offsetof(ShaderUnit, conditional_code[0])]); + movzx(COND1, byte[STATE + offsetof(ShaderUnit, conditional_code[1])]); // Used to set a register to one static const __m128 one = {1.f, 1.f, 1.f, 1.f};