Skip to content

Commit

Permalink
shader_jit: Fix/optimize conditional evaluation
Browse files Browse the repository at this point in the history
  • Loading branch information
Wunkolo authored and Gamer64ytb committed Aug 19, 2024
1 parent 8d88676 commit 16f4318
Show file tree
Hide file tree
Showing 6 changed files with 144 additions and 40 deletions.
88 changes: 88 additions & 0 deletions src/tests/video_core/shader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -676,6 +676,94 @@ TEMPLATE_TEST_CASE("Nested Loop", "[video_core][shader]", ShaderJitTest) {
}
}

SHADER_TEST_CASE("Conditional", "[video_core][shader]") {
const auto sh_input = SourceRegister::MakeInput(0);
const auto sh_temp = SourceRegister::MakeTemporary(0);
const auto sh_output = DestRegister::MakeOutput(0);

const std::initializer_list<nihstro::InlineAsm> assembly_template = {
// IFC configured later
{OpCode::Id::NOP},
// True
{OpCode::Id::MOV, sh_output, sh_input},
{OpCode::Id::END},
// False
{OpCode::Id::MOV, sh_output, sh_temp},
{OpCode::Id::END},
};

const bool ref_x = GENERATE(0, 1);
const bool cmp_x = GENERATE(0, 1);
const bool result_x = (cmp_x == ref_x);

const bool ref_y = GENERATE(0, 1);
const bool cmp_y = GENERATE(0, 1);
const bool result_y = (cmp_y == ref_y);

nihstro::Instruction IFC = {};
IFC.opcode = nihstro::OpCode::Id::IFC;
IFC.flow_control.num_instructions = 2;
IFC.flow_control.dest_offset = 3;
IFC.flow_control.refx = ref_x;
IFC.flow_control.refy = ref_y;

Pica::ShaderUnit shader_unit;
shader_unit.conditional_code[0] = cmp_x;
shader_unit.conditional_code[1] = cmp_y;

// JustX
{
auto shader_setup = CompileShaderSetup(assembly_template);
IFC.flow_control.op = nihstro::Instruction::FlowControlType::Op::JustX;
shader_setup->program_code[0] = IFC.hex;
const float result = result_x ? 1.0f : 0.0f;

auto shader_test = TestType(std::move(shader_setup));
shader_test.Run(shader_unit, 1.0f);

REQUIRE(shader_unit.output[0].x.ToFloat32() == result);
}

// JustY
{
auto shader_setup = CompileShaderSetup(assembly_template);
IFC.flow_control.op = nihstro::Instruction::FlowControlType::Op::JustY;
shader_setup->program_code[0] = IFC.hex;
const float result = result_y ? 1.0f : 0.0f;

auto shader_test = TestType(std::move(shader_setup));
shader_test.Run(shader_unit, 1.0f);

REQUIRE(shader_unit.output[0].x.ToFloat32() == result);
}

// OR
{
auto shader_setup = CompileShaderSetup(assembly_template);
IFC.flow_control.op = nihstro::Instruction::FlowControlType::Op::Or;
shader_setup->program_code[0] = IFC.hex;
const float result = (result_x || result_y) ? 1.0f : 0.0f;

auto shader_test = TestType(std::move(shader_setup));
shader_test.Run(shader_unit, 1.0f);

REQUIRE(shader_unit.output[0].x.ToFloat32() == result);
}

// AND
{
auto shader_setup = CompileShaderSetup(assembly_template);
IFC.flow_control.op = nihstro::Instruction::FlowControlType::Op::And;
shader_setup->program_code[0] = IFC.hex;
const float result = (result_x && result_y) ? 1.0f : 0.0f;

auto shader_test = TestType(std::move(shader_setup));
shader_test.Run(shader_unit, 1.0f);

REQUIRE(shader_unit.output[0].x.ToFloat32() == result);
}
}

SHADER_TEST_CASE("Source Swizzle", "[video_core][shader]") {
const auto sh_input = SourceRegister::MakeInput(0);
const auto sh_output = DestRegister::MakeOutput(0);
Expand Down
5 changes: 1 addition & 4 deletions src/video_core/pica/shader_unit.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,7 @@

namespace Pica {

ShaderUnit::ShaderUnit(GeometryEmitter* emitter) : emitter_ptr{emitter} {
const Common::Vec4<f24> temp_vec{f24::Zero(), f24::Zero(), f24::Zero(), f24::One()};
temporary.fill(temp_vec);
}
ShaderUnit::ShaderUnit(GeometryEmitter* emitter) : emitter_ptr{emitter} {}

ShaderUnit::~ShaderUnit() = default;

Expand Down
10 changes: 5 additions & 5 deletions src/video_core/pica/shader_unit.h
Original file line number Diff line number Diff line change
Expand Up @@ -46,11 +46,11 @@ struct ShaderUnit {
}

public:
s32 address_registers[3];
bool conditional_code[2];
alignas(16) std::array<Common::Vec4<f24>, 16> input;
alignas(16) std::array<Common::Vec4<f24>, 16> temporary;
alignas(16) std::array<Common::Vec4<f24>, 16> output;
s32 address_registers[3] = {};
bool conditional_code[2] = {};
alignas(16) std::array<Common::Vec4<f24>, 16> input = {};
alignas(16) std::array<Common::Vec4<f24>, 16> temporary = {};
alignas(16) std::array<Common::Vec4<f24>, 16> output = {};
GeometryEmitter* emitter_ptr;

private:
Expand Down
3 changes: 0 additions & 3 deletions src/video_core/shader/shader_interpreter.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,9 +52,6 @@ static void RunInterpreter(const ShaderSetup& setup, ShaderUnit& state,
boost::circular_buffer<LoopStackElement> loop_stack(4);
u32 program_counter = entry_point;

state.conditional_code[0] = false;
state.conditional_code[1] = false;

const auto do_if = [&](Instruction instr, bool condition) {
if (condition) {
if_stack.push_back({
Expand Down
46 changes: 34 additions & 12 deletions src/video_core/shader/shader_jit_a64_compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -386,28 +386,50 @@ void JitShader::Compile_SanitizedMul(QReg src1, QReg src2, QReg scratch0) {
}

void JitShader::Compile_EvaluateCondition(Instruction instr) {
const u8 refx = instr.flow_control.refx.Value();
const u8 refy = instr.flow_control.refy.Value();
const bool refx = instr.flow_control.refx.Value();
const bool refy = instr.flow_control.refy.Value();

switch (instr.flow_control.op) {
// Note: NXOR is used below to check for equality
case Instruction::FlowControlType::Or:
EOR(XSCRATCH0, COND0, refx ^ 1);
EOR(XSCRATCH1, COND1, refy ^ 1);
ORR(XSCRATCH0, XSCRATCH0, XSCRATCH1);
case Instruction::FlowControlType::Or: {
XReg OpX = XSCRATCH0;
if (!refx) {
EOR(OpX, COND0, u8(refx) ^ 1);
} else {
OpX = COND0;
}
XReg OpY = XSCRATCH1;
if (!refy) {
EOR(OpY, COND1, u8(refy) ^ 1);
} else {
OpY = COND1;
}
ORR(XSCRATCH0, OpX, OpY);
CMP(XSCRATCH0, 0);
break;
}
// Note: TST will AND two registers and set the EQ/NE flags on the result
case Instruction::FlowControlType::And:
EOR(XSCRATCH0, COND0, refx ^ 1);
EOR(XSCRATCH1, COND1, refy ^ 1);
TST(XSCRATCH0, XSCRATCH1);
case Instruction::FlowControlType::And: {
XReg OpX = XSCRATCH0;
if (!refx) {
EOR(OpX, COND0, u8(refx) ^ 1);
} else {
OpX = COND0;
}
XReg OpY = XSCRATCH1;
if (!refy) {
EOR(OpY, COND1, u8(refy) ^ 1);
} else {
OpY = COND1;
}
TST(OpX, OpY);
break;
}
case Instruction::FlowControlType::JustX:
CMP(COND0, refx);
CMP(COND0, u8(refx) ^ 1);
break;
case Instruction::FlowControlType::JustY:
CMP(COND1, refy);
CMP(COND1, u8(refy) ^ 1);
break;
default:
UNREACHABLE();
Expand Down
32 changes: 16 additions & 16 deletions src/video_core/shader/shader_jit_x64_compiler.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -401,29 +401,29 @@ void JitShader::Compile_EvaluateCondition(Instruction instr) {
// Note: NXOR is used below to check for equality
switch (instr.flow_control.op) {
case Instruction::FlowControlType::Or:
mov(eax, COND0);
mov(ebx, COND1);
xor_(eax, (instr.flow_control.refx.Value() ^ 1));
xor_(ebx, (instr.flow_control.refy.Value() ^ 1));
or_(eax, ebx);
mov(al, COND0.cvt8());
mov(bl, COND1.cvt8());
xor_(al, (instr.flow_control.refx.Value() ^ 1));
xor_(bl, (instr.flow_control.refy.Value() ^ 1));
or_(al, bl);
break;

case Instruction::FlowControlType::And:
mov(eax, COND0);
mov(ebx, COND1);
xor_(eax, (instr.flow_control.refx.Value() ^ 1));
xor_(ebx, (instr.flow_control.refy.Value() ^ 1));
and_(eax, ebx);
mov(al, COND0);
mov(bl, COND1);
xor_(al, (instr.flow_control.refx.Value() ^ 1));
xor_(bl, (instr.flow_control.refy.Value() ^ 1));
and_(al, bl);
break;

case Instruction::FlowControlType::JustX:
mov(eax, COND0);
xor_(eax, (instr.flow_control.refx.Value() ^ 1));
mov(al, COND0);
xor_(al, (instr.flow_control.refx.Value() ^ 1));
break;

case Instruction::FlowControlType::JustY:
mov(eax, COND1);
xor_(eax, (instr.flow_control.refy.Value() ^ 1));
mov(al, COND1);
xor_(al, (instr.flow_control.refy.Value() ^ 1));
break;
}
}
Expand Down Expand Up @@ -1002,8 +1002,8 @@ void JitShader::Compile(const std::array<u32, MAX_PROGRAM_CODE_LENGTH>* program_
mov(LOOPCOUNT_REG, dword[STATE + offsetof(ShaderUnit, address_registers[2])]);

// Load conditional code
mov(COND0, byte[STATE + offsetof(ShaderUnit, conditional_code[0])]);
mov(COND1, byte[STATE + offsetof(ShaderUnit, conditional_code[1])]);
movzx(COND0, byte[STATE + offsetof(ShaderUnit, conditional_code[0])]);
movzx(COND1, byte[STATE + offsetof(ShaderUnit, conditional_code[1])]);

// Used to set a register to one
static const __m128 one = {1.f, 1.f, 1.f, 1.f};
Expand Down

0 comments on commit 16f4318

Please sign in to comment.