Mesa (master): aco: move s_andn2_b64 instructions out of the p_discard_if
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Wed Oct 9 17:16:50 UTC 2019
Module: Mesa
Branch: master
Commit: 2ea9e59e8d976ec77800d2a20645087b96d1e241
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=2ea9e59e8d976ec77800d2a20645087b96d1e241
Author: Rhys Perry <pendingchaos02 at gmail.com>
Date: Tue Oct 8 13:40:17 2019 +0100
aco: move s_andn2_b64 instructions out of the p_discard_if
And use a new p_discard_early_exit instruction. This fixes some cases
where a definition having the same register as an operand causes issues.
v2: rename instruction to p_exit_early_if
v2: modify the existing instruction instead of creating a new one
v3: merge the "i == num - 1" IFs
Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
Reviewed-by: Daniel Schürmann <daniel at schuermann.dev>
---
src/amd/compiler/aco_insert_exec_mask.cpp | 72 +++++++++++++-------------
src/amd/compiler/aco_instruction_selection.cpp | 1 +
src/amd/compiler/aco_lower_to_hw_instr.cpp | 35 ++++---------
src/amd/compiler/aco_opcodes.py | 1 +
src/amd/compiler/aco_scheduler.cpp | 6 +++
5 files changed, 54 insertions(+), 61 deletions(-)
diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp
index 155c21a5aa4..3f4b48e661f 100644
--- a/src/amd/compiler/aco_insert_exec_mask.cpp
+++ b/src/amd/compiler/aco_insert_exec_mask.cpp
@@ -657,22 +657,23 @@ void process_instructions(exec_ctx& ctx, Block* block,
transition_to_WQM(ctx, bld, block->index);
ctx.info[block->index].exec.back().second &= ~mask_type_global;
}
- unsigned num = ctx.info[block->index].exec.size();
+ int num = ctx.info[block->index].exec.size();
assert(num);
Operand cond = instr->operands[0];
- instr.reset(create_instruction<Pseudo_instruction>(aco_opcode::p_discard_if, Format::PSEUDO, num + 1, num + 1));
- for (unsigned i = 0; i < num; i++) {
- instr->operands[i] = Operand(ctx.info[block->index].exec[i].first);
- if (i == num - 1)
- instr->operands[i].setFixed(exec);
- Temp new_mask = bld.tmp(s2);
- instr->definitions[i] = Definition(new_mask);
- ctx.info[block->index].exec[i].first = new_mask;
+ for (int i = num - 1; i >= 0; i--) {
+ Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc),
+ ctx.info[block->index].exec[i].first, cond);
+ if (i == num - 1) {
+ andn2->operands[0].setFixed(exec);
+ andn2->definitions[0].setFixed(exec);
+ }
+ if (i == 0) {
+ instr->opcode = aco_opcode::p_exit_early_if;
+ instr->operands[0] = bld.scc(andn2->definitions[1].getTemp());
+ }
+ ctx.info[block->index].exec[i].first = andn2->definitions[0].getTemp();
}
- assert((ctx.info[block->index].exec[0].second & mask_type_wqm) == 0);
- instr->definitions[num - 1].setFixed(exec);
- instr->operands[num] = cond;
- instr->definitions[num] = bld.def(s1, scc);
+ assert(!ctx.handle_wqm || (ctx.info[block->index].exec[0].second & mask_type_wqm) == 0);
} else if (needs == WQM && state != WQM) {
transition_to_WQM(ctx, bld, block->index);
@@ -738,24 +739,24 @@ void process_instructions(exec_ctx& ctx, Block* block,
num = 1;
}
- for (unsigned i = 0; i < ctx.info[block->index].exec.size() - 1; i++)
- num += ctx.info[block->index].exec[i].second & mask_type_exact ? 1 : 0;
- instr.reset(create_instruction<Instruction>(aco_opcode::p_discard_if, Format::PSEUDO, num + 1, num + 1));
- int k = 0;
- for (unsigned i = 0; k < num; i++) {
+ num += ctx.info[block->index].exec.size() - 1;
+ for (int i = num - 1; i >= 0; i--) {
if (ctx.info[block->index].exec[i].second & mask_type_exact) {
- instr->operands[k] = Operand(ctx.info[block->index].exec[i].first);
- Temp new_mask = bld.tmp(s2);
- instr->definitions[k] = Definition(new_mask);
- if (i == ctx.info[block->index].exec.size() - 1)
- instr->definitions[k].setFixed(exec);
- k++;
- ctx.info[block->index].exec[i].first = new_mask;
+ Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc),
+ ctx.info[block->index].exec[i].first, cond);
+ if (i == num - 1) {
+ andn2->operands[0].setFixed(exec);
+ andn2->definitions[0].setFixed(exec);
+ }
+ if (i == 0) {
+ instr->opcode = aco_opcode::p_exit_early_if;
+ instr->operands[0] = bld.scc(andn2->definitions[1].getTemp());
+ }
+ ctx.info[block->index].exec[i].first = andn2->definitions[0].getTemp();
+ } else {
+ assert(i != 0);
}
}
- assert(k == num);
- instr->definitions[num] = bld.def(s1, scc);
- instr->operands[num] = Operand(cond);
state = Exact;
} else if (instr->opcode == aco_opcode::p_fs_buffer_store_smem) {
@@ -878,18 +879,15 @@ void add_branch_code(exec_ctx& ctx, Block* block)
bld.exec(Definition(new_exec)), Operand(0u), bld.exec(old_exec));
ctx.info[idx].exec.back().first = new_exec;
- aco_ptr<Pseudo_instruction> discard{create_instruction<Pseudo_instruction>(aco_opcode::p_discard_if, Format::PSEUDO, num + 1, num + 1)};
- for (unsigned i = 0; i < num; i++) {
- discard->operands[i] = Operand(ctx.info[block->index].exec[i].first);
- Temp new_mask = bld.tmp(s2);
- discard->definitions[i] = Definition(new_mask);
- ctx.info[block->index].exec[i].first = new_mask;
+ for (int i = num - 1; i >= 0; i--) {
+ Instruction *andn2 = bld.sop2(aco_opcode::s_andn2_b64, bld.def(s2), bld.def(s1, scc),
+ ctx.info[block->index].exec[i].first, cond);
+ if (i == 0)
+ bld.pseudo(aco_opcode::p_exit_early_if, bld.scc(andn2->definitions[1].getTemp()));
+ ctx.info[block->index].exec[i].first = andn2->definitions[0].getTemp();
}
assert(!ctx.handle_wqm || (ctx.info[block->index].exec[0].second & mask_type_wqm) == 0);
- discard->operands[num] = Operand(cond);
- discard->definitions[num] = bld.def(s1, scc);
- bld.insert(std::move(discard));
if ((block->kind & (block_kind_break | block_kind_uniform)) == block_kind_break)
ctx.info[idx].exec.back().first = cond;
bld.insert(std::move(branch));
diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp
index bba091fd74b..d1849d7b92b 100644
--- a/src/amd/compiler/aco_instruction_selection.cpp
+++ b/src/amd/compiler/aco_instruction_selection.cpp
@@ -3266,6 +3266,7 @@ void visit_discard_if(isel_context *ctx, nir_intrinsic_instr *instr)
ctx->program->needs_exact = true;
+ // TODO: optimize uniform conditions
Builder bld(ctx->program, ctx->block);
Temp src = as_divergent_bool(ctx, get_ssa_temp(ctx, instr->src[0].ssa), false);
src = bld.sop2(aco_opcode::s_and_b64, bld.def(s2), bld.def(s1, scc), src, Operand(exec, s2));
diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp
index 8fd33e47d92..39585111954 100644
--- a/src/amd/compiler/aco_lower_to_hw_instr.cpp
+++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp
@@ -606,15 +606,15 @@ void lower_to_hw_instr(Program* program)
handle_operands(copy_operations, &ctx, program->chip_class, pi);
break;
}
- case aco_opcode::p_discard_if:
+ case aco_opcode::p_exit_early_if:
{
- bool early_exit = false;
- if (block->instructions[j + 1]->opcode != aco_opcode::p_logical_end ||
- block->instructions[j + 2]->opcode != aco_opcode::s_endpgm) {
- early_exit = true;
+ /* don't bother with an early exit at the end of the program */
+ if (block->instructions[j + 1]->opcode == aco_opcode::p_logical_end &&
+ block->instructions[j + 2]->opcode == aco_opcode::s_endpgm) {
+ break;
}
- if (early_exit && !discard_block) {
+ if (!discard_block) {
discard_block = program->create_and_insert_block();
block = &program->blocks[i];
@@ -628,26 +628,13 @@ void lower_to_hw_instr(Program* program)
bld.reset(&ctx.instructions);
}
- // TODO: optimize uniform conditions
- Definition branch_cond = instr->definitions.back();
- Operand discard_cond = instr->operands.back();
- aco_ptr<Instruction> sop2;
- /* backwards, to finally branch on the global exec mask */
- for (int i = instr->operands.size() - 2; i >= 0; i--) {
- bld.sop2(aco_opcode::s_andn2_b64,
- instr->definitions[i], /* new mask */
- branch_cond, /* scc */
- instr->operands[i], /* old mask */
- discard_cond);
- }
-
- if (early_exit) {
- bld.sopp(aco_opcode::s_cbranch_scc0, bld.scc(branch_cond.getTemp()), discard_block->index);
+ //TODO: exec can be zero here with block_kind_discard
- discard_block->linear_preds.push_back(block->index);
- block->linear_succs.push_back(discard_block->index);
- }
+ assert(instr->operands[0].physReg() == scc);
+ bld.sopp(aco_opcode::s_cbranch_scc0, instr->operands[0], discard_block->index);
+ discard_block->linear_preds.push_back(block->index);
+ block->linear_succs.push_back(discard_block->index);
break;
}
case aco_opcode::p_spill:
diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py
index a5b4eb9a54e..a358527e60b 100644
--- a/src/amd/compiler/aco_opcodes.py
+++ b/src/amd/compiler/aco_opcodes.py
@@ -236,6 +236,7 @@ opcode("p_discard_if")
opcode("p_load_helper")
opcode("p_demote_to_helper")
opcode("p_is_helper")
+opcode("p_exit_early_if")
opcode("p_fs_buffer_store_smem", format=Format.SMEM)
diff --git a/src/amd/compiler/aco_scheduler.cpp b/src/amd/compiler/aco_scheduler.cpp
index 0cd67a979e0..09076a9a71f 100644
--- a/src/amd/compiler/aco_scheduler.cpp
+++ b/src/amd/compiler/aco_scheduler.cpp
@@ -220,6 +220,8 @@ void schedule_SMEM(sched_ctx& ctx, Block* block,
break;
if (candidate->opcode == aco_opcode::p_logical_start)
break;
+ if (candidate->opcode == aco_opcode::p_exit_early_if)
+ break;
if (!can_move_instr(candidate, current, moving_interaction))
break;
register_pressure.update(register_demand[candidate_idx]);
@@ -445,6 +447,8 @@ void schedule_VMEM(sched_ctx& ctx, Block* block,
break;
if (candidate->opcode == aco_opcode::p_logical_start)
break;
+ if (candidate->opcode == aco_opcode::p_exit_early_if)
+ break;
if (!can_move_instr(candidate, current, moving_interaction))
break;
@@ -665,6 +669,8 @@ void schedule_position_export(sched_ctx& ctx, Block* block,
/* break when encountering logical_start or barriers */
if (candidate->opcode == aco_opcode::p_logical_start)
break;
+ if (candidate->opcode == aco_opcode::p_exit_early_if)
+ break;
if (candidate->isVMEM() || candidate->format == Format::SMEM)
break;
if (!can_move_instr(candidate, current, moving_interaction))
More information about the mesa-commit
mailing list