<div dir="ltr"> <div class="gmail_extra"> <div class="gmail_quote">On Mon, Feb 2, 2015 at 10:00 PM, Kenneth Graunke <<a href="mailto:kenneth@whitecape.org" target="_blank">kenneth@whitecape.org</a>> wrote: <blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"><div class="HOEnZb"><div class="h5">On Thursday, January 29, 2015 01:40:20 PM Jason Ekstrand wrote: > Originally, get_alu_src was supposed to handle resolving swizzles and > things like that. However, now that basically every instruction we have > only takes scalar sources, we don't really need it anymore. The only case > where it's still marginally useful is for the mov and vecN operations that > are left over from SSA form. We can handle those cases as a special case > easily enough. As a side-effect, we don't need the vec_to_movs pass > anymore. > --- > src/mesa/drivers/dri/i965/brw_fs.h | 1 - > src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 153 +++++++++++++++++++------------ > 2 files changed, 95 insertions(+), 59 deletions(-) > > diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h > index 25197cd..b95e2c0 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs.h > +++ b/src/mesa/drivers/dri/i965/brw_fs.h > @@ -589,7 +589,6 @@ public: > void nir_emit_texture(nir_tex_instr *instr); > void nir_emit_jump(nir_jump_instr *instr); > fs_reg get_nir_src(nir_src src); > - fs_reg get_nir_alu_src(nir_alu_instr *instr, unsigned src); > fs_reg get_nir_dest(nir_dest dest); > void emit_percomp(fs_inst *inst, unsigned wr_mask); > > diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp > index 37ccd24..fbb1622 100644 > --- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp > +++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp > @@ -140,8 +140,6 @@ fs_visitor::emit_nir_code() > > nir_convert_from_ssa(nir); > nir_validate_shader(nir); > - nir_lower_vec_to_movs(nir); > - nir_validate_shader(nir); > > /* emit the arrays used for inputs and outputs - load/store intrinsics will > * be converted to reads/writes of these arrays > @@ -419,6 +417,7 @@ fs_visitor::nir_emit_impl(nir_function_impl *impl) > void > fs_visitor::nir_emit_cf_list(exec_list *list) > { > + exec_list_validate(list); > foreach_list_typed(nir_cf_node, node, node, list) { > switch (node->type) { > case nir_cf_node_if: > @@ -541,34 +540,117 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr) > struct brw_wm_prog_key *fs_key = (struct brw_wm_prog_key *) this->key; > fs_inst *inst; > > - fs_reg op[3]; > fs_reg result = get_nir_dest(instr->dest.dest); > result.type = brw_type_for_nir_type(nir_op_infos[instr->op].output_type); > > - for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) > - op[i] = get_nir_alu_src(instr, i); > + fs_reg op[4]; > + for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { > + op[i] = get_nir_src(instr->src[i].src); > + op[i].type = brw_type_for_nir_type(nir_op_infos[instr->op].input_types[i]); > + op[i].abs = instr->src[i].abs; > + op[i].negate = instr->src[i].negate; > + } > + > + /* We get a bunch of mov's out of the from_ssa pass and they may still > + * be vectorized. We'll handle them as a special-case. We'll also > + * handle vecN here because it's basically the same thing. > + */ > + bool need_extra_copy = false; > + switch (instr->op) { > + case nir_op_vec4: > + if (!instr->src[3].src.is_ssa && > + instr->dest.dest.reg.reg == instr->src[3].src.reg.reg) > + need_extra_copy = true; > + /* fall through */ > + case nir_op_vec3: > + if (!instr->src[2].src.is_ssa && > + instr->dest.dest.reg.reg == instr->src[2].src.reg.reg) > + need_extra_copy = true; > + /* fall through */ > + case nir_op_vec2: > + if (!instr->src[1].src.is_ssa && > + instr->dest.dest.reg.reg == instr->src[1].src.reg.reg) > + need_extra_copy = true; > + /* fall through */ > + case nir_op_imov: > + case nir_op_fmov: { > + if (!instr->src[0].src.is_ssa && > + instr->dest.dest.reg.reg == instr->src[0].src.reg.reg) > + need_extra_copy = true; </div></div>How about: switch (instr->op) { case nir_op_vec4: case nir_op_vec3: case nir_op_vec2: case nir_op_imov: case nir_op_fmov: { fs_reg temp = result; bool need_extra_copy = false; for (int i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { if (!instr->src[i].src.is_ssa && instr->dest.dest.reg.reg == instr->src[i].src.reg.reg) { need_extra_copy = true; temp = retype(vgrf(4), result.type); break; } } ... } That duplicates less code. </blockquote><div> </div><div>Yeah, the original is horrible. That's much better. I'll do that. </div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"> <div><div class="h5"> > + > + fs_reg temp; > + if (need_extra_copy) { > + temp = retype(vgrf(4), result.type); > + } else { > + temp = result; > + } > + > + if (instr->op == nir_op_imov || instr->op == nir_op_fmov) { > + for (unsigned i = 0; i < 4; i++) { > + if (!(instr->dest.write_mask & (1 << i))) > + continue; > + > + inst = emit(MOV(offset(temp, i), > + offset(op[0], instr->src[0].swizzle[i]))); > + inst->saturate = instr->dest.saturate; > + } > + } else { > + for (unsigned i = 0; i < 4; i++) { > + if (!(instr->dest.write_mask & (1 << i))) > + continue; > + > + inst = emit(MOV(offset(temp, i), > + offset(op[i], instr->src[i].swizzle[0]))); > + inst->saturate = instr->dest.saturate; > + } > + } > + > + /* In this case the source and destination registers were the same, > + * so we need to insert an extra set of moves in order to deal with > + * any swizzling. > + */ > + if (need_extra_copy) { > + for (unsigned i = 0; i < 4; i++) { > + if (!(instr->dest.write_mask & (1 << i))) > + continue; > + > + emit(MOV(offset(result, i), offset(temp, i))); > + } > + } </div></div>I might just combine all four loops, and move the if-conditions inside. It'd probably be less code. Your call. </blockquote><div> </div><div>We can't. We have to fill temp and then dump to result. Otherwise, the swizzles will get messed up. We can probably combine the first two though. </div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"> > + return; > + } > + default: > + break; > + } > > + /* Bonus blank line here - should be /* At this point </blockquote><div> </div><div>Sure. </div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"> Otherwise, this looks good to me. </blockquote><div> </div><div>Do you want a v2? </div><div> </div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex"> <div class="HOEnZb"><div class="h5"> > + * At this point, we have dealt with any instruction that operates on > + * more than a single channel. Therefore, we can just adjust the source > + * and destination registers for that channel and emit the instruction. > + */ > + unsigned channel = 0; > if (nir_op_infos[instr->op].output_size == 0) { > - /* We've already scalarized, so we know that we only have one > - * channel. The only question is which channel. > + /* Since NIR is doing the scalarizing for us, we should only ever see > + * vectorized operations with a single channel. > */ > assert(_mesa_bitcount(instr->dest.write_mask) == 1); > - unsigned off = ffs(instr->dest.write_mask) - 1; > - result = offset(result, off); > + channel = ffs(instr->dest.write_mask) - 1; > + > + result = offset(result, channel); > + } > > - for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) > - op[i] = offset(op[i], off); > + for (unsigned i = 0; i < nir_op_infos[instr->op].num_inputs; i++) { > + assert(nir_op_infos[instr->op].input_sizes[i] < 2); > + op[i] = offset(op[i], instr->src[i].swizzle[channel]); > } > > switch (instr->op) { > - case nir_op_fmov: > case nir_op_i2f: > case nir_op_u2f: > inst = emit(MOV(result, op[0])); > inst->saturate = instr->dest.saturate; > break; > > - case nir_op_imov: > case nir_op_f2i: > case nir_op_f2u: > emit(MOV(result, op[0])); > @@ -823,11 +905,6 @@ fs_visitor::nir_emit_alu(nir_alu_instr *instr) > case nir_op_fnoise4_4: > unreachable("not reached: should be handled by lower_noise"); > > - case nir_op_vec2: > - case nir_op_vec3: > - case nir_op_vec4: > - unreachable("not reached: should be handled by lower_quadop_vector"); > - > case nir_op_ldexp: > unreachable("not reached: should be handled by ldexp_to_arith()"); > > @@ -1048,46 +1125,6 @@ fs_visitor::get_nir_src(nir_src src) > } > > fs_reg > -fs_visitor::get_nir_alu_src(nir_alu_instr *instr, unsigned src) > -{ > - fs_reg reg = get_nir_src(instr->src[src].src); > - > - reg.type = brw_type_for_nir_type(nir_op_infos[instr->op].input_types[src]); > - reg.abs = instr->src[src].abs; > - reg.negate = instr->src[src].negate; > - > - bool needs_swizzle = false; > - unsigned num_components = 0; > - for (unsigned i = 0; i < 4; i++) { > - if (!nir_alu_instr_channel_used(instr, src, i)) > - continue; > - > - if (instr->src[src].swizzle[i] != i) > - needs_swizzle = true; > - > - num_components = i + 1; > - } > - > - if (needs_swizzle) { > - /* resolve the swizzle through MOV's */ > - fs_reg new_reg = vgrf(num_components); > - new_reg.type = reg.type; > - > - for (unsigned i = 0; i < 4; i++) { > - if (!nir_alu_instr_channel_used(instr, src, i)) > - continue; > - > - emit(MOV(offset(new_reg, i), > - offset(reg, instr->src[src].swizzle[i]))); > - } > - > - return new_reg; > - } > - > - return reg; > -} > - > -fs_reg > fs_visitor::get_nir_dest(nir_dest dest) > { > fs_reg reg; > </div></div></blockquote></div> </div></div>