<div dir="ltr">On 2 December 2013 11:31, Francisco Jerez <span dir="ltr"><<a href="mailto:currojerez@riseup.net" target="_blank">currojerez@riseup.net</a>></span> wrote:<br><div class="gmail_extra"><div class="gmail_quote">
<blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">---<br>
src/mesa/drivers/dri/i965/brw_fs.cpp | 29 +++++++++++++++++++---<br>
src/mesa/drivers/dri/i965/brw_fs.h | 3 +++<br>
.../drivers/dri/i965/brw_fs_copy_propagation.cpp | 7 +++++-<br>
src/mesa/drivers/dri/i965/brw_fs_generator.cpp | 8 +++---<br>
.../drivers/dri/i965/brw_fs_live_variables.cpp | 2 +-<br>
src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp | 4 +--<br>
6 files changed, 43 insertions(+), 10 deletions(-)<br>
<br>
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp<br>
index 0caae2d..e4cee33 100644<br>
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp<br>
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp<br>
@@ -380,6 +380,7 @@ fs_reg::init()<br>
{<br>
memset(this, 0, sizeof(*this));<br>
this->smear = -1;<br>
+ stride = 1;<br>
}<br>
<br>
/** Generic unset register constructor. */<br>
@@ -445,6 +446,7 @@ fs_reg::equals(const fs_reg &r) const<br>
memcmp(&fixed_hw_reg, &r.fixed_hw_reg,<br>
sizeof(fixed_hw_reg)) == 0 &&<br>
smear == r.smear &&<br>
+ stride == r.stride &&<br>
imm.u == r.imm.u);<br>
}<br>
<br>
@@ -456,6 +458,22 @@ fs_reg::retype(uint32_t type)<br>
return result;<br>
}<br>
<br>
+fs_reg &<br>
+fs_reg::apply_stride(unsigned stride)<br>
+{<br>
+ assert((this->stride * stride) <= 4 &&<br>
+ is_power_of_two(stride) &&<br>
+ file != HW_REG && file != IMM); <br></blockquote><div><br></div><div>This function relies on the (IMO surprising) fact that is_power_of_two(0) == true. Can we add a comment to clarify that we're taking advantage of this to allow a stride of 0? Otherwise a reader might reasonably assume that strides of 0 were not allowed.<br>
</div><div> </div><blockquote class="gmail_quote" style="margin:0px 0px 0px 0.8ex;border-left:1px solid rgb(204,204,204);padding-left:1ex">
+ this->stride *= stride;<br>
+ return *this;<br>
+}<br>
+<br>
+bool<br>
+fs_reg::is_contiguous() const<br>
+{<br>
+ return stride == 1;<br>
+}<br>
+<br>
bool<br>
fs_reg::is_valid_3src() const<br>
{<br>
@@ -686,7 +704,7 @@ fs_inst::is_partial_write()<br>
{<br>
return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||<br>
this->force_uncompressed ||<br>
- this->force_sechalf);<br>
+ this->force_sechalf || !this->dst.is_contiguous());<br>
}<br>
<br>
int<br>
@@ -2246,6 +2264,7 @@ fs_visitor::register_coalesce_2()<br>
inst->src[0].negate ||<br>
inst->src[0].abs ||<br>
inst->src[0].smear != -1 ||<br>
+ !inst->src[0].is_contiguous() ||<br>
inst->dst.file != GRF ||<br>
inst->dst.type != inst->src[0].type ||<br>
virtual_grf_sizes[inst->src[0].reg] != 1) {<br>
@@ -2338,6 +2357,7 @@ fs_visitor::register_coalesce()<br>
bool has_source_modifiers = (inst->src[0].abs ||<br>
inst->src[0].negate ||<br>
inst->src[0].smear != -1 ||<br>
+ !inst->src[0].is_contiguous() ||<br>
inst->src[0].file == UNIFORM);<br>
<br>
/* Found a move of a GRF to a GRF. Let's see if we can coalesce<br>
@@ -2422,7 +2442,9 @@ fs_visitor::register_coalesce()<br>
}<br>
new_src.negate ^= scan_inst->src[i].negate;<br>
new_src.sechalf = scan_inst->src[i].sechalf;<br>
- new_src.subreg_offset += scan_inst->src[i].subreg_offset;<br>
+ new_src.subreg_offset +=<br>
+ scan_inst->src[i].subreg_offset * new_src.stride;<br>
+ new_src.stride *= scan_inst->src[i].stride;<br>
scan_inst->src[i] = new_src;<br>
}<br>
}<br>
@@ -2458,7 +2480,8 @@ fs_visitor::compute_to_mrf()<br>
inst->dst.file != MRF || inst->src[0].file != GRF ||<br>
inst->dst.type != inst->src[0].type ||<br>
inst->src[0].abs || inst->src[0].negate ||<br>
- inst->src[0].smear != -1 || inst->src[0].subreg_offset)<br>
+ inst->src[0].smear != -1 || !inst->src[0].is_contiguous() ||<br>
+ inst->src[0].subreg_offset)<br>
continue;<br>
<br>
/* Work out which hardware MRF registers are written by this<br>
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h<br>
index 93a393d..b0ce812 100644<br>
--- a/src/mesa/drivers/dri/i965/brw_fs.h<br>
+++ b/src/mesa/drivers/dri/i965/brw_fs.h<br>
@@ -77,13 +77,16 @@ public:<br>
<br>
bool equals(const fs_reg &r) const;<br>
bool is_valid_3src() const;<br>
+ bool is_contiguous() const;<br>
fs_reg retype(uint32_t type);<br>
+ fs_reg &apply_stride(unsigned stride);<br>
<br>
bool negate;<br>
bool abs;<br>
bool sechalf;<br>
int smear; /* -1, or a channel of the reg to smear to all channels. */<br>
int subreg_offset; /**< Offset in bytes from the start of the register. */<br>
+ int stride; /**< Register region horizontal stride */<br>
<br>
fs_reg *reladdr;<br>
};<br>
diff --git a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp<br>
index f3f44c6..2f2d6b6 100644<br>
--- a/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp<br>
+++ b/src/mesa/drivers/dri/i965/brw_fs_copy_propagation.cpp<br>
@@ -298,7 +298,11 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)<br>
bool has_source_modifiers = entry->src.abs || entry->src.negate;<br>
<br>
if ((has_source_modifiers || entry->src.file == UNIFORM ||<br>
- entry->src.smear != -1) && !can_do_source_mods(inst))<br>
+ entry->src.smear != -1 || !entry->src.is_contiguous()) &&<br>
+ !can_do_source_mods(inst))<br>
+ return false;<br>
+<br>
+ if (entry->src.stride * inst->src[arg].stride > 4)<br>
return false;<br>
<br>
if (has_source_modifiers && entry->dst.type != inst->src[arg].type)<br>
@@ -310,6 +314,7 @@ fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)<br>
if (entry->src.smear != -1)<br>
inst->src[arg].smear = entry->src.smear;<br>
inst->src[arg].subreg_offset = entry->src.subreg_offset;<br>
+ inst->src[arg].stride *= entry->src.stride;<br>
<br>
if (!inst->src[arg].abs) {<br>
inst->src[arg].abs = entry->src.abs;<br>
diff --git a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp<br>
index a0b6135..66321bd 100644<br>
--- a/src/mesa/drivers/dri/i965/brw_fs_generator.cpp<br>
+++ b/src/mesa/drivers/dri/i965/brw_fs_generator.cpp<br>
@@ -988,11 +988,13 @@ brw_reg_from_fs_reg(fs_reg *reg)<br>
switch (reg->file) {<br>
case GRF:<br>
case MRF:<br>
- if (reg->smear == -1) {<br>
- brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);<br>
+ if (reg->stride == 0 || reg->smear >= 0) {<br>
+ brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, reg->smear);<br>
} else {<br>
- brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, reg->smear);<br>
+ brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);<br>
+ brw_reg = stride(brw_reg, 8 * reg->stride, 8, reg->stride);<br>
}<br>
+<br>
brw_reg = retype(brw_reg, reg->type);<br>
if (reg->sechalf)<br>
brw_reg = sechalf(brw_reg);<br>
diff --git a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp<br>
index 21b2618..1a2d398 100644<br>
--- a/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp<br>
+++ b/src/mesa/drivers/dri/i965/brw_fs_live_variables.cpp<br>
@@ -84,7 +84,7 @@ fs_live_variables::setup_one_read(bblock_t *block, fs_inst *inst,<br>
* would get stomped by the first decode as well.<br>
*/<br>
int end_ip = ip;<br>
- if (v->dispatch_width == 16 && (reg.smear != -1 ||<br>
+ if (v->dispatch_width == 16 && (reg.smear != -1 || reg.stride == 0 ||<br>
(v->pixel_x.reg == reg.reg ||<br>
v->pixel_y.reg == reg.reg))) {<br>
end_ip++;<br>
diff --git a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp<br>
index 2d35909..3290004 100644<br>
--- a/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp<br>
+++ b/src/mesa/drivers/dri/i965/brw_fs_reg_allocate.cpp<br>
@@ -591,7 +591,7 @@ fs_visitor::choose_spill_reg(struct ra_graph *g)<br>
* loading pull constants, so spilling them is unlikely to reduce<br>
* register pressure anyhow.<br>
*/<br>
- if (inst->src[i].smear >= 0) {<br>
+ if (inst->src[i].smear >= 0 || !inst->src[i].is_contiguous()) {<br>
no_spill[inst->src[i].reg] = true;<br>
}<br>
}<br>
@@ -600,7 +600,7 @@ fs_visitor::choose_spill_reg(struct ra_graph *g)<br>
if (inst->dst.file == GRF) {<br>
spill_costs[inst->dst.reg] += inst->regs_written * loop_scale;<br>
<br>
- if (inst->dst.smear >= 0) {<br>
+ if (inst->dst.smear >= 0 || !inst->dst.is_contiguous()) {<br>
no_spill[inst->dst.reg] = true;<br>
}<br>
}<br></blockquote><div><br></div><div>In v2 of this patch, you add the following code to fs_visitor::try_copy_propagate():<br><br>+ /* Bail if the result of composing both strides cannot be expressed<br>+ * as another stride.<br>
+ */<br>+ if (entry->src.stride != 1 &&<br>+ (inst->src[arg].stride *<br>+ type_sz(inst->src[arg].type)) % type_sz(entry->src.type) != 0)<br> return false;<br><br></div><div>I don't understand. It seems like this is trying to make sure that (src_stride * src_type_sz) / entry_type_sz is an integer so we can later use the factor (src_type_sz / entry_type_sz) as a multiplicative factor to correct the stride without creating a fractional result. But I don't see us applying this correction factor anywhere. Is there some code missing?<br>
</div></div></div></div>