[Mesa-dev] [PATCH 15/23] i965/fs: add SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA helper
Samuel Iglesias Gonsálvez
siglesias at igalia.com
Tue May 3 12:22:04 UTC 2016
From: Iago Toral Quiroga <itoral at igalia.com>
There are a few places where we need to shuffle the result of a 32-bit load
into valid 64-bit data, so extract this logic into a separate helper that we
can reuse.
Also, the shuffling needs to operate with WE_all set, which we were missing
before, because we are changing the layout of the data across the various
channels. Otherwise we will run into problems in non-uniform control-flow
scenarios.
---
src/mesa/drivers/dri/i965/brw_fs.cpp | 95 +++++++++++++++++++++-----------
src/mesa/drivers/dri/i965/brw_fs.h | 5 ++
src/mesa/drivers/dri/i965/brw_fs_nir.cpp | 46 ++--------------
3 files changed, 73 insertions(+), 73 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/brw_fs.cpp b/src/mesa/drivers/dri/i965/brw_fs.cpp
index dff13ea..709e4b8 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs.cpp
@@ -216,39 +216,8 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
vec4_result.type = dst.type;
- /* Our VARYING_PULL_CONSTANT_LOAD reads a vector of 32-bit elements. If we
- * are reading doubles this means that we get this:
- *
- * r0: x0 x0 x0 x0 x0 x0 x0 x0
- * r1: x1 x1 x1 x1 x1 x1 x1 x1
- * r2: y0 y0 y0 y0 y0 y0 y0 y0
- * r3: y1 y1 y1 y1 y1 y1 y1 y1
- *
- * Fix this up so we return valid double elements:
- *
- * r0: x0 x1 x0 x1 x0 x1 x0 x1
- * r1: x0 x1 x0 x1 x0 x1 x0 x1
- * r2: y0 y1 y0 y1 y0 y1 y0 y1
- * r3: y0 y1 y0 y1 y0 y1 y0 y1
- */
- if (type_sz(dst.type) == 8) {
- int multiplier = bld.dispatch_width() / 8;
- fs_reg fixed_res =
- fs_reg(VGRF, alloc.allocate(2 * multiplier), BRW_REGISTER_TYPE_F);
- /* We only have 2 doubles in a 32-bit vec4 */
- for (int i = 0; i < 2; i++) {
- fs_reg vec4_float =
- horiz_offset(retype(vec4_result, BRW_REGISTER_TYPE_F),
- multiplier * 16 * i);
-
- bld.MOV(stride(fixed_res, 2), vec4_float);
- bld.MOV(stride(horiz_offset(fixed_res, 1), 2),
- horiz_offset(vec4_float, 8 * multiplier));
-
- bld.MOV(horiz_offset(vec4_result, multiplier * 8 * i),
- retype(fixed_res, BRW_REGISTER_TYPE_DF));
- }
- }
+ if (type_sz(dst.type) == 8)
+ SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(bld, vec4_result, vec4_result, 2);
int type_slots = MAX2(type_sz(dst.type) / 4, 1);
bld.MOV(dst, offset(vec4_result, bld,
@@ -256,6 +225,66 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
}
/**
+ * This helper takes the result of a load operation that reads 32-bit elements
+ * in this format:
+ *
+ * x x x x x x x x
+ * y y y y y y y y
+ * z z z z z z z z
+ * w w w w w w w w
+ *
+ * and shuffles the data to get this:
+ *
+ * x y x y x y x y
+ * x y x y x y x y
+ * z w z w z w z w
+ * z w z w z w z w
+ *
+ * Which is exactly what we want if the load is reading 64-bit components
+ * like doubles, where x represents the low 32-bit of the x double component
+ * and y represents the high 32-bit of the x double component (likewise with
+ * z and w for double component y). The parameter @components represents
+ * the number of 64-bit components present in @src. This would typically be
+ * 2 at most, since we can only fit 2 double elements in the result of a
+ * vec4 load.
+ *
+ * Notice that @dst and @src can be the same register.
+ */
+void
+fs_visitor::SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(const fs_builder &bld,
+ const fs_reg dst,
+ const fs_reg src,
+ uint32_t components)
+{
+ int multiplier = bld.dispatch_width() / 8;
+
+ /* A temporary that we will use to shuffle the 32-bit data of each
+ * component in the vector into valid 64-bit data
+ */
+ fs_reg tmp =
+ fs_reg(VGRF, alloc.allocate(2 * multiplier), BRW_REGISTER_TYPE_F);
+
+ /* We are going to manipulate the data in elements of 32-bit */
+ fs_reg src_data = retype(src, BRW_REGISTER_TYPE_F);
+
+ /* We are going to manipulate the dst in elements of 64-bit */
+ fs_reg dst_data = retype(dst, BRW_REGISTER_TYPE_DF);
+
+ /* Shuffle the data */
+ for (unsigned i = 0; i < components; i++) {
+ fs_reg component_i = horiz_offset(src_data, multiplier * 16 * i);
+
+ bld.MOV(stride(tmp, 2), component_i)->force_writemask_all = true;
+ bld.MOV(stride(horiz_offset(tmp, 1), 2),
+ horiz_offset(component_i, 8 * multiplier))
+ ->force_writemask_all = true;
+
+ bld.MOV(horiz_offset(dst_data, multiplier * 8 * i),
+ retype(tmp, BRW_REGISTER_TYPE_DF))->force_writemask_all = true;
+ }
+}
+
+/**
* A helper for MOV generation for fixing up broken hardware SEND dependency
* handling.
*/
diff --git a/src/mesa/drivers/dri/i965/brw_fs.h b/src/mesa/drivers/dri/i965/brw_fs.h
index 1f18112..c95b30a 100644
--- a/src/mesa/drivers/dri/i965/brw_fs.h
+++ b/src/mesa/drivers/dri/i965/brw_fs.h
@@ -106,6 +106,11 @@ public:
uint32_t const_offset);
void DEP_RESOLVE_MOV(const brw::fs_builder &bld, int grf);
+ void SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(const brw::fs_builder &bld,
+ const fs_reg dst,
+ const fs_reg src,
+ uint32_t components);
+
bool run_fs(bool do_rep_send);
bool run_vs(gl_clip_plane *clip_planes);
bool run_tes();
diff --git a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
index 5a12d63..9cd751b 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_nir.cpp
@@ -3082,44 +3082,19 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
for (int i = 0; i < instr->num_components; i++)
bld.MOV(offset(dest, bld, i), offset(read_result, bld, i));
} else {
- /* Reading a dvec, then the untyped read below gives us this:
+ /* Reading a dvec, so we need to:
*
- * x0 x0 x0 x0 x0 x0 x0 x0
- * x1 x1 x1 x1 x1 x1 x1 x1
- * y0 y0 y0 y0 y0 y0 y0 y0
- * y1 y1 y1 y1 y1 y1 y1 y1
- *
- * But that is not valid 64-bit data, instead we want:
- *
- * x0 x1 x0 x1 x0 x1 x0 x1
- * x0 x1 x0 x1 x0 x1 x0 x1
- * y0 y1 y0 y1 y0 y1 y0 y1
- * y0 y1 y0 y1 y0 y1 y0 y1
- *
- * Also, that would only load half of a dvec4. So, we have to:
- *
- * 1. Multiply num_components by 2, to account for the fact that we need
- * to read 64-bit components.
+ * 1. Multiply num_components by 2, to account for the fact that we
+ * need to read 64-bit components.
* 2. Shuffle the result of the load to form valid 64-bit elements
* 3. Emit a second load (for components z/w) if needed.
- *
- * FIXME: extract the shuffling logic and share it with
- * varying_pull_constant_load
*/
- int multiplier = bld.dispatch_width() / 8;
-
fs_reg read_offset = vgrf(glsl_type::uint_type);
bld.MOV(read_offset, offset_reg);
int num_components = instr->num_components;
int iters = num_components <= 2 ? 1 : 2;
- /* A temporary that we will use to shuffle the 32-bit data of each
- * component in the vector into valid 64-bit data
- */
- fs_reg tmp =
- fs_reg(VGRF, alloc.allocate(2 * multiplier), BRW_REGISTER_TYPE_F);
-
/* Load the dvec, the first iteration loads components x/y, the second
* iteration, if needed, loads components z/w
*/
@@ -3138,18 +3113,9 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
read_result.type = BRW_REGISTER_TYPE_F;
/* Shuffle the 32-bit load result into valid 64-bit data */
- int multiplier = bld.dispatch_width() / 8;
- for (int i = 0; i < iter_components; i++) {
- fs_reg component_i =
- horiz_offset(read_result, multiplier * 16 * i);
-
- bld.MOV(stride(tmp, 2), component_i);
- bld.MOV(stride(horiz_offset(tmp, 1), 2),
- horiz_offset(component_i, 8 * multiplier));
-
- bld.MOV(horiz_offset(dest, multiplier * 8 * (i + it * 2)),
- retype(tmp, BRW_REGISTER_TYPE_DF));
- }
+ SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA(bld,
+ offset(dest, bld, it * 2),
+ read_result, iter_components);
bld.ADD(read_offset, read_offset, brw_imm_ud(16));
}
--
2.5.0
More information about the mesa-dev
mailing list