[Mesa-dev] [PATCH 11/95] i965/vec4: add double/float conversion pseudo-opcodes

Tue Jul 19 10:40:08 UTC 2016

These need to be emitted as align1 MOV's, since they need to have a
stride of 2 on the float register (whether src or dest) so that data
from another thread doesn't cross the middle of a SIMD8 register.

v2 (Iago):
- The float-to-double needs to align 32-bit data to 64-bit before doing the
conversion. This was doable in align16 when we tried to use an execsize
of 4, but with an execsize of 8 we would need another align1 opcode to do
that (since we need data to cross the middle of a SIMD register). Just
making the opcode handle this internally seems more practical that adding
another opcode just for this purpose and having the caller know about this
before converting.
- The double-to-float conversion produces 32-bit elements aligned to 64-bit
so we make the opcode re-pack the result to 32-bit and fit in one register,
as expected by SIMD4x2 operation. This still requires that callers reserve
two registers for the float data destination because we need to produce
64-bit aligned data first, and repack it later on the same destination
register, but it saves the need for a re-pack opcode only to achieve this
making the operation complete in a single opcode. Hopefully that is worth
the weirdness of the double register allocation...

Signed-off-by: Connor Abbott <connor.w.abbott at intel.com>
Signed-off-by: Iago Toral Quiroga <itoral at igalia.com>
---
 src/mesa/drivers/dri/i965/brw_defines.h          |  2 ++
 src/mesa/drivers/dri/i965/brw_shader.cpp         |  4 +++
 src/mesa/drivers/dri/i965/brw_vec4.cpp           |  8 +++++
 src/mesa/drivers/dri/i965/brw_vec4_generator.cpp | 44 ++++++++++++++++++++++++
 4 files changed, 58 insertions(+)

diff --git a/src/mesa/drivers/dri/i965/brw_defines.h b/src/mesa/drivers/dri/i965/brw_defines.h
index b5a259e..10db5f0 100644
--- a/src/mesa/drivers/dri/i965/brw_defines.h
+++ b/src/mesa/drivers/dri/i965/brw_defines.h
@@ -1094,6 +1094,8 @@ enum opcode {
    VEC4_OPCODE_MOV_BYTES,
    VEC4_OPCODE_PACK_BYTES,
    VEC4_OPCODE_UNPACK_UNIFORM,
+   VEC4_OPCODE_DOUBLE_TO_FLOAT,
+   VEC4_OPCODE_FLOAT_TO_DOUBLE,
 
    FS_OPCODE_DDX_COARSE,
    FS_OPCODE_DDX_FINE,
diff --git a/src/mesa/drivers/dri/i965/brw_shader.cpp b/src/mesa/drivers/dri/i965/brw_shader.cpp
index f3b5487..5fb1d68 100644
--- a/src/mesa/drivers/dri/i965/brw_shader.cpp
+++ b/src/mesa/drivers/dri/i965/brw_shader.cpp
@@ -313,6 +313,10 @@ brw_instruction_name(const struct brw_device_info *devinfo, enum opcode op)
       return "pack_bytes";
    case VEC4_OPCODE_UNPACK_UNIFORM:
       return "unpack_uniform";
+   case VEC4_OPCODE_DOUBLE_TO_FLOAT:
+      return "double_to_float";
+   case VEC4_OPCODE_FLOAT_TO_DOUBLE:
+      return "float_to_double";
 
    case FS_OPCODE_DDX_COARSE:
       return "ddx_coarse";
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index bf6701e..d4b1e9c 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -243,6 +243,8 @@ vec4_instruction::can_do_writemask(const struct brw_device_info *devinfo)
 {
    switch (opcode) {
    case SHADER_OPCODE_GEN4_SCRATCH_READ:
+   case VEC4_OPCODE_DOUBLE_TO_FLOAT:
+   case VEC4_OPCODE_FLOAT_TO_DOUBLE:
    case VS_OPCODE_PULL_CONSTANT_LOAD:
    case VS_OPCODE_PULL_CONSTANT_LOAD_GEN7:
    case VS_OPCODE_SET_SIMD4X2_HEADER_GEN9:
@@ -493,6 +495,12 @@ vec4_visitor::opt_reduce_swizzle()
       case BRW_OPCODE_DP2:
          swizzle = brw_swizzle_for_size(2);
          break;
+
+      case VEC4_OPCODE_FLOAT_TO_DOUBLE:
+      case VEC4_OPCODE_DOUBLE_TO_FLOAT:
+         swizzle = brw_swizzle_for_size(4);
+         break;
+
       default:
          swizzle = brw_swizzle_for_mask(inst->dst.writemask);
          break;
diff --git a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
index 3878c4a..d47b489 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4_generator.cpp
@@ -1910,6 +1910,50 @@ generate_code(struct brw_codegen *p,
          break;
       }
 
+      case VEC4_OPCODE_DOUBLE_TO_FLOAT: {
+         assert(src[0].type == BRW_REGISTER_TYPE_DF);
+         assert(dst.type == BRW_REGISTER_TYPE_F);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+         dst.hstride = BRW_HORIZONTAL_STRIDE_2;
+         dst.width = BRW_WIDTH_4;
+         src[0].vstride = BRW_VERTICAL_STRIDE_4;
+         src[0].width = BRW_WIDTH_4;
+         brw_MOV(p, dst, src[0]);
+
+         struct brw_reg dst_as_src = dst;
+         dst.hstride = BRW_HORIZONTAL_STRIDE_1;
+         dst.width = BRW_WIDTH_8;
+         brw_MOV(p, dst, dst_as_src);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         break;
+      }
+
+      case VEC4_OPCODE_FLOAT_TO_DOUBLE: {
+         assert(src[0].type == BRW_REGISTER_TYPE_F);
+         assert(dst.type == BRW_REGISTER_TYPE_DF);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_1);
+
+         struct brw_reg tmp = retype(dst, src[0].type);
+         tmp.hstride = BRW_HORIZONTAL_STRIDE_2;
+         tmp.width = BRW_WIDTH_4;
+         src[0].vstride = BRW_VERTICAL_STRIDE_4;
+         src[0].hstride = BRW_HORIZONTAL_STRIDE_1;
+         src[0].width = BRW_WIDTH_4;
+         brw_MOV(p, tmp, src[0]);
+
+         tmp.vstride = BRW_VERTICAL_STRIDE_8;
+         tmp.hstride = BRW_HORIZONTAL_STRIDE_2;
+         tmp.width = BRW_WIDTH_4;
+         brw_MOV(p, dst, tmp);
+
+         brw_set_default_access_mode(p, BRW_ALIGN_16);
+         break;
+      }
+
       case VEC4_OPCODE_PACK_BYTES: {
          /* Is effectively:
           *
-- 
2.7.4