[Mesa-dev] [PATCH v3 30/43] i965/fs: Support 16-bit types at load_input and store_output
Jose Maria Casanova Crespo
jmcasanova at igalia.com
Thu Oct 12 18:38:19 UTC 2017
Enables the support of 16-bit types on load_input and
store_outputs intrinsics intra-stages.
The approach was based on re-using the 32-bit URB read
and writes between stages, shuffling pairs of 16-bit values into
32-bit values at load_store intrinsic and un-shuffling the values
at load_inputs.
shuffle_32bit_load_result_to_16bit_data and
shuffle_32bit_load_result_to_16bit_data are implemented in a similar
way than the analogous functions for handling 64-bit types.
---
src/intel/compiler/brw_fs.h | 11 ++++
src/intel/compiler/brw_fs_nir.cpp | 119 +++++++++++++++++++++++++++++++++++++-
2 files changed, 129 insertions(+), 1 deletion(-)
diff --git a/src/intel/compiler/brw_fs.h b/src/intel/compiler/brw_fs.h
index b9476e69ed..90ada3ef4b 100644
--- a/src/intel/compiler/brw_fs.h
+++ b/src/intel/compiler/brw_fs.h
@@ -498,6 +498,17 @@ void shuffle_64bit_data_for_32bit_write(const brw::fs_builder &bld,
const fs_reg &dst,
const fs_reg &src,
uint32_t components);
+
+void shuffle_32bit_load_result_to_16bit_data(const brw::fs_builder &bld,
+ const fs_reg &dst,
+ const fs_reg &src,
+ uint32_t components);
+
+void shuffle_16bit_data_for_32bit_write(const brw::fs_builder &bld,
+ const fs_reg &dst,
+ const fs_reg &src,
+ uint32_t components);
+
fs_reg setup_imm_df(const brw::fs_builder &bld,
double v);
diff --git a/src/intel/compiler/brw_fs_nir.cpp b/src/intel/compiler/brw_fs_nir.cpp
index 83ff0607a7..9c694a1c53 100644
--- a/src/intel/compiler/brw_fs_nir.cpp
+++ b/src/intel/compiler/brw_fs_nir.cpp
@@ -2124,12 +2124,17 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
first_component = first_component / 2;
}
+ if (type_sz(dst.type) == 2) {
+ num_components = DIV_ROUND_UP(num_components, 2);
+ tmp_dst = bld.vgrf(BRW_REGISTER_TYPE_F, num_components);
+ }
+
for (unsigned iter = 0; iter < num_iterations; iter++) {
if (offset_const) {
/* Constant indexing - use global offset. */
if (first_component != 0) {
unsigned read_components = num_components + first_component;
- fs_reg tmp = bld.vgrf(dst.type, read_components);
+ fs_reg tmp = bld.vgrf(tmp_dst.type, read_components);
inst = bld.emit(SHADER_OPCODE_URB_READ_SIMD8, tmp, icp_handle);
inst->size_written = read_components *
tmp.component_size(inst->exec_size);
@@ -2179,6 +2184,11 @@ fs_visitor::emit_gs_input_load(const fs_reg &dst,
bld.MOV(offset(dst, bld, iter * 2 + c), offset(tmp_dst, bld, c));
}
+ if (type_sz(dst.type) == 2) {
+ shuffle_32bit_load_result_to_16bit_data(
+ bld, dst, retype(tmp_dst, BRW_REGISTER_TYPE_F), orig_num_components);
+ }
+
if (num_iterations > 1) {
num_components = orig_num_components - 2;
if(offset_const) {
@@ -2484,6 +2494,11 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
dst = tmp;
}
+ if (type_sz(dst.type) == 2) {
+ num_components = DIV_ROUND_UP(num_components, 2);
+ dst = bld.vgrf(BRW_REGISTER_TYPE_F, num_components);
+ }
+
for (unsigned iter = 0; iter < num_iterations; iter++) {
if (indirect_offset.file == BAD_FILE) {
/* Constant indexing - use global offset. */
@@ -2539,6 +2554,11 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
}
}
+ if (type_sz(orig_dst.type) == 2) {
+ shuffle_32bit_load_result_to_16bit_data(
+ bld, orig_dst, dst, instr->num_components);
+ }
+
/* Copy the temporary to the destination to deal with writemasking.
*
* Also attempt to deal with gl_PointSize being in the .w component.
@@ -2629,6 +2649,8 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
fs_reg value = get_nir_src(instr->src[0]);
bool is_64bit = (instr->src[0].is_ssa ?
instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 64;
+ bool is_16bit = (instr->src[0].is_ssa ?
+ instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size) == 16;
fs_reg indirect_offset = get_indirect_offset(instr);
unsigned imm_offset = instr->const_index[0];
unsigned swiz = BRW_SWIZZLE_XYZW;
@@ -2659,6 +2681,10 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
num_iterations = 2;
iter_components = 2;
}
+ } else {
+ if (is_16bit) {
+ iter_components = DIV_ROUND_UP(num_components, 2);
+ }
}
/* 64-bit data needs to me shuffled before we can write it to the URB.
@@ -2711,6 +2737,12 @@ fs_visitor::nir_emit_tcs_intrinsic(const fs_builder &bld,
continue;
if (!is_64bit) {
+ if (is_16bit) {
+ shuffle_16bit_data_for_32bit_write(bld,
+ retype(offset(value,bld,BRW_GET_SWZ(swiz, i)), BRW_REGISTER_TYPE_F),
+ retype(offset(value,bld,BRW_GET_SWZ(swiz, i)), BRW_REGISTER_TYPE_HF),
+ 2);
+ }
srcs[header_regs + i + first_component] =
offset(value, bld, BRW_GET_SWZ(swiz, i));
} else {
@@ -2862,6 +2894,11 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
dest = tmp;
}
+ if (type_sz(dest.type) == 2) {
+ num_components = DIV_ROUND_UP(num_components, 2);
+ dest = bld.vgrf(BRW_REGISTER_TYPE_F, num_components);
+ }
+
for (unsigned iter = 0; iter < num_iterations; iter++) {
const fs_reg srcs[] = {
retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD),
@@ -2904,6 +2941,11 @@ fs_visitor::nir_emit_tes_intrinsic(const fs_builder &bld,
}
}
+ if (type_sz(dest.type) == 2) {
+ shuffle_32bit_load_result_to_16bit_data(
+ bld, orig_dest, retype(dest, BRW_REGISTER_TYPE_F), num_components);
+ }
+
/* If we are loading double data and we need a second read message
* adjust the offset
*/
@@ -3257,6 +3299,13 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
num_components *= 2;
}
+ fs_reg orig_dest = dest;
+ if (nir_dest_bit_size(instr->dest) == 16) {
+ type = BRW_REGISTER_TYPE_F;
+ num_components = DIV_ROUND_UP(num_components, 2);
+ dest = bld.vgrf(type, num_components);
+ }
+
for (unsigned int i = 0; i < num_components; i++) {
struct brw_reg interp = interp_reg(base, component + i);
interp = suboffset(interp, 3);
@@ -3270,6 +3319,12 @@ fs_visitor::nir_emit_fs_intrinsic(const fs_builder &bld,
retype(dest, type),
instr->num_components);
}
+ if (nir_dest_bit_size(instr->dest) == 16) {
+ shuffle_32bit_load_result_to_16bit_data(bld,
+ orig_dest,
+ retype(dest, type),
+ instr->num_components);
+ }
break;
}
@@ -4182,6 +4237,7 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
unsigned first_component = nir_intrinsic_component(instr);
unsigned bit_size = instr->src[0].is_ssa ?
instr->src[0].ssa->bit_size : instr->src[0].reg.reg->bit_size;
+
if (bit_size == 64) {
fs_reg tmp =
fs_reg(VGRF, alloc.allocate(2 * num_components),
@@ -4192,6 +4248,16 @@ fs_visitor::nir_emit_intrinsic(const fs_builder &bld, nir_intrinsic_instr *instr
num_components *= 2;
}
+ if (bit_size == 16) {
+ fs_reg tmp =
+ fs_reg(VGRF, alloc.allocate(DIV_ROUND_UP(num_components, 2)),
+ BRW_REGISTER_TYPE_F);
+ shuffle_16bit_data_for_32bit_write(
+ bld, tmp, retype(src, BRW_REGISTER_TYPE_HF), num_components);
+ src = retype(tmp, src.type);
+ num_components = DIV_ROUND_UP(num_components, 2);
+ }
+
for (unsigned j = 0; j < num_components; j++) {
bld.MOV(offset(new_dest, bld, j + first_component),
offset(src, bld, j));
@@ -4815,6 +4881,33 @@ shuffle_32bit_load_result_to_64bit_data(const fs_builder &bld,
}
}
+void
+shuffle_32bit_load_result_to_16bit_data(const fs_builder &bld,
+ const fs_reg &dst,
+ const fs_reg &src,
+ uint32_t components)
+{
+ assert(type_sz(src.type) == 4);
+ assert(type_sz(dst.type) == 2);
+
+ fs_reg tmp = retype(bld.vgrf(src.type), dst.type);
+
+ for (unsigned i = 0; i < components; i++) {
+ const fs_reg component_i = subscript(offset(src, bld, i / 2), dst.type, i % 2);
+
+ bld.MOV(offset(tmp, bld, i % 2), component_i);
+
+ if (i % 2) {
+ bld.MOV(offset(dst, bld, i -1), offset(tmp, bld, 0));
+ bld.MOV(offset(dst, bld, i), offset(tmp, bld, 1));
+ }
+ }
+ if (components % 2) {
+ bld.MOV(offset(dst, bld, components - 1), tmp);
+ }
+}
+
+
/**
* This helper does the inverse operation of
* SHUFFLE_32BIT_LOAD_RESULT_TO_64BIT_DATA.
@@ -4849,6 +4942,30 @@ shuffle_64bit_data_for_32bit_write(const fs_builder &bld,
}
}
+void
+shuffle_16bit_data_for_32bit_write(const fs_builder &bld,
+ const fs_reg &dst,
+ const fs_reg &src,
+ uint32_t components)
+{
+ assert(type_sz(src.type) == 2);
+ assert(type_sz(dst.type) == 4);
+
+ fs_reg tmp = bld.vgrf(dst.type);
+
+ for (unsigned i = 0; i < components; i++) {
+ const fs_reg component_i = offset(src, bld, i);
+ bld.MOV(subscript(tmp, src.type, i % 2), component_i);
+ if (i % 2) {
+ bld.MOV(offset(dst, bld, i / 2), tmp);
+ }
+ if (components % 2) {
+ bld.MOV(offset(dst, bld, components / 2), tmp);
+ }
+ }
+}
+
+
fs_reg
setup_imm_df(const fs_builder &bld, double v)
{
--
2.13.6
More information about the mesa-dev
mailing list