[Mesa-dev] [PATCH v4 23/44] i965/fs: Enables 16-bit load_ubo with sampler
Jose Maria Casanova Crespo
jmcasanova at igalia.com
Thu Nov 30 02:50:54 UTC 2017
load_ubo is using 32-bit loads as uniforms surfaces have a 32-bit
surface format defined. So when reading 16-bit components with the
sampler we need to unshuffle two 16-bit components from each 32-bit
component.
Using the sampler avoids the use of the byte_scattered_read message
that needs one message for each component and is supposed to be
slower.
In the case of SKL+ we take advantage of a hardware feature that
automatically defines a channel mask based on the rlen value, so on
SKL+ we only use half of the registers without using a header in the
payload.
---
src/intel/compiler/brw_fs.cpp | 31 +++++++++++++++++++++++++++----
src/intel/compiler/brw_fs_generator.cpp | 10 ++++++++--
2 files changed, 35 insertions(+), 6 deletions(-)
diff --git a/src/intel/compiler/brw_fs.cpp b/src/intel/compiler/brw_fs.cpp
index 1ca4d416b2..9c543496ba 100644
--- a/src/intel/compiler/brw_fs.cpp
+++ b/src/intel/compiler/brw_fs.cpp
@@ -184,9 +184,17 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
* a double this means we are only loading 2 elements worth of data.
* We also want to use a 32-bit data type for the dst of the load operation
* so other parts of the driver don't get confused about the size of the
- * result.
+ * result. On the case of 16-bit data we only need half of the 32-bit
+ * components on SKL+ as we take advance of using message return size to
+ * define an xy channel mask.
*/
- fs_reg vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+ fs_reg vec4_result;
+ if (type_sz(dst.type) == 2 && (devinfo->gen >= 9)) {
+ vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 2);
+ vec4_result = retype(vec4_result, BRW_REGISTER_TYPE_HF);
+ } else {
+ vec4_result = bld.vgrf(BRW_REGISTER_TYPE_F, 4);
+ }
fs_inst *inst = bld.emit(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_LOGICAL,
vec4_result, surf_index, vec4_offset);
inst->size_written = 4 * vec4_result.component_size(inst->exec_size);
@@ -197,8 +205,23 @@ fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_builder &bld,
}
vec4_result.type = dst.type;
- bld.MOV(dst, offset(vec4_result, bld,
- (const_offset & 0xf) / type_sz(vec4_result.type)));
+
+ if (type_sz(dst.type) == 2) {
+ /* 16-bit types need to be unshuffled as each pair of 16-bit components
+ * is packed on a 32-bit compoment because we are using a 32-bit format
+ * in the surface of uniform that is read by the sampler.
+ * TODO: On BDW+ mark when an uniform has 16-bit type so we could setup a
+ * surface format of 16-bit and use the 16-bit return format at the
+ * sampler.
+ */
+ vec4_result.stride = 2;
+ bld.MOV(dst, byte_offset(offset(vec4_result, bld,
+ (const_offset & 0x7) / 4),
+ (const_offset & 0x7) / 2 % 2 * 2));
+ } else {
+ bld.MOV(dst, offset(vec4_result, bld,
+ (const_offset & 0xf) / type_sz(vec4_result.type)));
+ }
}
/**
diff --git a/src/intel/compiler/brw_fs_generator.cpp b/src/intel/compiler/brw_fs_generator.cpp
index a3861cd68e..00a4e29147 100644
--- a/src/intel/compiler/brw_fs_generator.cpp
+++ b/src/intel/compiler/brw_fs_generator.cpp
@@ -1381,12 +1381,18 @@ fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
uint32_t simd_mode, rlen, mlen;
if (inst->exec_size == 16) {
mlen = 2;
- rlen = 8;
+ if (type_sz(dst.type) == 2 && (devinfo->gen >= 9))
+ rlen = 4;
+ else
+ rlen = 8;
simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
} else {
assert(inst->exec_size == 8);
mlen = 1;
- rlen = 4;
+ if (type_sz(dst.type) == 2 && (devinfo->gen >= 9))
+ rlen = 2;
+ else
+ rlen = 4;
simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
}
--
2.14.3
More information about the mesa-dev
mailing list