<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<style type="text/css" style="display:none;"> P {margin-top:0;margin-bottom:0;} </style>
</head>
<body dir="ltr">
<div style="font-family: Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
LGTM.</div>
<div style="font-family: Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
<br>
</div>
<div style="font-family: Calibri, Helvetica, sans-serif; font-size: 12pt; color: rgb(0, 0, 0);">
Reviewed-by: Jose Fonseca <jfonseca@vmware.com></div>
<div>
<div id="appendonsend"></div>
<div style="font-family:Calibri,Helvetica,sans-serif; font-size:12pt; color:rgb(0,0,0)">
<br>
</div>
<hr tabindex="-1" style="display:inline-block; width:98%">
<div id="divRplyFwdMsg" dir="ltr"><font face="Calibri, sans-serif" color="#000000" style="font-size:11pt"><b>From:</b> sroland@vmware.com <sroland@vmware.com><br>
<b>Sent:</b> Tuesday, May 7, 2019 03:12<br>
<b>To:</b> Jose Fonseca; Brian Paul; mesa-dev@lists.freedesktop.org<br>
<b>Cc:</b> Roland Scheidegger<br>
<b>Subject:</b> [PATCH] gallivm: fix broken 8-wide s3tc decoding</font>
<div> </div>
</div>
<div class="BodyFragment"><font size="2"><span style="font-size:11pt">
<div class="PlainText">From: Roland Scheidegger <sroland@vmware.com><br>
<br>
Brian noticed there was an uninitialized var for the 8-wide case and 128<br>
bit blocks, which made it always crash. Likewise, the 64bit block case<br>
had another crash bug due to type mismatch.<br>
Color decode (used for all s3tc formats) also had a bogus shuffle for<br>
this case, leading to decode artifacts.<br>
Fix these all up, which makes the code actually work 8-wide. Note that<br>
it's still not used - I've verified it works, and the generated assembly<br>
does look quite a bit simpler actually (20-30% less instructions for the<br>
s3tc decode part with avx2), however in practice it still seems to be<br>
sligthly slower for some unknown reason (tested with openarena) on my<br>
haswell box, so for now continue to split things into 4-wide vectors<br>
before decoding.<br>
---<br>
.../auxiliary/gallivm/lp_bld_format_s3tc.c | 33 +++++++++----------<br>
1 file changed, 16 insertions(+), 17 deletions(-)<br>
<br>
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c<br>
index 9561c349dad..8f6e9bec18a 100644<br>
--- a/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c<br>
+++ b/src/gallium/auxiliary/gallivm/lp_bld_format_s3tc.c<br>
@@ -77,24 +77,17 @@ lp_build_uninterleave2_half(struct gallivm_state *gallivm,<br>
unsigned lo_hi)<br>
{<br>
LLVMValueRef shuffle, elems[LP_MAX_VECTOR_LENGTH];<br>
- unsigned i, j;<br>
+ unsigned i;<br>
<br>
assert(type.length <= LP_MAX_VECTOR_LENGTH);<br>
assert(lo_hi < 2);<br>
<br>
if (type.length * type.width == 256) {<br>
- assert(type.length >= 4);<br>
- for (i = 0, j = 0; i < type.length; ++i) {<br>
- if (i == type.length / 4) {<br>
- j = type.length;<br>
- } else if (i == type.length / 2) {<br>
- j = type.length / 2;<br>
- } else if (i == 3 * type.length / 4) {<br>
- j = 3 * type.length / 4;<br>
- } else {<br>
- j += 2;<br>
- }<br>
- elems[i] = lp_build_const_int32(gallivm, j + lo_hi);<br>
+ assert(type.length == 8);<br>
+ assert(type.width == 32);<br>
+ const unsigned shufvals[8] = {0, 2, 8, 10, 4, 6, 12, 14};<br>
+ for (i = 0; i < type.length; ++i) {<br>
+ elems[i] = lp_build_const_int32(gallivm, shufvals[i] + lo_hi);<br>
}<br>
} else {<br>
for (i = 0; i < type.length; ++i) {<br>
@@ -277,7 +270,7 @@ lp_build_gather_s3tc(struct gallivm_state *gallivm,<br>
}<br>
else {<br>
LLVMValueRef tmp[4], cc01, cc23;<br>
- struct lp_type lp_type32, lp_type64, lp_type32dxt;<br>
+ struct lp_type lp_type32, lp_type64;<br>
memset(&lp_type32, 0, sizeof lp_type32);<br>
lp_type32.width = 32;<br>
lp_type32.length = length;<br>
@@ -309,10 +302,14 @@ lp_build_gather_s3tc(struct gallivm_state *gallivm,<br>
lp_build_const_extend_shuffle(gallivm, 2, 4), "");<br>
}<br>
if (length == 8) {<br>
+ struct lp_type lp_type32_4;<br>
+ memset(&lp_type32_4, 0, sizeof lp_type32_4);<br>
+ lp_type32_4.width = 32;<br>
+ lp_type32_4.length = 4;<br>
for (i = 0; i < 4; ++i) {<br>
tmp[0] = elems[i];<br>
tmp[1] = elems[i+4];<br>
- elems[i] = lp_build_concat(gallivm, tmp, lp_type32, 2);<br>
+ elems[i] = lp_build_concat(gallivm, tmp, lp_type32_4, 2);<br>
}<br>
}<br>
cc01 = lp_build_interleave2_half(gallivm, lp_type32, elems[0], elems[1], 0);<br>
@@ -811,7 +808,7 @@ s3tc_dxt3_to_rgba_aos(struct gallivm_state *gallivm,<br>
tmp = lp_build_select(&bld, sel_mask, alpha_low, alpha_hi);<br>
bit_pos = LLVMBuildAnd(builder, bit_pos,<br>
lp_build_const_int_vec(gallivm, type, 0xffffffdf), "");<br>
- /* Warning: slow shift with per element count */<br>
+ /* Warning: slow shift with per element count (without avx2) */<br>
/*<br>
* Could do pshufb here as well - just use appropriate 2 bits in bit_pos<br>
* to select the right byte with pshufb. Then for the remaining one bit<br>
@@ -1640,7 +1637,6 @@ s3tc_decode_block_dxt5(struct gallivm_state *gallivm,<br>
lp_build_const_int_vec(gallivm, type16, 8), "");<br>
alpha = LLVMBuildBitCast(builder, alpha, i64t, "");<br>
shuffle1 = lp_build_const_shuffle1(gallivm, 0, 8);<br>
- /* XXX this shuffle broken with LLVM 2.8 */<br>
alpha0 = LLVMBuildShuffleVector(builder, alpha0, alpha0, shuffle1, "");<br>
alpha1 = LLVMBuildShuffleVector(builder, alpha1, alpha1, shuffle1, "");<br>
<br>
@@ -2176,6 +2172,9 @@ lp_build_fetch_s3tc_rgba_aos(struct gallivm_state *gallivm,<br>
return rgba;<br>
}<br>
<br>
+ /*<br>
+ * Could use n > 8 here with avx2, but doesn't seem faster.<br>
+ */<br>
if (n > 4) {<br>
unsigned count;<br>
LLVMTypeRef i8_vectype = LLVMVectorType(i8t, 4 * n);<br>
-- <br>
2.17.1<br>
<br>
</div>
</span></font></div>
</div>
</body>
</html>