[Mesa-dev] [PATCH 2/2] gallivm: do per-element lod for lod bias and explicit derivs too
sroland at vmware.com
sroland at vmware.com
Wed Aug 21 18:30:24 PDT 2013
From: Roland Scheidegger <sroland at vmware.com>
Except for explicit derivs with cube maps which are very bogus anyway.
Just like explicit lod this is only used if no_quad_lod is set in
GALLIVM_DEBUG env var.
Minification is terrible on cpus which don't support true vector shifts
(but should work correctly). Cannot do the min/mag filter decision (if
they are different) per pixel though, only selecting different mip levels
works.
---
src/gallium/auxiliary/gallivm/lp_bld_sample.c | 92 +++++++++++++++------
src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c | 12 ++-
2 files changed, 73 insertions(+), 31 deletions(-)
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample.c b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
index 6e5c4a1..2fa3221 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample.c
@@ -200,7 +200,7 @@ lp_sampler_static_sampler_state(struct lp_static_sampler_state *state,
* Generate code to compute coordinate gradient (rho).
* \param derivs partial derivatives of (s, t, r, q) with respect to X and Y
*
- * The resulting rho is scalar per quad.
+ * The resulting rho has bld->levelf format (per quad or per element).
*/
static LLVMValueRef
lp_build_rho(struct lp_build_sample_context *bld,
@@ -236,7 +236,10 @@ lp_build_rho(struct lp_build_sample_context *bld,
/* Note that all simplified calculations will only work for isotropic filtering */
- assert(bld->num_lods != length);
+ /*
+ * rho calcs are always per quad except for explicit derivs (excluding
+ * the messy cube maps for now) when requested.
+ */
first_level = bld->dynamic_state->first_level(bld->dynamic_state,
bld->gallivm, texture_unit);
@@ -247,11 +250,18 @@ lp_build_rho(struct lp_build_sample_context *bld,
if (cube_rho) {
LLVMValueRef cubesize;
LLVMValueRef index0 = lp_build_const_int32(gallivm, 0);
+
/*
* Cube map code did already everything except size mul and per-quad extraction.
+ * Luckily cube maps are always quadratic!
*/
- rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
- levelf_bld->type, cube_rho, 0);
+ if (levelf_bld->type.length != length) {
+ rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+ levelf_bld->type, cube_rho, 0);
+ }
+ else {
+ rho = lp_build_swizzle_scalar_aos(coord_bld, cube_rho, 0, 4);
+ }
if (gallivm_debug & GALLIVM_DEBUG_NO_RHO_APPROX) {
rho = lp_build_sqrt(levelf_bld, rho);
}
@@ -290,29 +300,35 @@ lp_build_rho(struct lp_build_sample_context *bld,
rho_xvec = lp_build_add(coord_bld, rho_xvec, ddx[2]);
rho_yvec = lp_build_add(coord_bld, rho_yvec, ddy[2]);
}
- rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
- rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
- levelf_bld->type, rho_vec, 0);
- /*
- * note that as long as we don't care about per-pixel lod could reduce math
- * more (at some shuffle cost), but for now only do sqrt after packing.
- */
+ rho = lp_build_max(coord_bld, rho_xvec, rho_yvec);
+
+ if (levelf_bld->type.length != length) {
+ /*
+ * note for this case without per-pixel lod could reduce math more
+ * (at some shuffle cost), but for now only do sqrt after packing,
+ * otherwise would also need different code to per-pixel lod case.
+ */
+ rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+ levelf_bld->type, rho, 0);
+ }
rho = lp_build_sqrt(levelf_bld, rho);
+
}
else {
- rho_vec = ddmax[0];
+ rho = ddmax[0];
if (dims > 1) {
- rho_vec = lp_build_max(coord_bld, rho_vec, ddmax[1]);
+ rho = lp_build_max(coord_bld, rho, ddmax[1]);
if (dims > 2) {
- rho_vec = lp_build_max(coord_bld, rho_vec, ddmax[2]);
+ rho = lp_build_max(coord_bld, rho, ddmax[2]);
}
}
- /*
- * rho_vec now still contains per-pixel rho, convert to scalar per quad
- * since we can't handle per-pixel rho/lod from now on (TODO).
- */
- rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
- levelf_bld->type, rho_vec, 0);
+ if (levelf_bld->type.length != length) {
+ /*
+ * rho_vec contains per-pixel rho, convert to scalar per quad.
+ */
+ rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+ levelf_bld->type, rho, 0);
+ }
}
}
else {
@@ -379,12 +395,25 @@ lp_build_rho(struct lp_build_sample_context *bld,
ddx_ddy[1] = lp_build_swizzle_aos(coord_bld, ddx_ddy[1], swizzle02);
rho_vec = lp_build_add(coord_bld, rho_vec, ddx_ddy[1]);
}
+
rho_xvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle0);
rho_yvec = lp_build_swizzle_aos(coord_bld, rho_vec, swizzle1);
- rho_vec = lp_build_max(coord_bld, rho_xvec, rho_yvec);
+ rho = lp_build_max(coord_bld, rho_xvec, rho_yvec);
- rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
- levelf_bld->type, rho_vec, 0);
+ if (levelf_bld->type.length != length) {
+ rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+ levelf_bld->type, rho, 0);
+ }
+ else {
+ /*
+ * on some cpus with half-speed 8-wide sqrt (e.g. SNB but not IVB)
+ * doing pack/sqrt/unpack/swizzle might be better for 8-wide case,
+ * same is true for cpus having faster scalars than 4-wide vecs
+ * for 4-wide case (where pack/unpack would be no-ops anyway).
+ * (Same is true really for cube_rho case above.)
+ */
+ rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4);
+ }
rho = lp_build_sqrt(levelf_bld, rho);
}
else {
@@ -464,8 +493,13 @@ lp_build_rho(struct lp_build_sample_context *bld,
}
}
}
- rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
- levelf_bld->type, rho, 0);
+ if (levelf_bld->type.length != length) {
+ rho = lp_build_pack_aos_scalars(bld->gallivm, coord_bld->type,
+ levelf_bld->type, rho, 0);
+ }
+ else {
+ rho = lp_build_swizzle_scalar_aos(coord_bld, rho, 0, 4);
+ }
}
else {
if (dims <= 1) {
@@ -491,6 +525,9 @@ lp_build_rho(struct lp_build_sample_context *bld,
}
}
}
+ if (levelf_bld->type.length == length) {
+ rho = lp_build_broadcast_scalar(levelf_bld, rho);
+ }
}
}
}
@@ -729,8 +766,9 @@ lp_build_lod_selector(struct lp_build_sample_context *bld,
/* add shader lod bias */
if (lod_bias) {
- lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
- levelf_bld->type, lod_bias, 0);
+ if (bld->num_lods != bld->coord_type.length)
+ lod_bias = lp_build_pack_aos_scalars(bld->gallivm, bld->coord_bld.type,
+ levelf_bld->type, lod_bias, 0);
lod = LLVMBuildFAdd(builder, lod, lod_bias, "shader_lod_bias");
}
}
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
index 34ab414..d79d94d 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_sample_soa.c
@@ -1941,7 +1941,9 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
* There are other situations where at least the multiple int lods could be
* avoided like min and max lod being equal.
*/
- if (explicit_lod && lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
+ if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
+ (explicit_lod || lod_bias ||
+ (derivs && static_texture_state->target != PIPE_TEXTURE_CUBE)) &&
((is_fetch && target != PIPE_BUFFER) ||
(!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)))
bld.num_lods = type.length;
@@ -2139,9 +2141,11 @@ lp_build_sample_soa(struct gallivm_state *gallivm,
bld4.levelf_type.length = 1;
bld4.leveli_type = lp_int_type(bld4.levelf_type);
- if (explicit_lod && lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
- ((is_fetch && target != PIPE_BUFFER) ||
- (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)))
+ if (lod_property == LP_SAMPLER_LOD_PER_ELEMENT &&
+ (explicit_lod || lod_bias ||
+ (derivs && static_texture_state->target != PIPE_TEXTURE_CUBE)) &&
+ ((is_fetch && target != PIPE_BUFFER) ||
+ (!is_fetch && mip_filter != PIPE_TEX_MIPFILTER_NONE)))
bld4.num_lods = type4.length;
else
bld4.num_lods = 1;
--
1.7.9.5
More information about the mesa-dev
mailing list