[Mesa-dev] [PATCH 2/2] [v3] i965: Use IVB specific formula for depthbuffer
Ben Widawsky
benjamin.widawsky at intel.com
Mon Sep 30 12:35:42 PDT 2013
After the last patch, we can replace the region allocated in the miptree
creation with a more straightforward (and hopefully smaller resulting)
buffer based on the bspec's allocation formula.
Since I am relatively new to this part of the bspec, I would very much
appreciate scrutiny during review of this. There were some ambiguities
to me which are likely obvious to others.
To prove the reduced [GPU] memory usage I created a simple script which
polls the memory usage of the process through debugfs ever .1 seconds.
The following results show the memory usage difference over 5 runs of
xonotic-glx with ultra settings.
The data suggests a 10MB savings on average. I've not measured the
savings on the CPU side, but I imagine some amount of savings would be
present there as well.
x master/mem_usage.txt
+ mine/mem_usage.txt
N Min Max Median Avg Stddev
x 17121 98959360 7.3394995e+08 7.2782234e+08 7.2209615e+08 43633222
+ 17166 1.2538266e+08 7.2241562e+08 7.16288e+08 7.1071472e+08 42964578
Below is the FPS data over those same 5 tests. I'm not sure if the
decrease is statistically significant to y'all. I don't have any
theories about it.
x master/xonotic.fps
+ mine/xonotic.fps
N Min Max Median Avg Stddev
x 5 27.430746 27.524985 27.50568 27.487017 0.039439874
+ 5 27.409173 27.461715 27.441207 27.440883 0.021086805
NOTE: There were a couple of places in the arithmetic where I could have
taken some shortcuts. In order to make the code match with the spec as
much as possible, I've decided not to do this. One shortcut I did make
was the tiling type. Digging through the code it looks like you always
want Y-tiled, except when it won't fit, in which case you want X-tiled.
I wasn't a fan of the existing helper function that's there since it has
a few irrelevant parameters for this operation. I suspect people
reviewing this might ask me to change this, which is fine; I just wanted
to explain the motivation.
v2: copy-paste fix where I used I915_TILING_Y where I meant _X. (Topi)
v3:
Updated to directly use the bo/stride instead of intel_region. (Ken, Chad)
Fix the reference count leak on the hiz buffer (Chad)
Don't allow fallback to old mt allocation. It should never happen. (Ben)
Break out hz_depth/width calculation to separete functions. (Ben)
Use cpp = 1, since the calculation takes cpp into account (Ben)
x head/xonotic
+ mine/xonotic
N Min Max Median Avg Stddev
x 5 25.683336 25.898164 25.872499 25.842426 0.089829019
+ 5 25.841368 25.934931 25.869051 25.877494 0.039885576
x head/memusage
+ mine/memusage
N Min Max Median Avg Stddev
x 18036 89432064 8.6380954e+08 7.9515648e+08 7.930405e+08 42774265
+ 18030 86548480 8.6262989e+08 7.8178714e+08 7.7978462e+08 42099587
CC: Chad Versace <chad.versace at linux.intel.com>
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=67564
Signed-off-by: Ben Widawsky <ben at bwidawsk.net>
---
src/mesa/drivers/dri/i965/intel_mipmap_tree.c | 194 +++++++++++++++++++++++---
src/mesa/drivers/dri/i965/intel_mipmap_tree.h | 2 +-
2 files changed, 176 insertions(+), 20 deletions(-)
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
index e1da9de..9fc4e97 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.c
@@ -793,8 +793,12 @@ intel_miptree_release(struct intel_mipmap_tree **mt)
intel_region_release(&((*mt)->region));
intel_miptree_release(&(*mt)->stencil_mt);
- intel_miptree_release(&(*mt)->hiz_buffer.mt);
- (*mt)->hiz_buffer.bo = NULL;
+ if (&(*mt)->hiz_buffer.mt)
+ intel_miptree_release(&(*mt)->hiz_buffer.mt);
+ else {
+ drm_intel_bo_unreference((*mt)->hiz_buffer.bo);
+ (*mt)->hiz_buffer.bo = NULL;
+ }
intel_miptree_release(&(*mt)->mcs_mt);
intel_miptree_release(&(*mt)->singlesample_mt);
intel_resolve_map_clear(&(*mt)->hiz_map);
@@ -1271,30 +1275,182 @@ intel_miptree_slice_enable_hiz(struct brw_context *brw,
return true;
}
+#define level(x, l) ((x) >> (l) > 0 ? (x) >> (l) : 1)
+static unsigned int caclulate_z_height(const struct intel_mipmap_tree *mt,
+ const int level)
+{
+ unsigned int height = level(mt->logical_height0, level);
+
+ /* [DevIVB+]: If the surface is multisampled and it is a depth or stencil
+ * surface or Multisampled Surface StorageFormat in SURFACE_STATE is
+ * MSFMT_DEPTH_STENCIL, WL and HL must be adjusted as follows before
+ * proceeding: */
+ switch (mt->num_samples) {
+ case 4:
+ height = CEILING(height, 2) * 4;
+ break;
+ case 8:
+ height = CEILING(height, 2) * 4 * mt->level[level].depth;;
+ break;
+ case 16:
+ height = CEILING(height, 2) * 8;
+ break;
+ }
+
+ /* The value of Z_Height and Z_Width must each be multiplied by 2 before
+ * being applied to the table below if Number of Multisamples is set to
+ * NUMSAMPLES_4. The value of Z_Height must be multiplied by 2 and Z_Width
+ * must be multiplied by 4 before being applied to the table below if Number
+ * of Multisamples is set to NUMSAMPLES_8.
+ */
+ if (mt->num_samples == 4 || mt->num_samples == 8)
+ height *= 2;
+
+ return height;
+}
+
+static unsigned int caclulate_z_width(const struct intel_mipmap_tree *mt,
+ const int level)
+{
+ unsigned int width = level(mt->logical_width0, level);
+
+ /* [DevIVB+]: If the surface is multisampled and it is a depth or stencil
+ * surface or Multisampled Surface StorageFormat in SURFACE_STATE is
+ * MSFMT_DEPTH_STENCIL, WL and HL must be adjusted as follows before
+ * proceeding: */
+ switch (mt->num_samples) {
+ case 2:
+ case 4:
+ width = CEILING(width, 2) * 4;
+ break;
+ case 8:
+ case 16:
+ width = CEILING(width, 2) * 8;
+ break;
+ }
+
+ /* The value of Z_Height and Z_Width must each be multiplied by 2 before
+ * being applied to the table below if Number of Multisamples is set to
+ * NUMSAMPLES_4. The value of Z_Height must be multiplied by 2 and Z_Width
+ * must be multiplied by 4 before being applied to the table below if Number
+ * of Multisamples is set to NUMSAMPLES_8.
+ */
+ if (mt->num_samples == 4)
+ width *= 2;
+ else if (mt->num_samples == 8)
+ width *= 4;
+
+ return width;
+}
+#undef level
+
+static void
+gen7_create_hiz_buffer(struct brw_context *brw,
+ struct intel_mipmap_tree *mt)
+{
+ uint32_t q_pitch, w0, h0, h1, h_level, z_depth; /* Inputs to formula */
+ size_t hz_width; /* Number of bytes */
+ unsigned int hz_height; /* Number of rows */
+ unsigned int tiling = I915_TILING_Y;
+
+ z_depth = mt->level[0].depth;
+ w0 = caclulate_z_width(mt, 0);
+ h0 = caclulate_z_height(mt, 0);
+ h1 = caclulate_z_height(mt, 1);
+
+ hz_width = CEILING(w0, 16) * 16;
+
+ /* ... Where, Qpitch is computed using vertical alignment j=8. Please refer
+ * to the GPU overview volume for Qpitch definition. NB: The docs have
+ * multiple formulas for q_pitch on IVB, but the HSW docs only have the
+ * below definition.
+ *
+ * TODO: Compressed textures have a different formula
+ */
+ q_pitch = h0 + h1 + 11 * 8;
+
+ /* The following is directly derived from the "Hierarchical Depth Buffer"
+ * section of the bspec.
+ */
+ switch (mt->target) {
+ case GL_TEXTURE_1D_ARRAY:
+ case GL_TEXTURE_2D_ARRAY:
+ case GL_TEXTURE_2D:
+ hz_height = CEILING((q_pitch * z_depth / 2), 8) * 8;
+ break;
+ case GL_TEXTURE_CUBE_MAP_ARRAY:
+ hz_height = CEILING((q_pitch * z_depth * 6 / 2), 8) * 8;
+ break;
+ case GL_TEXTURE_3D:
+ hz_height = 0;
+ for (int i = 0; i < mt->last_level; i++) {
+ int tmp;
+ h_level = caclulate_z_height(mt, i);
+ tmp = floorf(z_depth / pow(2, i));
+ if (!tmp)
+ tmp++;
+ hz_height += h_level * tmp;
+ }
+ hz_height /= 2;
+ break;
+ default:
+ perf_debug("Unknown depthbuffer texture type (%d).", mt->target);
+ return;
+ }
+ mt->hiz_buffer.bo = drm_intel_bo_alloc_tiled(brw->intelScreen->bufmgr,
+ "hiz_buffer",
+ hz_width, hz_height, 1,
+ &tiling, &mt->hiz_buffer.stride,
+ BO_ALLOC_FOR_RENDER);
+
+ /* We need to do the same check in intel_miptree_create() to make
+ * sure we have a region that can be blitted.
+ */
+ if (mt->hiz_buffer.bo->size >= brw->max_gtt_map_object_size) {
+ perf_debug("%zx%d depthbuffer larger than aperture; falling back to X-tiled\n",
+ hz_width, hz_height);
+
+ tiling = I915_TILING_X;
+ drm_intel_bo_unreference(mt->hiz_buffer.bo);
+
+ mt->hiz_buffer.bo =
+ drm_intel_bo_alloc_tiled(brw->intelScreen->bufmgr,
+ "hiz_buffer",
+ hz_width, hz_height, 1,
+ &tiling, &mt->hiz_buffer.stride,
+ BO_ALLOC_FOR_RENDER);
+ }
+}
bool
intel_miptree_alloc_hiz(struct brw_context *brw,
struct intel_mipmap_tree *mt)
{
- assert(mt->hiz_buffer.mt == NULL);
- mt->hiz_buffer.mt = intel_miptree_create(brw,
- mt->target,
- mt->format,
- mt->first_level,
- mt->last_level,
- mt->logical_width0,
- mt->logical_height0,
- mt->logical_depth0,
- true,
- mt->num_samples,
- INTEL_MIPTREE_TILING_ANY);
-
- if (!mt->hiz_buffer.mt)
- return false;
+ if (brw->gen > 6) {
+ assert(mt->hiz_buffer.bo == NULL);
+ gen7_create_hiz_buffer(brw, mt);
+ } else {
+ assert(mt->hiz_buffer.mt == NULL);
+ mt->hiz_buffer.mt = intel_miptree_create(brw,
+ mt->target,
+ mt->format,
+ mt->first_level,
+ mt->last_level,
+ mt->logical_width0,
+ mt->logical_height0,
+ mt->logical_depth0,
+ true,
+ mt->num_samples,
+ INTEL_MIPTREE_TILING_ANY);
+ if (mt->hiz_buffer.mt) {
+ mt->hiz_buffer.bo = mt->hiz_buffer.mt->region->bo;
+ mt->hiz_buffer.stride = mt->hiz_buffer.mt->region->pitch;
+ }
+ }
- mt->hiz_buffer.bo = mt->hiz_buffer.mt->region->bo;
- mt->hiz_buffer.stride = mt->hiz_buffer.mt->region->pitch;
+ if (!mt->hiz_buffer.bo)
+ return false;
/* Mark that all slices need a HiZ resolve. */
struct intel_resolve_map *head = &mt->hiz_map;
diff --git a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
index 92d26fa..694e5e7 100644
--- a/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
+++ b/src/mesa/drivers/dri/i965/intel_mipmap_tree.h
@@ -417,7 +417,7 @@ struct intel_mipmap_tree
struct {
struct intel_mipmap_tree *mt;
drm_intel_bo *bo;
- uint32_t stride;
+ unsigned long stride;
} hiz_buffer;
/**
--
1.8.4
More information about the mesa-dev
mailing list