[Mesa-dev] [PATCH 08/13] i965/draw: Use the real size for vertex buffers

Thu May 19 07:21:05 UTC 2016

Previously, we were using the size of the BO which may be substantially
larger than the actual vertex buffer size.
---
 src/mesa/drivers/dri/i965/brw_context.h      |  1 +
 src/mesa/drivers/dri/i965/brw_draw_upload.c  | 52 +++++++++++++++++++++++++++-
 src/mesa/drivers/dri/i965/gen8_draw_upload.c |  2 +-
 3 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_context.h b/src/mesa/drivers/dri/i965/brw_context.h
index 76ed1de..d1d31e0 100644
--- a/src/mesa/drivers/dri/i965/brw_context.h
+++ b/src/mesa/drivers/dri/i965/brw_context.h
@@ -569,6 +569,7 @@ struct brw_vertex_buffer {
    /** Buffer object containing the uploaded vertex data */
    drm_intel_bo *bo;
    uint32_t offset;
+   uint32_t vf_upper_bound;
    /** Byte stride between elements in the uploaded array */
    GLuint stride;
    GLuint step_rate;
diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index b651fd2..2eac385 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -365,6 +365,17 @@ brw_get_vertex_surface_type(struct brw_context *brw,
    }
 }
 
+static unsigned
+attrib_vec4_size(GLenum type)
+{
+   const int type_size = _mesa_sizeof_type(type);
+
+   /* _mesa_sizeof_type() returns > 0 for bare GL types and -1 for all of the
+    * packed formats.  All of the packed formats have a size of 4.
+    */
+   return type_size > 0 ? type_size * 4 : 4;
+}
+
 static void
 copy_array_to_vbo_array(struct brw_context *brw,
 			struct brw_vertex_element *element,
@@ -373,6 +384,7 @@ copy_array_to_vbo_array(struct brw_context *brw,
 			GLuint dst_stride)
 {
    const int src_stride = element->glarray->StrideB;
+   const unsigned vec4_size = attrib_vec4_size(element->glarray->Type);
 
    /* If the source stride is zero, we just want to upload the current
     * attribute once and set the buffer's stride to 0.  There's no need
@@ -385,6 +397,7 @@ copy_array_to_vbo_array(struct brw_context *brw,
 			&buffer->bo, &buffer->offset);
 
       buffer->stride = 0;
+      buffer->vf_upper_bound = vec4_size;
       return;
    }
 
@@ -404,6 +417,7 @@ copy_array_to_vbo_array(struct brw_context *brw,
       }
    }
    buffer->stride = dst_stride;
+   buffer->vf_upper_bound = size + (vec4_size - dst_stride);
 }
 
 void
@@ -457,6 +471,7 @@ brw_prepare_vertices(struct brw_context *brw)
    struct intel_buffer_object *enabled_buffer[VERT_ATTRIB_MAX];
    uint32_t buffer_range_start[VERT_ATTRIB_MAX];
    uint32_t buffer_range_end[VERT_ATTRIB_MAX];
+   uint32_t buffer_range_vf_end[VERT_ATTRIB_MAX];
 
    for (i = j = 0; i < brw->vb.nr_enabled; i++) {
       struct brw_vertex_element *input = brw->vb.enabled[i];
@@ -486,6 +501,23 @@ brw_prepare_vertices(struct brw_context *brw)
             }
          }
 
+         /* This is ugly.  It's completely undocumented (as far as I can tell)
+          * but based on a little reverse-engineering, it appears that the VF
+          * stage first fetches an entire vec4 and then swizzles components
+          * into the VUE.  Therefore, if any part of the vec4 lies outside of
+          * the buffer's bounds, the entire vec4 is discarded and you get
+          * entirely zeros.
+          *
+          * This means that we can't actually use tight bounds for vertex
+          * buffers.  Instead, we have to pad them out so that, for the last
+          * elment, the whole vec4 fits.  Unfortunately, this means there are
+          * a few corner cases where we don't handle ARB_robust_buffer_access
+          * 100% correct but they're very hard to hit and it's still safe in
+          * the sense that you shouldn't end up in someone else's buffer.
+          */
+         const unsigned vec4_size = attrib_vec4_size(glarray->Type);
+         const unsigned vf_range = range + (vec4_size - glarray->_ElementSize);
+
 	 /* If we have a VB set to be uploaded for this buffer object
 	  * already, reuse that VB state so that we emit fewer
 	  * relocations.
@@ -503,6 +535,7 @@ brw_prepare_vertices(struct brw_context *brw)
 
                buffer_range_start[k] = MIN2(buffer_range_start[k], start);
                buffer_range_end[k] = MAX2(buffer_range_end[k], start + range);
+               buffer_range_vf_end[k] = MAX2(buffer_range_end[k], start + vf_range);
 	       break;
 	    }
 	 }
@@ -517,6 +550,7 @@ brw_prepare_vertices(struct brw_context *brw)
             enabled_buffer[j] = intel_buffer;
             buffer_range_start[j] = start;
             buffer_range_end[j] = start + range;
+            buffer_range_vf_end[j] = start + vf_range;
 
 	    input->buffer = j++;
 	    input->offset = 0;
@@ -580,6 +614,8 @@ brw_prepare_vertices(struct brw_context *brw)
 
       buffer->bo = intel_bufferobj_buffer(brw, enabled_buffer[i], start, range);
       drm_intel_bo_reference(buffer->bo);
+
+      buffer->vf_upper_bound = buffer_range_vf_end[i];
    }
 
    /* If we need to upload all the arrays, then we can trim those arrays to
@@ -604,12 +640,24 @@ brw_prepare_vertices(struct brw_context *brw)
 				 buffer, interleaved);
 	 buffer->offset -= delta * interleaved;
 
+         /* Because we just pass upload[0] in to copy_array_to_vbo_array
+          * above, it cannot provide us with the correct vf_upper_bound.
+          * Instead, we have to calculate that ourselves.
+          */
+         unsigned elem_vf_size = 0;
+
 	 for (i = 0; i < nr_uploads; i++) {
 	    /* Then, just point upload[i] at upload[0]'s buffer. */
 	    upload[i]->offset =
 	       ((const unsigned char *)upload[i]->glarray->Ptr - ptr);
 	    upload[i]->buffer = j;
+
+            unsigned vec4_size = attrib_vec4_size(upload[i]->glarray->Type);
+            elem_vf_size = MAX2(elem_vf_size, upload[i]->offset + vec4_size);
 	 }
+         buffer->vf_upper_bound =
+            (delta + max_index - min_index) * interleaved + elem_vf_size;
+
 	 j++;
 
 	 nr_uploads = 0;
@@ -632,6 +680,7 @@ brw_prepare_vertices(struct brw_context *brw)
                                  buffer, upload[i]->glarray->_ElementSize);
       }
       buffer->offset -= delta * buffer->stride;
+      buffer->vf_upper_bound += delta * buffer->stride;
       buffer->step_rate = upload[i]->glarray->InstanceDivisor;
       upload[i]->buffer = j++;
       upload[i]->offset = 0;
@@ -773,7 +822,8 @@ brw_emit_vertices(struct brw_context *brw)
       OUT_BATCH((_3DSTATE_VERTEX_BUFFERS << 16) | (4 * nr_buffers - 1));
       for (i = 0; i < brw->vb.nr_buffers; i++) {
 	 struct brw_vertex_buffer *buffer = &brw->vb.buffers[i];
-         EMIT_VERTEX_BUFFER_STATE(brw, i, buffer->bo, buffer->bo->size - 1,
+         EMIT_VERTEX_BUFFER_STATE(brw, i, buffer->bo,
+                                  buffer->offset + buffer->vf_upper_bound - 1,
                                   buffer->offset, buffer->stride,
                                   buffer->step_rate);
 
diff --git a/src/mesa/drivers/dri/i965/gen8_draw_upload.c b/src/mesa/drivers/dri/i965/gen8_draw_upload.c
index dce11dd..722cde6 100644
--- a/src/mesa/drivers/dri/i965/gen8_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/gen8_draw_upload.c
@@ -151,7 +151,7 @@ gen8_emit_vertices(struct brw_context *brw)
 
          OUT_BATCH(dw0);
          OUT_RELOC64(buffer->bo, I915_GEM_DOMAIN_VERTEX, 0, buffer->offset);
-         OUT_BATCH(buffer->bo->size);
+         OUT_BATCH(buffer->vf_upper_bound);
       }
 
       if (uses_draw_params) {
-- 
2.5.0.400.gff86faf