[Mesa-dev] [PATCH] i965: Sort array elements to increase chances of reusing buffer relocation

Mon Dec 1 12:23:32 PST 2014

When submitting the vertex buffers the i965 driver will try to recognise when
multiple attributes are using the same buffer so that it can submit a single
relocation for it and set a different offset in the attribute. Previously
however if the application happens to have the attributes in a struct with an
order that doesn't match the order they are listed in the gl_vert_attrib enum
then the loop would end up processing the attributes with a greater offset
first and the optimisation wouldn't be used.

To make the optmisation more likely to be used this patch makes it always
process the elements in increasing order of offset. This is done copying the
element pointers into a separate array and sorting it with qsort. This only
affects the order that the elements are processed and doesn't change the order
that they are submitted to the hardware.
---

I noticed this problem by inspection but I don't have a good feel for
how important avoiding the buffer relocations is so I don't know
whether the patch makes much sense. However I think the added overhead
to sort the attributes is minimal so I don't think it can do much
harm.

I've run the patch through Piglit on IvyBridge and there are no
regressions. I've also verified that it does end up generating two
buffer relocations if you put the attributes in the wrong order with
gdb and also that the patch fixes it.

 src/mesa/drivers/dri/i965/brw_draw_upload.c | 38 ++++++++++++++++++++++++++---
 1 file changed, 35 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index 7bf9163..8b60563 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -396,6 +396,24 @@ copy_array_to_vbo_array(struct brw_context *brw,
    buffer->stride = dst_stride;
 }
 
+static int
+compare_array_ptr(const void *a, const void *b)
+{
+   struct brw_vertex_element *input_a =
+      *(struct brw_vertex_element * const *) a;
+   struct brw_vertex_element *input_b =
+      *(struct brw_vertex_element * const *) b;
+   const GLubyte *ptr_a = input_a->glarray->Ptr;
+   const GLubyte *ptr_b = input_b->glarray->Ptr;
+
+   if (ptr_a < ptr_b)
+      return -1;
+   else if (ptr_a > ptr_b)
+      return 1;
+   else
+      return 0;
+}
+
 void
 brw_prepare_vertices(struct brw_context *brw)
 {
@@ -409,6 +427,7 @@ brw_prepare_vertices(struct brw_context *brw)
    int delta, i, j;
 
    struct brw_vertex_element *upload[VERT_ATTRIB_MAX];
+   struct brw_vertex_element *sorted[VERT_ATTRIB_MAX];
    GLuint nr_uploads = 0;
 
    /* _NEW_POLYGON
@@ -442,8 +461,21 @@ brw_prepare_vertices(struct brw_context *brw)
    if (brw->vb.nr_buffers)
       return;
 
+   /* In the loop below if it finds an element that is using the same buffer
+    * as a previous element then it will reuse the same buffer relocation.
+    * However it will only work if the offset for the previous element is less
+    * than the offset for the new element and the difference is less than the
+    * stride. In order to increase the chances of hitting this optimisation
+    * the elements will be processed in increasing order of offset by first
+    * sorting the pointers.
+    */
+   memcpy(sorted, brw->vb.enabled,
+          sizeof(sorted[0]) * brw->vb.nr_enabled);
+   qsort(sorted, brw->vb.nr_enabled, sizeof(sorted[0]),
+         compare_array_ptr);
+
    for (i = j = 0; i < brw->vb.nr_enabled; i++) {
-      struct brw_vertex_element *input = brw->vb.enabled[i];
+      struct brw_vertex_element *input = sorted[i];
       const struct gl_client_array *glarray = input->glarray;
 
       if (_mesa_is_bufferobj(glarray->BufferObj)) {
@@ -456,13 +488,13 @@ brw_prepare_vertices(struct brw_context *brw)
 	  * relocations.
 	  */
 	 for (k = 0; k < i; k++) {
-	    const struct gl_client_array *other = brw->vb.enabled[k]->glarray;
+	    const struct gl_client_array *other = sorted[k]->glarray;
 	    if (glarray->BufferObj == other->BufferObj &&
 		glarray->StrideB == other->StrideB &&
 		glarray->InstanceDivisor == other->InstanceDivisor &&
 		(uintptr_t)(glarray->Ptr - other->Ptr) < glarray->StrideB)
 	    {
-	       input->buffer = brw->vb.enabled[k]->buffer;
+	       input->buffer = sorted[k]->buffer;
 	       input->offset = glarray->Ptr - other->Ptr;
 	       break;
 	    }
-- 
1.9.3