[Mesa-dev] [PATCH v2] i965: Sort array elements to increase chances of reusing buffer relocation

Tue Dec 2 10:42:09 PST 2014

Neil wrote:

> It might be worth making a simpler hard-coded implementation of
> quicksort because calling qsort is probably not very sensible for
> such a small array and the function call overhead for each
> comparison is probably quite a bit.

Ok, here is a v2 of the patch which has a simple custom quick sort
implementation based on the Wikipedia description. It seems a lot
faster than using qsort so the table is now like this:

attributes are  │  master  with patch  optimization removed  patchv2
────────────────┼──────────────────────────────────────────────────
in order        │   820       560             325             760
out of order    │   325       540             325             760

- Neil

------- >8 --------------- (use git am --scissors to automatically chop here)

When submitting the vertex buffers the i965 driver will try to recognise when
multiple attributes are using the same buffer so that it can submit a single
relocation for it and set a different offset in the attribute. Previously
however if the application happens to have the attributes in a struct with an
order that doesn't match the order they are listed in the gl_vert_attrib enum
then the loop would end up processing the attributes with a greater offset
first and the optimisation wouldn't be used.

To make the optmisation more likely to be used this patch makes it always
process the elements in increasing order of offset. This is done copying the
element pointers into a separate array and sorting it with a simple quick sort
implementation. This only affects the order that the elements are processed
and doesn't change the order that they are submitted to the hardware.
---
 src/mesa/drivers/dri/i965/brw_draw_upload.c | 102 +++++++++++++++++++++++++++-
 1 file changed, 99 insertions(+), 3 deletions(-)

diff --git a/src/mesa/drivers/dri/i965/brw_draw_upload.c b/src/mesa/drivers/dri/i965/brw_draw_upload.c
index 7bf9163..8480043 100644
--- a/src/mesa/drivers/dri/i965/brw_draw_upload.c
+++ b/src/mesa/drivers/dri/i965/brw_draw_upload.c
@@ -396,6 +396,89 @@ copy_array_to_vbo_array(struct brw_context *brw,
    buffer->stride = dst_stride;
 }
 
+static bool
+compare_array_ptr(const struct brw_vertex_element *input_a,
+                  const struct brw_vertex_element *input_b)
+{
+   const GLubyte *ptr_a = input_a->glarray->Ptr;
+   const GLubyte *ptr_b = input_b->glarray->Ptr;
+
+   if (ptr_a < ptr_b)
+      return false;
+   else if (ptr_a > ptr_b)
+      return true;
+   else {
+      /* Resort to comparing the element pointers so that it never returns
+       * equality because apparently that can be bad for quick sort */
+      if (input_a < input_b)
+         return false;
+      else
+         return true;
+   }
+}
+
+static int
+partition_elements(struct brw_vertex_element **elements,
+                   int n_elements)
+{
+   int pivot = n_elements / 2;
+   struct brw_vertex_element *pivot_value = elements[pivot];
+   struct brw_vertex_element *tmp;
+   int i, store_index;
+
+   elements[pivot] = elements[n_elements - 1];
+
+   store_index = 0;
+
+   for (i = 0; i < n_elements - 1; i++) {
+      if (!compare_array_ptr(elements[i], pivot_value)) {
+         tmp = elements[i];
+         elements[i] = elements[store_index];
+         elements[store_index] = tmp;
+         store_index++;
+      }
+   }
+
+   elements[n_elements - 1] = elements[store_index];
+   elements[store_index] = pivot_value;
+
+   return store_index;
+}
+
+struct sort_stack {
+   int16_t start, end;
+};
+
+static void
+sort_elements(struct brw_vertex_element **elements,
+              int n_elements)
+{
+   struct sort_stack stack[VERT_ATTRIB_MAX];
+   int stack_size = 1;
+   int start, end, pivot_point;
+
+   stack[0].start = 0;
+   stack[0].end = n_elements;
+
+   while (stack_size > 0) {
+      stack_size--;
+      start = stack[stack_size].start;
+      end = stack[stack_size].end;
+
+      if (end - start >= 2) {
+         pivot_point = partition_elements(elements + start,
+                                          end - start) + start;
+         assert(stack_size + 2 <= ARRAY_SIZE(stack));
+         stack[stack_size].start = pivot_point + 1;
+         stack[stack_size].end = end;
+         stack_size++;
+         stack[stack_size].start = start;
+         stack[stack_size].end = pivot_point;
+         stack_size++;
+      }
+   }
+}
+
 void
 brw_prepare_vertices(struct brw_context *brw)
 {
@@ -409,6 +492,7 @@ brw_prepare_vertices(struct brw_context *brw)
    int delta, i, j;
 
    struct brw_vertex_element *upload[VERT_ATTRIB_MAX];
+   struct brw_vertex_element *sorted[VERT_ATTRIB_MAX];
    GLuint nr_uploads = 0;
 
    /* _NEW_POLYGON
@@ -442,8 +526,20 @@ brw_prepare_vertices(struct brw_context *brw)
    if (brw->vb.nr_buffers)
       return;
 
+   /* In the loop below if it finds an element that is using the same buffer
+    * as a previous element then it will reuse the same buffer relocation.
+    * However it will only work if the offset for the previous element is less
+    * than the offset for the new element and the difference is less than the
+    * stride. In order to increase the chances of hitting this optimisation
+    * the elements will be processed in increasing order of offset by first
+    * sorting the pointers.
+    */
+   memcpy(sorted, brw->vb.enabled,
+          sizeof(sorted[0]) * brw->vb.nr_enabled);
+   sort_elements(sorted, brw->vb.nr_enabled);
+
    for (i = j = 0; i < brw->vb.nr_enabled; i++) {
-      struct brw_vertex_element *input = brw->vb.enabled[i];
+      struct brw_vertex_element *input = sorted[i];
       const struct gl_client_array *glarray = input->glarray;
 
       if (_mesa_is_bufferobj(glarray->BufferObj)) {
@@ -456,13 +552,13 @@ brw_prepare_vertices(struct brw_context *brw)
 	  * relocations.
 	  */
 	 for (k = 0; k < i; k++) {
-	    const struct gl_client_array *other = brw->vb.enabled[k]->glarray;
+	    const struct gl_client_array *other = sorted[k]->glarray;
 	    if (glarray->BufferObj == other->BufferObj &&
 		glarray->StrideB == other->StrideB &&
 		glarray->InstanceDivisor == other->InstanceDivisor &&
 		(uintptr_t)(glarray->Ptr - other->Ptr) < glarray->StrideB)
 	    {
-	       input->buffer = brw->vb.enabled[k]->buffer;
+	       input->buffer = sorted[k]->buffer;
 	       input->offset = glarray->Ptr - other->Ptr;
 	       break;
 	    }
-- 
1.9.3