[Cogl] [PATCH] cogl-matrix: Use SSE for cogl_matrix_transform/project_points

Wed Jun 5 09:03:59 PDT 2013

The generated assembly for the matrix transform functions on 64 bit
with -O3 only does one multiplication at a time using only one of the
floats in each SSE register. With some hand-written assembly we can do
slightly better than this by storing the whole matrix in just 4
registers and doing up to four multiplications in parallel with the
mulps instruction. This seems to be consistently faster for all sizes
of input on both 32-bit and 64-bit.

On 64-bit we could probably do slightly better than this because there
are double the number of SSE registers so we could potentially
transform two vertices at once. However the current code has the
advantage that it is suitable for both 64-bit and 32-bit because it
doesn't refer to any non-SSE registers directly.

This patch has the interesting side-effect that
cogl_matrix_project_points is actually faster than
cogl_matrix_transform_points. The extra multiplication is done anyway
for free, ie, the SSE registers always multiply four numbers at a
time, even if in the transform case we only want three. When it comes
to store the result, for the project version we can just do an
unaligned write of the whole SSE register. However for the transform
version we have to write only three of the values in order to not
overwrite the rest of the memory so we have to do three separate
writes.

There is a standalone performance test of the various implementations
of the functions here:

https://github.com/bpeel/time-transform
---
 cogl/cogl-bitmap-conversion.c |  16 +--
 cogl/cogl-matrix.c            | 267 ++++++++++++++++++++++++++++++++++++++++++
 cogl/cogl-util.h              |   9 ++
 3 files changed, 280 insertions(+), 12 deletions(-)

diff --git a/cogl/cogl-bitmap-conversion.c b/cogl/cogl-bitmap-conversion.c
index 102bf23..07b118d 100644
--- a/cogl/cogl-bitmap-conversion.c
+++ b/cogl/cogl-bitmap-conversion.c
@@ -123,15 +123,7 @@ _cogl_premult_alpha_first (uint8_t *dst)
 
 #undef MULT
 
-/* Use the SSE optimized version to premult four pixels at once when
-   it is available. The same assembler code works for x86 and x86-64
-   because it doesn't refer to any non-SSE registers directly */
-#if defined(__SSE2__) && defined(__GNUC__) \
-  && (defined(__x86_64) || defined(__i386))
-#define COGL_USE_PREMULT_SSE2
-#endif
-
-#ifdef COGL_USE_PREMULT_SSE2
+#ifdef COGL_USE_INLINE_SSE2
 
 inline static void
 _cogl_premult_alpha_last_four_pixels_sse2 (uint8_t *p)
@@ -206,13 +198,13 @@ _cogl_premult_alpha_last_four_pixels_sse2 (uint8_t *p)
        : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5");
 }
 
-#endif /* COGL_USE_PREMULT_SSE2 */
+#endif /* COGL_USE_INLINE_SSE2 */
 
 static void
 _cogl_bitmap_premult_unpacked_span_8 (uint8_t *data,
                                       int width)
 {
-#ifdef COGL_USE_PREMULT_SSE2
+#ifdef COGL_USE_INLINE_SSE2
 
   /* Process 4 pixels at a time */
   while (width >= 4)
@@ -225,7 +217,7 @@ _cogl_bitmap_premult_unpacked_span_8 (uint8_t *data,
   /* If there are any pixels left we will fall through and
      handle them below */
 
-#endif /* COGL_USE_PREMULT_SSE2 */
+#endif /* COGL_USE_INLINE_SSE2 */
 
   while (width-- > 0)
     {
diff --git a/cogl/cogl-matrix.c b/cogl/cogl-matrix.c
index 278750a..96cf8b5 100644
--- a/cogl/cogl-matrix.c
+++ b/cogl/cogl-matrix.c
@@ -2002,6 +2002,67 @@ _cogl_matrix_transform_points_f2 (const CoglMatrix *matrix,
                                   void *points_out,
                                   int n_points)
 {
+#ifdef COGL_USE_INLINE_SSE2
+
+  /* The 'volatile' here is needed because otherwise GCC think that
+     none of the outputs are actually used so it will just no-op the
+     whole thing */
+  asm volatile
+    /* Load the matrix into SSE registers xmm4-xmm6 */
+    ("movdqu (%3), %%xmm4\n"
+     "movdqu 16(%3), %%xmm5\n"
+     "movdqu 48(%3), %%xmm6\n"
+     /* Jump straight to the loop condition */
+     "jmp 0f\n"
+     /* loop... */
+     "1:\n"
+     /* Get next x coordinate into xmm0 */
+     "movss (%1), %%xmm0\n"
+     /* Get y coordinate into xmm1 */
+     "movss 4(%1), %%xmm1\n"
+     /* Expand x coordinate to all four floats in xmm0 */
+     "shufps $0, %%xmm0, %%xmm0\n"
+     /* Expand y coordinate to all four floats in xmm1 */
+     "shufps $0, %%xmm1, %%xmm1\n"
+     /* Multiply the x coordinate by the first column of the matrix */
+     "mulps %%xmm4, %%xmm0\n"
+     /* Multiply the y coordinate by the second column of the matrix */
+     "mulps %%xmm5, %%xmm1\n"
+     /* Add in the fourth column of the matrix */
+     "addps %%xmm6, %%xmm0\n"
+     /* and the results of multiplying the y coordinate */
+     "addps %%xmm1, %%xmm0\n"
+     /* write the three resulting parts. we need to do this as three
+        separate writes because points_out will only be aligned to
+        32-bits and we don't want to overwrite the space after the
+        third float */
+     /* store first float */
+     "movss %%xmm0, (%2)\n"
+     /* copy second float */
+     "shufps $0xe5, %%xmm0, %%xmm0\n"
+     "movss %%xmm0, 4(%2)\n"
+     /* copy third float */
+     "shufps $0xe6, %%xmm0, %%xmm0\n"
+     "movss %%xmm0, 8(%2)\n"
+     "add %4, %1\n" /* add stride_in to points_in */
+     "add %5, %2\n" /* add stride_out to points_out */
+     /* loop condition */
+     "0:\n"
+     "dec %0\n" /* decrement n_points */
+     "jns 1b\n" /* continue if not negative */
+     : /* these aren't really outputs but GCC doesn't have any other
+          way to specify that we're clobbering them apart from to
+          mark them as in-out */
+       "+r" (n_points), /* 0 */
+       "+r" (points_in), /* 1 */
+       "+r" (points_out) /* 2 */
+     : "r" (matrix), /* 3 */
+       "rin" (stride_in), /* 4 */
+       "rin" (stride_out) /* 5 */
+     : "xmm0", "xmm1", "xmm4", "xmm5", "xmm6");
+
+#else /* COGL_USE_INLINE_SSE2 */
+
   int i;
 
   for (i = 0; i < n_points; i++)
@@ -2013,6 +2074,8 @@ _cogl_matrix_transform_points_f2 (const CoglMatrix *matrix,
       o->y = matrix->yx * p.x + matrix->yy * p.y + matrix->yw;
       o->z = matrix->zx * p.x + matrix->zy * p.y + matrix->zw;
     }
+
+#endif /* COGL_USE_INLINE_SSE2 */
 }
 
 static void
@@ -2045,6 +2108,76 @@ _cogl_matrix_transform_points_f3 (const CoglMatrix *matrix,
                                   void *points_out,
                                   int n_points)
 {
+#ifdef COGL_USE_INLINE_SSE2
+
+  /* The 'volatile' here is needed because otherwise GCC think that
+     none of the outputs are actually used so it will just no-op the
+     whole thing */
+  asm volatile
+    /* Load the matrix into SSE registers xmm4-xmm7 */
+    ("movdqu (%3), %%xmm4\n"
+     "movdqu 16(%3), %%xmm5\n"
+     "movdqu 32(%3), %%xmm6\n"
+     "movdqu 48(%3), %%xmm7\n"
+     /* Jump straight to the loop condition */
+     "jmp 0f\n"
+     /* loop... */
+     "1:\n"
+     /* Get next x coordinate into xmm0 */
+     "movss (%1), %%xmm0\n"
+     /* Get y coordinate into xmm1 */
+     "movss 4(%1), %%xmm1\n"
+     /* Get z coordinate into xmm2 */
+     "movss 8(%1), %%xmm2\n"
+     /* Expand x coordinate to all four floats in xmm0 */
+     "shufps $0, %%xmm0, %%xmm0\n"
+     /* Expand y coordinate to all four floats in xmm1 */
+     "shufps $0, %%xmm1, %%xmm1\n"
+     /* Expand z coordinate to all four floats in xmm2 */
+     "shufps $0, %%xmm2, %%xmm2\n"
+     /* Multiply the x coordinate by the first column of the matrix */
+     "mulps %%xmm4, %%xmm0\n"
+     /* Multiply the y coordinate by the second column of the matrix */
+     "mulps %%xmm5, %%xmm1\n"
+     /* Multiply the z coordinate by the third column of the matrix */
+     "mulps %%xmm6, %%xmm2\n"
+     /* Add in the fourth column of the matrix */
+     "addps %%xmm7, %%xmm0\n"
+     /* and the results of multiplying the y coordinate */
+     "addps %%xmm1, %%xmm0\n"
+     /* and the results of multiplying the z coordinate */
+     "addps %%xmm2, %%xmm0\n"
+     /* write the three resulting parts. we need to do this as three
+        separate writes because points_out will only be aligned to
+        32-bits and we don't want to overwrite the space after the
+        third float */
+     /* store first float */
+     "movss %%xmm0, (%2)\n"
+     /* copy second float */
+     "shufps $0xe5, %%xmm0, %%xmm0\n"
+     "movss %%xmm0, 4(%2)\n"
+     /* copy third float */
+     "shufps $0xe6, %%xmm0, %%xmm0\n"
+     "movss %%xmm0, 8(%2)\n"
+     "add %4, %1\n" /* add stride_in to points_in */
+     "add %5, %2\n" /* add stride_out to points_out */
+     /* loop condition */
+     "0:\n"
+     "dec %0\n" /* decrement n_points */
+     "jns 1b\n" /* continue if not negative */
+     : /* these aren't really outputs but GCC doesn't have any other
+          way to specify that we're clobbering them apart from to
+          mark them as in-out */
+       "+r" (n_points), /* 0 */
+       "+r" (points_in), /* 1 */
+       "+r" (points_out) /* 2 */
+     : "r" (matrix), /* 3 */
+       "rin" (stride_in), /* 4 */
+       "rin" (stride_out) /* 5 */
+     : "xmm0", "xmm1", "xmm2", "xmm4", "xmm5", "xmm6", "xmm7");
+
+#else /* COGL_USE_INLINE_SSE2 */
+
   int i;
 
   for (i = 0; i < n_points; i++)
@@ -2059,6 +2192,8 @@ _cogl_matrix_transform_points_f3 (const CoglMatrix *matrix,
       o->z = matrix->zx * p.x + matrix->zy * p.y +
              matrix->zz * p.z + matrix->zw;
     }
+
+#endif /* COGL_USE_INLINE_SSE2 */
 }
 
 static void
@@ -2069,6 +2204,67 @@ _cogl_matrix_project_points_f3 (const CoglMatrix *matrix,
                                 void *points_out,
                                 int n_points)
 {
+#ifdef COGL_USE_INLINE_SSE2
+
+  /* The 'volatile' here is needed because otherwise GCC think that
+     none of the outputs are actually used so it will just no-op the
+     whole thing */
+  asm volatile
+    /* Load the matrix into SSE registers xmm4-xmm7 */
+    ("movdqu (%3), %%xmm4\n"
+     "movdqu 16(%3), %%xmm5\n"
+     "movdqu 32(%3), %%xmm6\n"
+     "movdqu 48(%3), %%xmm7\n"
+     /* Jump straight to the loop condition */
+     "jmp 0f\n"
+     /* loop... */
+     "1:\n"
+     /* Get next x coordinate into xmm0 */
+     "movss (%1), %%xmm0\n"
+     /* Get y coordinate into xmm1 */
+     "movss 4(%1), %%xmm1\n"
+     /* Get z coordinate into xmm2 */
+     "movss 8(%1), %%xmm2\n"
+     /* Expand x coordinate to all four floats in xmm0 */
+     "shufps $0, %%xmm0, %%xmm0\n"
+     /* Expand y coordinate to all four floats in xmm1 */
+     "shufps $0, %%xmm1, %%xmm1\n"
+     /* Expand z coordinate to all four floats in xmm2 */
+     "shufps $0, %%xmm2, %%xmm2\n"
+     /* Multiply the x coordinate by the first column of the matrix */
+     "mulps %%xmm4, %%xmm0\n"
+     /* Multiply the y coordinate by the second column of the matrix */
+     "mulps %%xmm5, %%xmm1\n"
+     /* Multiply the z coordinate by the third column of the matrix */
+     "mulps %%xmm6, %%xmm2\n"
+     /* Add in the fourth column of the matrix */
+     "addps %%xmm7, %%xmm0\n"
+     /* and the results of multiplying the y coordinate */
+     "addps %%xmm1, %%xmm0\n"
+     /* and the results of multiplying the z coordinate */
+     "addps %%xmm2, %%xmm0\n"
+     /* write the four resulting parts. we can do this as one
+      * unaligned write */
+     "movdqu %%xmm0, (%2)\n"
+     "add %4, %1\n" /* add stride_in to points_in */
+     "add %5, %2\n" /* add stride_out to points_out */
+     /* loop condition */
+     "0:\n"
+     "dec %0\n" /* decrement n_points */
+     "jns 1b\n" /* continue if not negative */
+     : /* these aren't really outputs but GCC doesn't have any other
+          way to specify that we're clobbering them apart from to
+          mark them as in-out */
+       "+r" (n_points), /* 0 */
+       "+r" (points_in), /* 1 */
+       "+r" (points_out) /* 2 */
+     : "r" (matrix), /* 3 */
+       "rin" (stride_in), /* 4 */
+       "rin" (stride_out) /* 5 */
+     : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+
+#else /* COGL_USE_INLINE_SSE2 */
+
   int i;
 
   for (i = 0; i < n_points; i++)
@@ -2085,6 +2281,8 @@ _cogl_matrix_project_points_f3 (const CoglMatrix *matrix,
       o->w = matrix->wx * p.x + matrix->wy * p.y +
              matrix->wz * p.z + matrix->ww;
     }
+
+#endif /* COGL_USE_INLINE_SSE2 */
 }
 
 static void
@@ -2095,6 +2293,73 @@ _cogl_matrix_project_points_f4 (const CoglMatrix *matrix,
                                 void *points_out,
                                 int n_points)
 {
+#ifdef COGL_USE_INLINE_SSE2
+
+  /* The 'volatile' here is needed because otherwise GCC think that
+     none of the outputs are actually used so it will just no-op the
+     whole thing */
+  asm volatile
+    /* Load the matrix into SSE registers xmm4-xmm7 */
+    ("movdqu (%3), %%xmm4\n"
+     "movdqu 16(%3), %%xmm5\n"
+     "movdqu 32(%3), %%xmm6\n"
+     "movdqu 48(%3), %%xmm7\n"
+     /* Jump straight to the loop condition */
+     "jmp 0f\n"
+     /* loop... */
+     "1:\n"
+     /* Get next x coordinate into xmm0 */
+     "movss (%1), %%xmm0\n"
+     /* Get y coordinate into xmm1 */
+     "movss 4(%1), %%xmm1\n"
+     /* Get z coordinate into xmm2 */
+     "movss 8(%1), %%xmm2\n"
+     /* Get w coordinate into xmm3 */
+     "movss 12(%1), %%xmm3\n"
+     /* Expand x coordinate to all four floats in xmm0 */
+     "shufps $0, %%xmm0, %%xmm0\n"
+     /* Expand y coordinate to all four floats in xmm1 */
+     "shufps $0, %%xmm1, %%xmm1\n"
+     /* Expand z coordinate to all four floats in xmm2 */
+     "shufps $0, %%xmm2, %%xmm2\n"
+     /* Expand w coordinate to all four floats in xmm3 */
+     "shufps $0, %%xmm3, %%xmm3\n"
+     /* Multiply the x coordinate by the first column of the matrix */
+     "mulps %%xmm4, %%xmm0\n"
+     /* Multiply the y coordinate by the second column of the matrix */
+     "mulps %%xmm5, %%xmm1\n"
+     /* Multiply the z coordinate by the third column of the matrix */
+     "mulps %%xmm6, %%xmm2\n"
+     /* Multiply the w coordinate by the fourth column of the matrix */
+     "mulps %%xmm7, %%xmm3\n"
+     /* Add in the results of multiplying the y coordinate */
+     "addps %%xmm1, %%xmm0\n"
+     /* and the results of multiplying the z coordinate */
+     "addps %%xmm2, %%xmm0\n"
+     /* and the results of multiplying the w coordinate */
+     "addps %%xmm3, %%xmm0\n"
+     /* write the four resulting parts. we can do this as one
+      * unaligned write */
+     "movdqu %%xmm0, (%2)\n"
+     "add %4, %1\n" /* add stride_in to points_in */
+     "add %5, %2\n" /* add stride_out to points_out */
+     /* loop condition */
+     "0:\n"
+     "dec %0\n" /* decrement n_points */
+     "jns 1b\n" /* continue if not negative */
+     : /* these aren't really outputs but GCC doesn't have any other
+          way to specify that we're clobbering them apart from to
+          mark them as in-out */
+       "+r" (n_points), /* 0 */
+       "+r" (points_in), /* 1 */
+       "+r" (points_out) /* 2 */
+     : "r" (matrix), /* 3 */
+       "rin" (stride_in), /* 4 */
+       "rin" (stride_out) /* 5 */
+     : "xmm0", "xmm1", "xmm2", "xmm3", "xmm4", "xmm5", "xmm6", "xmm7");
+
+#else /* COGL_USE_INLINE_SSE2 */
+
   int i;
 
   for (i = 0; i < n_points; i++)
@@ -2111,6 +2376,8 @@ _cogl_matrix_project_points_f4 (const CoglMatrix *matrix,
       o->w = matrix->wx * p.x + matrix->wy * p.y +
              matrix->wz * p.z + matrix->ww * p.w;
     }
+
+#endif /* COGL_USE_INLINE_SSE2 */
 }
 
 void
diff --git a/cogl/cogl-util.h b/cogl/cogl-util.h
index 9f6bfff..5e1b815 100644
--- a/cogl/cogl-util.h
+++ b/cogl/cogl-util.h
@@ -313,4 +313,13 @@ _cogl_util_scissor_intersect (int rect_x0,
   *scissor_y1 = MIN (*scissor_y1, rect_y1);
 }
 
+/* Common macro used to decide whether to build with SSE optimised
+   inline assembler instructions. The same assembler code works for
+   x86 and x86-64 because it doesn't refer to any non-SSE registers
+   directly */
+#if defined(__SSE2__) && defined(__GNUC__) \
+  && (defined(__x86_64) || defined(__i386))
+#define COGL_USE_INLINE_SSE2
+#endif
+
 #endif /* __COGL_UTIL_H */
-- 
1.7.11.3.g3c3efa5