[Pixman] [PATCH] Add fast paths for separable convolution

Tue Nov 27 00:42:10 PST 2012

From: Søren Sandmann Pedersen <ssp at redhat.com>

Similar to the fetchers for affine bilinear and nearest, add some fast
paths for the separable convolution filter for all combinations of
formats x8r8g8b8, a8r8g8b8, r5g6b5, a8 with the four repeat modes.

On a 1.2GHz AMD Phenom the time to scale down a 3200x1200 image by
9.22 went from 0.2238s to 0.1875s, an improvement of about 16%. For
comparison GdkPixbuf does the same thing in about 0.07s, so there is
still a lot of room for improvement.
---
 pixman/pixman-bits-image.c |  183 +++++++++++++++++++++++++++++++++++++++++++-
 pixman/pixman-image.c      |    3 +
 pixman/pixman-private.h    |    1 +
 3 files changed, 184 insertions(+), 3 deletions(-)

diff --git a/pixman/pixman-bits-image.c b/pixman/pixman-bits-image.c
index e76b78c..86d80c3 100644
--- a/pixman/pixman-bits-image.c
+++ b/pixman/pixman-bits-image.c
@@ -720,11 +720,155 @@ bits_image_fetch_general (pixman_iter_t  *iter,
     return buffer;
 }
 
-static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
-
 typedef uint32_t (* convert_pixel_t) (const uint8_t *row, int x);
 
 static force_inline void
+bits_image_fetch_separable_convolution_affine (pixman_image_t * image,
+					       int              offset,
+					       int              line,
+					       int              width,
+					       uint32_t *       buffer,
+					       const uint32_t * mask,
+
+					       convert_pixel_t	convert_pixel,
+					       pixman_format_code_t	format,
+					       pixman_repeat_t	repeat_mode)
+{
+    bits_image_t *bits = &image->bits;
+    pixman_fixed_t *params = image->common.filter_params;
+    int cwidth = pixman_fixed_to_int (params[0]);
+    int cheight = pixman_fixed_to_int (params[1]);
+    int x_off = ((cwidth << 16) - pixman_fixed_1) >> 1;
+    int y_off = ((cheight << 16) - pixman_fixed_1) >> 1;
+    int x_phase_bits = pixman_fixed_to_int (params[2]);
+    int y_phase_bits = pixman_fixed_to_int (params[3]);
+    int x_phase_shift = 16 - x_phase_bits;
+    int y_phase_shift = 16 - y_phase_bits;
+    pixman_fixed_t vx, vy;
+    pixman_fixed_t ux, uy;
+    pixman_vector_t v;
+    int k;
+
+    /* reference point is the center of the pixel */
+    v.vector[0] = pixman_int_to_fixed (offset) + pixman_fixed_1 / 2;
+    v.vector[1] = pixman_int_to_fixed (line) + pixman_fixed_1 / 2;
+    v.vector[2] = pixman_fixed_1;
+
+    if (!pixman_transform_point_3d (image->common.transform, &v))
+	return;
+
+    ux = image->common.transform->matrix[0][0];
+    uy = image->common.transform->matrix[1][0];
+
+    vx = v.vector[0];
+    vy = v.vector[1];
+
+    for (k = 0; k < width; ++k)
+    {
+	pixman_fixed_t *y_params;
+	int satot, srtot, sgtot, sbtot;
+	pixman_fixed_t x, y;
+	int32_t x1, x2, y1, y2;
+	int32_t px, py;
+	int i, j;
+
+	if (mask && !mask[k])
+	    goto next;
+
+	/* Round x and y to the middle of the closest phase before continuing. This
+	 * ensures that the convolution matrix is aligned right, since it was
+	 * positioned relative to a particular phase (and not relative to whatever
+	 * exact fraction we happen to get here).
+	 */
+	x = ((vx >> x_phase_shift) << x_phase_shift) + ((1 << x_phase_shift) >> 1);
+	y = ((vy >> y_phase_shift) << y_phase_shift) + ((1 << y_phase_shift) >> 1);
+
+	px = (x & 0xffff) >> x_phase_shift;
+	py = (y & 0xffff) >> y_phase_shift;
+
+	x1 = pixman_fixed_to_int (x - pixman_fixed_e - x_off);
+	y1 = pixman_fixed_to_int (y - pixman_fixed_e - y_off);
+	x2 = x1 + cwidth;
+	y2 = y1 + cheight;
+
+	satot = srtot = sgtot = sbtot = 0;
+
+	y_params = params + 4 + (1 << x_phase_bits) * cwidth + py * cheight;
+
+	for (i = y1; i < y2; ++i)
+	{
+	    pixman_fixed_t fy = *y_params++;
+
+	    if (fy)
+	    {
+		pixman_fixed_t *x_params = params + 4 + px * cwidth;
+
+		for (j = x1; j < x2; ++j)
+		{
+		    pixman_fixed_t fx = *x_params++;
+		    int rx = j;
+		    int ry = i;
+		    
+		    if (fx)
+		    {
+			pixman_fixed_t f;
+			uint32_t pixel, mask;
+			uint8_t *row;
+
+			mask = PIXMAN_FORMAT_A (format)? 0 : 0xff000000;
+
+			if (repeat_mode != PIXMAN_REPEAT_NONE)
+			{
+			    repeat (repeat_mode, &rx, bits->width);
+			    repeat (repeat_mode, &ry, bits->height);
+
+			    row = (uint8_t *)bits->bits + bits->rowstride * 4 * ry;
+			    pixel = convert_pixel (row, rx) | mask;
+			}
+			else
+			{
+			    if (rx < 0 || ry < 0 || rx >= bits->width || ry >= bits->height)
+			    {
+				pixel = 0;
+			    }
+			    else
+			    {
+				row = (uint8_t *)bits->bits + bits->rowstride * 4 * ry;
+				pixel = convert_pixel (row, rx) | mask;
+			    }
+			}
+
+			f = ((pixman_fixed_32_32_t)fx * fy + 0x8000) >> 16;
+			srtot += (int)RED_8 (pixel) * f;
+			sgtot += (int)GREEN_8 (pixel) * f;
+			sbtot += (int)BLUE_8 (pixel) * f;
+			satot += (int)ALPHA_8 (pixel) * f;
+		    }
+		}
+	    }
+	}
+
+	satot = (satot + 0x8000) >> 16;
+	srtot = (srtot + 0x8000) >> 16;
+	sgtot = (sgtot + 0x8000) >> 16;
+	sbtot = (sbtot + 0x8000) >> 16;
+
+	satot = CLIP (satot, 0, 0xff);
+	srtot = CLIP (srtot, 0, 0xff);
+	sgtot = CLIP (sgtot, 0, 0xff);
+	sbtot = CLIP (sbtot, 0, 0xff);
+
+	buffer[k] = (satot << 24) | (srtot << 16) | (sgtot << 8) | (sbtot << 0);
+
+    next:
+	vx += ux;
+	vy += uy;
+    }
+}
+
+static const uint8_t zero[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };
+
+static force_inline void
 bits_image_fetch_bilinear_affine (pixman_image_t * image,
 				  int              offset,
 				  int              line,
@@ -973,6 +1117,23 @@ convert_r5g6b5 (const uint8_t *row, int x)
     return CONVERT_0565_TO_0888 (*((uint16_t *)row + x));
 }
 
+#define MAKE_SEPARABLE_CONVOLUTION_FETCHER(name, format, repeat_mode)  \
+    static uint32_t *							\
+    bits_image_fetch_separable_convolution_affine_ ## name (pixman_iter_t   *iter, \
+							    const uint32_t * mask) \
+    {									\
+	bits_image_fetch_separable_convolution_affine (                 \
+	    iter->image,                                                \
+	    iter->x, iter->y++,                                         \
+	    iter->width,                                                \
+	    iter->buffer, mask,                                         \
+	    convert_ ## format,                                         \
+	    PIXMAN_ ## format,                                          \
+	    repeat_mode);                                               \
+									\
+	return iter->buffer;                                            \
+    }
+
 #define MAKE_BILINEAR_FETCHER(name, format, repeat_mode)		\
     static uint32_t *							\
     bits_image_fetch_bilinear_affine_ ## name (pixman_iter_t   *iter,	\
@@ -1005,7 +1166,8 @@ convert_r5g6b5 (const uint8_t *row, int x)
 
 #define MAKE_FETCHERS(name, format, repeat_mode)			\
     MAKE_NEAREST_FETCHER (name, format, repeat_mode)			\
-    MAKE_BILINEAR_FETCHER (name, format, repeat_mode)
+    MAKE_BILINEAR_FETCHER (name, format, repeat_mode)			\
+    MAKE_SEPARABLE_CONVOLUTION_FETCHER (name, format, repeat_mode)
 
 MAKE_FETCHERS (pad_a8r8g8b8,     a8r8g8b8, PIXMAN_REPEAT_PAD)
 MAKE_FETCHERS (none_a8r8g8b8,    a8r8g8b8, PIXMAN_REPEAT_NONE)
@@ -1255,6 +1417,20 @@ static const fetcher_info_t fetcher_info[] =
      FAST_PATH_AFFINE_TRANSFORM		|				\
      FAST_PATH_NEAREST_FILTER)
 
+#define GENERAL_SEPARABLE_CONVOLUTION_FLAGS				\
+    (FAST_PATH_NO_ALPHA_MAP            |				\
+     FAST_PATH_NO_ACCESSORS            |				\
+     FAST_PATH_HAS_TRANSFORM           |				\
+     FAST_PATH_AFFINE_TRANSFORM        |				\
+     FAST_PATH_SEPARABLE_CONVOLUTION_FILTER)
+    
+#define SEPARABLE_CONVOLUTION_AFFINE_FAST_PATH(name, format, repeat)   \
+    { PIXMAN_ ## format,                                               \
+      GENERAL_SEPARABLE_CONVOLUTION_FLAGS | FAST_PATH_ ## repeat ## _REPEAT, \
+      bits_image_fetch_separable_convolution_affine_ ## name,          \
+      _pixman_image_get_scanline_generic_float			       \
+    },
+
 #define BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\
     { PIXMAN_ ## format,						\
       GENERAL_BILINEAR_FLAGS | FAST_PATH_ ## repeat ## _REPEAT,		\
@@ -1270,6 +1446,7 @@ static const fetcher_info_t fetcher_info[] =
     },
 
 #define AFFINE_FAST_PATHS(name, format, repeat)				\
+    SEPARABLE_CONVOLUTION_AFFINE_FAST_PATH(name, format, repeat)	\
     BILINEAR_AFFINE_FAST_PATH(name, format, repeat)			\
     NEAREST_AFFINE_FAST_PATH(name, format, repeat)
     
diff --git a/pixman/pixman-image.c b/pixman/pixman-image.c
index 93ed17e..6f076d5 100644
--- a/pixman/pixman-image.c
+++ b/pixman/pixman-image.c
@@ -371,7 +371,10 @@ compute_image_info (pixman_image_t *image)
 	break;
 
     case PIXMAN_FILTER_CONVOLUTION:
+	break;
+
     case PIXMAN_FILTER_SEPARABLE_CONVOLUTION:
+	flags |= FAST_PATH_SEPARABLE_CONVOLUTION_FILTER;
 	break;
 
     default:
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index c0a6bc0..99125a1 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -687,6 +687,7 @@ _pixman_iter_get_scanline_noop (pixman_iter_t *iter, const uint32_t *mask);
 #define FAST_PATH_SAMPLES_COVER_CLIP_NEAREST	(1 << 23)
 #define FAST_PATH_SAMPLES_COVER_CLIP_BILINEAR	(1 << 24)
 #define FAST_PATH_BITS_IMAGE			(1 << 25)
+#define FAST_PATH_SEPARABLE_CONVOLUTION_FILTER  (1 << 26)
 
 #define FAST_PATH_PAD_REPEAT						\
     (FAST_PATH_NO_NONE_REPEAT		|				\
-- 
1.7.4