pixman: Branch 'master' - 7 commits

Sun Jan 27 11:04:34 PST 2013

configure.ac              |   16 +
 pixman/pixman-fast-path.c |  222 +++++++++++++++++++++----
 pixman/pixman-matrix.c    |  408 ++++++++++++++++++++++++++++++++++++++++------
 pixman/pixman-private.h   |   21 ++
 test/Makefile.sources     |    1 
 test/affine-test.c        |    6 
 test/matrix-test.c        |  186 ++++++++++++++++++++
 7 files changed, 771 insertions(+), 89 deletions(-)

New commits:
commit ed39992564beefe6b12f81e842caba11aff98a9c
Author: Siarhei Siamashka <siarhei.siamashka at gmail.com>
Date:   Sat Dec 15 07:18:53 2012 +0200

    Use pixman_transform_point_31_16() from pixman_transform_point()
    
    Old functions pixman_transform_point() and pixman_transform_point_3d()
    now become just wrappers for pixman_transform_point_31_16() and
    pixman_transform_point_31_16_3d(). Eventually their uses should be
    completely eliminated in the pixman code and replaced with their
    extended range counterparts. This is needed in order to be able
    to correctly handle any matrices and parameters that may come
    to pixman from the code responsible for XRender implementation.

diff --git a/pixman/pixman-matrix.c b/pixman/pixman-matrix.c
index cbc6fbe..89b9682 100644
--- a/pixman/pixman-matrix.c
+++ b/pixman/pixman-matrix.c
@@ -382,69 +382,41 @@ PIXMAN_EXPORT pixman_bool_t
 pixman_transform_point_3d (const struct pixman_transform *transform,
                            struct pixman_vector *         vector)
 {
-    struct pixman_vector result;
-    pixman_fixed_32_32_t partial;
-    pixman_fixed_48_16_t v;
-    int i, j;
+    pixman_vector_48_16_t tmp;
+    tmp.v[0] = vector->vector[0];
+    tmp.v[1] = vector->vector[1];
+    tmp.v[2] = vector->vector[2];
 
-    for (j = 0; j < 3; j++)
-    {
-	v = 0;
-	for (i = 0; i < 3; i++)
-	{
-	    partial = ((pixman_fixed_48_16_t) transform->matrix[j][i] *
-	               (pixman_fixed_48_16_t) vector->vector[i]);
-	    v += (partial + 0x8000) >> 16;
-	}
-	
-	if (v > pixman_max_fixed_48_16 || v < pixman_min_fixed_48_16)
-	    return FALSE;
-	
-	result.vector[j] = (pixman_fixed_t) v;
-    }
-    
-    *vector = result;
+    pixman_transform_point_31_16_3d (transform, &tmp, &tmp);
 
-    if (!result.vector[2])
-	return FALSE;
+    vector->vector[0] = tmp.v[0];
+    vector->vector[1] = tmp.v[1];
+    vector->vector[2] = tmp.v[2];
 
-    return TRUE;
+    return vector->vector[0] == tmp.v[0] &&
+           vector->vector[1] == tmp.v[1] &&
+           vector->vector[2] == tmp.v[2];
 }
 
 PIXMAN_EXPORT pixman_bool_t
 pixman_transform_point (const struct pixman_transform *transform,
                         struct pixman_vector *         vector)
 {
-    pixman_fixed_32_32_t partial;
-    pixman_fixed_34_30_t v[3];
-    pixman_fixed_48_16_t quo;
-    int i, j;
+    pixman_vector_48_16_t tmp;
+    tmp.v[0] = vector->vector[0];
+    tmp.v[1] = vector->vector[1];
+    tmp.v[2] = vector->vector[2];
 
-    for (j = 0; j < 3; j++)
-    {
-	v[j] = 0;
-	
-	for (i = 0; i < 3; i++)
-	{
-	    partial = ((pixman_fixed_32_32_t) transform->matrix[j][i] *
-	               (pixman_fixed_32_32_t) vector->vector[i]);
-	    v[j] += (partial + 2) >> 2;
-	}
-    }
-    
-    if (!((v[2] + 0x8000) >> 16))
-	return FALSE;
+    if (!pixman_transform_point_31_16 (transform, &tmp, &tmp))
+        return FALSE;
 
-    for (j = 0; j < 2; j++)
-    {
-	quo = v[j] / ((v[2] + 0x8000) >> 16);
-	if (quo > pixman_max_fixed_48_16 || quo < pixman_min_fixed_48_16)
-	    return FALSE;
-	vector->vector[j] = (pixman_fixed_t) quo;
-    }
-    
-    vector->vector[2] = pixman_fixed_1;
-    return TRUE;
+    vector->vector[0] = tmp.v[0];
+    vector->vector[1] = tmp.v[1];
+    vector->vector[2] = tmp.v[2];
+
+    return vector->vector[0] == tmp.v[0] &&
+           vector->vector[1] == tmp.v[1] &&
+           vector->vector[2] == tmp.v[2];
 }
 
 PIXMAN_EXPORT pixman_bool_t
diff --git a/test/affine-test.c b/test/affine-test.c
index 678fbe8..2506250 100644
--- a/test/affine-test.c
+++ b/test/affine-test.c
@@ -307,11 +307,11 @@ test_composite (int      testnum,
 }
 
 #if BILINEAR_INTERPOLATION_BITS == 8
-#define CHECKSUM 0x97097336
+#define CHECKSUM 0x2CDF1F07
 #elif BILINEAR_INTERPOLATION_BITS == 7
-#define CHECKSUM 0x31D2DC21
+#define CHECKSUM 0xBC00B1DF
 #elif BILINEAR_INTERPOLATION_BITS == 4
-#define CHECKSUM 0x8B925154
+#define CHECKSUM 0xA227306B
 #else
 #define CHECKSUM 0x00000000
 #endif
commit 5a78d74ccccba2aeb473f04ade44512d2f6c0613
Author: Siarhei Siamashka <siarhei.siamashka at gmail.com>
Date:   Sat Dec 15 06:19:21 2012 +0200

    test: Added matrix-test for testing projective transform accuracy
    
    This test uses __float128 data type when it is available
    for implementing a "perfect" reference implementation. The
    output from from pixman_transform_point_31_16() and
    pixman_transform_point_31_16_affine() is compared with the
    reference implementation to make sure that the rounding
    errors may only show up in a single least significant bit.
    
    The platforms and compilers, which do not support __float128
    data type, can rely on crc32 checksum for the pseudorandom
    transform results.

diff --git a/test/Makefile.sources b/test/Makefile.sources
index 8c0b505..e323a8e 100644
--- a/test/Makefile.sources
+++ b/test/Makefile.sources
@@ -17,6 +17,7 @@ TESTPROGRAMS =			\
 	gradient-crash-test	\
 	region-contains-test	\
 	alphamap		\
+	matrix-test		\
 	stress-test		\
 	composite-traps-test	\
 	blitters-test		\
diff --git a/test/matrix-test.c b/test/matrix-test.c
new file mode 100644
index 0000000..8437dd2
--- /dev/null
+++ b/test/matrix-test.c
@@ -0,0 +1,186 @@
+/*
+ * Copyright Â© 2012 Siarhei Siamashka <siarhei.siamashka at gmail.com>
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
+ * DEALINGS IN THE SOFTWARE.
+ */
+
+#include "utils.h"
+#include <assert.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <math.h>
+
+#ifdef HAVE_FLOAT128
+
+#define pixman_fixed_to_float128(x) (((__float128)(x)) / 65536.0Q)
+
+typedef struct { __float128 v[3]; } pixman_vector_f128_t;
+typedef struct { __float128 m[3][3]; } pixman_transform_f128_t;
+
+pixman_bool_t
+pixman_transform_point_f128 (const pixman_transform_f128_t *t,
+                             const pixman_vector_f128_t    *v,
+                             pixman_vector_f128_t          *result)
+{
+    int i;
+    for (i = 0; i < 3; i++)
+    {
+        result->v[i] = t->m[i][0] * v->v[0] +
+                       t->m[i][1] * v->v[1] +
+                       t->m[i][2] * v->v[2];
+    }
+    if (result->v[2] != 0)
+    {
+        result->v[0] /= result->v[2];
+        result->v[1] /= result->v[2];
+        result->v[2] = 1;
+        return TRUE;
+    }
+    else
+    {
+        return FALSE;
+    }
+}
+
+pixman_bool_t does_it_fit_fixed_48_16 (__float128 x)
+{
+    if (x >= 65536.0Q * 65536.0Q * 32768.0Q)
+        return FALSE;
+    if (x <= -65536.0Q * 65536.0Q * 32768.0Q)
+        return FALSE;
+    return TRUE;
+}
+
+#endif
+
+uint32_t
+test_matrix (int testnum, int verbose)
+{
+    uint32_t crc32 = 0;
+    int i, j, k;
+    pixman_bool_t is_affine;
+
+    prng_srand (testnum);
+
+    for (i = 0; i < 100; i++)
+    {
+        pixman_bool_t           transform_ok;
+        pixman_transform_t      ti;
+        pixman_vector_48_16_t   vi, result_i;
+#ifdef HAVE_FLOAT128
+        pixman_transform_f128_t tf;
+        pixman_vector_f128_t    vf, result_f;
+#endif
+        prng_randmemset (&ti, sizeof(ti), 0);
+        prng_randmemset (&vi, sizeof(vi), 0);
+
+        for (j = 0; j < 3; j++)
+        {
+            /* make sure that "vi" contains 31.16 fixed point data */
+            vi.v[j] >>= 17;
+            /* and apply random shift */
+            if (prng_rand_n (3) == 0)
+                vi.v[j] >>= prng_rand_n (46);
+        }
+
+        if (prng_rand_n (2))
+        {
+            /* random shift for the matrix */
+            for (j = 0; j < 3; j++)
+                for (k = 0; k < 3; k++)
+                    ti.matrix[j][k] >>= prng_rand_n (30);
+        }
+
+        if (prng_rand_n (2))
+        {
+            /* affine matrix */
+            ti.matrix[2][0] = 0;
+            ti.matrix[2][1] = 0;
+            ti.matrix[2][2] = pixman_fixed_1;
+        }
+
+        if (prng_rand_n (2))
+        {
+            /* cartesian coordinates */
+            vi.v[2] = pixman_fixed_1;
+        }
+
+        is_affine = (ti.matrix[2][0] == 0 && ti.matrix[2][1] == 0 &&
+                     ti.matrix[2][2] == pixman_fixed_1 &&
+                     vi.v[2] == pixman_fixed_1);
+
+        transform_ok = TRUE;
+        if (is_affine && prng_rand_n (2))
+            pixman_transform_point_31_16_affine (&ti, &vi, &result_i);
+        else
+            transform_ok = pixman_transform_point_31_16 (&ti, &vi, &result_i);
+
+        crc32 = compute_crc32 (crc32, &result_i, sizeof(result_i));
+
+#ifdef HAVE_FLOAT128
+        /* compare with a reference 128-bit floating point implementation */
+        for (j = 0; j < 3; j++)
+        {
+            vf.v[j] = pixman_fixed_to_float128 (vi.v[j]);
+            for (k = 0; k < 3; k++)
+            {
+                tf.m[j][k] = pixman_fixed_to_float128 (ti.matrix[j][k]);
+            }
+        }
+
+        if (pixman_transform_point_f128 (&tf, &vf, &result_f))
+        {
+            if (transform_ok ||
+                (does_it_fit_fixed_48_16 (result_f.v[0]) &&
+                 does_it_fit_fixed_48_16 (result_f.v[1]) &&
+                 does_it_fit_fixed_48_16 (result_f.v[2])))
+            {
+                for (j = 0; j < 3; j++)
+                {
+                    double diff = fabs (result_f.v[j] -
+                                        pixman_fixed_to_float128 (result_i.v[j]));
+
+                    if (is_affine && diff > (0.51 / 65536.0))
+                    {
+                        printf ("%d:%d: bad precision for affine (%.12f)\n",
+                               testnum, i, diff);
+                        abort ();
+                    }
+                    else if (diff > (0.71 / 65536.0))
+                    {
+                        printf ("%d:%d: bad precision for projective (%.12f)\n",
+                               testnum, i, diff);
+                        abort ();
+                    }
+                }
+            }
+        }
+#endif
+    }
+    return crc32;
+}
+
+int
+main (int argc, const char *argv[])
+{
+    return fuzzer_test_main ("matrix", 20000,
+			     0xBEBF98C3,
+			     test_matrix, argc, argv);
+}
commit 09600ae7e34eb777471c931cd4c3a8cdbda6e84a
Author: Siarhei Siamashka <siarhei.siamashka at gmail.com>
Date:   Wed Dec 12 02:41:55 2012 +0200

    configure.ac: Added detection for __float128 support
    
    GCC supports 128-bit floating point data type on some platforms (including
    but not limited to x86 and x86-64). This may be useful for tests, which
    need prefectly accurate reference implementations of certain algorithms.

diff --git a/configure.ac b/configure.ac
index 515e312..a93e290 100644
--- a/configure.ac
+++ b/configure.ac
@@ -968,6 +968,22 @@ fi
 AC_MSG_RESULT($support_for_attribute_constructor)
 AC_SUBST(TOOLCHAIN_SUPPORTS_ATTRIBUTE_CONSTRUCTOR)
 
+dnl =====================================
+dnl __float128
+
+support_for_float128=no
+
+AC_MSG_CHECKING(for __float128)
+AC_LINK_IFELSE([AC_LANG_SOURCE([[
+__float128 a = 1.0Q, b = 2.0Q; int main (void) { return a + b; }
+]])], support_for_float128=yes)
+
+if test x$support_for_float128 = xyes; then
+   AC_DEFINE([HAVE_FLOAT128], [], [Whether the tool chain supports __float128])
+fi
+
+AC_MSG_RESULT($support_for_float128)
+
 dnl ==================
 dnl libpng
 
commit c3deb8334a71998b986a7b8d5b74bedf26cc23aa
Author: Siarhei Siamashka <siarhei.siamashka at gmail.com>
Date:   Fri Dec 14 18:43:57 2012 +0200

    Add higher precision "pixman_transform_point_*" functions
    
    The following new functions are added:
    
    pixman_transform_point_31_16_3d() -
        Calculates the product of a matrix and a vector multiplication.
    
    pixman_transform_point_31_16() -
        Calculates the product of a matrix and a vector multiplication.
        Then converts the homogenous resulting vector [x, y, z] to
        cartesian [x', y', 1] variant, where x' = x / z, and y' = y / z.
    
    pixman_transform_point_31_16_affine() -
        A faster sibling of the other two functions, which assumes affine
        transformation, where the bottom row of the matrix is [0, 0, 1] and
        the last element of the input vector is set to 1.
    
    These functions transform a point with 31.16 fixed point coordinates from
    the destination space to a point with 48.16 fixed point coordinates in
    the source space.
    
    The results are accurate and the rounding errors may only show up in
    the least significant bit. No overflows are possible for the affine
    transformations as long as the input data is provided in 31.16 format.
    In the case of projective transformations, some output values may be not
    representable using 48.16 fixed point format. In this case the results
    are clamped to return maximum or minimum 48.16 values (so that the caller
    can at least handle NONE and PAD repeats correctly).

diff --git a/pixman/pixman-matrix.c b/pixman/pixman-matrix.c
index cd2f1b5..cbc6fbe 100644
--- a/pixman/pixman-matrix.c
+++ b/pixman/pixman-matrix.c
@@ -34,6 +34,338 @@
 
 #define F(x)    pixman_int_to_fixed (x)
 
+static force_inline int
+count_leading_zeros (uint32_t x)
+{
+#ifdef __GNUC__
+    return __builtin_clz (x);
+#else
+    int n = 0;
+    while (x)
+    {
+        n++;
+        x >>= 1;
+    }
+    return 32 - n;
+#endif
+}
+
+/*
+ * Large signed/unsigned integer division with rounding for the platforms with
+ * only 64-bit integer data type supported (no 128-bit data type).
+ *
+ * Arguments:
+ *     hi, lo - high and low 64-bit parts of the dividend
+ *     div    - 48-bit divisor
+ *
+ * Returns: lowest 64 bits of the result as a return value and highest 64
+ *          bits of the result to "result_hi" pointer
+ */
+
+/* grade-school unsigned division (128-bit by 48-bit) with rounding to nearest */
+static force_inline uint64_t
+rounded_udiv_128_by_48 (uint64_t  hi,
+                        uint64_t  lo,
+                        uint64_t  div,
+                        uint64_t *result_hi)
+{
+    uint64_t tmp, remainder, result_lo;
+    assert(div < ((uint64_t)1 << 48));
+
+    remainder = hi % div;
+    *result_hi = hi / div;
+
+    tmp = (remainder << 16) + (lo >> 48);
+    result_lo = tmp / div;
+    remainder = tmp % div;
+
+    tmp = (remainder << 16) + ((lo >> 32) & 0xFFFF);
+    result_lo = (result_lo << 16) + (tmp / div);
+    remainder = tmp % div;
+
+    tmp = (remainder << 16) + ((lo >> 16) & 0xFFFF);
+    result_lo = (result_lo << 16) + (tmp / div);
+    remainder = tmp % div;
+
+    tmp = (remainder << 16) + (lo & 0xFFFF);
+    result_lo = (result_lo << 16) + (tmp / div);
+    remainder = tmp % div;
+
+    /* round to nearest */
+    if (remainder * 2 >= div && ++result_lo == 0)
+        *result_hi += 1;
+
+    return result_lo;
+}
+
+/* signed division (128-bit by 49-bit) with rounding to nearest */
+static inline int64_t
+rounded_sdiv_128_by_49 (int64_t   hi,
+                        uint64_t  lo,
+                        int64_t   div,
+                        int64_t  *signed_result_hi)
+{
+    uint64_t result_lo, result_hi;
+    int sign = 0;
+    if (div < 0)
+    {
+        div = -div;
+        sign ^= 1;
+    }
+    if (hi < 0)
+    {
+        if (lo != 0)
+            hi++;
+        hi = -hi;
+        lo = -lo;
+        sign ^= 1;
+    }
+    result_lo = rounded_udiv_128_by_48 (hi, lo, div, &result_hi);
+    if (sign)
+    {
+        if (result_lo != 0)
+            result_hi++;
+        result_hi = -result_hi;
+        result_lo = -result_lo;
+    }
+    if (signed_result_hi)
+    {
+        *signed_result_hi = result_hi;
+    }
+    return result_lo;
+}
+
+/*
+ * Multiply 64.16 fixed point value by (2^scalebits) and convert
+ * to 128-bit integer.
+ */
+static force_inline void
+fixed_64_16_to_int128 (int64_t  hi,
+                       int64_t  lo,
+                       int64_t *rhi,
+                       int64_t *rlo,
+                       int      scalebits)
+{
+    /* separate integer and fractional parts */
+    hi += lo >> 16;
+    lo &= 0xFFFF;
+
+    if (scalebits <= 0)
+    {
+        *rlo = hi >> (-scalebits);
+        *rhi = *rlo >> 63;
+    }
+    else
+    {
+        *rhi = hi >> (64 - scalebits);
+        *rlo = (uint64_t)hi << scalebits;
+        if (scalebits < 16)
+            *rlo += lo >> (16 - scalebits);
+        else
+            *rlo += lo << (scalebits - 16);
+    }
+}
+
+/*
+ * Convert 112.16 fixed point value to 48.16 with clamping for the out
+ * of range values.
+ */
+static force_inline pixman_fixed_48_16_t
+fixed_112_16_to_fixed_48_16 (int64_t hi, int64_t lo, pixman_bool_t *clampflag)
+{
+    if ((lo >> 63) != hi)
+    {
+        *clampflag = TRUE;
+        return hi >= 0 ? INT64_MAX : INT64_MIN;
+    }
+    else
+    {
+        return lo;
+    }
+}
+
+/*
+ * Transform a point with 31.16 fixed point coordinates from the destination
+ * space to a point with 48.16 fixed point coordinates in the source space.
+ * No overflows are possible for affine transformations and the results are
+ * accurate including the least significant bit. Projective transformations
+ * may overflow, in this case the results are just clamped to return maximum
+ * or minimum 48.16 values (so that the caller can at least handle the NONE
+ * and PAD repeats correctly) and the return value is FALSE to indicate that
+ * such clamping has happened.
+ */
+PIXMAN_EXPORT pixman_bool_t
+pixman_transform_point_31_16 (const pixman_transform_t    *t,
+                              const pixman_vector_48_16_t *v,
+                              pixman_vector_48_16_t       *result)
+{
+    pixman_bool_t clampflag = FALSE;
+    int i;
+    int64_t tmp[3][2], divint;
+    uint16_t divfrac;
+
+    /* input vector values must have no more than 31 bits (including sign)
+     * in the integer part */
+    assert (v->v[0] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[2] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[2] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+
+    for (i = 0; i < 3; i++)
+    {
+        tmp[i][0] = (int64_t)t->matrix[i][0] * (v->v[0] >> 16);
+        tmp[i][1] = (int64_t)t->matrix[i][0] * (v->v[0] & 0xFFFF);
+        tmp[i][0] += (int64_t)t->matrix[i][1] * (v->v[1] >> 16);
+        tmp[i][1] += (int64_t)t->matrix[i][1] * (v->v[1] & 0xFFFF);
+        tmp[i][0] += (int64_t)t->matrix[i][2] * (v->v[2] >> 16);
+        tmp[i][1] += (int64_t)t->matrix[i][2] * (v->v[2] & 0xFFFF);
+    }
+
+    /*
+     * separate 64-bit integer and 16-bit fractional parts for the divisor,
+     * which is also scaled by 65536 after fixed point multiplication.
+     */
+    divint  = tmp[2][0] + (tmp[2][1] >> 16);
+    divfrac = tmp[2][1] & 0xFFFF;
+
+    if (divint == pixman_fixed_1 && divfrac == 0)
+    {
+        /*
+         * this is a simple affine transformation
+         */
+        result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16);
+        result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16);
+        result->v[2] = pixman_fixed_1;
+    }
+    else if (divint == 0 && divfrac == 0)
+    {
+        /*
+         * handle zero divisor (if the values are non-zero, set the
+         * results to maximum positive or minimum negative)
+         */
+        clampflag = TRUE;
+
+        result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16);
+        result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16);
+
+        if (result->v[0] > 0)
+            result->v[0] = INT64_MAX;
+        else if (result->v[0] < 0)
+            result->v[0] = INT64_MIN;
+
+        if (result->v[1] > 0)
+            result->v[1] = INT64_MAX;
+        else if (result->v[1] < 0)
+            result->v[1] = INT64_MIN;
+    }
+    else
+    {
+        /*
+         * projective transformation, analyze the top 32 bits of the divisor
+         */
+        int32_t hi32divbits = divint >> 32;
+        if (hi32divbits < 0)
+            hi32divbits = ~hi32divbits;
+
+        if (hi32divbits == 0)
+        {
+            /* the divisor is small, we can actually keep all the bits */
+            int64_t hi, rhi, lo, rlo;
+            int64_t div = (divint << 16) + divfrac;
+
+            fixed_64_16_to_int128 (tmp[0][0], tmp[0][1], &hi, &lo, 32);
+            rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi);
+            result->v[0] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag);
+
+            fixed_64_16_to_int128 (tmp[1][0], tmp[1][1], &hi, &lo, 32);
+            rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi);
+            result->v[1] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag);
+        }
+        else
+        {
+            /* the divisor needs to be reduced to 48 bits */
+            int64_t hi, rhi, lo, rlo, div;
+            int shift = 32 - count_leading_zeros (hi32divbits);
+            fixed_64_16_to_int128 (divint, divfrac, &hi, &div, 16 - shift);
+
+            fixed_64_16_to_int128 (tmp[0][0], tmp[0][1], &hi, &lo, 32 - shift);
+            rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi);
+            result->v[0] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag);
+
+            fixed_64_16_to_int128 (tmp[1][0], tmp[1][1], &hi, &lo, 32 - shift);
+            rlo = rounded_sdiv_128_by_49 (hi, lo, div, &rhi);
+            result->v[1] = fixed_112_16_to_fixed_48_16 (rhi, rlo, &clampflag);
+        }
+    }
+    result->v[2] = pixman_fixed_1;
+    return !clampflag;
+}
+
+PIXMAN_EXPORT void
+pixman_transform_point_31_16_affine (const pixman_transform_t    *t,
+                                     const pixman_vector_48_16_t *v,
+                                     pixman_vector_48_16_t       *result)
+{
+    int64_t hi0, lo0, hi1, lo1;
+
+    /* input vector values must have no more than 31 bits (including sign)
+     * in the integer part */
+    assert (v->v[0] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+
+    hi0  = (int64_t)t->matrix[0][0] * (v->v[0] >> 16);
+    lo0  = (int64_t)t->matrix[0][0] * (v->v[0] & 0xFFFF);
+    hi0 += (int64_t)t->matrix[0][1] * (v->v[1] >> 16);
+    lo0 += (int64_t)t->matrix[0][1] * (v->v[1] & 0xFFFF);
+    hi0 += (int64_t)t->matrix[0][2];
+
+    hi1  = (int64_t)t->matrix[1][0] * (v->v[0] >> 16);
+    lo1  = (int64_t)t->matrix[1][0] * (v->v[0] & 0xFFFF);
+    hi1 += (int64_t)t->matrix[1][1] * (v->v[1] >> 16);
+    lo1 += (int64_t)t->matrix[1][1] * (v->v[1] & 0xFFFF);
+    hi1 += (int64_t)t->matrix[1][2];
+
+    result->v[0] = hi0 + ((lo0 + 0x8000) >> 16);
+    result->v[1] = hi1 + ((lo1 + 0x8000) >> 16);
+    result->v[2] = pixman_fixed_1;
+}
+
+PIXMAN_EXPORT void
+pixman_transform_point_31_16_3d (const pixman_transform_t    *t,
+                                 const pixman_vector_48_16_t *v,
+                                 pixman_vector_48_16_t       *result)
+{
+    int i;
+    int64_t tmp[3][2];
+
+    /* input vector values must have no more than 31 bits (including sign)
+     * in the integer part */
+    assert (v->v[0] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[0] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[1] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[2] <   ((pixman_fixed_48_16_t)1 << (30 + 16)));
+    assert (v->v[2] >= -((pixman_fixed_48_16_t)1 << (30 + 16)));
+
+    for (i = 0; i < 3; i++)
+    {
+        tmp[i][0] = (int64_t)t->matrix[i][0] * (v->v[0] >> 16);
+        tmp[i][1] = (int64_t)t->matrix[i][0] * (v->v[0] & 0xFFFF);
+        tmp[i][0] += (int64_t)t->matrix[i][1] * (v->v[1] >> 16);
+        tmp[i][1] += (int64_t)t->matrix[i][1] * (v->v[1] & 0xFFFF);
+        tmp[i][0] += (int64_t)t->matrix[i][2] * (v->v[2] >> 16);
+        tmp[i][1] += (int64_t)t->matrix[i][2] * (v->v[2] & 0xFFFF);
+    }
+
+    result->v[0] = tmp[0][0] + ((tmp[0][1] + 0x8000) >> 16);
+    result->v[1] = tmp[1][0] + ((tmp[1][1] + 0x8000) >> 16);
+    result->v[2] = tmp[2][0] + ((tmp[2][1] + 0x8000) >> 16);
+}
+
 PIXMAN_EXPORT void
 pixman_transform_init_identity (struct pixman_transform *matrix)
 {
diff --git a/pixman/pixman-private.h b/pixman/pixman-private.h
index 3981873..cb78a2e 100644
--- a/pixman/pixman-private.h
+++ b/pixman/pixman-private.h
@@ -1078,6 +1078,27 @@ _pixman_log_error (const char *function, const char *message);
 #endif
 
 /*
+ * Matrix
+ */
+
+typedef struct { pixman_fixed_48_16_t v[3]; } pixman_vector_48_16_t;
+
+pixman_bool_t
+pixman_transform_point_31_16 (const pixman_transform_t    *t,
+                              const pixman_vector_48_16_t *v,
+                              pixman_vector_48_16_t       *result);
+
+void
+pixman_transform_point_31_16_3d (const pixman_transform_t    *t,
+                                 const pixman_vector_48_16_t *v,
+                                 pixman_vector_48_16_t       *result);
+
+void
+pixman_transform_point_31_16_affine (const pixman_transform_t    *t,
+                                     const pixman_vector_48_16_t *v,
+                                     pixman_vector_48_16_t       *result);
+
+/*
  * Timers
  */
 
commit a47ed2c31180e6c3b332747a1721731e0649b10f
Author: Siarhei Siamashka <siarhei.siamashka at gmail.com>
Date:   Mon Dec 3 17:42:21 2012 +0200

    Faster fetch for the C variant of r5g6b5 src/dest iterator
    
    Processing two pixels at once is used to reduce the number of
    arithmetic operations.
    
    The speedup relative to the generic fetch_scanline_r5g6b5() from
    "pixman-access.c" (pixman was compiled with gcc 4.7.2):
    
        MIPS 74K        480MHz  :  20.32 MPix/s ->  26.47 MPix/s
        ARM11           700MHz  :  34.95 MPix/s ->  38.22 MPix/s
        ARM Cortex-A8  1000MHz  :  87.44 MPix/s -> 100.92 MPix/s
        ARM Cortex-A9  1700MHz  : 150.95 MPix/s -> 158.13 MPix/s
        ARM Cortex-A15 1700MHz  : 148.91 MPix/s -> 155.42 MPix/s
        IBM Cell PPU   3200MHz  :  75.29 MPix/s ->  98.33 MPix/s
        Intel Core i7  2800MHz  : 257.02 MPix/s -> 376.93 MPix/s
    
    That's the performance for C code (SIMD and assembly optimizations
    are disabled via PIXMAN_DISABLE environment variable).

diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index 02a5119..247aea6 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -2170,11 +2170,40 @@ fast_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
 
     iter->bits += iter->stride;
 
-    while (w > 0)
+    /* Align the source buffer at 4 bytes boundary */
+    if (w > 0 && ((uintptr_t)src & 3))
     {
 	*dst++ = convert_0565_to_8888 (*src++);
 	w--;
     }
+    /* Process two pixels per iteration */
+    while ((w -= 2) >= 0)
+    {
+	uint32_t sr, sb, sg, t0, t1;
+	uint32_t s = *(const uint32_t *)src;
+	src += 2;
+	sr = (s >> 8) & 0x00F800F8;
+	sb = (s << 3) & 0x00F800F8;
+	sg = (s >> 3) & 0x00FC00FC;
+	sr |= sr >> 5;
+	sb |= sb >> 5;
+	sg |= sg >> 6;
+	t0 = ((sr << 16) & 0x00FF0000) | ((sg << 8) & 0x0000FF00) |
+	     (sb & 0xFF) | 0xFF000000;
+	t1 = (sr & 0x00FF0000) | ((sg >> 8) & 0x0000FF00) |
+	     (sb >> 16) | 0xFF000000;
+#ifdef WORDS_BIGENDIAN
+	*dst++ = t1;
+	*dst++ = t0;
+#else
+	*dst++ = t0;
+	*dst++ = t1;
+#endif
+    }
+    if (w & 1)
+    {
+	*dst = convert_0565_to_8888 (*src);
+    }
 
     return iter->buffer;
 }
commit e66fd5ccb6b69dfa1acde36220dc3c3c44026890
Author: Siarhei Siamashka <siarhei.siamashka at gmail.com>
Date:   Mon Dec 3 17:07:31 2012 +0200

    Faster write-back for the C variant of r5g6b5 dest iterator
    
    Unrolling loops improves performance, so just use it here.
    
    Also GCC can't properly optimize this code for RISC processors and
    allocate 0x1F001F constant in a register. Because this constant is
    too large to be represented as an immediate operand in instructions,
    GCC inserts some redundant arithmetics. This problem can be workarounded
    by explicitly using a variable for 0x1F001F constant and also initializing
    it by a read from another volatile variable. In this case GCC is forced
    to allocate a register for it, because it is not seen as a constant anymore.
    
    The speedup relative to the generic store_scanline_r5g6b5() from
    "pixman-access.c" (pixman was compiled with gcc 4.7.2):
    
        MIPS 74K        480MHz  :  33.22 MPix/s ->  43.42 MPix/s
        ARM11           700MHz  :  50.16 MPix/s ->  78.23 MPix/s
        ARM Cortex-A8  1000MHz  : 117.75 MPix/s -> 196.34 MPix/s
        ARM Cortex-A9  1700MHz  : 177.04 MPix/s -> 320.32 MPix/s
        ARM Cortex-A15 1700MHz  : 231.44 MPix/s -> 261.64 MPix/s
        IBM Cell PPU   3200MHz  : 130.25 MPix/s -> 145.61 MPix/s
        Intel Core i7  2800MHz  : 502.21 MPix/s -> 721.73 MPix/s
    
    That's the performance for C code (SIMD and assembly optimizations
    are disabled via PIXMAN_DISABLE environment variable).

diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index cbe34bb..02a5119 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -2186,17 +2186,49 @@ fast_dest_fetch_noop (pixman_iter_t *iter, const uint32_t *mask)
     return iter->buffer;
 }
 
+/* Helper function for a workaround, which tries to ensure that 0x1F001F
+ * constant is always allocated in a register on RISC architectures.
+ */
+static force_inline uint32_t
+convert_8888_to_0565_workaround (uint32_t s, uint32_t x1F001F)
+{
+    uint32_t a, b;
+    a = (s >> 3) & x1F001F;
+    b = s & 0xFC00;
+    a |= a >> 5;
+    a |= b >> 5;
+    return a;
+}
+
 static void
 fast_write_back_r5g6b5 (pixman_iter_t *iter)
 {
     int32_t w = iter->width;
     uint16_t *dst = (uint16_t *)(iter->bits - iter->stride);
     const uint32_t *src = iter->buffer;
+    /* Workaround to ensure that x1F001F variable is allocated in a register */
+    static volatile uint32_t volatile_x1F001F = 0x1F001F;
+    uint32_t x1F001F = volatile_x1F001F;
 
-    while (w > 0)
+    while ((w -= 4) >= 0)
     {
-	*dst++ = convert_8888_to_0565 (*src++);
-	w--;
+	uint32_t s1 = *src++;
+	uint32_t s2 = *src++;
+	uint32_t s3 = *src++;
+	uint32_t s4 = *src++;
+	*dst++ = convert_8888_to_0565_workaround (s1, x1F001F);
+	*dst++ = convert_8888_to_0565_workaround (s2, x1F001F);
+	*dst++ = convert_8888_to_0565_workaround (s3, x1F001F);
+	*dst++ = convert_8888_to_0565_workaround (s4, x1F001F);
+    }
+    if (w & 2)
+    {
+	*dst++ = convert_8888_to_0565_workaround (*src++, x1F001F);
+	*dst++ = convert_8888_to_0565_workaround (*src++, x1F001F);
+    }
+    if (w & 1)
+    {
+	*dst = convert_8888_to_0565_workaround (*src, x1F001F);
     }
 }
 
commit a9f66694163da9e8e41a69497acbadd630e0cb51
Author: Siarhei Siamashka <siarhei.siamashka at gmail.com>
Date:   Mon Dec 3 06:32:46 2012 +0200

    Added C variants of r5g6b5 fetch/write-back iterators
    
    Adding specialized iterators for r5g6b5 color format allows us to work
    on fine tuning performance of r5g6b5 fetch/write-back operations in the
    pixman general "fetch -> combine -> store" pipeline.
    
    These iterators also make "src_x888_0565" fast path redundant, so it can
    be removed.

diff --git a/pixman/pixman-fast-path.c b/pixman/pixman-fast-path.c
index 1ac2d11..cbe34bb 100644
--- a/pixman/pixman-fast-path.c
+++ b/pixman/pixman-fast-path.c
@@ -739,36 +739,6 @@ fast_composite_over_8888_0565 (pixman_implementation_t *imp,
 }
 
 static void
-fast_composite_src_x888_0565 (pixman_implementation_t *imp,
-                              pixman_composite_info_t *info)
-{
-    PIXMAN_COMPOSITE_ARGS (info);
-    uint16_t    *dst_line, *dst;
-    uint32_t    *src_line, *src, s;
-    int dst_stride, src_stride;
-    int32_t w;
-
-    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
-    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
-
-    while (height--)
-    {
-	dst = dst_line;
-	dst_line += dst_stride;
-	src = src_line;
-	src_line += src_stride;
-	w = width;
-
-	while (w--)
-	{
-	    s = *src++;
-	    *dst = convert_8888_to_0565 (s);
-	    dst++;
-	}
-    }
-}
-
-static void
 fast_composite_add_8_8 (pixman_implementation_t *imp,
 			pixman_composite_info_t *info)
 {
@@ -1907,10 +1877,6 @@ static const pixman_fast_path_t c_fast_paths[] =
     PIXMAN_STD_FAST_PATH (SRC, x1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
     PIXMAN_STD_FAST_PATH (SRC, a1r5g5b5, null, x1r5g5b5, fast_composite_src_memcpy),
     PIXMAN_STD_FAST_PATH (SRC, a8, null, a8, fast_composite_src_memcpy),
-    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
-    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, fast_composite_src_x888_0565),
-    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
-    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, fast_composite_src_x888_0565),
     PIXMAN_STD_FAST_PATH (IN, a8, null, a8, fast_composite_in_8_8),
     PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, fast_composite_in_n_8_8),
 
@@ -2193,12 +2159,139 @@ fast_path_fill (pixman_implementation_t *imp,
     return TRUE;
 }
 
+/*****************************************************************************/
+
+static uint32_t *
+fast_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
+{
+    int32_t w = iter->width;
+    uint32_t *dst = iter->buffer;
+    const uint16_t *src = (const uint16_t *)iter->bits;
+
+    iter->bits += iter->stride;
+
+    while (w > 0)
+    {
+	*dst++ = convert_0565_to_8888 (*src++);
+	w--;
+    }
+
+    return iter->buffer;
+}
+
+static uint32_t *
+fast_dest_fetch_noop (pixman_iter_t *iter, const uint32_t *mask)
+{
+    iter->bits += iter->stride;
+    return iter->buffer;
+}
+
+static void
+fast_write_back_r5g6b5 (pixman_iter_t *iter)
+{
+    int32_t w = iter->width;
+    uint16_t *dst = (uint16_t *)(iter->bits - iter->stride);
+    const uint32_t *src = iter->buffer;
+
+    while (w > 0)
+    {
+	*dst++ = convert_8888_to_0565 (*src++);
+	w--;
+    }
+}
+
+typedef struct
+{
+    pixman_format_code_t	format;
+    pixman_iter_get_scanline_t	get_scanline;
+    pixman_iter_write_back_t	write_back;
+} fetcher_info_t;
+
+static const fetcher_info_t fetchers[] =
+{
+    { PIXMAN_r5g6b5, fast_fetch_r5g6b5, fast_write_back_r5g6b5 },
+    { PIXMAN_null }
+};
+
+static pixman_bool_t
+fast_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    pixman_image_t *image = iter->image;
+
+#define FLAGS								\
+    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
+     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
+
+    if ((iter->iter_flags & ITER_NARROW)			&&
+	(iter->image_flags & FLAGS) == FLAGS)
+    {
+	const fetcher_info_t *f;
+
+	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
+	{
+	    if (image->common.extended_format_code == f->format)
+	    {
+		uint8_t *b = (uint8_t *)image->bits.bits;
+		int s = image->bits.rowstride * 4;
+
+		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
+		iter->stride = s;
+
+		iter->get_scanline = f->get_scanline;
+		return TRUE;
+	    }
+	}
+    }
+
+    return FALSE;
+}
+
+static pixman_bool_t
+fast_dest_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
+{
+    pixman_image_t *image = iter->image;
+
+    if ((iter->iter_flags & ITER_NARROW)		&&
+	(iter->image_flags & FAST_PATH_STD_DEST_FLAGS) == FAST_PATH_STD_DEST_FLAGS)
+    {
+	const fetcher_info_t *f;
+
+	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
+	{
+	    if (image->common.extended_format_code == f->format)
+	    {
+		uint8_t *b = (uint8_t *)image->bits.bits;
+		int s = image->bits.rowstride * 4;
+
+		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
+		iter->stride = s;
+
+		if ((iter->iter_flags & (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA)) ==
+		    (ITER_IGNORE_RGB | ITER_IGNORE_ALPHA))
+		{
+		    iter->get_scanline = fast_dest_fetch_noop;
+		}
+		else
+		{
+		    iter->get_scanline = f->get_scanline;
+		}
+		iter->write_back = f->write_back;
+		return TRUE;
+	    }
+	}
+    }
+    return FALSE;
+}
+
+
 pixman_implementation_t *
 _pixman_implementation_create_fast_path (pixman_implementation_t *fallback)
 {
     pixman_implementation_t *imp = _pixman_implementation_create (fallback, c_fast_paths);
 
     imp->fill = fast_path_fill;
+    imp->src_iter_init = fast_src_iter_init;
+    imp->dest_iter_init = fast_dest_iter_init;
 
     return imp;
 }