[cairo] [PATCH] pixman: high qualtiy downscaling

Fri Jul 31 13:56:13 PDT 2009

The following patch is a cleaned up version of my previous downscaling
patches. It doesn't currently export any api, but I'd like to get it
into the pixman repo to avoid any duplication of work while the proper
api to expose it is worked out.

-Jeff

diff --git a/pixman/Makefile.am b/pixman/Makefile.am
index e19fa6e..8292bfc 100644
--- a/pixman/Makefile.am
+++ b/pixman/Makefile.am
@@ -33,6 +33,10 @@ libpixman_1_la_SOURCES =			\
 	pixman-edge-imp.h			\
 	pixman-trap.c				\
 	pixman-timer.c				\
+	pixman-rescale.c			\
+	pixman-rescale-box.c			\
+	pixman-rescale-integer-box.c		\
+	pixman-rescale-lanczos.c		\
 	pixman-matrix.c
 
 libpixmanincludedir = $(includedir)/pixman-1/
diff --git a/pixman/pixman-rescale-box.c b/pixman/pixman-rescale-box.c
new file mode 100644
index 0000000..6d9a874
--- /dev/null
+++ b/pixman/pixman-rescale-box.c
@@ -0,0 +1,439 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2009 Mozilla Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * MOZILLA CORPORATION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT
+ * SHALL MOZILLA CORPORATION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ *
+ * Author: Jeff Muizelaar, Mozilla Corp.
+ */
+
+/* This implements a box filter that supports non-integer box sizes */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdint.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <math.h>
+#include "pixman-private.h"
+#include "pixman-rescale.h"
+
+/* we work in fixed point where 1. == 1 << 24 */
+#define FIXED_SHIFT 24
+
+#if 0
+static void downsample_row_box_filter_sse2(
+        int start,
+		int width,
+		uint32_t *src, uint32_t *dest,
+                int coverage[], int pixel_coverage)
+{
+    /* we need an array of the pixel contribution of each destination pixel on the boundaries.
+     * we invert the value to get the value on the other size of the box */
+    /*
+
+       value  = a * contribution * 1/box_size
+       value += a * 1/box_size
+       value += a * 1/box_size
+       value += a * 1/box_size
+       value += a * (1 - contribution) * 1/box_size
+                a * (1/box_size - contribution * 1/box_size)
+
+        box size is constant
+
+
+       value = a * contribtion_a * 1/box_size + b * contribution_b * 1/box_size
+               contribution_b = (1 - contribution_a)
+                              = (1 - contribution_a_next)
+    */
+
+    /* box size = ceil(src_width/dest_width) */
+    int x = 0;
+    while (x < start) {
+        int box = 1 << FIXED_SHIFT;
+        int start_coverage = coverage[x];
+        box -= start_coverage;
+        src++;
+        while (box >= pixel_coverage) {
+            src++;
+            box -= pixel_coverage;
+        }
+        x++;
+    }
+    __m128i accum = _mm_setzero_si128();
+    __m128i zero = _mm_setzero_si128();
+    while (x < start + width) {
+        uint32_t a = 0;
+        uint32_t r = 0;
+        uint32_t g = 0;
+        uint32_t b = 0;
+        int box = 1 << FIXED_SHIFT;
+        int start_coverage = coverage[x];
+        __m128i v_src = _mm_cvtsi32_si128(*src);
+        v_src = _mm_unpacklo_epi16(_mm_unpacklo_epi8(v_src, zero), zero);
+        __m128i coeff = _mm_cvtsi32_si128(start_coverage);
+        /* duplicate the filter coefficient */
+        coeff = _mm_shuffle_epi32(coeff, _MM_SHUFFLE(0, 0, 0, 0));
+        /* multiply and accumulate the result:
+         * 0000vvvv_0000vvvv_0000vvvv_0000vvvv * 000000aa_000000rr_000000gg_000000bb */
+        accum = _mm_madd_epi16(v_src, coeff);
+        // we can't use a 32 bit mulitply in sse2
+        // sse4.1 gives us pmulld but it is apparenlty slow (6 clocks on Nehalem)
+        src++;
+        x++;
+        box -= start_coverage;
+        while (box >= pixel_coverage) {
+            a += ((*src >> 24) & 0xff) * pixel_coverage;
+            r += ((*src >> 16) & 0xff) * pixel_coverage;
+            g += ((*src >>  8) & 0xff) * pixel_coverage;
+            b += ((*src >>  0) & 0xff) * pixel_coverage;
+            src++;
+
+            box -= pixel_coverage;
+        }
+        /* multiply by whatever is leftover
+         * this ensures that we don't bias down.
+         * i.e. start_coverage + n*pixel_coverage + box == 1 << 24 */
+        if (box > 0) {
+            a += ((*src >> 24) & 0xff) * box;
+            r += ((*src >> 16) & 0xff) * box;
+            g += ((*src >>  8) & 0xff) * box;
+            b += ((*src >>  0) & 0xff) * box;
+        }
+
+        a >>= FIXED_SHIFT;
+        r >>= FIXED_SHIFT;
+        g >>= FIXED_SHIFT;
+        b >>= FIXED_SHIFT;
+
+        *dest = (a << 24) | (r << 16) | (g << 8) | b;
+        dest++;
+    }
+}
+#endif
+
+static void downsample_row_box_filter (
+        int start, int width,
+        uint32_t *src, uint32_t *dest,
+        int coverage[], int pixel_coverage)
+{
+    /* we need an array of the pixel contribution of each destination pixel on the boundaries.
+     * we invert the value to get the value on the other size of the box */
+    /*
+
+       value  = a * contribution * 1/box_size
+       value += a * 1/box_size
+       value += a * 1/box_size
+       value += a * 1/box_size
+       value += a * (1 - contribution) * 1/box_size
+                a * (1/box_size - contribution * 1/box_size)
+
+        box size is constant
+
+
+       value = a * contribtion_a * 1/box_size + b * contribution_b * 1/box_size
+               contribution_b = (1 - contribution_a)
+                              = (1 - contribution_a_next)
+    */
+
+    /* box size = ceil(src_width/dest_width) */
+    int x = 0;
+
+    /* skip to start */
+    /* XXX: it might be possible to do this directly instead of iteratively, however
+     * the iterative solution is simple */
+    while (x < start)
+    {
+        int box = 1 << FIXED_SHIFT;
+        int start_coverage = coverage[x];
+        box -= start_coverage;
+        src++;
+        while (box >= pixel_coverage)
+        {
+            src++;
+            box -= pixel_coverage;
+        }
+        x++;
+    }
+
+    while (x < start + width)
+    {
+        uint32_t a = 0;
+        uint32_t r = 0;
+        uint32_t g = 0;
+        uint32_t b = 0;
+        int box = 1 << FIXED_SHIFT;
+        int start_coverage = coverage[x];
+
+        a = ((*src >> 24) & 0xff) * start_coverage;
+        r = ((*src >> 16) & 0xff) * start_coverage;
+        g = ((*src >>  8) & 0xff) * start_coverage;
+        b = ((*src >>  0) & 0xff) * start_coverage;
+        src++;
+        x++;
+        box -= start_coverage;
+
+        while (box >= pixel_coverage)
+        {
+            a += ((*src >> 24) & 0xff) * pixel_coverage;
+            r += ((*src >> 16) & 0xff) * pixel_coverage;
+            g += ((*src >>  8) & 0xff) * pixel_coverage;
+            b += ((*src >>  0) & 0xff) * pixel_coverage;
+            src++;
+
+            box -= pixel_coverage;
+        }
+
+        /* multiply by whatever is leftover
+         * this ensures that we don't bias down.
+         * i.e. start_coverage + n*pixel_coverage + box == 1 << 24 */
+        if (box > 0)
+        {
+            a += ((*src >> 24) & 0xff) * box;
+            r += ((*src >> 16) & 0xff) * box;
+            g += ((*src >>  8) & 0xff) * box;
+            b += ((*src >>  0) & 0xff) * box;
+        }
+
+        a >>= FIXED_SHIFT;
+        r >>= FIXED_SHIFT;
+        g >>= FIXED_SHIFT;
+        b >>= FIXED_SHIFT;
+
+        *dest = (a << 24) | (r << 16) | (g << 8) | b;
+        dest++;
+    }
+}
+
+static void downsample_columns_box_filter (
+        int n,
+        int start_coverage,
+        int pixel_coverage,
+        uint32_t *src, uint32_t *dest)
+{
+    int stride = n;
+    while (n--) {
+        uint32_t a = 0;
+        uint32_t r = 0;
+        uint32_t g = 0;
+        uint32_t b = 0;
+        uint32_t *column_src = src;
+        int box = 1 << FIXED_SHIFT;
+
+        a = ((*column_src >> 24) & 0xff) * start_coverage;
+        r = ((*column_src >> 16) & 0xff) * start_coverage;
+        g = ((*column_src >>  8) & 0xff) * start_coverage;
+        b = ((*column_src >>  0) & 0xff) * start_coverage;
+        column_src += stride;
+        box -= start_coverage;
+
+        while (box >= pixel_coverage)
+        {
+            a += ((*column_src >> 24) & 0xff) * pixel_coverage;
+            r += ((*column_src >> 16) & 0xff) * pixel_coverage;
+            g += ((*column_src >>  8) & 0xff) * pixel_coverage;
+            b += ((*column_src >>  0) & 0xff) * pixel_coverage;
+            column_src += stride;
+            box -= pixel_coverage;
+        }
+
+        if (box > 0) {
+            a += ((*column_src >> 24) & 0xff) * box;
+            r += ((*column_src >> 16) & 0xff) * box;
+            g += ((*column_src >>  8) & 0xff) * box;
+            b += ((*column_src >>  0) & 0xff) * box;
+        }
+
+        a >>= FIXED_SHIFT;
+        r >>= FIXED_SHIFT;
+        g >>= FIXED_SHIFT;
+        b >>= FIXED_SHIFT;
+
+        *dest = (a << 24) | (r << 16) | (g << 8) | b;
+        dest++;
+        src++;
+    }
+}
+
+static int compute_coverage (int coverage[], int src_length, int dest_length)
+{
+    int i;
+    /* num = src_length/dest_length
+       total = sum(pixel) / num
+
+       pixel * 1/num == pixel * dest_length / src_length
+    */
+    /* the average contribution of each source pixel */
+    int ratio = ((1 << 24)*(long long int)dest_length)/src_length;
+    /* because ((1 << 24)*(long long int)dest_length) won't always be divisible by src_length
+     * we'll need someplace to put the other bits.
+     *
+     * We want to ensure a + n*ratio < 1<<24
+     *
+     * 1<<24
+     * */
+
+    double scale = (double)src_length/dest_length;
+
+    /* for each destination pixel compute the coverage of the left most pixel included in the box */
+    /* I have a proof of this, which this margin is too narrow to contain */
+    for (i=0; i<dest_length; i++)
+    {
+        float left_side = i*scale;
+        float right_side = (i+1)*scale;
+        float right_fract = right_side - floor (right_side);
+        float left_fract = ceil (left_side) - left_side;
+        int overage;
+        /* find out how many source pixels will be used to fill the box */
+        int count = floor (right_side) - ceil (left_side);
+        /* what's the maximum value this expression can become?
+           floor((i+1)*scale) - ceil(i*scale)
+
+           (i+1)*scale - i*scale == scale
+
+           since floor((i+1)*scale) <= (i+1)*scale
+           and   ceil(i*scale)      >= i*scale
+
+           floor((i+1)*scale) - ceil(i*scale) <= scale
+
+           further since: floor((i+1)*scale) - ceil(i*scale) is an integer
+
+           therefore:
+           floor((i+1)*scale) - ceil(i*scale) <= floor(scale)
+        */
+
+        if (left_fract == 0.)
+            count--;
+
+        /* compute how much the right-most pixel contributes */
+        overage = ratio*(right_fract);
+
+        /* the remainder is the the amount that the left-most pixel
+         * contributes */
+        coverage[i] = (1<<24) - (count * ratio + overage);
+    }
+
+    return ratio;
+}
+
+PIXMAN_EXPORT
+int downscale_box_filter(const struct rescale_fetcher *fetcher, void *closure, unsigned orig_width, unsigned orig_height,
+		signed scaled_width, signed scaled_height,
+		uint16_t start_column, uint16_t start_row,
+		uint16_t width, uint16_t height,
+		uint32_t *dest, int dst_stride)
+{
+    int pixel_coverage_x, pixel_coverage_y;
+    int dest_y;
+    int src_y = 0;
+
+    void (*fetch_scanline)(void *closure, int y, uint32_t **scanline) = fetcher->fetch_scanline;
+    uint32_t *scanline = fetcher->init(closure);
+
+    int *x_coverage = pixman_malloc_abc (orig_width, 1, sizeof(int));
+    int *y_coverage = pixman_malloc_abc (orig_height, 1, sizeof(int));
+
+    /* we need to allocate enough room for ceil(src_height/dest_height)+1
+      Example:
+      src_height = 140
+      dest_height = 50
+      src_height/dest_height = 2.8
+
+      |-------------|       2.8 pixels
+      |----|----|----|----| 4 pixels
+      need to sample 3 pixels
+
+         |-------------|       2.8 pixels
+      |----|----|----|----| 4 pixels
+      need to sample 4 pixels
+    */
+
+    uint32_t *temp_buf = pixman_malloc_abc ((orig_height + scaled_height-1)/scaled_height+1, scaled_width, sizeof(uint32_t));
+
+    if (!x_coverage || !y_coverage || !scanline || !temp_buf) {
+        return -1;
+    }
+
+    pixel_coverage_x = compute_coverage (x_coverage, orig_width, scaled_width);
+    pixel_coverage_y = compute_coverage (y_coverage, orig_height, scaled_height);
+
+    assert (width + start_column <= scaled_width);
+
+    /* skip the rows at the beginning */
+    for (dest_y = 0; dest_y < start_row; dest_y++)
+    {
+        int box = 1 << FIXED_SHIFT;
+        int start_coverage_y = y_coverage[dest_y];
+        box -= start_coverage_y;
+        src_y++;
+        while (box >= pixel_coverage_y)
+        {
+            box -= pixel_coverage_y;
+            src_y++;
+        }
+    }
+
+    for (; dest_y < start_row + height; dest_y++)
+    {
+        int columns = 0;
+        int box = 1 << FIXED_SHIFT;
+        int start_coverage_y = y_coverage[dest_y];
+
+        fetch_scanline (closure, src_y, &scanline);
+        downsample_row_box_filter (start_column, width, scanline, temp_buf + width * columns, x_coverage, pixel_coverage_x);
+        columns++;
+        src_y++;
+        box -= start_coverage_y;
+
+        while (box >= pixel_coverage_y)
+        {
+            fetch_scanline (closure, src_y, &scanline);
+            downsample_row_box_filter (start_column, width, scanline, temp_buf + width * columns, x_coverage, pixel_coverage_x);
+            columns++;
+            src_y++;
+            box -= pixel_coverage_y;
+        }
+
+        /* downsample any leftovers */
+        if (box > 0)
+        {
+            fetch_scanline (closure, src_y, &scanline);
+            downsample_row_box_filter (start_column, width, scanline, temp_buf + width * columns, x_coverage, pixel_coverage_x);
+            columns++;
+        }
+
+        /* now scale the rows we just downsampled in the y direction */
+        downsample_columns_box_filter (width, start_coverage_y, pixel_coverage_y, temp_buf, dest);
+        dest += dst_stride/4;
+
+        assert(width*columns <= ((orig_height + scaled_height-1)/scaled_height+1) * width);
+    }
+
+    assert (src_y<=orig_height);
+
+    fetcher->fini (closure, scanline);
+    free (x_coverage);
+    free (y_coverage);
+    free (temp_buf);
+
+    return 0;
+}
diff --git a/pixman/pixman-rescale-integer-box.c b/pixman/pixman-rescale-integer-box.c
new file mode 100644
index 0000000..5142705
--- /dev/null
+++ b/pixman/pixman-rescale-integer-box.c
@@ -0,0 +1,271 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2009 Mozilla Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * MOZILLA CORPORATION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT
+ * SHALL MOZILLA CORPORATION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ *
+ * Author: Jeff Muizelaar, Mozilla Corp.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include "pixman-private.h"
+#include "pixman-rescale.h"
+
+/*
+When box filtering with an integer sized box we only ever have two box sizes.
+Proof:
+    Assume w1 > w2.
+    r = w1/w2
+
+    The size of the two boxes is
+    r1 = ceil(r), r2 = floor(r)
+
+    we want to find non-negative integers p and q such that:
+    w1 = p*r1 + q*r2
+
+    if r1 == r2 then
+        r is an integer and thus w2 evenly divides w1
+        therefor p = w2 and q = 0
+    otherwise r1 != r2 and
+        r1 = r2 + 1
+        w1 = p*(r2 + 1) + q * r2
+           = p*r2 + q*r2 + p
+           = (p + q)*r2 + p
+
+        we then choose a value of p such that:
+            w1 = r2*w2 + p
+            thus p = w1 mod w2
+            and  q = w2 - p which is > 0 because
+                            p = w1 mod w2
+
+            subing into:
+            (p + q)*r2 + p
+            gives:
+            w1 = (p + w2 - p)*r2 + p
+            w1 = w2*r2 + w1 mod w2
+
+*/
+#define AVOID_DIVIDE
+
+/* downsample_row_box_filter is a Digital Differential Analyzer (like Bresenham's line algorithm)
+ * 'e' is the current fraction or error
+ */
+void downsample_row_box_filter(
+		int n,
+		uint32_t *src, uint32_t *dest,
+                long dx2, long e, long src_dx)
+{
+    /* the filter sums only two different counts of samples.
+     * we compute the inverses for these two different counts
+     * so that we can do multiplications instead of divisions */
+    int div_inv[2];
+    int div = src_dx/dx2;
+    /* we can index on the bottom bit of the divisor because the two different counts
+     * are +1 */
+    div_inv[div & 1] = (1<<24)/div;
+    div += 1;
+    div_inv[div & 1] = (1<<24)/div;
+
+    while (n--) {
+        uint32_t a, r, g, b;
+        int div;
+        int count = 1;
+
+        /* we always have at least one source pixel per destination pixel
+         * because we are downscaling */
+        a = (*src >> 24) & 0xff;
+        r = (*src >> 16) & 0xff;
+        g = (*src >>  8) & 0xff;
+        b = (*src >>  0) & 0xff;
+        e -= dx2;
+        src++;
+
+        /* accumulate the rest of the source pixels */
+        while (e > 0) {
+            e -= dx2;
+            a += (*src >> 24) & 0xff;
+            r += (*src >> 16) & 0xff;
+            g += (*src >>  8) & 0xff;
+            b += (*src >>  0) & 0xff;
+            count++;
+            src++;
+        }
+
+#ifdef AVOID_DIVIDE
+        div = div_inv[count & 1];
+
+        a = (a * div + 0x10000) >> 24;
+        r = (r * div + 0x10000) >> 24;
+        g = (g * div + 0x10000) >> 24;
+        b = (b * div + 0x10000) >> 24;
+#else
+        a /= count;
+        r /= count;
+        g /= count;
+        b /= count;
+#endif
+
+        *dest = (a << 24) | (r << 16) | (g << 8) | b;
+        dest++;
+
+        e += src_dx;
+    }
+}
+
+void downsample_columns_box_filter(
+        int n,
+        int columns,
+        uint32_t *src, uint32_t *dest)
+{
+    int stride = n;
+    int div = (1<<24)/columns;
+    while (n--)
+    {
+        uint32_t a, r, g, b;
+        uint32_t *column_src = src;
+        int h;
+
+        a = (*column_src >> 24) & 0xff;
+        r = (*column_src >> 16) & 0xff;
+        g = (*column_src >>  8) & 0xff;
+        b = (*column_src >>  0) & 0xff;
+
+        for (h = 1; h < columns; h++)
+        {
+            a += (*column_src >> 24) & 0xff;
+            r += (*column_src >> 16) & 0xff;
+            g += (*column_src >>  8) & 0xff;
+            b += (*column_src >>  0) & 0xff;
+            column_src += stride;
+        }
+
+#ifdef AVOID_DIVIDE
+        a = (a * div + 0x10000) >> 24;
+        r = (r * div + 0x10000) >> 24;
+        g = (g * div + 0x10000) >> 24;
+        b = (b * div + 0x10000) >> 24;
+#else
+        a /= columns;
+        r /= columns;
+        g /= columns ;
+        b /= columns;
+#endif
+        *dest = (a << 24) | (r << 16) | (g << 8) | b;
+        dest++;
+        src++;
+    }
+}
+
+#define ROUND
+
+PIXMAN_EXPORT
+int downscale_integer_box_filter (const struct rescale_fetcher *fetcher, void *closure, unsigned orig_width, unsigned orig_height,
+		signed scaled_width, signed scaled_height,
+		uint16_t start_column, uint16_t start_row,
+		uint16_t width, uint16_t height,
+		uint32_t *dest, int dst_stride)
+{
+    uint32_t *temp_buf;
+    uint32_t *scanline;
+    int src_offset = 0;
+    int src_dy, e, dx2;
+    int dest_dx, src_dx, ex, dx2x;
+    int y = 0;
+    int d;
+
+    /* setup our parameters for runing our DDA in the y direction */
+    /* multiplying by 2 is for rounding purposes. i.e. to move the center */
+    src_dy = orig_height * 2; /* dy *= 2 */
+    e = (orig_height*2) - scaled_height; /* e = dy*2 - dx */
+    dx2 = scaled_height*2; /* dx2 = dx*2 */
+
+    /* setup our parameters for doing DDA in the x direction */
+    dest_dx = scaled_width;
+    src_dx = orig_width;
+    ex = (src_dx*2) - dest_dx;
+    dx2x = dest_dx*2;
+    src_dx *= 2;
+
+    /* Compute the src_offset and associated error value:
+     * There are two ways to do this: */
+    /* 1. Directly
+    int nsrc_offset = (src_dx*startColumn + dest_dx)/(dest_dx*2);
+    long nex = (src_dx*startColumn + dest_dx)%(dest_dx*2) - dest_dx*2 + src_dx; */
+    /* 2. Iteratively */
+    for (d = 0; d < start_column; d++) {
+        while (ex > 0) {
+            src_offset++;
+            ex -= dx2x;
+        }
+        ex += src_dx;
+    }
+
+    /* seek to the begining */
+    for (d = 0; d < start_row; d++) {
+        while (e > 0) {
+            e -= dx2;
+            y++;
+        }
+        e += src_dy;
+    }
+
+    /* we need to allocate enough room for ceil(src_height/dest_height) scanlines */
+    temp_buf = pixman_malloc_abc ((orig_height + scaled_height-1)/scaled_height, scaled_width, sizeof(uint32_t));
+
+    scanline = fetcher->init (closure);
+
+    if (!scanline || !temp_buf) {
+        return -1;
+    }
+
+    for (d = start_row; d < start_row + height; d++)
+    {
+        int columns = 0;
+        while (e > 0)
+        {
+            fetcher->fetch_scanline (closure, y, &scanline);
+
+            /* XXX: We could turn this multiplication into addition. It
+             * probably doesn't matter though  */
+            downsample_row_box_filter (width, scanline + src_offset, temp_buf + width * columns,
+                                       dx2x, ex, src_dx);
+
+            e -= dx2;
+            columns++;
+            y++;
+        }
+
+        downsample_columns_box_filter (width, columns, temp_buf, dest);
+
+        dest += dst_stride/4;
+        e += src_dy;
+    }
+
+    fetcher->fini (closure, scanline);
+    free (temp_buf);
+
+    return 0;
+}
+
diff --git a/pixman/pixman-rescale-lanczos.c b/pixman/pixman-rescale-lanczos.c
new file mode 100644
index 0000000..864fd6d
--- /dev/null
+++ b/pixman/pixman-rescale-lanczos.c
@@ -0,0 +1,483 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2009 Mozilla Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * MOZILLA CORPORATION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT
+ * SHALL MOZILLA CORPORATION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ *
+ * Author: Jeff Muizelaar, Mozilla Corp.
+ *
+ * Portions derived from Chromium:
+ *
+ * Copyright (c) 2006-2008 The Chromium Authors. All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met:
+ *
+ *    * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *    * Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following disclaimer
+ * in the documentation and/or other materials provided with the
+ * distribution.
+ *    * Neither the name of Google Inc. nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdint.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include <float.h>
+#include <math.h>
+#include "pixman-private.h"
+#include "pixman-rescale.h"
+
+
+#define FILTER_SHIFT 14
+#define LANCZOS_LOBES 2
+
+/* contains the filter for each destination pixel */
+struct filter {
+    int count; /* filter size */
+    int16_t *values; /* filter coefficients */
+    int offset; /* filter start offset */
+};
+
+static int clamp(int a)
+{
+    if (a > 255)
+        return 255;
+    if (a < 0)
+        return 0;
+    return a;
+}
+
+/* from ffmpeg */
+static inline uint8_t clip_uint8(int a)
+{
+    if (a&(~255)) /* check if any of the high bits are set */
+        return (-a)>>31; /* clip to either 0 or 255 */
+    else
+        return a;
+}
+
+#if 0
+#include <pmmintrin.h>
+#include <emmintrin.h>
+static void
+downsample_row_convolve_sse2(int n,
+		uint32_t *src, uint32_t *dest,
+                const struct filter *filter, int src_size)
+{
+    int pixel = 0;
+    while (n--) {
+        int i;
+        __m128i accum = _mm_setzero_si128 ();
+        __m128i zero  = _mm_setzero_si128 ();
+        for (i=0; i<filter[pixel].count; i++) {
+            __m128i v_src = _mm_cvtsi32_si128 (src[filter[pixel].offset + i]);
+            v_src = _mm_unpacklo_epi16 (_mm_unpacklo_epi8 (v_src, zero), zero);
+
+            __m128i coeff = _mm_cvtsi32_si128 (filter[pixel].values[i]);
+            /* duplicate the filter coefficient */
+            coeff = _mm_shuffle_epi32 (coeff, _MM_SHUFFLE (0, 0, 0, 0));
+
+            /* multiply and accumulate the result:
+             * 0000vvvv_0000vvvv_0000vvvv_0000vvvv * 000000aa_000000rr_000000gg_000000bb */
+            __m128i result = _mm_madd_epi16 (v_src, coeff);
+            accum = _mm_add_epi32 (accum, result);
+        }
+
+        /* scale the accumulator down */
+        accum = _mm_srai_epi32 (accum, FILTER_SHIFT);
+
+        /* pack 000000aa_000000rr_000000gg_000000bb -> 00000000_00000000_00000000_aarrggbb */
+        accum = _mm_packs_epi32 (accum, accum);
+
+        //XXX: this should be need to saturate properly but doesn't seem to make a difference
+        accum = _mm_max_epi16 (accum, zero);
+
+        accum = _mm_packus_epi16 (accum, accum);
+
+        *dest = _mm_cvtsi128_si32 (accum);
+        dest++;
+        pixel++;
+    }
+}
+#endif
+
+static void
+downsample_row_convolve(int n,
+		uint32_t *src, uint32_t *dest,
+                struct filter *filter, int src_size)
+{
+    int pixel = 0;
+    while (n--)
+    {
+        int32_t a = 0;
+        int32_t r = 0;
+        int32_t g = 0;
+        int32_t b = 0;
+        int i;
+
+        for (i=0; i<filter[pixel].count; i++) {
+            a += ((src[filter[pixel].offset + i] >> 24) & 0xff) * filter[pixel].values[i];
+            r += ((src[filter[pixel].offset + i] >> 16) & 0xff) * filter[pixel].values[i];
+            g += ((src[filter[pixel].offset + i] >>  8) & 0xff) * filter[pixel].values[i];
+            b += ((src[filter[pixel].offset + i] >>  0) & 0xff) * filter[pixel].values[i];
+        }
+
+        a >>= FILTER_SHIFT;
+        r >>= FILTER_SHIFT;
+        g >>= FILTER_SHIFT;
+        b >>= FILTER_SHIFT;
+
+        a = clamp (a);
+        r = clamp (r);
+        g = clamp (g);
+        b = clamp (b);
+
+        *dest = (a << 24) | (r << 16) | (g << 8) | b;
+        dest++;
+        pixel++;
+    }
+}
+
+/* src1 and src2 are two contiguous regions of samples. We need two to handle
+ * the ring buffer that is used by the caller
+ *
+ * instead of src1, src1_length, src2, filter_length we could pass:
+ * ring_buf, start_index, ring_buf_length, filter_length,
+ * this is implicitly coded in the parameters we do pass:
+ * src1 = ring_buf + start_index
+ * src1_length = min(ring_buf_length - start_index, filter_length)
+ * src2 = ring_buf */
+static void
+downsample_columns_convolve(int n,
+        uint32_t *src1, int src1_length,
+        uint32_t *src2,
+        uint32_t *dest,
+        int16_t *filter, int filter_length)
+{
+    int stride = n;
+    while (n--)
+    {
+        int32_t a = 0;
+        int32_t r = 0;
+        int32_t g = 0;
+        int32_t b = 0;
+        int i;
+        uint32_t *column_src;
+
+        /* we do the accumulation in two steps because the source lines are in a ring buffer
+         * so won't be contiguous */
+        column_src = src1;
+        for (i=0; i<src1_length; i++) /* loop till the end of the ring buffer */
+        {
+            a += ((*column_src >> 24) & 0xff) * filter[i];
+            r += ((*column_src >> 16) & 0xff) * filter[i];
+            g += ((*column_src >>  8) & 0xff) * filter[i];
+            b += ((*column_src >>  0) & 0xff) * filter[i];
+            column_src += stride;
+        }
+
+        /* accumulate the remaining samples starting at the begining of the ring buffer */
+        column_src = src2;
+        for (; i<filter_length; i++)
+        {
+            a += ((*column_src >> 24) & 0xff) * filter[i];
+            r += ((*column_src >> 16) & 0xff) * filter[i];
+            g += ((*column_src >>  8) & 0xff) * filter[i];
+            b += ((*column_src >>  0) & 0xff) * filter[i];
+            column_src += stride;
+        }
+
+        a >>= FILTER_SHIFT;
+        r >>= FILTER_SHIFT;
+        g >>= FILTER_SHIFT;
+        b >>= FILTER_SHIFT;
+
+        a = clamp (a);
+        r = clamp (r);
+        g = clamp (g);
+        b = clamp (b);
+
+        *dest = (a << 24) | (r << 16) | (g << 8) | b;
+        dest++;
+        src1++;
+        src2++;
+    }
+}
+
+
+/* Evaluates the Lanczos filter of the given filter size window for the given
+ * position.
+ *
+ * |filter_size| is the width of the filter (the "window"), outside of which
+ * the value of the function is 0. Inside of the window, the value is the
+ * normalized sinc function:
+ *   lanczos(x) = sinc(x) * sinc(x / filter_size);
+ * where
+ *   sinc(x) = sin(pi*x) / (pi*x);
+ */
+float eval_lanczos(int filter_size, float x)
+{
+  if (x <= -filter_size || x >= filter_size)
+    return 0.0f;  /* Outside of the window. */
+  if (x > -FLT_EPSILON &&
+      x < FLT_EPSILON)
+    return 1.0f;  /* Special case the discontinuity at the origin. */
+  else {
+      float xpi = x * (M_PI);
+      return (sin(xpi) / xpi) *  /* sinc(x) */
+          sin(xpi / filter_size) / (xpi / filter_size);  /* sinc(x/filter_size) */
+  }
+}
+
+/* dealing with the edges:
+   some options:
+   we could always have approximately the same number of samples in the filter and just pad the image out
+   chromium seems to truncate the filter though...
+   I don't really have a good reason to choose either approach...
+   one way to get an idea is to see what other implementation do.
+   - it looks like quartz pads
+   - chromium truncates the filter
+   - opera pads
+*/
+
+int floor_int(float a)
+{
+    return floor(a);
+}
+
+int ceil_int(float a)
+{
+    return ceil(a);
+}
+
+int float_to_fixed(float a)
+{
+    return a * (1 << FILTER_SHIFT);
+}
+
+/* this method does not do source clipping
+ * but will do dest clipping */
+struct filter *compute_lanczos_filter(int dest_subset_lo, int dest_subset_size, int src_size, float scale)
+{
+    /* this is half the number of pixels that the filter uses in filter space */
+    int dest_support = LANCZOS_LOBES;
+    float src_support = dest_support / scale;
+
+    /* we need to compute a set of filters for each pixel */
+    /* filter width */
+    int i;
+    struct filter *filter = malloc (dest_subset_size * sizeof(struct filter));
+    int max_filter_size = ceil_int (src_support * 2) - floor_int (src_support * -2) + 1;
+    float *filter_values = malloc (max_filter_size * sizeof(float)); /* this is a good candidate for stack allocation */
+    int dest_subset_hi = dest_subset_lo + dest_subset_size; /* [lo, hi) */
+
+
+    /* When we're doing a magnification, the scale will be larger than one. This
+     * means the destination pixels are much smaller than the source pixels, and
+     * that the range covered by the filter won't necessarily cover any source
+     * pixel boundaries. Therefore, we use these clamped values (max of 1) for
+     * some computations.
+     */
+    float clamped_scale = MIN (1., scale);
+
+    /* Speed up the divisions below by turning them into multiplies. */
+    float inv_scale = 1. / scale;
+
+    /* Loop over all pixels in the output range. We will generate one set of
+     * filter values for each one. Those values will tell us how to blend the
+     * source pixels to compute the destination pixel.
+     */
+    int dest_subset_i;
+    int pixel = 0;
+    for (dest_subset_i = dest_subset_lo; dest_subset_i < dest_subset_hi; dest_subset_i++, pixel++) {
+
+        float src_pixel = dest_subset_i * inv_scale;
+
+        int src_begin = MAX (0, floor_int (src_pixel - src_support));
+        int src_end = MIN (src_size - 1, ceil_int (src_pixel + src_support));
+
+        /* Compute the unnormalized filter value at each location of the source
+         * it covers. */
+        float filter_sum = 0.; /* sum of the filter values for normalizing */
+        int filter_size = 0;
+        int cur_filter_pixel;
+        int j = 0;
+
+        assert(src_begin >= 0);
+
+        for (cur_filter_pixel = src_begin; cur_filter_pixel <= src_end;
+                cur_filter_pixel++)
+        {
+            /* Distance from the center of the filter, this is the filter coordinate
+             * in source space. */
+            float src_filter_pos = cur_filter_pixel - src_pixel;
+
+            /* Since the filter really exists in dest space, map it there. */
+            float dest_filter_pos = src_filter_pos * clamped_scale;
+
+            /* Compute the filter value at that location. */
+            float filter_value = eval_lanczos(LANCZOS_LOBES, dest_filter_pos);
+            filter_sum += filter_value;
+            filter_values[j] = filter_value;
+
+            filter_size++;
+            j++;
+        }
+
+        assert(src_end >= src_begin);
+        assert(filter_size <= max_filter_size);
+
+        {
+            int16_t leftovers;
+            /* XXX: eventually we should avoid doing malloc here and just malloc a big
+             * chunk of max_filter_size * sizeof(int16_t) */
+            int16_t *fixed_filter_values = malloc (filter_size * sizeof(int16_t));
+
+            /* the filter must be normalized so that we don't affect the brightness of
+             * the image. Convert to normalized fixed point */
+            /* XXX: It might be better if we didn't have to do this in a separate pass */
+            int16_t fixed_sum = 0; /* XXX: should we use a regular int here? */
+            for (i=0; i<filter_size; i++)
+            {
+                int16_t cur_fixed = float_to_fixed(filter_values[i] / filter_sum);
+                fixed_sum += cur_fixed;
+                fixed_filter_values[i] = cur_fixed;
+            }
+
+            /* The conversion to fixed point will leave some rounding errors, which
+             * we add back in to avoid affecting the brightness of the image. We
+             * arbitrarily add this to the center of the filter array (this won't always
+             * be the center of the filter function since it could get clipped on the
+             * edges, but it doesn't matter enough to worry about that case). */
+            leftovers = float_to_fixed(1.0f) - fixed_sum;
+            fixed_filter_values[filter_size / 2] += leftovers;
+
+            filter[pixel].values = fixed_filter_values;
+            filter[pixel].count = filter_size;
+            filter[pixel].offset = src_begin;
+            assert (filter[pixel].offset >= 0);
+            assert (filter[pixel].offset + filter[pixel].count - 1 < src_size);
+        }
+    }
+
+    free (filter_values);
+    return filter;
+}
+
+/* start_column and start_row are in destination space */
+PIXMAN_EXPORT
+int downscale_lanczos_filter(const struct rescale_fetcher *fetcher, void *closure, unsigned orig_width, unsigned orig_height,
+		signed scaled_width, signed scaled_height,
+		uint16_t start_column, uint16_t start_row,
+		uint16_t width, uint16_t height,
+		uint32_t *dest, int dst_stride)
+{
+    struct filter *x_filter, *y_filter;
+    int filter_index;
+    int src_index = 0;
+    int i;
+
+    /* XXX: this duplicates code in compute_lanczos_filter */
+    int dest_support = LANCZOS_LOBES;
+    float src_support = dest_support / ((float)scaled_height/orig_height);
+
+    int max_filter_size = ceil_int (src_support * 2) - floor_int (src_support * -2) + 1;
+
+    /* we use a ring buffer to store the downsampled rows because we want to
+     * reuse the downsampled rows across different destination pixels */
+    int ring_buf_size = max_filter_size;
+    uint32_t *ring_buf = pixman_malloc_abc (ring_buf_size, scaled_width, sizeof(uint32_t));
+
+    uint32_t *scanline = fetcher->init (closure);
+    void (*fetch_scanline)(void *closure, int y, uint32_t **scanline) = fetcher->fetch_scanline;
+
+    assert (start_column + width  <= scaled_width);
+    assert (start_row    + height <= scaled_height);
+
+    x_filter = compute_lanczos_filter (start_column, width,  orig_width,  (float)scaled_width/orig_width);
+    y_filter = compute_lanczos_filter (start_row,    height, orig_height, (float)scaled_height/orig_height);
+
+    if (!ring_buf || !scanline || !x_filter || !y_filter)
+        return -1;
+
+    for (filter_index = 0; filter_index < height; filter_index++)
+    {
+        int filter_size   = y_filter[filter_index].count;
+        int filter_offset = y_filter[filter_index].offset;
+        int src1_index, src1_size;
+        uint32_t *src1;
+        assert(filter_offset >= 0);
+
+        /* read and downsample the rows needed to downsample the next column */
+        while (src_index < filter_offset + filter_size)
+        {
+            fetch_scanline (closure, src_index, &scanline);
+            /* downsample_row_convolve_sse2 (width, scanline, ring_buf + width * (src_index % ring_buf_size), x_filter, orig_width); */
+            downsample_row_convolve(width, scanline, ring_buf + width * (src_index % ring_buf_size), x_filter, orig_width);
+            src_index++;
+        }
+
+        src1_index = filter_offset % ring_buf_size;
+        assert (src1_index >= 0);
+        assert (filter_size <= ring_buf_size);
+        src1 = ring_buf + width * src1_index;
+
+        src1_size = MIN (ring_buf_size - src1_index, filter_size);
+        assert (src1_size >= 0);
+        assert (filter_size >= src1_size);
+
+        downsample_columns_convolve (width, src1, src1_size, ring_buf, dest, y_filter[filter_index].values, y_filter[filter_index].count);
+        dest += dst_stride/4;
+    }
+
+    free (ring_buf);
+    fetcher->fini (closure, scanline);
+
+    for (i=0; i<width; i++)
+        free (x_filter[i].values);
+    for (i=0; i<height; i++)
+        free (y_filter[i].values);
+    free (x_filter);
+    free (y_filter);
+
+    return 0;
+}
diff --git a/pixman/pixman-rescale-mult-old.c b/pixman/pixman-rescale-mult-old.c
new file mode 100644
index 0000000..861d9e7
--- /dev/null
+++ b/pixman/pixman-rescale-mult-old.c
@@ -0,0 +1,290 @@
+/* -*- Mode: c; c-basic-offset: 4; tab-width: 8; indent-tabs-mode: t; -*- */
+/*
+ * Copyright © 2009 Mozilla Corporation
+ *
+ * Permission to use, copy, modify, distribute, and sell this software and its
+ * documentation for any purpose is hereby granted without fee, provided that
+ * the above copyright notice appear in all copies and that both that
+ * copyright notice and this permission notice appear in supporting
+ * documentation, and that the name of Mozilla Corporation not be used in
+ * advertising or publicity pertaining to distribution of the software without
+ * specific, written prior permission.  Mozilla Corporation makes no
+ * representations about the suitability of this software for any purpose.  It
+ * is provided "as is" without express or implied warranty.
+ *
+ * MOZILLA CORPORATION DISCLAIMS ALL WARRANTIES WITH REGARD TO THIS SOFTWARE,
+ * INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS, IN NO EVENT
+ * SHALL MOZILLA CORPORATION BE LIABLE FOR ANY SPECIAL, INDIRECT OR
+ * CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
+ * DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER
+ * TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE
+ * OF THIS SOFTWARE.
+ *
+ * Author: Jeff Muizelaar, Mozilla Corp.
+ */
+
+#if 0
+
+/* This implements a box filter that supports non-integer box sizes */
+
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdint.h>
+#include <stdio.h>
+#include <assert.h>
+#include <stdlib.h>
+#include "pixman-private.h"
+#include "pixman-rescale.h"
+#define abs(a) (a) > 0 ? (a) : -(a)
+#define sign(x) ((x)>0 ? 1:-1)
+
+static void fetch_scanline(void *closure, int y, uint32_t *scanline)
+{
+    pixman_image_t *pict = closure;
+    bits_image_t *bits = &pict->bits;
+    if (y > pict->bits.height-1)
+        y = pict->bits.height-1;
+    if (!pixman_image_has_source_clipping(pict)) {
+        fetchProc32 fetch = ACCESS(pixman_fetchProcForPicture32)(bits);
+        fetch(bits, 0, y, pict->bits.width, scanline);
+    } else {
+        int x;
+        fetchPixelProc32 fetch = ACCESS(pixman_fetchPixelProcForPicture32)(bits);
+        for (x=0; x<bits->width; x++) {
+            if (pixman_region32_contains_point (bits->common.src_clip, x, y,NULL))
+                scanline[x] = fetch (bits, x, y);
+            else
+                scanline[x] = 0;
+        }
+    }
+}
+
+/* we work in fixed point where 1. == 1 << 24 */
+#define FIXED_SHIFT 24
+
+static void downsample_row_box_filter(
+		int n,
+		uint32_t *src, uint32_t *dest,
+                int coverage[], int pixel_coverage)
+{
+    /* we need an array of the pixel contribution of each destination pixel on the boundaries.
+     * we invert the value to get the value on the other size of the box */
+    /*
+
+       value  = a * contribution * 1/box_size
+       value += a * 1/box_size
+       value += a * 1/box_size
+       value += a * 1/box_size
+       value += a * (1 - contribution) * 1/box_size
+                a * (1/box_size - contribution * 1/box_size)
+
+        box size is constant
+
+
+       value = a * contribtion_a * 1/box_size + b * contribution_b * 1/box_size
+               contribution_b = (1 - contribution_a)
+                              = (1 - contribution_a_next)
+    */
+
+    /* box size = ceil(src_width/dest_width) */
+
+    int x = 0;
+    while (n--) {
+        uint32_t a = 0;
+        uint32_t r = 0;
+        uint32_t g = 0;
+        uint32_t b = 0;
+        int box = 1 << FIXED_SHIFT;
+        int start_coverage = coverage[x];
+        a = ((*src >> 24) & 0xff) * start_coverage;
+        r = ((*src >> 16) & 0xff) * start_coverage;
+        g = ((*src >>  8) & 0xff) * start_coverage;
+        b = ((*src >>  0) & 0xff) * start_coverage;
+        src++;
+        x++;
+        box -= start_coverage;
+        while (box >= pixel_coverage) {
+            a += ((*src >> 24) & 0xff) * pixel_coverage;
+            r += ((*src >> 16) & 0xff) * pixel_coverage;
+            g += ((*src >>  8) & 0xff) * pixel_coverage;
+            b += ((*src >>  0) & 0xff) * pixel_coverage;
+            src++;
+            x++;
+
+            box -= pixel_coverage;
+        }
+        /* multiply by whatever is leftover
+         * this ensures that we don't bias down.
+         * i.e. start_coverage + n*pixel_coverage + box == 1 << 24 */
+        if (box > 0) {
+            a += ((*src >> 24) & 0xff) * box;
+            r += ((*src >> 16) & 0xff) * box;
+            g += ((*src >>  8) & 0xff) * box;
+            b += ((*src >>  0) & 0xff) * box;
+        }
+
+        a >>= FIXED_SHIFT;
+        r >>= FIXED_SHIFT;
+        g >>= FIXED_SHIFT;
+        b >>= FIXED_SHIFT;
+
+        *dest = (a << 24) | (r << 16) | (g << 8) | b;
+        dest++;
+    }
+}
+
+static void downsample_columns_box_filter(
+        int n,
+        int start_coverage,
+        int pixel_coverage,
+        uint32_t *src, uint32_t *dest)
+{
+    int stride = n;
+    while (n--) {
+        uint32_t a = 0;
+        uint32_t r = 0;
+        uint32_t g = 0;
+        uint32_t b = 0;
+        uint32_t *column_src = src;
+        int box = 1 << FIXED_SHIFT;
+        a = ((*column_src >> 24) & 0xff) * start_coverage;
+        r = ((*column_src >> 16) & 0xff) * start_coverage;
+        g = ((*column_src >>  8) & 0xff) * start_coverage;
+        b = ((*column_src >>  0) & 0xff) * start_coverage;
+        column_src += stride;
+        box -= start_coverage;
+        while (box >= pixel_coverage) {
+            a += ((*column_src >> 24) & 0xff) * pixel_coverage;
+            r += ((*column_src >> 16) & 0xff) * pixel_coverage;
+            g += ((*column_src >>  8) & 0xff) * pixel_coverage;
+            b += ((*column_src >>  0) & 0xff) * pixel_coverage;
+            column_src += stride;
+            box -= pixel_coverage;
+        }
+        if (box > 0) {
+            a += ((*column_src >> 24) & 0xff) * box;
+            r += ((*column_src >> 16) & 0xff) * box;
+            g += ((*column_src >>  8) & 0xff) * box;
+            b += ((*column_src >>  0) & 0xff) * box;
+        }
+        a >>= FIXED_SHIFT;
+        r >>= FIXED_SHIFT;
+        g >>= FIXED_SHIFT;
+        b >>= FIXED_SHIFT;
+
+        *dest = (a << 24) | (r << 16) | (g << 8) | b;
+        dest++;
+        src++;
+    }
+}
+
+#if 1
+int compute_coverage(int coverage[], int src_length, int dest_length) {
+    int i;
+    /* num = src_length/dest_length
+       total = sum(pixel) / num
+
+       pixel * 1/num == pixel * dest_length / src_length
+    */
+    int full = 1<<24;
+    /* the average contribution of each source pixel */
+    int ratio = ((1 << 24)*(long long int)dest_length)/src_length;
+    int finished = ((1 << 24)*(long long int)dest_length) - (((1 << 24)*(long long int)dest_length)/src_length)*src_length;
+    int p=0;
+    int p2;
+    int remainder = full;
+    /* because ((1 << 24)*(long long int)dest_length) won't always be divisible by src_length
+     * we'll need someplace to put the other bits.
+     *
+     * We want to ensure a + n*ratio < 1<<24
+     *
+     * 1<<24
+     * */
+    for (i=0; i<src_length; i++) {
+        if (remainder < ratio) {
+            p = ratio - remainder;
+            remainder = full - p;
+        } else {
+            p = ratio;
+            remainder -= ratio;
+        }
+
+        if ((((i+1)*ratio) % full) < ratio) {
+            p2 = (((i+1)*ratio) % full);
+        } else {
+            p2 = ratio;
+        }
+
+        //printf("%d %d %d %d %d\n", i, p, remainder, (i+1)*ratio, p2);
+        //assert(p == p2);
+
+        coverage[i] = p;
+    }
+    //assert(remainder == 0);
+    return ratio;
+}
+#endif
+
+
+PIXMAN_EXPORT
+int downscale_box_mult_old_filter(pixman_image_t *pict, unsigned origWidth, unsigned origHeight,
+		signed scaledWidth, signed scaledHeight,
+		uint16_t startColumn, uint16_t startRow,
+		uint16_t width, uint16_t height,
+		uint32_t *src, int srcStride,
+		uint32_t *dest, int dstStride)
+{
+   // printf("%d %d %d %d\n", scaledWidth, scaledHeight, origWidth, origHeight);
+        int yd1 = 0;
+        int d;
+
+        /* we need to allocate enough room for ceil(src_height/dest_height) scanlines */
+        uint32_t *temp_buf = pixman_malloc_abc ((origHeight + scaledHeight-1)/scaledHeight, scaledWidth, sizeof(uint32_t));
+	if (!temp_buf)
+            return -1;
+
+	//XXX: I suppose we should check whether this will succeed
+        uint32_t *scanline = pixman_malloc_abc (origWidth, 1, sizeof(uint32_t));
+
+        int *x_coverage = pixman_malloc_abc (origWidth, 1, sizeof(int));
+        int *y_coverage = pixman_malloc_abc (origHeight, 1, sizeof(int));
+        int pixel_coverage_x = compute_coverage(x_coverage, origWidth, scaledWidth);
+        int pixel_coverage_y = compute_coverage(y_coverage, origHeight, scaledHeight);
+
+        int y = 0;
+        for (d = 0; d < startRow + height; d++)
+	{
+            int columns = 0;
+            int box = 1 << FIXED_SHIFT;
+            int start_coverage_y = y_coverage[y];
+            fetch_scanline(pict, y, scanline);
+            downsample_row_box_filter(width, scanline, temp_buf + width * columns,
+                    x_coverage, pixel_coverage_x);
+            columns++;
+            y++;
+            box -= start_coverage_y;
+
+            while (box >= pixel_coverage_y) {
+                fetch_scanline(pict, y, scanline);
+                downsample_row_box_filter(width, scanline, temp_buf + width * columns,
+                    x_coverage, pixel_coverage_x);
+                columns++;
+                y++;
+                box -= pixel_coverage_y;
+            }
+            if (box > 0) {
+                fetch_scanline(pict, y, scanline);
+                downsample_row_box_filter(width, scanline, temp_buf + width * columns,
+                    x_coverage, pixel_coverage_x);
+                columns++;
+            }
+            downsample_columns_box_filter(width, start_coverage_y, pixel_coverage_y, temp_buf, dest + (yd1 - startRow)*dstStride/4);
+            yd1++;
+        }
+        free(temp_buf);
+        return 0;
+}
+
+#endif
diff --git a/pixman/pixman-rescale.c b/pixman/pixman-rescale.c
new file mode 100644
index 0000000..20e2e7c
--- /dev/null
+++ b/pixman/pixman-rescale.c
@@ -0,0 +1,124 @@
+#ifdef HAVE_CONFIG_H
+#include <config.h>
+#endif
+
+#include <stdlib.h>
+
+#include "pixman-private.h"
+#include "pixman-rescale.h"
+
+/* Fetcher documentation:
+ * Goals: we need a generic interface to fetch lines of ARGB32
+ * - we want to support converting to argb32
+ * - we also want to support having the rescaler read directly from the buffer
+ *   if the buffer is of a format compatible with the rescaler */
+
+static void fetch_convert(void *closure, int y, uint32_t **scanline_ptr)
+{
+    pixman_image_t *pict = closure;
+    uint32_t *scanline = *scanline_ptr;
+    pict->bits.fetch_scanline_raw_32(pict, 0, y, pict->bits.width, scanline, NULL, 0);
+
+}
+
+static uint32_t *fetch_convert_init(void *closure)
+{
+    pixman_image_t *pict = closure;
+    return pixman_malloc_abc (pict->bits.width, 1, sizeof(uint32_t));
+}
+
+static void fetch_convert_fini(void *closure, uint32_t *scanline)
+{
+    free(scanline);
+}
+
+
+static void fetch_raw(void *closure, int y, uint32_t **scanline)
+{
+    pixman_image_t *pict = closure;
+    bits_image_t *bits = &pict->bits;
+    /* set the scanline to the begining of the yth row */
+    *scanline = bits->bits + y*bits->rowstride;
+}
+
+static uint32_t *fetch_raw_init(void *closure)
+{
+    return closure;
+}
+
+static void fetch_raw_fini(void *closure, uint32_t *scanline)
+{
+}
+
+const struct rescale_fetcher fetcher_convert = {
+    fetch_convert,
+    fetch_convert_init,
+    fetch_convert_fini
+};
+
+const struct rescale_fetcher fetcher_raw = {
+    fetch_raw,
+    fetch_raw_init,
+    fetch_raw_fini
+};
+
+/* downscale an pixman_image_t into an ARGB32 buffer
+ *
+ * scaled_width / original_width is the scale ratio
+ * using scaled_width instead of just passing in the ratio avoids any rounding errors because
+ * the ratio can be specified exactly
+ *
+ * start_column, start_row, width, height
+ * specify a rectangle in the scaled source space
+ *
+ * dest, dst_stride, width, and height allow specifying
+ * a rectangle in the destination buffer
+ */
+PIXMAN_EXPORT
+int downscale_image(pixman_image_t *pict, enum downscale_type type,
+		int scaled_width, int scaled_height,
+		int start_column, int start_row,
+		int width, int height,
+		uint32_t *dest, int dst_stride)
+{
+    const struct rescale_fetcher *fetcher;
+    int orig_height = pict->bits.height;
+    int orig_width  = pict->bits.width;
+
+    /* choose a fetcher */
+    if (PIXMAN_FORMAT_BPP(pict->bits.format) == 32 &&
+            PIXMAN_FORMAT_R(pict->bits.format) == 8 &&
+            PIXMAN_FORMAT_G(pict->bits.format) == 8 &&
+            PIXMAN_FORMAT_B(pict->bits.format) == 8)
+    {
+        fetcher = &fetcher_raw;
+    } else
+    {
+        fetcher = &fetcher_convert;
+    }
+
+    /* choose and run downscaler */
+    switch (type)
+    {
+        case LANCZOS:
+            return downscale_lanczos_filter(fetcher, pict, orig_width, orig_height,
+                scaled_width, scaled_height,
+                start_column, start_row,
+                width, height,
+                dest, dst_stride);
+        case BOX:
+            return downscale_box_filter(fetcher, pict, orig_width, orig_height,
+                scaled_width, scaled_height,
+                start_column, start_row,
+                width, height,
+                dest, dst_stride);
+        case INTEGER_BOX:
+            return downscale_integer_box_filter(fetcher, pict, orig_width, orig_height,
+                scaled_width, scaled_height,
+                start_column, start_row,
+                width, height,
+                dest, dst_stride);
+        default:
+            return -1;
+    }
+}
diff --git a/pixman/pixman-rescale.h b/pixman/pixman-rescale.h
new file mode 100644
index 0000000..9c8cf73
--- /dev/null
+++ b/pixman/pixman-rescale.h
@@ -0,0 +1,44 @@
+#include "pixman-private.h"
+struct rescale_fetcher
+{
+    void (*fetch_scanline)(void *closure, int y, uint32_t **scanline_ptr);
+    uint32_t *(*init)(void *closure);
+    void (*fini)(void *closure, uint32_t *scanline);
+};
+
+enum downscale_type
+{
+    INTEGER_BOX,
+    BOX,
+    LANCZOS
+};
+
+int downscale_image(pixman_image_t *image, enum downscale_type type,
+		int scaled_width, int scaled_height,
+		int start_column, int start_row,
+		int width, int height,
+		uint32_t *dest, int dst_stride);
+
+int downscale_integer_box_filter(
+        const struct rescale_fetcher *fetcher, void *closure,
+        unsigned orig_width, unsigned orig_height,
+		signed scaledWidth, signed scaled_height,
+		uint16_t start_column, uint16_t start_row,
+		uint16_t width, uint16_t height,
+		uint32_t *dest, int dst_stride);
+
+int downscale_box_filter(
+        const struct rescale_fetcher *fetcher, void *closure,
+        unsigned orig_width, unsigned orig_height,
+		signed scaled_width, signed scaled_height,
+		uint16_t start_column, uint16_t start_row,
+		uint16_t width, uint16_t height,
+		uint32_t *dest, int dst_stride);
+
+int downscale_lanczos_filter(
+        const struct rescale_fetcher *fetcher, void *closure,
+        unsigned orig_width, unsigned orig_height,
+		signed scaledWidth, signed scaled_height,
+		uint16_t start_column, uint16_t start_row,
+		uint16_t width, uint16_t height,
+		uint32_t *dest, int dst_stride);