Mesa (pipe-video): [g3dvl] move quantification into shaders

Mon May 30 18:22:40 UTC 2011

Module: Mesa
Branch: pipe-video
Commit: 912dc8ff09cd7c28926762c2e562de5a99d3e27a
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=912dc8ff09cd7c28926762c2e562de5a99d3e27a

Author: Christian König <deathsimple at vodafone.de>
Date:   Sun May 29 19:53:45 2011 +0200

[g3dvl] move quantification into shaders

---

 src/gallium/auxiliary/vl/vl_mpeg12_bitstream.c |  107 +++++++------------
 src/gallium/auxiliary/vl/vl_mpeg12_decoder.c   |   23 ++++-
 src/gallium/auxiliary/vl/vl_zscan.c            |  138 ++++++++++++++++++------
 src/gallium/auxiliary/vl/vl_zscan.h            |   12 +--
 4 files changed, 168 insertions(+), 112 deletions(-)

diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_bitstream.c b/src/gallium/auxiliary/vl/vl_mpeg12_bitstream.c
index bf9b6cd..e7fbc31 100644
--- a/src/gallium/auxiliary/vl/vl_mpeg12_bitstream.c
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_bitstream.c
@@ -55,7 +55,6 @@
 #include <pipe/p_video_state.h>
 
 #include "vl_vlc.h"
-#include "vl_zscan.h"
 #include "vl_mpeg12_bitstream.h"
 
 /* take num bits from the high part of bit_buf and zero extend them */
@@ -64,12 +63,6 @@
 /* take num bits from the high part of bit_buf and sign extend them */
 #define SBITS(buf,num) (((int32_t)(buf)) >> (32 - (num)))
 
-#define SATURATE(val)			\
-do {					\
-   if ((uint32_t)(val + 2048) > 4095)	\
-      val = (val > 0) ? 2047 : -2048;	\
-} while (0)
-
 /* macroblock modes */
 #define MACROBLOCK_INTRA 1
 #define MACROBLOCK_PATTERN 2
@@ -721,7 +714,7 @@ get_chroma_dc_dct_diff(struct vl_mpg12_bs *bs)
 }
 
 static inline void
-get_intra_block_B14(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quantizer_scale, short *dest)
+get_intra_block_B14(struct vl_mpg12_bs *bs, int quantizer_scale, short *dest)
 {
    int i, val;
    const DCTtab *tab;
@@ -742,12 +735,10 @@ get_intra_block_B14(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quan
       normal_code:
          bs->vlc.buf <<= tab->len;
          bs->vlc.bits += tab->len + 1;
-         val = (tab->level * quantizer_scale * quant_matrix[i]) >> 4;
+         val = tab->level * quantizer_scale;
 
-         /* if (bitstream_get (1)) val = -val; */
          val = (val ^ vl_vlc_sbits(&bs->vlc, 1)) - vl_vlc_sbits(&bs->vlc, 1);
 
-         SATURATE (val);
          dest[i] = val;
 
          bs->vlc.buf <<= 1;
@@ -771,9 +762,8 @@ get_intra_block_B14(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quan
 
          vl_vlc_dumpbits(&bs->vlc, 12);
          vl_vlc_needbits(&bs->vlc);
-         val = (vl_vlc_sbits(&bs->vlc, 12) * quantizer_scale * quant_matrix[i]) / 16;
+         val = vl_vlc_sbits(&bs->vlc, 12) * quantizer_scale;
 
-         SATURATE (val);
          dest[i] = val;
 
          vl_vlc_dumpbits(&bs->vlc, 12);
@@ -811,7 +801,7 @@ get_intra_block_B14(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quan
 }
 
 static inline void
-get_intra_block_B15(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quantizer_scale, short *dest)
+get_intra_block_B15(struct vl_mpg12_bs *bs, int quantizer_scale, short *dest)
 {
    int i, val;
    const DCTtab * tab;
@@ -831,12 +821,10 @@ get_intra_block_B15(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quan
          normal_code:
             bs->vlc.buf <<= tab->len;
             bs->vlc.bits += tab->len + 1;
-            val = (tab->level * quantizer_scale * quant_matrix[i]) >> 4;
+            val = tab->level * quantizer_scale;
 
-            /* if (bitstream_get (1)) val = -val; */
             val = (val ^ vl_vlc_sbits(&bs->vlc, 1)) - vl_vlc_sbits(&bs->vlc, 1);
 
-            SATURATE (val);
             dest[i] = val;
 
             bs->vlc.buf <<= 1;
@@ -859,9 +847,8 @@ get_intra_block_B15(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quan
 
             vl_vlc_dumpbits(&bs->vlc, 12);
             vl_vlc_needbits(&bs->vlc);
-            val = (vl_vlc_sbits(&bs->vlc, 12) * quantizer_scale * quant_matrix[i]) / 16;
+            val = vl_vlc_sbits(&bs->vlc, 12) * quantizer_scale;
 
-            SATURATE (val);
             dest[i] = val;
 
             vl_vlc_dumpbits(&bs->vlc, 12);
@@ -900,7 +887,7 @@ get_intra_block_B15(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quan
 }
 
 static inline void
-get_non_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quantizer_scale, short *dest)
+get_non_intra_block(struct vl_mpg12_bs *bs, int quantizer_scale, short *dest)
 {
    int i, val;
    const DCTtab *tab;
@@ -927,12 +914,10 @@ get_non_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quan
       normal_code:
          bs->vlc.buf <<= tab->len;
          bs->vlc.bits += tab->len + 1;
-         val = ((2*tab->level+1) * quantizer_scale * quant_matrix[i]) >> 5;
+         val = ((2*tab->level+1) * quantizer_scale) >> 1;
 
-         /* if (bitstream_get (1)) val = -val; */
          val = (val ^ vl_vlc_sbits(&bs->vlc, 1)) - vl_vlc_sbits(&bs->vlc, 1);
 
-         SATURATE (val);
          dest[i] = val;
 
          bs->vlc.buf <<= 1;
@@ -960,9 +945,8 @@ get_non_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quan
          vl_vlc_dumpbits(&bs->vlc, 12);
          vl_vlc_needbits(&bs->vlc);
          val = 2 * (vl_vlc_sbits(&bs->vlc, 12) + vl_vlc_sbits(&bs->vlc, 1)) + 1;
-         val = (val * quantizer_scale * quant_matrix[i]) / 32;
+         val = (val * quantizer_scale) / 2;
 
-         SATURATE (val);
          dest[i] = val;
 
          vl_vlc_dumpbits(&bs->vlc, 12);
@@ -999,7 +983,7 @@ get_non_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quan
 }
 
 static inline void
-get_mpeg1_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quantizer_scale, short *dest)
+get_mpeg1_intra_block(struct vl_mpg12_bs *bs, int quantizer_scale, short *dest)
 {
    int i, val;
    const DCTtab * tab;
@@ -1020,7 +1004,7 @@ get_mpeg1_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], int qu
       normal_code:
          bs->vlc.buf <<= tab->len;
          bs->vlc.bits += tab->len + 1;
-         val = (tab->level * quantizer_scale * quant_matrix[i]) >> 4;
+         val = tab->level * quantizer_scale;
 
          /* oddification */
          val = (val - 1) | 1;
@@ -1028,7 +1012,6 @@ get_mpeg1_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], int qu
          /* if (bitstream_get (1)) val = -val; */
          val = (val ^ vl_vlc_sbits(&bs->vlc, 1)) - vl_vlc_sbits(&bs->vlc, 1);
 
-         SATURATE (val);
          dest[i] = val;
 
          bs->vlc.buf <<= 1;
@@ -1057,12 +1040,11 @@ get_mpeg1_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], int qu
             vl_vlc_dumpbits(&bs->vlc, 8);
             val = vl_vlc_ubits(&bs->vlc, 8) + 2 * val;
          }
-         val = (val * quantizer_scale * quant_matrix[i]) / 16;
+         val = val * quantizer_scale;
 
          /* oddification */
          val = (val + ~SBITS (val, 1)) | 1;
 
-         SATURATE (val);
          dest[i] = val;
 
          vl_vlc_dumpbits(&bs->vlc, 8);
@@ -1099,7 +1081,7 @@ get_mpeg1_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], int qu
 }
 
 static inline void
-get_mpeg1_non_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], int quantizer_scale, short *dest)
+get_mpeg1_non_intra_block(struct vl_mpg12_bs *bs, int quantizer_scale, short *dest)
 {
    int i, val;
    const DCTtab * tab;
@@ -1126,7 +1108,7 @@ get_mpeg1_non_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], in
       normal_code:
          bs->vlc.buf <<= tab->len;
          bs->vlc.bits += tab->len + 1;
-         val = ((2*tab->level+1) * quantizer_scale * quant_matrix[i]) >> 5;
+         val = ((2*tab->level+1) * quantizer_scale) >> 1;
 
          /* oddification */
          val = (val - 1) | 1;
@@ -1134,7 +1116,6 @@ get_mpeg1_non_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], in
          /* if (bitstream_get (1)) val = -val; */
          val = (val ^ vl_vlc_sbits(&bs->vlc, 1)) - vl_vlc_sbits(&bs->vlc, 1);
 
-         SATURATE (val);
          dest[i] = val;
 
          bs->vlc.buf <<= 1;
@@ -1167,12 +1148,11 @@ get_mpeg1_non_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], in
             val = vl_vlc_ubits(&bs->vlc, 8) + 2 * val;
          }
          val = 2 * (val + SBITS (val, 1)) + 1;
-         val = (val * quantizer_scale * quant_matrix[i]) / 32;
+         val = (val * quantizer_scale) / 2;
 
          /* oddification */
          val = (val + ~SBITS (val, 1)) | 1;
 
-         SATURATE (val);
          dest[i] = val;
 
          vl_vlc_dumpbits(&bs->vlc, 8);
@@ -1209,7 +1189,7 @@ get_mpeg1_non_intra_block(struct vl_mpg12_bs *bs, const int quant_matrix[64], in
 }
 
 static inline void
-slice_intra_DCT(struct vl_mpg12_bs *bs, struct pipe_mpeg12_picture_desc * picture, const int quant_matrix[64], int cc,
+slice_intra_DCT(struct vl_mpg12_bs *bs, struct pipe_mpeg12_picture_desc * picture, int cc,
                  unsigned x, unsigned y, enum pipe_mpeg12_dct_type coding, int quantizer_scale, int dc_dct_pred[3])
 {
    short dest[64];
@@ -1228,14 +1208,14 @@ slice_intra_DCT(struct vl_mpg12_bs *bs, struct pipe_mpeg12_picture_desc * pictur
       dc_dct_pred[cc] += get_chroma_dc_dct_diff(bs);
 
    memset(dest, 0, sizeof(int16_t) * 64);
-   dest[0] = dc_dct_pred[cc] << (3 - picture->intra_dc_precision);
+   dest[0] = dc_dct_pred[cc];
    if (picture->mpeg1) {
       if (picture->picture_coding_type != D_TYPE)
-          get_mpeg1_intra_block(bs, quant_matrix, quantizer_scale, dest);
+          get_mpeg1_intra_block(bs, quantizer_scale, dest);
    } else if (picture->intra_vlc_format)
-      get_intra_block_B15(bs, quant_matrix, quantizer_scale, dest);
+      get_intra_block_B15(bs, quantizer_scale, dest);
    else
-      get_intra_block_B14(bs, quant_matrix, quantizer_scale, dest);
+      get_intra_block_B14(bs, quantizer_scale, dest);
 
    memcpy(bs->ycbcr_buffer[cc], dest, sizeof(int16_t) * 64);
 
@@ -1245,7 +1225,7 @@ slice_intra_DCT(struct vl_mpg12_bs *bs, struct pipe_mpeg12_picture_desc * pictur
 }
 
 static inline void
-slice_non_intra_DCT(struct vl_mpg12_bs *bs, struct pipe_mpeg12_picture_desc * picture, const int quant_matrix[64], int cc,
+slice_non_intra_DCT(struct vl_mpg12_bs *bs, struct pipe_mpeg12_picture_desc * picture, int cc,
                     unsigned x, unsigned y,  enum pipe_mpeg12_dct_type coding, int quantizer_scale)
 {
    short dest[64];
@@ -1257,9 +1237,9 @@ slice_non_intra_DCT(struct vl_mpg12_bs *bs, struct pipe_mpeg12_picture_desc * pi
 
    memset(dest, 0, sizeof(int16_t) * 64);
    if (picture->mpeg1)
-      get_mpeg1_non_intra_block(bs, quant_matrix, quantizer_scale, dest);
+      get_mpeg1_non_intra_block(bs, quantizer_scale, dest);
    else
-      get_non_intra_block(bs, quant_matrix, quantizer_scale, dest);
+      get_non_intra_block(bs, quantizer_scale, dest);
 
    memcpy(bs->ycbcr_buffer[cc], dest, sizeof(int16_t) * 64);
 
@@ -1571,8 +1551,7 @@ slice_init(struct vl_mpg12_bs *bs, struct pipe_mpeg12_picture_desc * picture,
 }
 
 static inline bool
-decode_slice(struct vl_mpg12_bs *bs, struct pipe_mpeg12_picture_desc *picture,
-             const int intra_quantizer_matrix[64], const int non_intra_quantizer_matrix[64])
+decode_slice(struct vl_mpg12_bs *bs, struct pipe_mpeg12_picture_desc *picture)
 {
    enum pipe_video_field_select default_field_select;
    struct pipe_motionvector mv_fwd, mv_bwd;
@@ -1659,12 +1638,12 @@ decode_slice(struct vl_mpg12_bs *bs, struct pipe_mpeg12_picture_desc *picture,
          mv_bwd.top.weight = mv_bwd.bottom.weight = PIPE_VIDEO_MV_WEIGHT_MIN;
 
          // unravaled loop of 6 block(i) calls in macroblock()
-         slice_intra_DCT(bs, picture, intra_quantizer_matrix, 0, x*2+0, y*2+0, dct_type, quantizer_scale, dc_dct_pred);
-         slice_intra_DCT(bs, picture, intra_quantizer_matrix, 0, x*2+1, y*2+0, dct_type, quantizer_scale, dc_dct_pred);
-         slice_intra_DCT(bs, picture, intra_quantizer_matrix, 0, x*2+0, y*2+1, dct_type, quantizer_scale, dc_dct_pred);
-         slice_intra_DCT(bs, picture, intra_quantizer_matrix, 0, x*2+1, y*2+1, dct_type, quantizer_scale, dc_dct_pred);
-         slice_intra_DCT(bs, picture, intra_quantizer_matrix, 1, x, y, PIPE_MPEG12_DCT_TYPE_FRAME, quantizer_scale, dc_dct_pred);
-         slice_intra_DCT(bs, picture, intra_quantizer_matrix, 2, x, y, PIPE_MPEG12_DCT_TYPE_FRAME, quantizer_scale, dc_dct_pred);
+         slice_intra_DCT(bs, picture, 0, x*2+0, y*2+0, dct_type, quantizer_scale, dc_dct_pred);
+         slice_intra_DCT(bs, picture, 0, x*2+1, y*2+0, dct_type, quantizer_scale, dc_dct_pred);
+         slice_intra_DCT(bs, picture, 0, x*2+0, y*2+1, dct_type, quantizer_scale, dc_dct_pred);
+         slice_intra_DCT(bs, picture, 0, x*2+1, y*2+1, dct_type, quantizer_scale, dc_dct_pred);
+         slice_intra_DCT(bs, picture, 1, x, y, PIPE_MPEG12_DCT_TYPE_FRAME, quantizer_scale, dc_dct_pred);
+         slice_intra_DCT(bs, picture, 2, x, y, PIPE_MPEG12_DCT_TYPE_FRAME, quantizer_scale, dc_dct_pred);
 
          if (picture->picture_coding_type == D_TYPE) {
             vl_vlc_needbits(&bs->vlc);
@@ -1722,17 +1701,17 @@ decode_slice(struct vl_mpg12_bs *bs, struct pipe_mpeg12_picture_desc *picture,
 
             // TODO  optimize not fully used for idct accel only mc.
             if (coded_block_pattern & 0x20)
-               slice_non_intra_DCT(bs, picture, non_intra_quantizer_matrix, 0, x*2+0, y*2+0, dct_type, quantizer_scale); // cc0  luma 0
+               slice_non_intra_DCT(bs, picture, 0, x*2+0, y*2+0, dct_type, quantizer_scale); // cc0  luma 0
             if (coded_block_pattern & 0x10)
-               slice_non_intra_DCT(bs, picture, non_intra_quantizer_matrix, 0, x*2+1, y*2+0, dct_type, quantizer_scale); // cc0 luma 1
+               slice_non_intra_DCT(bs, picture, 0, x*2+1, y*2+0, dct_type, quantizer_scale); // cc0 luma 1
             if (coded_block_pattern & 0x08)
-               slice_non_intra_DCT(bs, picture, non_intra_quantizer_matrix, 0, x*2+0, y*2+1, dct_type, quantizer_scale); // cc0 luma 2
+               slice_non_intra_DCT(bs, picture, 0, x*2+0, y*2+1, dct_type, quantizer_scale); // cc0 luma 2
             if (coded_block_pattern & 0x04)
-               slice_non_intra_DCT(bs, picture, non_intra_quantizer_matrix, 0, x*2+1, y*2+1, dct_type, quantizer_scale); // cc0 luma 3
+               slice_non_intra_DCT(bs, picture, 0, x*2+1, y*2+1, dct_type, quantizer_scale); // cc0 luma 3
             if (coded_block_pattern & 0x2)
-               slice_non_intra_DCT(bs, picture, non_intra_quantizer_matrix, 1, x, y, PIPE_MPEG12_DCT_TYPE_FRAME, quantizer_scale); // cc1 croma
+               slice_non_intra_DCT(bs, picture, 1, x, y, PIPE_MPEG12_DCT_TYPE_FRAME, quantizer_scale); // cc1 croma
             if (coded_block_pattern & 0x1)
-               slice_non_intra_DCT(bs, picture, non_intra_quantizer_matrix, 2, x, y, PIPE_MPEG12_DCT_TYPE_FRAME, quantizer_scale); // cc2 croma
+               slice_non_intra_DCT(bs, picture, 2, x, y, PIPE_MPEG12_DCT_TYPE_FRAME, quantizer_scale); // cc2 croma
          }
 
          dc_dct_pred[0] = dc_dct_pred[1] = dc_dct_pred[2] = 0;
@@ -1845,12 +1824,6 @@ void
 vl_mpg12_bs_decode(struct vl_mpg12_bs *bs, unsigned num_bytes, const void *buffer,
                    struct pipe_mpeg12_picture_desc *picture, unsigned num_ycbcr_blocks[3])
 {
-   int intra_quantizer_matrix[64];
-   int non_intra_quantizer_matrix[64];
-
-   const int *scan;
-   unsigned i;
-
    assert(bs);
    assert(num_ycbcr_blocks);
    assert(buffer && num_bytes);
@@ -1859,11 +1832,5 @@ vl_mpg12_bs_decode(struct vl_mpg12_bs *bs, unsigned num_bytes, const void *buffe
 
    vl_vlc_init(&bs->vlc, buffer, num_bytes);
 
-   scan = picture->alternate_scan ? vl_zscan_alternate : vl_zscan_normal;
-   for (i = 0; i < 64; ++i) {
-      intra_quantizer_matrix[i] = picture->intra_quantizer_matrix[scan[i]];
-      non_intra_quantizer_matrix[i] = picture->non_intra_quantizer_matrix[scan[i]];
-   }
-
-   while(decode_slice(bs, picture, intra_quantizer_matrix, non_intra_quantizer_matrix));
+   while(decode_slice(bs, picture));
 }
diff --git a/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c b/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
index f96d7f0..ca790e7 100644
--- a/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
+++ b/src/gallium/auxiliary/vl/vl_mpeg12_decoder.c
@@ -312,8 +312,21 @@ vl_mpeg12_buffer_map(struct pipe_video_decode_buffer *buffer)
 
       vl_mpg12_bs_set_buffers(&buf->bs, ycbcr_stream, buf->texels, mv_stream);
    } else {
-      for (i = 0; i < VL_MAX_PLANES; ++i)
+      static const uint8_t dummy_quant[64] = {
+         0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+         0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+         0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+         0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+         0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+         0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+         0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10,
+         0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10, 0x10
+      };
+
+      for (i = 0; i < VL_MAX_PLANES; ++i) {
          vl_zscan_set_layout(&buf->zscan[i], dec->zscan_linear);
+         vl_zscan_upload_quant(&buf->zscan[i], dummy_quant, dummy_quant);
+      }
    }
 }
 
@@ -365,6 +378,7 @@ vl_mpeg12_buffer_decode_bitstream(struct pipe_video_decode_buffer *buffer,
                                   unsigned num_ycbcr_blocks[3])
 {
    struct vl_mpeg12_buffer *buf = (struct vl_mpeg12_buffer*)buffer;
+   uint8_t intra_quantizer_matrix[64];
    struct vl_mpeg12_decoder *dec;
    unsigned i;
 
@@ -373,8 +387,13 @@ vl_mpeg12_buffer_decode_bitstream(struct pipe_video_decode_buffer *buffer,
    dec = (struct vl_mpeg12_decoder *)buf->base.decoder;
    assert(dec);
 
-   for (i = 0; i < VL_MAX_PLANES; ++i)
+   memcpy(intra_quantizer_matrix, picture->intra_quantizer_matrix, sizeof(intra_quantizer_matrix));
+   intra_quantizer_matrix[0] = 1 << (7 - picture->intra_dc_precision);
+
+   for (i = 0; i < VL_MAX_PLANES; ++i) {
       vl_zscan_set_layout(&buf->zscan[i], picture->alternate_scan ? dec->zscan_alternate : dec->zscan_normal);
+      vl_zscan_upload_quant(&buf->zscan[i], intra_quantizer_matrix, picture->non_intra_quantizer_matrix);
+   }
 
    vl_mpg12_bs_decode(&buf->bs, num_bytes, data, picture, num_ycbcr_blocks);
 }
diff --git a/src/gallium/auxiliary/vl/vl_zscan.c b/src/gallium/auxiliary/vl/vl_zscan.c
index 0f468df..4af3962 100644
--- a/src/gallium/auxiliary/vl/vl_zscan.c
+++ b/src/gallium/auxiliary/vl/vl_zscan.c
@@ -136,11 +136,11 @@ create_vert_shader(struct vl_zscan *zscan)
    ureg_MUL(shader, ureg_writemask(o_vpos, TGSI_WRITEMASK_XY), ureg_src(tmp), scale);
    ureg_MOV(shader, ureg_writemask(o_vpos, TGSI_WRITEMASK_ZW), ureg_imm1f(shader, 1.0f));
 
-   ureg_MUL(shader, ureg_writemask(tmp, TGSI_WRITEMASK_XZ), ureg_scalar(instance, TGSI_SWIZZLE_X),
+   ureg_MUL(shader, ureg_writemask(tmp, TGSI_WRITEMASK_XW), ureg_scalar(instance, TGSI_SWIZZLE_X),
             ureg_imm1f(shader, 1.0f / zscan->blocks_per_line));
 
    ureg_FRC(shader, ureg_writemask(tmp, TGSI_WRITEMASK_Y), ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_X));
-   ureg_FLR(shader, ureg_writemask(tmp, TGSI_WRITEMASK_Z), ureg_src(tmp));
+   ureg_FLR(shader, ureg_writemask(tmp, TGSI_WRITEMASK_W), ureg_src(tmp));
 
    for (i = 0; i < zscan->num_channels; ++i) {
       ureg_ADD(shader, ureg_writemask(tmp, TGSI_WRITEMASK_X), ureg_scalar(ureg_src(tmp), TGSI_SWIZZLE_Y),
@@ -149,7 +149,8 @@ create_vert_shader(struct vl_zscan *zscan)
       ureg_MAD(shader, ureg_writemask(o_vtex[i], TGSI_WRITEMASK_X), vrect,
                ureg_imm1f(shader, 1.0f / zscan->blocks_per_line), ureg_src(tmp));
       ureg_MOV(shader, ureg_writemask(o_vtex[i], TGSI_WRITEMASK_Y), vrect);
-      ureg_MUL(shader, ureg_writemask(o_vtex[i], TGSI_WRITEMASK_Z), ureg_src(tmp),
+      ureg_MOV(shader, ureg_writemask(o_vtex[i], TGSI_WRITEMASK_Z), vpos);
+      ureg_MUL(shader, ureg_writemask(o_vtex[i], TGSI_WRITEMASK_W), ureg_src(tmp),
                ureg_imm1f(shader, (float)zscan->blocks_per_line / zscan->blocks_total));
    }
 
@@ -165,10 +166,10 @@ create_frag_shader(struct vl_zscan *zscan)
    struct ureg_program *shader;
    struct ureg_src vtex[zscan->num_channels];
 
-   struct ureg_src src, scan, quant;
+   struct ureg_src samp_src, samp_scan, samp_quant;
 
    struct ureg_dst tmp[zscan->num_channels];
-   struct ureg_dst fragment;
+   struct ureg_dst quant, fragment;
 
    unsigned i;
 
@@ -179,12 +180,13 @@ create_frag_shader(struct vl_zscan *zscan)
    for (i = 0; i < zscan->num_channels; ++i)
       vtex[i] = ureg_DECL_fs_input(shader, TGSI_SEMANTIC_GENERIC, VS_O_VTEX + i, TGSI_INTERPOLATE_LINEAR);
 
-   src = ureg_DECL_sampler(shader, 0);
-   scan = ureg_DECL_sampler(shader, 1);
-   quant = ureg_DECL_sampler(shader, 2);
+   samp_src = ureg_DECL_sampler(shader, 0);
+   samp_scan = ureg_DECL_sampler(shader, 1);
+   samp_quant = ureg_DECL_sampler(shader, 2);
 
    for (i = 0; i < zscan->num_channels; ++i)
       tmp[i] = ureg_DECL_temporary(shader);
+   quant = ureg_DECL_temporary(shader);
 
    fragment = ureg_DECL_output(shader, TGSI_SEMANTIC_COLOR, 0);
 
@@ -194,17 +196,18 @@ create_frag_shader(struct vl_zscan *zscan)
     * fragment = tex(tmp, 0) * quant
     */
    for (i = 0; i < zscan->num_channels; ++i)
-      ureg_TEX(shader, ureg_writemask(tmp[i], TGSI_WRITEMASK_X), TGSI_TEXTURE_2D, vtex[i], scan);
+      ureg_TEX(shader, ureg_writemask(tmp[i], TGSI_WRITEMASK_X), TGSI_TEXTURE_2D, vtex[i], samp_scan);
 
    for (i = 0; i < zscan->num_channels; ++i)
-      ureg_MOV(shader, ureg_writemask(tmp[i], TGSI_WRITEMASK_Y), ureg_scalar(vtex[i], TGSI_SWIZZLE_Z));
+      ureg_MOV(shader, ureg_writemask(tmp[i], TGSI_WRITEMASK_Y), ureg_scalar(vtex[i], TGSI_SWIZZLE_W));
 
-   for (i = 0; i < zscan->num_channels; ++i)
-      ureg_TEX(shader, tmp[i], TGSI_TEXTURE_2D, ureg_src(tmp[i]), src);
+   for (i = 0; i < zscan->num_channels; ++i) {
+      ureg_TEX(shader, ureg_writemask(tmp[0], TGSI_WRITEMASK_X << i), TGSI_TEXTURE_2D, ureg_src(tmp[i]), samp_src);
+      ureg_TEX(shader, ureg_writemask(quant, TGSI_WRITEMASK_X << i), TGSI_TEXTURE_3D, vtex[i], samp_quant);
+   }
 
-   // TODO: Fetch quant and use it
-   for (i = 0; i < zscan->num_channels; ++i)
-      ureg_MUL(shader, ureg_writemask(fragment, TGSI_WRITEMASK_X << i), ureg_src(tmp[i]), ureg_imm1f(shader, 1.0f));
+   ureg_MUL(shader, quant, ureg_src(quant), ureg_imm1f(shader, 16.0f));
+   ureg_MUL(shader, fragment, ureg_src(tmp[0]), ureg_src(quant));
 
    for (i = 0; i < zscan->num_channels; ++i)
       ureg_release_temporary(shader, tmp[i]);
@@ -283,7 +286,7 @@ init_state(struct vl_zscan *zscan)
       memset(&sampler, 0, sizeof(sampler));
       sampler.wrap_s = PIPE_TEX_WRAP_REPEAT;
       sampler.wrap_t = PIPE_TEX_WRAP_REPEAT;
-      sampler.wrap_r = PIPE_TEX_WRAP_REPEAT;
+      sampler.wrap_r = PIPE_TEX_WRAP_CLAMP_TO_EDGE;
       sampler.min_img_filter = PIPE_TEX_FILTER_NEAREST;
       sampler.min_mip_filter = PIPE_TEX_MIPFILTER_NONE;
       sampler.mag_img_filter = PIPE_TEX_FILTER_NEAREST;
@@ -413,15 +416,6 @@ error_resource:
    return NULL;
 }
 
-#if 0
-// TODO
-struct pipe_sampler_view *
-vl_zscan_normal(struct pipe_context *pipe, unsigned blocks_per_line);
-
-struct pipe_sampler_view *
-vl_zscan_alternate(struct pipe_context *pipe, unsigned blocks_per_line);
-#endif
-
 bool
 vl_zscan_init(struct vl_zscan *zscan, struct pipe_context *pipe,
               unsigned buffer_width, unsigned buffer_height,
@@ -457,16 +451,13 @@ vl_zscan_cleanup(struct vl_zscan *zscan)
    cleanup_state(zscan);
 }
 
-#if 0
-// TODO
-void
-vl_zscan_upload_quant(struct vl_zscan *zscan, ...);
-#endif
-
 bool
 vl_zscan_init_buffer(struct vl_zscan *zscan, struct vl_zscan_buffer *buffer,
                      struct pipe_sampler_view *src, struct pipe_surface *dst)
 {
+   struct pipe_resource res_tmpl, *res;
+   struct pipe_sampler_view sv_tmpl;
+
    assert(zscan && buffer);
 
    memset(buffer, 0, sizeof(struct vl_zscan_buffer));
@@ -489,6 +480,28 @@ vl_zscan_init_buffer(struct vl_zscan *zscan, struct vl_zscan_buffer *buffer,
    buffer->fb_state.nr_cbufs = 1;
    pipe_surface_reference(&buffer->fb_state.cbufs[0], dst);
 
+   memset(&res_tmpl, 0, sizeof(res_tmpl));
+   res_tmpl.target = PIPE_TEXTURE_3D;
+   res_tmpl.format = PIPE_FORMAT_R8_UNORM;
+   res_tmpl.width0 = BLOCK_WIDTH * zscan->blocks_per_line;
+   res_tmpl.height0 = BLOCK_HEIGHT;
+   res_tmpl.depth0 = 2;
+   res_tmpl.array_size = 1;
+   res_tmpl.usage = PIPE_USAGE_IMMUTABLE;
+   res_tmpl.bind = PIPE_BIND_SAMPLER_VIEW;
+
+   res = zscan->pipe->screen->resource_create(zscan->pipe->screen, &res_tmpl);
+   if (!res)
+      return false;
+
+   memset(&sv_tmpl, 0, sizeof(sv_tmpl));
+   u_sampler_view_default_template(&sv_tmpl, res, res->format);
+   sv_tmpl.swizzle_r = sv_tmpl.swizzle_g = sv_tmpl.swizzle_b = sv_tmpl.swizzle_a = TGSI_SWIZZLE_X;
+   buffer->quant = zscan->pipe->create_sampler_view(zscan->pipe, res, &sv_tmpl);
+   pipe_resource_reference(&res, NULL);
+   if (!buffer->quant)
+      return false;
+
    return true;
 }
 
@@ -513,6 +526,65 @@ vl_zscan_set_layout(struct vl_zscan_buffer *buffer, struct pipe_sampler_view *la
 }
 
 void
+vl_zscan_upload_quant(struct vl_zscan_buffer *buffer,
+                      const uint8_t intra_matrix[64],
+                      const uint8_t non_intra_matrix[64])
+{
+   struct pipe_context *pipe;
+   struct pipe_transfer *buf_transfer;
+   unsigned x, y, i, pitch;
+   uint8_t *intra, *non_intra;
+
+   struct pipe_box rect =
+   {
+      0, 0, 0,
+      BLOCK_WIDTH,
+      BLOCK_HEIGHT,
+      2
+   };
+
+   assert(buffer);
+   assert(intra_matrix);
+   assert(non_intra_matrix);
+
+   pipe = buffer->zscan->pipe;
+
+   rect.width *= buffer->zscan->blocks_per_line;
+
+   buf_transfer = pipe->get_transfer
+   (
+      pipe, buffer->quant->texture,
+      0, PIPE_TRANSFER_WRITE | PIPE_TRANSFER_DISCARD,
+      &rect
+   );
+   if (!buf_transfer)
+      goto error_transfer;
+
+   pitch = buf_transfer->stride;
+
+   non_intra = pipe->transfer_map(pipe, buf_transfer);
+   if (!non_intra)
+      goto error_map;
+
+   intra = non_intra + BLOCK_HEIGHT * pitch;
+
+   for (i = 0; i < buffer->zscan->blocks_per_line; ++i)
+      for (y = 0; y < BLOCK_HEIGHT; ++y)
+         for (x = 0; x < BLOCK_WIDTH; ++x) {
+            intra[i * BLOCK_WIDTH + y * pitch + x] = intra_matrix[x + y * BLOCK_WIDTH];
+            non_intra[i * BLOCK_WIDTH + y * pitch + x] = non_intra_matrix[x + y * BLOCK_WIDTH];
+         }
+
+   pipe->transfer_unmap(pipe, buf_transfer);
+
+error_map:
+   pipe->transfer_destroy(pipe, buf_transfer);
+
+error_transfer:
+   return;
+}
+
+void
 vl_zscan_render(struct vl_zscan_buffer *buffer, unsigned num_instances)
 {
    struct vl_zscan *zscan;
@@ -523,10 +595,10 @@ vl_zscan_render(struct vl_zscan_buffer *buffer, unsigned num_instances)
 
    zscan->pipe->bind_rasterizer_state(zscan->pipe, zscan->rs_state);
    zscan->pipe->bind_blend_state(zscan->pipe, zscan->blend);
-   zscan->pipe->bind_fragment_sampler_states(zscan->pipe, 2, zscan->samplers);
+   zscan->pipe->bind_fragment_sampler_states(zscan->pipe, 3, zscan->samplers);
    zscan->pipe->set_framebuffer_state(zscan->pipe, &buffer->fb_state);
    zscan->pipe->set_viewport_state(zscan->pipe, &buffer->viewport);
-   zscan->pipe->set_fragment_sampler_views(zscan->pipe, 2, &buffer->src);
+   zscan->pipe->set_fragment_sampler_views(zscan->pipe, 3, &buffer->src);
    zscan->pipe->bind_vs_state(zscan->pipe, zscan->vs);
    zscan->pipe->bind_fs_state(zscan->pipe, zscan->fs);
    util_draw_arrays_instanced(zscan->pipe, PIPE_PRIM_QUADS, 0, 4, 0, num_instances);
diff --git a/src/gallium/auxiliary/vl/vl_zscan.h b/src/gallium/auxiliary/vl/vl_zscan.h
index ccc6bc4..be12b8e 100644
--- a/src/gallium/auxiliary/vl/vl_zscan.h
+++ b/src/gallium/auxiliary/vl/vl_zscan.h
@@ -53,8 +53,6 @@ struct vl_zscan
    void *samplers[3];
 
    void *vs, *fs;
-
-   struct pipe_sampler_view *quant;
 };
 
 struct vl_zscan_buffer
@@ -84,11 +82,6 @@ vl_zscan_init(struct vl_zscan *zscan, struct pipe_context *pipe,
 void
 vl_zscan_cleanup(struct vl_zscan *zscan);
 
-#if 0
-void
-vl_zscan_upload_quant(struct vl_zscan *zscan, ...);
-#endif
-
 bool
 vl_zscan_init_buffer(struct vl_zscan *zscan, struct vl_zscan_buffer *buffer,
                      struct pipe_sampler_view *src, struct pipe_surface *dst);
@@ -100,6 +93,11 @@ void
 vl_zscan_set_layout(struct vl_zscan_buffer *buffer, struct pipe_sampler_view *layout);
 
 void
+vl_zscan_upload_quant(struct vl_zscan_buffer *buffer,
+                      const uint8_t intra_matrix[64],
+                      const uint8_t non_intra_matrix[64]);
+
+void
 vl_zscan_render(struct vl_zscan_buffer *buffer, unsigned num_instances);
 
 #endif