[Mesa-dev] [PATCH] llvmpipe: reduce alignment requirement for 1d resources from 4x4 to 4x1

Fri May 31 11:37:54 PDT 2013

From: Roland Scheidegger <sroland at vmware.com>

For rendering to buffers, we cannot have any y alignment.
So make sure that tile clear commands only clear up to the fb width/height,
not more (do this for all resources actually as clearing more seems
pointless for other resources too). For the jit fs function, skip execution
of the lower half of the fragment shader for the 4x4 stamp completely,
for depth/stencil only load/store the values from the first row
(replace other row with undef).
For the blend function, also only load half the values from fs output, drop
the second row after untwiddling (fix up some issues there due to inconsistent
usage of block_width/block_height/block_size, num_fs and fs type length).
Also reduce 1d and 1d array alignment too, because they can be handled the
same as buffers so don't need to waste memory.
---
 src/gallium/auxiliary/gallivm/lp_bld_conv.c |   90 +++++++----
 src/gallium/auxiliary/gallivm/lp_bld_pack.c |    6 +-
 src/gallium/drivers/llvmpipe/lp_bld_depth.c |   19 ++-
 src/gallium/drivers/llvmpipe/lp_bld_depth.h |    2 +
 src/gallium/drivers/llvmpipe/lp_rast.c      |    8 +-
 src/gallium/drivers/llvmpipe/lp_scene.c     |    2 -
 src/gallium/drivers/llvmpipe/lp_scene.h     |    4 -
 src/gallium/drivers/llvmpipe/lp_state_fs.c  |  228 +++++++++++++++++++--------
 src/gallium/drivers/llvmpipe/lp_state_fs.h  |    1 +
 src/gallium/drivers/llvmpipe/lp_texture.c   |   24 ++-
 src/gallium/drivers/llvmpipe/lp_texture.h   |   21 +++
 11 files changed, 281 insertions(+), 124 deletions(-)

diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
index eb2d096..f11361a 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
@@ -530,24 +530,22 @@ int lp_build_conv_auto(struct gallivm_state *gallivm,
        dst_type->width    == 8)
    {
       /* Special case 4x4f --> 1x16ub */
-      if (src_type.length == 4 && util_cpu_caps.has_sse2)
+      if (src_type.length == 4 &&
+          util_cpu_caps.has_sse2)
       {
-         assert((num_srcs % 4) == 0);
-
-         num_dsts = num_srcs / 4;
-         dst_type->length = 16;
+         num_dsts = (num_srcs + 3) / 4;
+         dst_type->length = num_srcs * 4 >= 16 ? 16 : num_srcs * 4;
 
          lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
          return num_dsts;
       }
 
       /* Special case 2x8f --> 1x16ub */
-      if (src_type.length == 8 && util_cpu_caps.has_avx)
+      if (src_type.length == 8 &&
+          util_cpu_caps.has_avx)
       {
-         assert((num_srcs % 2) == 0);
-
-         num_dsts = num_srcs / 2;
-         dst_type->length = 16;
+         num_dsts = (num_srcs + 1) / 2;
+         dst_type->length = num_srcs * 8 >= 16 ? 16 : num_srcs * 8;
 
          lp_build_conv(gallivm, src_type, *dst_type, src, num_srcs, dst, num_dsts);
          return num_dsts;
@@ -602,7 +600,7 @@ lp_build_conv(struct gallivm_state *gallivm,
    num_tmps = num_srcs;
 
 
-   /* Special case 4x4f --> 1x16ub 
+   /* Special case 4x4f --> 1x16ub, 2x4f -> 1x8ub, 1x4f -> 1x4ub
     */
    if (src_type.floating == 1 &&
        src_type.fixed    == 0 &&
@@ -616,20 +614,23 @@ lp_build_conv(struct gallivm_state *gallivm,
        dst_type.sign     == 0 &&
        dst_type.norm     == 1 &&
        dst_type.width    == 8 &&
-       dst_type.length   == 16 &&
 
-       4 * num_dsts      == num_srcs &&
+       ((dst_type.length == 16 && 4 * num_dsts == num_srcs) ||
+        (num_dsts == 1 && dst_type.length * num_srcs == 16 && num_srcs != 3)) &&
 
        util_cpu_caps.has_sse2)
    {
       struct lp_build_context bld;
-      struct lp_type int16_type = dst_type;
-      struct lp_type int32_type = dst_type;
+      struct lp_type int16_type, int32_type;
+      struct lp_type dst_type_ext = dst_type;
       LLVMValueRef const_255f;
       unsigned i, j;
 
       lp_build_context_init(&bld, gallivm, src_type);
 
+      dst_type_ext.length = 16;
+      int16_type = int32_type = dst_type_ext;
+
       int16_type.width *= 2;
       int16_type.length /= 2;
       int16_type.sign = 1;
@@ -643,21 +644,34 @@ lp_build_conv(struct gallivm_state *gallivm,
       for (i = 0; i < num_dsts; ++i, src += 4) {
          LLVMValueRef lo, hi;
 
-         for (j = 0; j < 4; ++j) {
+         for (j = 0; j < dst_type.length / 4; ++j) {
             tmp[j] = LLVMBuildFMul(builder, src[j], const_255f, "");
             tmp[j] = lp_build_iround(&bld, tmp[j]);
          }
 
+         if (num_srcs == 1) {
+            tmp[1] = tmp[0];
+         }
+
          /* relying on clamping behavior of sse2 intrinsics here */
          lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
-         hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
-         dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
+
+         if (num_srcs < 4) {
+            hi = lo;
+         }
+         else {
+            hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
+         }
+         dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi);
+      }
+      if (num_srcs < 4) {
+         dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length);
       }
 
       return; 
    }
 
-   /* Special case 2x8f --> 1x16ub
+   /* Special case 2x8f --> 1x16ub, 1x8f ->1x8ub
     */
    else if (src_type.floating == 1 &&
       src_type.fixed    == 0 &&
@@ -671,20 +685,23 @@ lp_build_conv(struct gallivm_state *gallivm,
       dst_type.sign     == 0 &&
       dst_type.norm     == 1 &&
       dst_type.width    == 8 &&
-      dst_type.length   == 16 &&
 
-      2 * num_dsts      == num_srcs &&
+      ((dst_type.length == 16 && 2 * num_dsts == num_srcs) ||
+       (num_dsts == 1 && dst_type.length * num_srcs == 8)) &&
 
       util_cpu_caps.has_avx) {
 
       struct lp_build_context bld;
-      struct lp_type int16_type = dst_type;
-      struct lp_type int32_type = dst_type;
+      struct lp_type int16_type, int32_type;
+      struct lp_type dst_type_ext = dst_type;
       LLVMValueRef const_255f;
       unsigned i;
 
       lp_build_context_init(&bld, gallivm, src_type);
 
+      dst_type_ext.length = 16;
+      int16_type = int32_type = dst_type_ext;
+
       int16_type.width *= 2;
       int16_type.length /= 2;
       int16_type.sign = 1;
@@ -699,21 +716,30 @@ lp_build_conv(struct gallivm_state *gallivm,
          LLVMValueRef lo, hi, a, b;
 
          a = LLVMBuildFMul(builder, src[0], const_255f, "");
-         b = LLVMBuildFMul(builder, src[1], const_255f, "");
-
          a = lp_build_iround(&bld, a);
-         b = lp_build_iround(&bld, b);
-
          tmp[0] = lp_build_extract_range(gallivm, a, 0, 4);
          tmp[1] = lp_build_extract_range(gallivm, a, 4, 4);
-         tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
-         tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);
-
          /* relying on clamping behavior of sse2 intrinsics here */
          lo = lp_build_pack2(gallivm, int32_type, int16_type, tmp[0], tmp[1]);
-         hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
-         dst[i] = lp_build_pack2(gallivm, int16_type, dst_type, lo, hi);
+
+         if (num_srcs == 1) {
+            hi = lo;
+         }
+         else {
+            b = LLVMBuildFMul(builder, src[1], const_255f, "");
+            b = lp_build_iround(&bld, b);
+            tmp[2] = lp_build_extract_range(gallivm, b, 0, 4);
+            tmp[3] = lp_build_extract_range(gallivm, b, 4, 4);
+            hi = lp_build_pack2(gallivm, int32_type, int16_type, tmp[2], tmp[3]);
+
+         }
+         dst[i] = lp_build_pack2(gallivm, int16_type, dst_type_ext, lo, hi);
       }
+
+      if (num_srcs == 1) {
+         dst[0] = lp_build_extract_range(gallivm, dst[0], 0, dst_type.length);
+      }
+
       return;
    }
 
diff --git a/src/gallium/auxiliary/gallivm/lp_bld_pack.c b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
index 0a57e39..14fcd38 100644
--- a/src/gallium/auxiliary/gallivm/lp_bld_pack.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_pack.c
@@ -241,8 +241,12 @@ lp_build_concat_n(struct gallivm_state *gallivm,
    assert(num_srcs >= num_dsts);
    assert((num_srcs % size) == 0);
 
-   if (num_srcs == num_dsts)
+   if (num_srcs == num_dsts) {
+      for (i = 0; i < num_dsts; ++i) {
+         dst[i] = src[i];
+      }
       return 1;
+   }
 
    for (i = 0; i < num_dsts; ++i) {
       dst[i] = lp_build_concat(gallivm, &src[i * size], src_type, size);
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.c b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
index df6a6c4..a8bd15f 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.c
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.c
@@ -525,6 +525,7 @@ lp_build_occlusion_count(struct gallivm_state *gallivm,
  *
  * \param type  the data type of the fragment depth/stencil values
  * \param format_desc  description of the depth/stencil surface
+ * \param is_1d  whether this resource has only one dimension
  * \param loop_counter  the current loop iteration
  * \param depth_ptr  pointer to the depth/stencil values of this 4x4 block
  * \param depth_stride  stride of the depth/stencil buffer
@@ -535,6 +536,7 @@ void
 lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm,
                                      struct lp_type z_src_type,
                                      const struct util_format_description *format_desc,
+                                     boolean is_1d,
                                      LLVMValueRef depth_ptr,
                                      LLVMValueRef depth_stride,
                                      LLVMValueRef *z_fb,
@@ -592,9 +594,14 @@ lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm,
    zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset1, 1, "");
    zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, "");
    zs_dst1 = LLVMBuildLoad(builder, zs_dst_ptr, "");
-   zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, "");
-   zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, "");
-   zs_dst2 = LLVMBuildLoad(builder, zs_dst_ptr, "");
+   if (is_1d) {
+      zs_dst2 = lp_build_undef(gallivm, zs_load_type);
+   }
+   else {
+      zs_dst_ptr = LLVMBuildGEP(builder, depth_ptr, &depth_offset2, 1, "");
+      zs_dst_ptr = LLVMBuildBitCast(builder, zs_dst_ptr, load_ptr_type, "");
+      zs_dst2 = LLVMBuildLoad(builder, zs_dst_ptr, "");
+   }
 
    *z_fb = LLVMBuildShuffleVector(builder, zs_dst1, zs_dst2,
                                   LLVMConstVector(shuffles, zs_type.length), "");
@@ -648,6 +655,7 @@ lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm,
  *
  * \param type  the data type of the fragment depth/stencil values
  * \param format_desc  description of the depth/stencil surface
+ * \param is_1d  whether this resource has only one dimension
  * \param mask  the alive/dead pixel mask for the quad (vector)
  * \param z_fb  z values read from fb (with padding)
  * \param s_fb  s values read from fb (with padding)
@@ -661,6 +669,7 @@ void
 lp_build_depth_stencil_write_swizzled(struct gallivm_state *gallivm,
                                       struct lp_type z_src_type,
                                       const struct util_format_description *format_desc,
+                                      boolean is_1d,
                                       struct lp_build_mask_context *mask,
                                       LLVMValueRef z_fb,
                                       LLVMValueRef s_fb,
@@ -791,7 +800,9 @@ lp_build_depth_stencil_write_swizzled(struct gallivm_state *gallivm,
    }
 
    LLVMBuildStore(builder, zs_dst1, zs_dst_ptr1);
-   LLVMBuildStore(builder, zs_dst2, zs_dst_ptr2);
+   if (!is_1d) {
+      LLVMBuildStore(builder, zs_dst2, zs_dst_ptr2);
+   }
 }
 
 /**
diff --git a/src/gallium/drivers/llvmpipe/lp_bld_depth.h b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
index 2534dc3..d169c89 100644
--- a/src/gallium/drivers/llvmpipe/lp_bld_depth.h
+++ b/src/gallium/drivers/llvmpipe/lp_bld_depth.h
@@ -74,6 +74,7 @@ void
 lp_build_depth_stencil_load_swizzled(struct gallivm_state *gallivm,
                                      struct lp_type z_src_type,
                                      const struct util_format_description *format_desc,
+                                     boolean is_1d,
                                      LLVMValueRef depth_ptr,
                                      LLVMValueRef depth_stride,
                                      LLVMValueRef *z_fb,
@@ -84,6 +85,7 @@ void
 lp_build_depth_stencil_write_swizzled(struct gallivm_state *gallivm,
                                       struct lp_type z_src_type,
                                       const struct util_format_description *format_desc,
+                                      boolean is_1d,
                                       struct lp_build_mask_context *mask,
                                       LLVMValueRef z_fb,
                                       LLVMValueRef s_fb,
diff --git a/src/gallium/drivers/llvmpipe/lp_rast.c b/src/gallium/drivers/llvmpipe/lp_rast.c
index be5a286..981dd71 100644
--- a/src/gallium/drivers/llvmpipe/lp_rast.c
+++ b/src/gallium/drivers/llvmpipe/lp_rast.c
@@ -95,10 +95,10 @@ lp_rast_tile_begin(struct lp_rasterizer_task *task,
    task->bin = bin;
    task->x = x * TILE_SIZE;
    task->y = y * TILE_SIZE;
-   task->width = TILE_SIZE + x * TILE_SIZE > task->scene->width_aligned ?
-                    task->scene->width_aligned - x * TILE_SIZE : TILE_SIZE;
-   task->height = TILE_SIZE + y * TILE_SIZE > task->scene->height_aligned ?
-                    task->scene->height_aligned - y * TILE_SIZE : TILE_SIZE;
+   task->width = TILE_SIZE + x * TILE_SIZE > task->scene->fb.width ?
+                    task->scene->fb.width - x * TILE_SIZE : TILE_SIZE;
+   task->height = TILE_SIZE + y * TILE_SIZE > task->scene->fb.height ?
+                    task->scene->fb.height - y * TILE_SIZE : TILE_SIZE;
 
    /* reset pointers to color and depth tile(s) */
    memset(task->color_tiles, 0, sizeof(task->color_tiles));
diff --git a/src/gallium/drivers/llvmpipe/lp_scene.c b/src/gallium/drivers/llvmpipe/lp_scene.c
index 2dfc7ff..771ad08 100644
--- a/src/gallium/drivers/llvmpipe/lp_scene.c
+++ b/src/gallium/drivers/llvmpipe/lp_scene.c
@@ -505,8 +505,6 @@ void lp_scene_begin_binning( struct lp_scene *scene,
 
    scene->tiles_x = align(fb->width, TILE_SIZE) / TILE_SIZE;
    scene->tiles_y = align(fb->height, TILE_SIZE) / TILE_SIZE;
-   scene->width_aligned = align(fb->width, LP_RASTER_BLOCK_SIZE);
-   scene->height_aligned = align(fb->height, LP_RASTER_BLOCK_SIZE);
 
    assert(scene->tiles_x <= TILES_X);
    assert(scene->tiles_y <= TILES_Y);
diff --git a/src/gallium/drivers/llvmpipe/lp_scene.h b/src/gallium/drivers/llvmpipe/lp_scene.h
index bc6c448..fa5bbca 100644
--- a/src/gallium/drivers/llvmpipe/lp_scene.h
+++ b/src/gallium/drivers/llvmpipe/lp_scene.h
@@ -144,10 +144,6 @@ struct lp_scene {
    /** list of resources referenced by the scene commands */
    struct resource_ref *resources;
 
-   /** aligned scene width, height */
-   unsigned width_aligned;
-   unsigned height_aligned;
-
    /** Total memory used by the scene (in bytes).  This sums all the
     * data blocks and counts all bins, state, resource references and
     * other random allocations within the scene.
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.c b/src/gallium/drivers/llvmpipe/lp_state_fs.c
index a20cc78..29b97cd 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.c
@@ -100,6 +100,7 @@
 #include "lp_tex_sample.h"
 #include "lp_flush.h"
 #include "lp_state_fs.h"
+#include "lp_rast.h"
 
 
 /** Fragment shader number (for debugging) */
@@ -352,7 +353,7 @@ generate_fs_loop(struct gallivm_state *gallivm,
 
    if (depth_mode & EARLY_DEPTH_TEST) {
       lp_build_depth_stencil_load_swizzled(gallivm, type,
-                                           zs_format_desc,
+                                           zs_format_desc, key->is_1d,
                                            depth_ptr, depth_stride,
                                            &z_fb, &s_fb, loop_state.counter);
       lp_build_depth_stencil_test(gallivm,
@@ -368,7 +369,8 @@ generate_fs_loop(struct gallivm_state *gallivm,
                                   !simple_shader);
 
       if (depth_mode & EARLY_DEPTH_WRITE) {
-         lp_build_depth_stencil_write_swizzled(gallivm, type, zs_format_desc,
+         lp_build_depth_stencil_write_swizzled(gallivm, type,
+                                               zs_format_desc, key->is_1d,
                                                NULL, NULL, NULL, loop_state.counter,
                                                depth_ptr, depth_stride,
                                                z_value, s_value);
@@ -423,7 +425,7 @@ generate_fs_loop(struct gallivm_state *gallivm,
       }
 
       lp_build_depth_stencil_load_swizzled(gallivm, type,
-                                           zs_format_desc,
+                                           zs_format_desc, key->is_1d,
                                            depth_ptr, depth_stride,
                                            &z_fb, &s_fb, loop_state.counter);
 
@@ -440,7 +442,8 @@ generate_fs_loop(struct gallivm_state *gallivm,
                                   !simple_shader);
       /* Late Z write */
       if (depth_mode & LATE_DEPTH_WRITE) {
-         lp_build_depth_stencil_write_swizzled(gallivm, type, zs_format_desc,
+         lp_build_depth_stencil_write_swizzled(gallivm, type,
+                                               zs_format_desc, key->is_1d,
                                                NULL, NULL, NULL, loop_state.counter,
                                                depth_ptr, depth_stride,
                                                z_value, s_value);
@@ -453,7 +456,8 @@ generate_fs_loop(struct gallivm_state *gallivm,
        * depth value, update from zs_value with the new mask value and
        * write that out.
        */
-      lp_build_depth_stencil_write_swizzled(gallivm, type, zs_format_desc,
+      lp_build_depth_stencil_write_swizzled(gallivm, type,
+                                            zs_format_desc, key->is_1d,
                                             &mask, z_fb, s_fb, loop_state.counter,
                                             depth_ptr, depth_stride,
                                             z_value, s_value);
@@ -507,6 +511,7 @@ generate_fs_loop(struct gallivm_state *gallivm,
  *
  * @param type            fragment shader type (4x or 8x float)
  * @param num_fs          number of fs_src
+ * @param is_1d           whether we're outputting to a 1d resource
  * @param dst_channels    number of output channels
  * @param fs_src          output from fragment shader
  * @param dst             pointer to store result
@@ -517,6 +522,7 @@ static int
 generate_fs_twiddle(struct gallivm_state *gallivm,
                     struct lp_type type,
                     unsigned num_fs,
+                    boolean is_1d,
                     unsigned dst_channels,
                     LLVMValueRef fs_src[][4],
                     LLVMValueRef* dst,
@@ -528,7 +534,7 @@ generate_fs_twiddle(struct gallivm_state *gallivm,
    bool twiddle;
    bool split;
 
-   unsigned pixels = num_fs == 4 ? 1 : 2;
+   unsigned pixels = type.length / 4;
    unsigned reorder_group;
    unsigned src_channels;
    unsigned src_count;
@@ -537,7 +543,7 @@ generate_fs_twiddle(struct gallivm_state *gallivm,
    src_channels = dst_channels < 3 ? dst_channels : 4;
    src_count = num_fs * src_channels;
 
-   assert(pixels == 2 || num_fs == 4);
+   assert(pixels == 2 || pixels == 1);
    assert(num_fs * src_channels <= Elements(src));
 
    /*
@@ -598,6 +604,8 @@ generate_fs_twiddle(struct gallivm_state *gallivm,
        *
        * src_count =  8 -> 0 2 1 3 4 6 5 7
        * src_count = 16 -> 0 1 4 5 2 3 6 7 8 9 12 13 10 11 14 15
+       * This also works for 1d case (just src_count will be half, picking
+       * only the first half of the values)
        */
       const unsigned reorder_sw[] = { 0, 2, 1, 3 };
 
@@ -917,6 +925,7 @@ scale_bits(struct gallivm_state *gallivm,
  */
 static void
 convert_to_blend_type(struct gallivm_state *gallivm,
+                      unsigned block_size,
                       const struct util_format_description *src_fmt,
                       struct lp_type src_type,
                       struct lp_type dst_type,
@@ -928,7 +937,7 @@ convert_to_blend_type(struct gallivm_state *gallivm,
    struct lp_type blend_type;
    struct lp_type mem_type;
    unsigned i, j, k;
-   unsigned pixels = 16 / num_srcs;
+   unsigned pixels = block_size / num_srcs;
    bool is_arith;
 
    /*
@@ -945,30 +954,54 @@ convert_to_blend_type(struct gallivm_state *gallivm,
       assert(dst_type.floating);
       assert(dst_type.width == 32);
       assert(dst_type.length % 4 == 0);
+
       for (i = 0; i < 4; i++) {
          tmpsrc[i] = src[i];
       }
-      for (i = 0; i < num_srcs / 4; i++) {
-         LLVMValueRef tmpsoa[4];
-         LLVMValueRef tmps = tmpsrc[i];
-         if (num_srcs == 8) {
-            LLVMValueRef shuffles[8];
-            unsigned j;
-            /* fetch was 4 values but need 8-wide output values */
-            tmps = lp_build_concat(gallivm, &tmpsrc[i * 2], src_type, 2);
-            /*
-             * for 8-wide aos transpose would give us wrong order not matching
-             * incoming converted fs values and mask. ARGH.
-             */
-            for (j = 0; j < 4; j++) {
-               shuffles[j] = lp_build_const_int32(gallivm, j * 2);
-               shuffles[j + 4] = lp_build_const_int32(gallivm, j * 2 + 1);
+
+      if (dst_type.length == 4) {
+         for (i = 0; i < num_srcs / 4; i++) {
+            LLVMValueRef tmpsoa[4];
+            lp_build_r11g11b10_to_float(gallivm, tmpsrc[i], tmpsoa);
+            lp_build_transpose_aos(gallivm, dst_type, tmpsoa, &src[i * 4]);
+         }
+      }
+      else {
+         assert(dst_type.length == 8);
+
+         if (num_srcs >= 4) { /* do r11g11b10aos->r32g32b32a32soa 8-wide */
+            for (i = 0; i < num_srcs / 4; i++) {
+               LLVMValueRef tmpsoa[4];
+               LLVMValueRef tmps = tmpsrc[i];
+               LLVMValueRef shuffles[8];
+               unsigned j;
+               /* fetch was 4 values but need 8-wide output values */
+               tmps = lp_build_concat(gallivm, &tmpsrc[i * 2], src_type, 2);
+               /*
+                * for 8-wide aos transpose would give us wrong order not matching
+                * incoming converted fs values and mask. ARGH.
+                */
+               for (j = 0; j < 4; j++) {
+                  shuffles[j] = lp_build_const_int32(gallivm, j * 2);
+                  shuffles[j + 4] = lp_build_const_int32(gallivm, j * 2 + 1);
+               }
+               tmps = LLVMBuildShuffleVector(builder, tmps, tmps,
+                                             LLVMConstVector(shuffles, 8), "");
+               lp_build_r11g11b10_to_float(gallivm, tmps, tmpsoa);
+               lp_build_transpose_aos(gallivm, dst_type, tmpsoa, &src[i * 4]);
+            }
+         }
+         else { /* do r11g11b10aos->r32g32b32a32soa 4-wide */
+            assert(num_srcs == 2);
+            dst_type.length = 4;
+            for (i = 0; i < num_srcs / 2; i++) {
+               LLVMValueRef tmpsoa[4];
+               lp_build_r11g11b10_to_float(gallivm, tmpsrc[i], tmpsoa);
+               lp_build_transpose_aos(gallivm, dst_type, tmpsoa, &src[i * 4]);
+               src[i * 2] = lp_build_concat(gallivm, &src[i * 4], dst_type, 2);
+               src[i * 2 + 1] = lp_build_concat(gallivm, &src[i * 4 + 2], dst_type, 2);
             }
-            tmps = LLVMBuildShuffleVector(builder, tmps, tmps,
-                                          LLVMConstVector(shuffles, 8), "");
          }
-         lp_build_r11g11b10_to_float(gallivm, tmps, tmpsoa);
-         lp_build_transpose_aos(gallivm, dst_type, tmpsoa, &src[i * 4]);
       }
       return;
    }
@@ -1062,6 +1095,7 @@ convert_to_blend_type(struct gallivm_state *gallivm,
  */
 static void
 convert_from_blend_type(struct gallivm_state *gallivm,
+                        unsigned block_size,
                         const struct util_format_description *src_fmt,
                         struct lp_type src_type,
                         struct lp_type dst_type,
@@ -1073,7 +1107,7 @@ convert_from_blend_type(struct gallivm_state *gallivm,
    struct lp_type mem_type;
    struct lp_type blend_type;
    LLVMBuilderRef builder = gallivm->builder;
-   unsigned pixels = 16 / num_srcs;
+   unsigned pixels = block_size / num_srcs;
    bool is_arith;
 
    /*
@@ -1090,28 +1124,51 @@ convert_from_blend_type(struct gallivm_state *gallivm,
       assert(src_type.width == 32);
       assert(src_type.length % 4 == 0);
       assert(dst_type.width == 32);
-      for (i = 0; i < num_srcs / 4; i++) {
-         LLVMValueRef tmpsoa[4], tmpdst;
-         lp_build_transpose_aos(gallivm, src_type, &src[i * 4], tmpsoa);
-         tmpdst = lp_build_float_to_r11g11b10(gallivm, tmpsoa);
-         if (num_srcs == 8) {
-            LLVMValueRef tmpaos, shuffles[8];
-            unsigned j;
-            /*
-             * for 8-wide aos transpose has given us wrong order not matching
-             * output order. HMPF. Also need to split the output values manually.
-             */
-            for (j = 0; j < 4; j++) {
-               shuffles[j * 2] = lp_build_const_int32(gallivm, j);
-               shuffles[j * 2 + 1] = lp_build_const_int32(gallivm, j + 4);
-            }
-            tmpaos = LLVMBuildShuffleVector(builder, tmpdst, tmpdst,
-                                            LLVMConstVector(shuffles, 8), "");
-            src[i * 2] = lp_build_extract_range(gallivm, tmpaos, 0, 4);
-            src[i * 2 + 1] = lp_build_extract_range(gallivm, tmpaos, 4, 4);
+
+      if (src_type.length == 4) {
+         for (i = 0; i < num_srcs / 4; i++) {
+            LLVMValueRef tmpsoa[4];
+            lp_build_transpose_aos(gallivm, src_type, &src[i * 4], tmpsoa);
+            src[i] = lp_build_float_to_r11g11b10(gallivm, tmpsoa);
          }
-         else {
-            src[i] = tmpdst;
+      }
+      else {
+         assert(src_type.length == 8);
+
+         if (num_srcs >= 4) { /* do r32g32b32a32soa->r11g11b10aos 8-wide */
+            for (i = 0; i < num_srcs / 4; i++) {
+               LLVMValueRef tmpsoa[4], tmpdst;
+               LLVMValueRef tmpaos, shuffles[8];
+               unsigned j;
+               lp_build_transpose_aos(gallivm, src_type, &src[i * 4], tmpsoa);
+               tmpdst = lp_build_float_to_r11g11b10(gallivm, tmpsoa);
+               /*
+                * for 8-wide aos transpose has given us wrong order not matching
+                * output order. HMPF. Also need to split the output values manually.
+                */
+               for (j = 0; j < 4; j++) {
+                  shuffles[j * 2] = lp_build_const_int32(gallivm, j);
+                  shuffles[j * 2 + 1] = lp_build_const_int32(gallivm, j + 4);
+               }
+               tmpaos = LLVMBuildShuffleVector(builder, tmpdst, tmpdst,
+                                               LLVMConstVector(shuffles, 8), "");
+               src[i * 2] = lp_build_extract_range(gallivm, tmpaos, 0, 4);
+               src[i * 2 + 1] = lp_build_extract_range(gallivm, tmpaos, 4, 4);
+             }
+         }
+         else { /* do r32g32b32a32soa->r11g11b10aos 4-wide */
+            assert(num_srcs == 2);
+
+            src_type.length = 4;
+            for (i = 0; i < num_srcs / 2; i++) {
+               LLVMValueRef tmpsoa[4], tmpaos[4];
+               tmpaos[0] = lp_build_extract_range(gallivm, src[i * 2], 0, 4);
+               tmpaos[1] = lp_build_extract_range(gallivm, src[i * 2], 4, 4);
+               tmpaos[2] = lp_build_extract_range(gallivm, src[i * 2 + 1], 0, 4);
+               tmpaos[3] = lp_build_extract_range(gallivm, src[i * 2 + 1], 4, 4);
+               lp_build_transpose_aos(gallivm, src_type, tmpaos, tmpsoa);
+               src[i] = lp_build_float_to_r11g11b10(gallivm, tmpsoa);
+            }
          }
       }
       return;
@@ -1217,6 +1274,7 @@ convert_alpha(struct gallivm_state *gallivm,
               struct lp_type alpha_type,
               const unsigned block_size,
               const unsigned block_height,
+              boolean is_1d,
               const unsigned src_count,
               const unsigned dst_channels,
               const bool pad_inline,
@@ -1228,9 +1286,9 @@ convert_alpha(struct gallivm_state *gallivm,
    row_type.length = alpha_type.length;
 
    /* Twiddle the alpha to match pixels */
-   lp_bld_quad_twiddle(gallivm, alpha_type, src_alpha, 4, src_alpha);
+   lp_bld_quad_twiddle(gallivm, alpha_type, src_alpha, is_1d ? 2 : 4, src_alpha);
 
-   for (i = 0; i < 4; ++i) {
+   for (i = 0; i < block_height; ++i) {
       lp_build_conv(gallivm, alpha_type, row_type, &src_alpha[i], 1, &src_alpha[i], 1);
    }
 
@@ -1238,7 +1296,7 @@ convert_alpha(struct gallivm_state *gallivm,
    row_type.length = length;
 
    /* If only one channel we can only need the single alpha value per pixel */
-   if (src_count == 1) {
+   if (src_count == 1 && !is_1d) {
       assert(dst_channels == 1);
 
       lp_build_concat_n(gallivm, alpha_type, src_alpha, 4, src_alpha, src_count);
@@ -1249,7 +1307,8 @@ convert_alpha(struct gallivm_state *gallivm,
             unsigned pixels = block_size / src_count;
             unsigned idx = i - 1;
 
-            src_alpha[idx] = lp_build_extract_range(gallivm, src_alpha[(idx * pixels) / 4], (idx * pixels) % 4, pixels);
+            src_alpha[idx] = lp_build_extract_range(gallivm, src_alpha[(idx * pixels) / 4],
+                                                    (idx * pixels) % 4, pixels);
          }
       }
 
@@ -1317,8 +1376,8 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
                           boolean do_branch)
 {
    const unsigned alpha_channel = 3;
-   const unsigned block_width = 4;
-   const unsigned block_height = 4;
+   const unsigned block_width = LP_RASTER_BLOCK_SIZE;
+   const unsigned block_height = variant->key.is_1d ? 1 : LP_RASTER_BLOCK_SIZE;
    const unsigned block_size = block_width * block_height;
    const unsigned lp_integer_vector_width = 128;
 
@@ -1358,6 +1417,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
    bool has_alpha = false;
    const boolean dual_source_blend = variant->key.blend.rt[0].blend_enable &&
                                      util_blend_state_is_dual(&variant->key.blend, 0);
+   const boolean is_1d = variant->key.is_1d;
 
    mask_type = lp_int32_vec4_type();
    mask_type.length = fs_type.length;
@@ -1523,9 +1583,11 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
    /*
     * Pixel twiddle from fragment shader order to memory order
     */
-   src_count = generate_fs_twiddle(gallivm, fs_type, num_fs, dst_channels, fs_src, src, pad_inline);
+   src_count = generate_fs_twiddle(gallivm, fs_type, num_fs, is_1d,
+                                   dst_channels, fs_src, src, pad_inline);
    if (dual_source_blend) {
-      generate_fs_twiddle(gallivm, fs_type, num_fs, dst_channels, fs_src1, src1, pad_inline);
+      generate_fs_twiddle(gallivm, fs_type, num_fs, is_1d, dst_channels,
+                          fs_src1, src1, pad_inline);
    }
 
    src_channels = dst_channels < 3 ? dst_channels : 4;
@@ -1538,6 +1600,14 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
    blend_type = row_type;
    mask_type.length = 4;
 
+   if (is_1d) {
+      /*
+       * due to num_fs we already only have processed the upper 4x2
+       * simply drop half the src for reducing to single row now.
+       */
+      src_count /= 2;
+   }
+
    /* Convert src to row_type */
    if (dual_source_blend) {
       struct lp_type old_row_type = row_type;
@@ -1553,7 +1623,9 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
       unsigned bits = row_type.width * row_type.length;
       unsigned combined;
 
-      dst_count = src_count / (vector_width / bits);
+      /* Note for 1d case can end up with not SSE vector size here */
+      dst_count = MAX2(1, src_count / (vector_width / bits));
+
       combined = lp_build_concat_n(gallivm, row_type, src, src_count, src, dst_count);
       if (dual_source_blend) {
          lp_build_concat_n(gallivm, row_type, src1, src_count, src1, dst_count);
@@ -1563,7 +1635,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
       src_count /= combined;
 
       bits = row_type.width * row_type.length;
-      assert(bits == 128 || bits == 256);
+      assert(bits == 128 || bits == 256 || is_1d);
    }
 
 
@@ -1593,7 +1665,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
    /*
     * Mask conversion
     */
-   lp_bld_quad_twiddle(gallivm, mask_type, &src_mask[0], 4, &src_mask[0]);
+   lp_bld_quad_twiddle(gallivm, mask_type, &src_mask[0], is_1d ? 2 : 4, &src_mask[0]);
 
    if (src_count < block_height) {
       lp_build_concat_n(gallivm, mask_type, src_mask, 4, src_mask, src_count);
@@ -1602,7 +1674,8 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
          unsigned pixels = block_size / src_count;
          unsigned idx = i - 1;
 
-         src_mask[idx] = lp_build_extract_range(gallivm, src_mask[(idx * pixels) / 4], (idx * pixels) % 4, pixels);
+         src_mask[idx] = lp_build_extract_range(gallivm, src_mask[(idx * pixels) / 4],
+                                                (idx * pixels) % 4, pixels);
       }
    }
 
@@ -1636,12 +1709,12 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
       struct lp_type alpha_type = fs_type;
       alpha_type.length = 4;
       convert_alpha(gallivm, row_type, alpha_type,
-                    block_size, block_height,
+                    block_size, block_height, is_1d,
                     src_count, dst_channels,
                     pad_inline, src_alpha);
       if (dual_source_blend) {
          convert_alpha(gallivm, row_type, alpha_type,
-                       block_size, block_height,
+                       block_size, block_height, is_1d,
                        src_count, dst_channels,
                        pad_inline, src1_alpha);
       }
@@ -1657,7 +1730,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
       dst_count = src_count;
    }
 
-   dst_type.length *= 16 / dst_count;
+   dst_type.length *= block_size / dst_count;
 
    if (out_format == PIPE_FORMAT_R11G11B10_FLOAT) {
       /*
@@ -1677,7 +1750,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
     * Convert from dst/output format to src/blending format.
     *
     * This is necessary as we can only read 1 row from memory at a time,
-    * so the minimum dst_count will ever be at this point is 4.
+    * so the minimum dst_count will ever be at this point is 4 (except for 1d case).
     *
     * With, for example, R8 format you can have all 16 pixels in a 128 bit vector,
     * this will take the 4 dsts and combine them into 1 src so we can perform blending
@@ -1699,7 +1772,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
     * It seems some cleanup could be done here (like skipping conversion/blend
     * when not needed).
     */
-   convert_to_blend_type(gallivm, out_format_desc, dst_type, row_type, dst, src_count);
+   convert_to_blend_type(gallivm, block_size, out_format_desc, dst_type, row_type, dst, src_count);
 
    for (i = 0; i < src_count; ++i) {
       dst[i] = lp_build_blend_aos(gallivm,
@@ -1719,7 +1792,7 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
                                   pad_inline ? 4 : dst_channels);
    }
 
-   convert_from_blend_type(gallivm, out_format_desc, row_type, dst_type, dst, src_count);
+   convert_from_blend_type(gallivm, block_size, out_format_desc, row_type, dst_type, dst, src_count);
 
    /* Split the blend rows back to memory rows */
    if (dst_count > src_count) {
@@ -1742,7 +1815,6 @@ generate_unswizzled_blend(struct gallivm_state *gallivm,
       src_count *= 2;
    }
 
-
    /*
     * Store blend result to memory
     */
@@ -1840,7 +1912,6 @@ generate_fragment(struct llvmpipe_context *lp,
    fs_type.norm = FALSE;         /* values are not limited to [0,1] or [-1,1] */
    fs_type.width = 32;           /* 32-bit float */
    fs_type.length = MIN2(lp_native_vector_width / 32, 16); /* n*4 elements per vector */
-   num_fs = 16 / fs_type.length; /* number of loops per 4x4 stamp */
 
    memset(&blend_type, 0, sizeof blend_type);
    blend_type.floating = FALSE; /* values are integers */
@@ -1929,6 +2000,11 @@ generate_fragment(struct llvmpipe_context *lp,
    /* code generated texture sampling */
    sampler = lp_llvm_sampler_soa_create(key->state, context_ptr);
 
+   num_fs = 16 / fs_type.length; /* number of loops per 4x4 stamp */
+   /* for 1d resources only run "upper half" of stamp */
+   if (key->is_1d)
+      num_fs /= 2;
+
    {
       LLVMValueRef num_loop = lp_build_const_int32(gallivm, num_fs);
       LLVMTypeRef mask_type = lp_build_int_vec_type(gallivm, fs_type);
@@ -2540,6 +2616,20 @@ make_variant_key(struct llvmpipe_context *lp,
 
    key->nr_cbufs = lp->framebuffer.nr_cbufs;
 
+   /* figure out if this is a 1d resource - z and color must match in any case.
+    * XXX OpenGL might allow crazy 2d with height 1 and 1d mixup?
+    */
+   if (lp->framebuffer.zsbuf) {
+      if (llvmpipe_resource_is_1d(lp->framebuffer.zsbuf->texture)) {
+         key->is_1d = TRUE;
+      }
+   }
+   else if (key->nr_cbufs) {
+      if (llvmpipe_resource_is_1d(lp->framebuffer.cbufs[0]->texture)) {
+         key->is_1d = TRUE;
+      }
+   }
+
    if (!key->blend.independent_blend_enable) {
       /* we always need independent blend otherwise the fixups below won't work */
       for (i = 1; i < key->nr_cbufs; i++) {
diff --git a/src/gallium/drivers/llvmpipe/lp_state_fs.h b/src/gallium/drivers/llvmpipe/lp_state_fs.h
index c8dc1c3..669696c 100644
--- a/src/gallium/drivers/llvmpipe/lp_state_fs.h
+++ b/src/gallium/drivers/llvmpipe/lp_state_fs.h
@@ -75,6 +75,7 @@ struct lp_fragment_shader_variant_key
    unsigned nr_sampler_views:8; /* actually derivable from just the shader */
    unsigned flatshade:1;
    unsigned occlusion_count:1;
+   unsigned is_1d:1;
 
    enum pipe_format zsbuf_format;
    enum pipe_format cbuf_format[PIPE_MAX_COLOR_BUFS];
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.c b/src/gallium/drivers/llvmpipe/lp_texture.c
index 56eb499..a02ddbc 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.c
+++ b/src/gallium/drivers/llvmpipe/lp_texture.c
@@ -83,22 +83,30 @@ llvmpipe_texture_layout(struct llvmpipe_screen *screen,
 
       /* Row stride and image stride */
       {
-         unsigned alignment, nblocksx, nblocksy, block_size;
+         unsigned align_x, align_y, nblocksx, nblocksy, block_size;
 
          /* For non-compressed formats we need 4x4 pixel alignment
-          * (for now). We also want cache line size in x direction,
+          * so we can read/write LP_RASTER_BLOCK_SIZE when rendering to them.
+          * We also want cache line size in x direction,
           * otherwise same cache line could end up in multiple threads.
-          * XXX this blows up 1d/1d array textures by a factor of 4.
+          * For explicit 1d resources however we reduce this to 4x1 and
+          * handle specially in render output code (as we need to do special
+          * handling there for buffers in any case).
           */
          if (util_format_is_compressed(pt->format))
-            alignment = 1;
-         else
-            alignment = LP_RASTER_BLOCK_SIZE;
+            align_x = align_y = 1;
+         else {
+            align_x = LP_RASTER_BLOCK_SIZE;
+            if (llvmpipe_resource_is_1d(&lpr->base))
+               align_y = 1;
+            else
+               align_y = LP_RASTER_BLOCK_SIZE;
+         }
 
          nblocksx = util_format_get_nblocksx(pt->format,
-                                             align(width, alignment));
+                                             align(width, align_x));
          nblocksy = util_format_get_nblocksy(pt->format,
-                                             align(height, alignment));
+                                             align(height, align_y));
          block_size = util_format_get_blocksize(pt->format);
 
          if (util_format_is_compressed(pt->format))
diff --git a/src/gallium/drivers/llvmpipe/lp_texture.h b/src/gallium/drivers/llvmpipe/lp_texture.h
index faba6f2..e73d449 100644
--- a/src/gallium/drivers/llvmpipe/lp_texture.h
+++ b/src/gallium/drivers/llvmpipe/lp_texture.h
@@ -159,6 +159,27 @@ llvmpipe_resource_is_texture(const struct pipe_resource *resource)
 }
 
 
+static INLINE boolean
+llvmpipe_resource_is_1d(const struct pipe_resource *resource)
+{
+   switch (resource->target) {
+   case PIPE_BUFFER:
+   case PIPE_TEXTURE_1D:
+   case PIPE_TEXTURE_1D_ARRAY:
+      return TRUE;
+   case PIPE_TEXTURE_2D:
+   case PIPE_TEXTURE_2D_ARRAY:
+   case PIPE_TEXTURE_RECT:
+   case PIPE_TEXTURE_3D:
+   case PIPE_TEXTURE_CUBE:
+      return FALSE;
+   default:
+      assert(0);
+      return FALSE;
+   }
+}
+
+
 static INLINE unsigned
 llvmpipe_resource_stride(struct pipe_resource *resource,
                          unsigned level)
-- 
1.7.9.5