[Mesa-dev] [PATCH 20/75] st/nine: Initial ProcessVertices support

Axel Davy axel.davy at ens.fr
Wed Oct 5 20:08:53 UTC 2016


For now only VS 3 support is implemented.

This enables The Sims 2 to work.

Signed-off-by: Axel Davy <axel.davy at ens.fr>
---
 src/gallium/state_trackers/nine/device9.c          | 143 +++++---
 src/gallium/state_trackers/nine/device9.h          |   7 +
 src/gallium/state_trackers/nine/nine_shader.c      |  69 +++-
 src/gallium/state_trackers/nine/nine_shader.h      |  75 +++++
 src/gallium/state_trackers/nine/nine_state.c       | 364 +++++++++++++++++++++
 src/gallium/state_trackers/nine/nine_state.h       |  12 +
 src/gallium/state_trackers/nine/pixelshader9.c     |   2 +
 .../state_trackers/nine/vertexdeclaration9.c       |  65 +++-
 .../state_trackers/nine/vertexdeclaration9.h       |  10 +-
 src/gallium/state_trackers/nine/vertexshader9.c    |  45 +++
 src/gallium/state_trackers/nine/vertexshader9.h    |  12 +-
 11 files changed, 735 insertions(+), 69 deletions(-)

diff --git a/src/gallium/state_trackers/nine/device9.c b/src/gallium/state_trackers/nine/device9.c
index 012cabf..cacba56 100644
--- a/src/gallium/state_trackers/nine/device9.c
+++ b/src/gallium/state_trackers/nine/device9.c
@@ -152,6 +152,7 @@ NineDevice9_ctor( struct NineDevice9 *This,
     list_inithead(&This->managed_textures);
 
     This->screen = pScreen;
+    This->screen_sw = pCTX->ref;
     This->caps = *pCaps;
     This->d3d9 = pD3D9;
     This->params = *pCreationParameters;
@@ -195,9 +196,13 @@ NineDevice9_ctor( struct NineDevice9 *This,
 
     This->pipe = This->screen->context_create(This->screen, NULL, 0);
     if (!This->pipe) { return E_OUTOFMEMORY; } /* guess */
+    This->pipe_sw = This->screen_sw->context_create(This->screen_sw, NULL, 0);
+    if (!This->pipe_sw) { return E_OUTOFMEMORY; }
 
     This->cso = cso_create_context(This->pipe);
     if (!This->cso) { return E_OUTOFMEMORY; } /* also a guess */
+    This->cso_sw = cso_create_context(This->pipe_sw);
+    if (!This->cso_sw) { return E_OUTOFMEMORY; }
 
     /* Create first, it messes up our state. */
     This->hud = hud_create(This->pipe, This->cso); /* NULL result is fine */
@@ -423,10 +428,14 @@ NineDevice9_ctor( struct NineDevice9 *This,
     This->driver_caps.user_vbufs = GET_PCAP(USER_VERTEX_BUFFERS);
     This->driver_caps.user_ibufs = GET_PCAP(USER_INDEX_BUFFERS);
     This->driver_caps.user_cbufs = GET_PCAP(USER_CONSTANT_BUFFERS);
+    This->driver_caps.user_sw_vbufs = This->screen_sw->get_param(This->screen_sw, PIPE_CAP_USER_VERTEX_BUFFERS);
+    This->driver_caps.user_sw_cbufs = This->screen_sw->get_param(This->screen_sw, PIPE_CAP_USER_CONSTANT_BUFFERS);
 
     if (!This->driver_caps.user_vbufs)
         This->vertex_uploader = u_upload_create(This->pipe, 65536,
                                                 PIPE_BIND_VERTEX_BUFFER, PIPE_USAGE_STREAM);
+    This->vertex_sw_uploader = u_upload_create(This->pipe_sw, 65536,
+                                            PIPE_BIND_VERTEX_BUFFER, PIPE_USAGE_STREAM);
     if (!This->driver_caps.user_ibufs)
         This->index_uploader = u_upload_create(This->pipe, 128 * 1024,
                                                PIPE_BIND_INDEX_BUFFER, PIPE_USAGE_STREAM);
@@ -436,6 +445,9 @@ NineDevice9_ctor( struct NineDevice9 *This,
                                                   PIPE_BIND_CONSTANT_BUFFER, PIPE_USAGE_STREAM);
     }
 
+    This->constbuf_sw_uploader = u_upload_create(This->pipe_sw, 128 * 1024,
+                                                 PIPE_BIND_CONSTANT_BUFFER, PIPE_USAGE_STREAM);
+
     This->driver_caps.window_space_position_support = GET_PCAP(TGSI_VS_WINDOW_SPACE_POSITION);
     This->driver_caps.vs_integer = pScreen->get_shader_param(pScreen, PIPE_SHADER_VERTEX, PIPE_SHADER_CAP_INTEGERS);
     This->driver_caps.ps_integer = pScreen->get_shader_param(pScreen, PIPE_SHADER_FRAGMENT, PIPE_SHADER_CAP_INTEGERS);
@@ -454,6 +466,8 @@ NineDevice9_ctor( struct NineDevice9 *This,
     This->update = &This->state;
     nine_update_state(This);
 
+    nine_state_init_sw(This);
+
     ID3DPresentGroup_Release(This->present);
 
     return D3D_OK;
@@ -470,6 +484,7 @@ NineDevice9_dtor( struct NineDevice9 *This )
     if (This->pipe && This->cso)
         nine_pipe_context_clear(This);
     nine_ff_fini(This);
+    nine_state_destroy_sw(This);
     nine_state_clear(&This->state, TRUE);
 
     if (This->vertex_uploader)
@@ -478,6 +493,10 @@ NineDevice9_dtor( struct NineDevice9 *This )
         u_upload_destroy(This->index_uploader);
     if (This->constbuf_uploader)
         u_upload_destroy(This->constbuf_uploader);
+    if (This->vertex_sw_uploader)
+        u_upload_destroy(This->vertex_sw_uploader);
+    if (This->constbuf_sw_uploader)
+        u_upload_destroy(This->constbuf_sw_uploader);
 
     nine_bind(&This->record, NULL);
 
@@ -499,13 +518,11 @@ NineDevice9_dtor( struct NineDevice9 *This )
         FREE(This->swapchains);
     }
 
-    /* state stuff */
-    if (This->pipe) {
-        if (This->cso) {
-            cso_destroy_context(This->cso);
-        }
-        if (This->pipe->destroy) { This->pipe->destroy(This->pipe); }
-    }
+    /* Destroy cso first */
+    if (This->cso) { cso_destroy_context(This->cso); }
+    if (This->cso_sw) { cso_destroy_context(This->cso_sw); }
+    if (This->pipe && This->pipe->destroy) { This->pipe->destroy(This->pipe); }
+    if (This->pipe_sw && This->pipe_sw->destroy) { This->pipe_sw->destroy(This->pipe_sw); }
 
     if (This->present) { ID3DPresentGroup_Release(This->present); }
     if (This->d3d9) { IDirect3D9_Release(This->d3d9); }
@@ -3162,9 +3179,6 @@ NineDevice9_DrawIndexedPrimitiveUP( struct NineDevice9 *This,
     return D3D_OK;
 }
 
-/* TODO: Write to pDestBuffer directly if vertex declaration contains
- * only f32 formats.
- */
 HRESULT NINE_WINAPI
 NineDevice9_ProcessVertices( struct NineDevice9 *This,
                              UINT SrcStartIndex,
@@ -3174,33 +3188,69 @@ NineDevice9_ProcessVertices( struct NineDevice9 *This,
                              IDirect3DVertexDeclaration9 *pVertexDecl,
                              DWORD Flags )
 {
-    struct pipe_screen *screen = This->screen;
+    struct pipe_screen *screen_sw = This->screen_sw;
+    struct pipe_context *pipe_sw = This->pipe_sw;
     struct NineVertexDeclaration9 *vdecl = NineVertexDeclaration9(pVertexDecl);
+    struct NineVertexBuffer9 *dst = NineVertexBuffer9(pDestBuffer);
     struct NineVertexShader9 *vs;
     struct pipe_resource *resource;
+    struct pipe_transfer *transfer = NULL;
+    struct pipe_stream_output_info so;
     struct pipe_stream_output_target *target;
     struct pipe_draw_info draw;
+    struct pipe_box box;
+    unsigned offsets[1] = {0};
     HRESULT hr;
-    unsigned buffer_offset, buffer_size;
+    unsigned buffer_size;
+    void *map;
 
     DBG("This=%p SrcStartIndex=%u DestIndex=%u VertexCount=%u "
         "pDestBuffer=%p pVertexDecl=%p Flags=%d\n",
         This, SrcStartIndex, DestIndex, VertexCount, pDestBuffer,
         pVertexDecl, Flags);
 
-    if (!screen->get_param(screen, PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS))
-        STUB(D3DERR_INVALIDCALL);
+    if (!screen_sw->get_param(screen_sw, PIPE_CAP_MAX_STREAM_OUTPUT_BUFFERS)) {
+        DBG("ProcessVertices not supported\n");
+        return D3DERR_INVALIDCALL;
+    }
 
-    nine_update_state(This);
 
-    /* TODO: Create shader with stream output. */
-    STUB(D3DERR_INVALIDCALL);
-    struct NineVertexBuffer9 *dst = NineVertexBuffer9(pDestBuffer);
+    vs = This->state.programmable_vs ? This->state.vs : This->ff.vs;
+    /* Note: version is 0 for ff */
+    user_assert(vdecl || (vs->byte_code.version < 0x30 && dst->desc.FVF),
+                D3DERR_INVALIDCALL);
+    if (!vdecl) {
+        DWORD FVF = dst->desc.FVF;
+        vdecl = util_hash_table_get(This->ff.ht_fvf, &FVF);
+        if (!vdecl) {
+            hr = NineVertexDeclaration9_new_from_fvf(This, FVF, &vdecl);
+            if (FAILED(hr))
+                return hr;
+            vdecl->fvf = FVF;
+            util_hash_table_set(This->ff.ht_fvf, &vdecl->fvf, vdecl);
+            NineUnknown_ConvertRefToBind(NineUnknown(vdecl));
+        }
+    }
 
-    vs = This->state.vs ? This->state.vs : This->ff.vs;
+    /* Flags: Can be 0 or D3DPV_DONOTCOPYDATA, and/or lock flags
+     * D3DPV_DONOTCOPYDATA -> Has effect only for ff. In particular
+     * if not set, everything from src will be used, and dst
+     * must match exactly the ff vs outputs.
+     * TODO: Handle all the checks, etc for ff */
+    user_assert(vdecl->position_t || This->state.programmable_vs,
+                D3DERR_INVALIDCALL);
+
+    /* TODO: Support vs < 3 and ff */
+    user_assert(vs->byte_code.version == 0x30,
+                D3DERR_INVALIDCALL);
+    /* TODO: Not hardcode the constant buffers for swvp */
+    user_assert(This->may_swvp,
+                D3DERR_INVALIDCALL);
+
+    nine_state_prepare_draw_sw(This, vdecl, SrcStartIndex, VertexCount, &so);
 
-    buffer_size = VertexCount * vs->so->stride[0];
-    if (1) {
+    buffer_size = VertexCount * so.stride[0] * 4;
+    {
         struct pipe_resource templ;
 
         templ.target = PIPE_BUFFER;
@@ -3212,49 +3262,50 @@ NineDevice9_ProcessVertices( struct NineDevice9 *This,
         templ.height0 = templ.depth0 = templ.array_size = 1;
         templ.last_level = templ.nr_samples = 0;
 
-        resource = This->screen->resource_create(This->screen, &templ);
+        resource = screen_sw->resource_create(screen_sw, &templ);
         if (!resource)
             return E_OUTOFMEMORY;
-        buffer_offset = 0;
-    } else {
-        /* SO matches vertex declaration */
-        resource = NineVertexBuffer9_GetResource(dst);
-        buffer_offset = DestIndex * vs->so->stride[0];
     }
-    target = This->pipe->create_stream_output_target(This->pipe, resource,
-                                                     buffer_offset,
-                                                     buffer_size);
+    target = pipe_sw->create_stream_output_target(pipe_sw, resource,
+                                                  0, buffer_size);
     if (!target) {
         pipe_resource_reference(&resource, NULL);
         return D3DERR_DRIVERINTERNALERROR;
     }
 
-    if (!vdecl) {
-        hr = NineVertexDeclaration9_new_from_fvf(This, dst->desc.FVF, &vdecl);
-        if (FAILED(hr))
-            goto out;
-    }
-
     init_draw_info(&draw, This, D3DPT_POINTLIST, VertexCount);
     draw.instance_count = 1;
     draw.indexed = FALSE;
-    draw.start = SrcStartIndex;
+    draw.start = 0;
     draw.index_bias = 0;
-    draw.min_index = SrcStartIndex;
-    draw.max_index = SrcStartIndex + VertexCount - 1;
+    draw.min_index = 0;
+    draw.max_index = VertexCount - 1;
+
+
+    pipe_sw->set_stream_output_targets(pipe_sw, 1, &target, offsets);
 
-    This->pipe->set_stream_output_targets(This->pipe, 1, &target, 0);
-    This->pipe->draw_vbo(This->pipe, &draw);
-    This->pipe->set_stream_output_targets(This->pipe, 0, NULL, 0);
-    This->pipe->stream_output_target_destroy(This->pipe, target);
+    pipe_sw->draw_vbo(pipe_sw, &draw);
+
+    pipe_sw->set_stream_output_targets(pipe_sw, 0, NULL, 0);
+    pipe_sw->stream_output_target_destroy(pipe_sw, target);
+
+    u_box_1d(0, VertexCount * so.stride[0] * 4, &box);
+    map = pipe_sw->transfer_map(pipe_sw, resource, 0, PIPE_TRANSFER_READ, &box,
+                                &transfer);
+    if (!map) {
+        hr = D3DERR_DRIVERINTERNALERROR;
+        goto out;
+    }
 
     hr = NineVertexDeclaration9_ConvertStreamOutput(vdecl,
                                                     dst, DestIndex, VertexCount,
-                                                    resource, vs->so);
+                                                    map, &so);
+    if (transfer)
+        pipe_sw->transfer_unmap(pipe_sw, transfer);
+
 out:
+    nine_state_after_draw_sw(This);
     pipe_resource_reference(&resource, NULL);
-    if (!pVertexDecl)
-        NineUnknown_Release(NineUnknown(vdecl));
     return hr;
 }
 
diff --git a/src/gallium/state_trackers/nine/device9.h b/src/gallium/state_trackers/nine/device9.h
index b6aa5e0..12be643 100644
--- a/src/gallium/state_trackers/nine/device9.h
+++ b/src/gallium/state_trackers/nine/device9.h
@@ -52,8 +52,11 @@ struct NineDevice9
 
     /* G3D context */
     struct pipe_screen *screen;
+    struct pipe_screen *screen_sw;
     struct pipe_context *pipe;
+    struct pipe_context *pipe_sw;
     struct cso_context *cso;
+    struct cso_context *cso_sw;
 
     /* creation parameters */
     D3DCAPS9 caps;
@@ -115,6 +118,8 @@ struct NineDevice9
         boolean user_vbufs;
         boolean user_ibufs;
         boolean user_cbufs;
+        boolean user_sw_vbufs;
+        boolean user_sw_cbufs;
         boolean window_space_position_support;
         boolean vs_integer;
         boolean ps_integer;
@@ -128,6 +133,8 @@ struct NineDevice9
     struct u_upload_mgr *vertex_uploader;
     struct u_upload_mgr *index_uploader;
     struct u_upload_mgr *constbuf_uploader;
+    struct u_upload_mgr *vertex_sw_uploader;
+    struct u_upload_mgr *constbuf_sw_uploader;
     unsigned constbuf_alignment;
 
     struct nine_range_pool range_pool;
diff --git a/src/gallium/state_trackers/nine/nine_shader.c b/src/gallium/state_trackers/nine/nine_shader.c
index 3a2bfa8..ae97c42 100644
--- a/src/gallium/state_trackers/nine/nine_shader.c
+++ b/src/gallium/state_trackers/nine/nine_shader.c
@@ -26,6 +26,7 @@
 #include "device9.h"
 #include "nine_debug.h"
 #include "nine_state.h"
+#include "vertexdeclaration9.h"
 
 #include "util/macros.h"
 #include "util/u_memory.h"
@@ -467,6 +468,7 @@ struct shader_translator
     struct {
         struct ureg_dst *r;
         struct ureg_dst oPos;
+        struct ureg_dst oPos_out; /* the real output when doing streamout */
         struct ureg_dst oFog;
         struct ureg_dst oPts;
         struct ureg_dst oCol[4];
@@ -511,6 +513,9 @@ struct shader_translator
     boolean indirect_const_access;
     boolean failure;
 
+    struct nine_vs_output_info output_info[16];
+    int num_outputs;
+
     struct nine_shader_info *info;
 
     int16_t op_info_map[D3DSIO_BREAKP + 1];
@@ -536,6 +541,17 @@ sm1_instruction_check(const struct sm1_instruction *insn)
     }
 }
 
+static void
+nine_record_outputs(struct shader_translator *tx, BYTE Usage, BYTE UsageIndex,
+                    int mask, int output_index)
+{
+    tx->output_info[tx->num_outputs].output_semantic = Usage;
+    tx->output_info[tx->num_outputs].output_semantic_index = UsageIndex;
+    tx->output_info[tx->num_outputs].mask = mask;
+    tx->output_info[tx->num_outputs].output_index = output_index;
+    tx->num_outputs++;
+}
+
 static boolean
 tx_lconstf(struct shader_translator *tx, struct ureg_src *src, INT index)
 {
@@ -2135,6 +2151,12 @@ DECL_SPECIAL(DCL)
             assert(ureg_dst_is_undef(tx->regs.o[sem.reg.idx]) && "Nine doesn't support yet packing");
             tx->regs.o[sem.reg.idx] = ureg_DECL_output_masked(
                 ureg, tgsi.Name, tgsi.Index, sem.reg.mask, 0, 1);
+            nine_record_outputs(tx, sem.usage, sem.usage_idx, sem.reg.mask, sem.reg.idx);
+            if (tx->info->process_vertices && sem.usage == D3DDECLUSAGE_POSITION && sem.usage_idx == 0) {
+                tx->regs.oPos_out = tx->regs.o[sem.reg.idx];
+                tx->regs.o[sem.reg.idx] = ureg_DECL_temporary(ureg);
+                tx->regs.oPos = tx->regs.o[sem.reg.idx];
+            }
 
             if (tgsi.Name == TGSI_SEMANTIC_PSIZE) {
                 tx->regs.o[sem.reg.idx] = ureg_DECL_temporary(ureg);
@@ -3346,6 +3368,8 @@ tx_ctor(struct shader_translator *tx, struct nine_shader_info *info)
 
     info->version = (tx->version.major << 4) | tx->version.minor;
 
+    tx->num_outputs = 0;
+
     create_op_info_map(tx);
 }
 
@@ -3359,6 +3383,26 @@ tx_dtor(struct shader_translator *tx)
     FREE(tx);
 }
 
+/* CONST[0].xyz = width/2, -height/2, zmax-zmin
+ * CONST[1].xyz = x+width/2, y+height/2, zmin */
+static void
+shader_add_vs_viewport_transform(struct shader_translator *tx)
+{
+    struct ureg_program *ureg = tx->ureg;
+    struct ureg_src c0 = NINE_CONSTANT_SRC(0);
+    struct ureg_src c1 = NINE_CONSTANT_SRC(1);
+    /* struct ureg_dst pos_tmp = ureg_DECL_temporary(ureg);*/
+
+    c0 = ureg_src_dimension(c0, 4);
+    c1 = ureg_src_dimension(c1, 4);
+    /* TODO: find out when we need to apply the viewport transformation or not.
+     * Likely will be XYZ vs XYZRHW in vdecl_out
+     * ureg_MUL(ureg, ureg_writemask(pos_tmp, TGSI_WRITEMASK_XYZ), ureg_src(tx->regs.oPos), c0);
+     * ureg_ADD(ureg, ureg_writemask(tx->regs.oPos_out, TGSI_WRITEMASK_XYZ), ureg_src(pos_tmp), c1);
+     */
+    ureg_MOV(ureg, ureg_writemask(tx->regs.oPos_out, TGSI_WRITEMASK_XYZ), ureg_src(tx->regs.oPos));
+}
+
 static void
 shader_add_ps_fog_stage(struct shader_translator *tx, struct ureg_src src_col)
 {
@@ -3410,10 +3454,10 @@ shader_add_ps_fog_stage(struct shader_translator *tx, struct ureg_src src_col)
     ureg_MOV(ureg, ureg_writemask(oCol0, TGSI_WRITEMASK_W), src_col);
 }
 
-#define GET_CAP(n) device->screen->get_param( \
-      device->screen, PIPE_CAP_##n)
-#define GET_SHADER_CAP(n) device->screen->get_shader_param( \
-      device->screen, info->type, PIPE_SHADER_CAP_##n)
+#define GET_CAP(n) screen->get_param( \
+      screen, PIPE_CAP_##n)
+#define GET_SHADER_CAP(n) screen->get_shader_param( \
+      screen, info->type, PIPE_SHADER_CAP_##n)
 
 HRESULT
 nine_translate_shader(struct NineDevice9 *device, struct nine_shader_info *info)
@@ -3421,6 +3465,8 @@ nine_translate_shader(struct NineDevice9 *device, struct nine_shader_info *info)
     struct shader_translator *tx;
     HRESULT hr = D3D_OK;
     const unsigned processor = info->type;
+    struct pipe_screen *screen = info->process_vertices ? device->screen_sw : device->screen;
+    struct pipe_context *pipe = info->process_vertices ? device->pipe_sw : device->pipe;
 
     user_assert(processor != ~0, D3DERR_INVALIDCALL);
 
@@ -3533,6 +3579,9 @@ nine_translate_shader(struct NineDevice9 *device, struct nine_shader_info *info)
         info->point_size = TRUE;
     }
 
+    if (info->process_vertices)
+        shader_add_vs_viewport_transform(tx);
+
     ureg_END(tx->ureg);
 
     /* record local constants */
@@ -3625,6 +3674,9 @@ nine_translate_shader(struct NineDevice9 *device, struct nine_shader_info *info)
          ureg_DECL_constant2D(tx->ureg, 0, 511, 3);
     }
 
+    if (info->process_vertices)
+        ureg_DECL_constant2D(tx->ureg, 0, 2, 4); /* Viewport data */
+
     if (debug_get_bool_option("NINE_TGSI_DUMP", FALSE)) {
         unsigned count;
         const struct tgsi_token *toks = ureg_get_tokens(tx->ureg, &count);
@@ -3632,7 +3684,14 @@ nine_translate_shader(struct NineDevice9 *device, struct nine_shader_info *info)
         ureg_free_tokens(toks);
     }
 
-    info->cso = ureg_create_shader_and_destroy(tx->ureg, device->pipe);
+    if (info->process_vertices) {
+        NineVertexDeclaration9_FillStreamOutputInfo(info->vdecl_out,
+                                                    tx->output_info,
+                                                    tx->num_outputs,
+                                                    &(info->so));
+        info->cso = ureg_create_shader_with_so_and_destroy(tx->ureg, pipe, &(info->so));
+    } else
+        info->cso = ureg_create_shader_and_destroy(tx->ureg, pipe);
     if (!info->cso) {
         hr = D3DERR_DRIVERINTERNALERROR;
         FREE(info->lconstf.data);
diff --git a/src/gallium/state_trackers/nine/nine_shader.h b/src/gallium/state_trackers/nine/nine_shader.h
index 092ae63..72a28b8 100644
--- a/src/gallium/state_trackers/nine/nine_shader.h
+++ b/src/gallium/state_trackers/nine/nine_shader.h
@@ -26,10 +26,12 @@
 #include "d3d9types.h"
 #include "d3d9caps.h"
 #include "nine_defines.h"
+#include "nine_helpers.h"
 #include "pipe/p_state.h" /* PIPE_MAX_ATTRIBS */
 #include "util/u_memory.h"
 
 struct NineDevice9;
+struct NineVertexDeclaration9;
 
 struct nine_lconstf /* NOTE: both pointers should be FREE'd by the user */
 {
@@ -78,6 +80,18 @@ struct nine_shader_info
     uint8_t bumpenvmat_needed;
 
     boolean swvp_on;
+
+    boolean process_vertices;
+    struct NineVertexDeclaration9 *vdecl_out;
+    struct pipe_stream_output_info so;
+};
+
+struct nine_vs_output_info
+{
+    BYTE output_semantic;
+    int output_semantic_index;
+    int mask;
+    int output_index;
 };
 
 static inline void
@@ -147,4 +161,65 @@ nine_shader_variants_free(struct nine_shader_variant *list)
     }
 }
 
+struct nine_shader_variant_so
+{
+    struct nine_shader_variant_so *next;
+    struct NineVertexDeclaration9 *vdecl;
+    struct pipe_stream_output_info so;
+    void *cso;
+};
+
+static inline void *
+nine_shader_variant_so_get(struct nine_shader_variant_so *list,
+                           struct NineVertexDeclaration9 *vdecl,
+                           struct pipe_stream_output_info *so)
+{
+    while (list->vdecl != vdecl && list->next)
+        list = list->next;
+    if (list->vdecl == vdecl) {
+        *so = list->so;
+        return list->cso;
+    }
+    return NULL;
+}
+
+static inline boolean
+nine_shader_variant_so_add(struct nine_shader_variant_so *list,
+                           struct NineVertexDeclaration9 *vdecl,
+                           struct pipe_stream_output_info *so, void *cso)
+{
+    if (list->vdecl == NULL) { /* first shader */
+        list->next = NULL;
+        nine_bind(&list->vdecl, vdecl);
+        list->so = *so;
+        list->cso = cso;
+        return TRUE;
+    }
+    while (list->next) {
+        assert(list->vdecl != vdecl);
+        list = list->next;
+    }
+    list->next = MALLOC_STRUCT(nine_shader_variant_so);
+    if (!list->next)
+        return FALSE;
+    list->next->next = NULL;
+    nine_bind(&list->vdecl, vdecl);
+    list->next->so = *so;
+    list->next->cso = cso;
+    return TRUE;
+}
+
+static inline void
+nine_shader_variants_so_free(struct nine_shader_variant_so *list)
+{
+    while (list->next) {
+        struct nine_shader_variant_so *ptr = list->next;
+        list->next = ptr->next;
+        nine_bind(&ptr->vdecl, NULL);
+        FREE(ptr);
+    }
+    if (list->vdecl)
+        nine_bind(&list->vdecl, NULL);
+}
+
 #endif /* _NINE_SHADER_H_ */
diff --git a/src/gallium/state_trackers/nine/nine_state.c b/src/gallium/state_trackers/nine/nine_state.c
index 024e639..a832a13 100644
--- a/src/gallium/state_trackers/nine/nine_state.c
+++ b/src/gallium/state_trackers/nine/nine_state.c
@@ -26,6 +26,7 @@
 #include "buffer9.h"
 #include "indexbuffer9.h"
 #include "surface9.h"
+#include "vertexbuffer9.h"
 #include "vertexdeclaration9.h"
 #include "vertexshader9.h"
 #include "pixelshader9.h"
@@ -36,6 +37,8 @@
 #include "cso_cache/cso_context.h"
 #include "util/u_upload_mgr.h"
 #include "util/u_math.h"
+#include "util/u_box.h"
+#include "util/u_simple_shaders.h"
 
 #define DBG_CHANNEL DBG_DEVICE
 
@@ -1356,6 +1359,367 @@ nine_state_clear(struct nine_state *state, const boolean device)
     }
 }
 
+void
+nine_state_init_sw(struct NineDevice9 *device)
+{
+    struct pipe_context *pipe_sw = device->pipe_sw;
+    struct pipe_rasterizer_state rast;
+    struct pipe_blend_state blend;
+    struct pipe_depth_stencil_alpha_state dsa;
+    struct pipe_framebuffer_state fb;
+
+    /* Only used with Streamout */
+    memset(&rast, 0, sizeof(rast));
+    rast.rasterizer_discard = true;
+    rast.point_quad_rasterization = 1; /* to make llvmpipe happy */
+    cso_set_rasterizer(device->cso_sw, &rast);
+
+    /* dummy settings */
+    memset(&blend, 0, sizeof(blend));
+    memset(&dsa, 0, sizeof(dsa));
+    memset(&fb, 0, sizeof(fb));
+    cso_set_blend(device->cso_sw, &blend);
+    cso_set_depth_stencil_alpha(device->cso_sw, &dsa);
+    cso_set_framebuffer(device->cso_sw, &fb);
+    cso_set_viewport_dims(device->cso_sw, 1.0, 1.0, false);
+    cso_set_fragment_shader_handle(device->cso_sw, util_make_empty_fragment_shader(pipe_sw));
+}
+
+/* There is duplication with update_vertex_elements.
+ * TODO: Share the code */
+
+static void
+update_vertex_elements_sw(struct NineDevice9 *device)
+{
+    struct nine_state *state = &device->state;
+    const struct NineVertexDeclaration9 *vdecl = device->state.vdecl;
+    const struct NineVertexShader9 *vs;
+    unsigned n, b, i;
+    int index;
+    char vdecl_index_map[16]; /* vs->num_inputs <= 16 */
+    char used_streams[device->caps.MaxStreams];
+    int dummy_vbo_stream = -1;
+    BOOL need_dummy_vbo = FALSE;
+    struct pipe_vertex_element ve[PIPE_MAX_ATTRIBS];
+
+    state->stream_usage_mask = 0;
+    memset(vdecl_index_map, -1, 16);
+    memset(used_streams, 0, device->caps.MaxStreams);
+    vs = state->programmable_vs ? device->state.vs : device->ff.vs;
+
+    if (vdecl) {
+        for (n = 0; n < vs->num_inputs; ++n) {
+            DBG("looking up input %u (usage %u) from vdecl(%p)\n",
+                n, vs->input_map[n].ndecl, vdecl);
+
+            for (i = 0; i < vdecl->nelems; i++) {
+                if (vdecl->usage_map[i] == vs->input_map[n].ndecl) {
+                    vdecl_index_map[n] = i;
+                    used_streams[vdecl->elems[i].vertex_buffer_index] = 1;
+                    break;
+                }
+            }
+            if (vdecl_index_map[n] < 0)
+                need_dummy_vbo = TRUE;
+        }
+    } else {
+        /* No vertex declaration. Likely will never happen in practice,
+         * but we need not crash on this */
+        need_dummy_vbo = TRUE;
+    }
+
+    if (need_dummy_vbo) {
+        for (i = 0; i < device->caps.MaxStreams; i++ ) {
+            if (!used_streams[i]) {
+                dummy_vbo_stream = i;
+                break;
+            }
+        }
+    }
+    /* there are less vertex shader inputs than stream slots,
+     * so if we need a slot for the dummy vbo, we should have found one */
+    assert (!need_dummy_vbo || dummy_vbo_stream != -1);
+
+    for (n = 0; n < vs->num_inputs; ++n) {
+        index = vdecl_index_map[n];
+        if (index >= 0) {
+            ve[n] = vdecl->elems[index];
+            b = ve[n].vertex_buffer_index;
+            state->stream_usage_mask |= 1 << b;
+            /* XXX wine just uses 1 here: */
+            if (state->stream_freq[b] & D3DSTREAMSOURCE_INSTANCEDATA)
+                ve[n].instance_divisor = state->stream_freq[b] & 0x7FFFFF;
+        } else {
+            /* if the vertex declaration is incomplete compared to what the
+             * vertex shader needs, we bind a dummy vbo with 0 0 0 0.
+             * This is not precised by the spec, but is the behaviour
+             * tested on win */
+            ve[n].vertex_buffer_index = dummy_vbo_stream;
+            ve[n].src_format = PIPE_FORMAT_R32G32B32A32_FLOAT;
+            ve[n].src_offset = 0;
+            ve[n].instance_divisor = 0;
+        }
+    }
+
+    if (state->dummy_vbo_bound_at != dummy_vbo_stream) {
+        if (state->dummy_vbo_bound_at >= 0)
+            state->changed.vtxbuf |= 1 << state->dummy_vbo_bound_at;
+        if (dummy_vbo_stream >= 0) {
+            state->changed.vtxbuf |= 1 << dummy_vbo_stream;
+            state->vbo_bound_done = FALSE;
+        }
+        state->dummy_vbo_bound_at = dummy_vbo_stream;
+    }
+
+    cso_set_vertex_elements(device->cso_sw, vs->num_inputs, ve);
+}
+
+static void
+update_vertex_buffers_sw(struct NineDevice9 *device, int start_vertice, int num_vertices)
+{
+    struct pipe_context *pipe = device->pipe;
+    struct pipe_context *pipe_sw = device->pipe_sw;
+    struct nine_state *state = &device->state;
+    struct pipe_vertex_buffer vtxbuf;
+    uint32_t mask = 0xf;
+    unsigned i;
+
+    DBG("mask=%x\n", mask);
+
+    assert (state->dummy_vbo_bound_at < 0);
+    /* TODO: handle dummy_vbo_bound_at */
+
+    for (i = 0; mask; mask >>= 1, ++i) {
+        if (mask & 1) {
+            if (state->vtxbuf[i].buffer) {
+                struct pipe_resource *buf;
+                struct pipe_box box;
+
+                vtxbuf = state->vtxbuf[i];
+
+                DBG("Locking %p (offset %d, length %d)\n", vtxbuf.buffer,
+                    vtxbuf.buffer_offset, num_vertices * vtxbuf.stride);
+
+                u_box_1d(vtxbuf.buffer_offset + start_vertice * vtxbuf.stride,
+                         num_vertices * vtxbuf.stride, &box);
+                buf = vtxbuf.buffer;
+                vtxbuf.user_buffer = pipe->transfer_map(pipe, buf, 0, PIPE_TRANSFER_READ, &box,
+                                                        &(state->transfers_so[i]));
+                vtxbuf.buffer = NULL;
+                if (!device->driver_caps.user_sw_vbufs) {
+                    u_upload_data(device->vertex_sw_uploader,
+                                  0,
+                                  box.width,
+                                  16,
+                                  vtxbuf.user_buffer,
+                                  &(vtxbuf.buffer_offset),
+                                  &(vtxbuf.buffer));
+                    u_upload_unmap(device->vertex_sw_uploader);
+                    vtxbuf.user_buffer = NULL;
+                }
+                pipe_sw->set_vertex_buffers(pipe_sw, i, 1, &vtxbuf);
+                if (vtxbuf.buffer)
+                    pipe_resource_reference(&vtxbuf.buffer, NULL);
+            } else
+                pipe_sw->set_vertex_buffers(pipe_sw, i, 1, NULL);
+        }
+    }
+}
+
+static void
+update_vs_constants_sw(struct NineDevice9 *device)
+{
+    struct nine_state *state = &device->state;
+    struct pipe_context *pipe_sw = device->pipe_sw;
+
+    DBG("updating\n");
+
+    {
+        struct pipe_constant_buffer cb;
+        const void *buf;
+
+        cb.buffer = NULL;
+        cb.buffer_offset = 0;
+        cb.buffer_size = 4096 * sizeof(float[4]);
+        cb.user_buffer = state->vs_const_f_swvp;
+
+        if (state->vs->lconstf.ranges) {
+            const struct nine_lconstf *lconstf =  &device->state.vs->lconstf;
+            const struct nine_range *r = lconstf->ranges;
+            unsigned n = 0;
+            float *dst = device->state.vs_lconstf_temp;
+            float *src = (float *)cb.user_buffer;
+            memcpy(dst, src, 8192 * sizeof(float[4]));
+            while (r) {
+                unsigned p = r->bgn;
+                unsigned c = r->end - r->bgn;
+                memcpy(&dst[p * 4], &lconstf->data[n * 4], c * 4 * sizeof(float));
+                n += c;
+                r = r->next;
+            }
+            cb.user_buffer = dst;
+        }
+
+        buf = cb.user_buffer;
+        if (!device->driver_caps.user_sw_cbufs) {
+            u_upload_data(device->constbuf_sw_uploader,
+                          0,
+                          cb.buffer_size,
+                          16,
+                          cb.user_buffer,
+                          &(cb.buffer_offset),
+                          &(cb.buffer));
+            u_upload_unmap(device->constbuf_sw_uploader);
+            cb.user_buffer = NULL;
+        }
+
+        pipe_sw->set_constant_buffer(pipe_sw, PIPE_SHADER_VERTEX, 0, &cb);
+        if (cb.buffer)
+            pipe_resource_reference(&cb.buffer, NULL);
+
+        cb.user_buffer = (char *)buf + 4096 * sizeof(float[4]);
+        if (!device->driver_caps.user_sw_cbufs) {
+            u_upload_data(device->constbuf_sw_uploader,
+                          0,
+                          cb.buffer_size,
+                          16,
+                          cb.user_buffer,
+                          &(cb.buffer_offset),
+                          &(cb.buffer));
+            u_upload_unmap(device->constbuf_sw_uploader);
+            cb.user_buffer = NULL;
+        }
+
+        pipe_sw->set_constant_buffer(pipe_sw, PIPE_SHADER_VERTEX, 1, &cb);
+        if (cb.buffer)
+            pipe_resource_reference(&cb.buffer, NULL);
+    }
+
+    {
+        struct pipe_constant_buffer cb;
+
+        cb.buffer = NULL;
+        cb.buffer_offset = 0;
+        cb.buffer_size = 2048 * sizeof(float[4]);
+        cb.user_buffer = state->vs_const_i;
+
+        if (!device->driver_caps.user_sw_cbufs) {
+            u_upload_data(device->constbuf_sw_uploader,
+                          0,
+                          cb.buffer_size,
+                          16,
+                          cb.user_buffer,
+                          &(cb.buffer_offset),
+                          &(cb.buffer));
+            u_upload_unmap(device->constbuf_sw_uploader);
+            cb.user_buffer = NULL;
+        }
+
+        pipe_sw->set_constant_buffer(pipe_sw, PIPE_SHADER_VERTEX, 2, &cb);
+        if (cb.buffer)
+            pipe_resource_reference(&cb.buffer, NULL);
+    }
+
+    {
+        struct pipe_constant_buffer cb;
+
+        cb.buffer = NULL;
+        cb.buffer_offset = 0;
+        cb.buffer_size = 512 * sizeof(float[4]);
+        cb.user_buffer = state->vs_const_b;
+
+        if (!device->driver_caps.user_sw_cbufs) {
+            u_upload_data(device->constbuf_sw_uploader,
+                          0,
+                          cb.buffer_size,
+                          16,
+                          cb.user_buffer,
+                          &(cb.buffer_offset),
+                          &(cb.buffer));
+            u_upload_unmap(device->constbuf_sw_uploader);
+            cb.user_buffer = NULL;
+        }
+
+        pipe_sw->set_constant_buffer(pipe_sw, PIPE_SHADER_VERTEX, 3, &cb);
+        if (cb.buffer)
+            pipe_resource_reference(&cb.buffer, NULL);
+    }
+
+    {
+        struct pipe_constant_buffer cb;
+        const D3DVIEWPORT9 *vport = &device->state.viewport;
+        float viewport_data[8] = {(float)vport->Width * 0.5f,
+            (float)vport->Height * -0.5f, vport->MaxZ - vport->MinZ, 0.f,
+            (float)vport->Width * 0.5f + (float)vport->X,
+            (float)vport->Height * 0.5f + (float)vport->Y,
+            vport->MinZ, 0.f};
+
+        cb.buffer = NULL;
+        cb.buffer_offset = 0;
+        cb.buffer_size = 2 * sizeof(float[4]);
+        cb.user_buffer = viewport_data;
+
+        {
+            u_upload_data(device->constbuf_sw_uploader,
+                          0,
+                          cb.buffer_size,
+                          16,
+                          cb.user_buffer,
+                          &(cb.buffer_offset),
+                          &(cb.buffer));
+            u_upload_unmap(device->constbuf_sw_uploader);
+            cb.user_buffer = NULL;
+        }
+
+        pipe_sw->set_constant_buffer(pipe_sw, PIPE_SHADER_VERTEX, 4, &cb);
+        if (cb.buffer)
+            pipe_resource_reference(&cb.buffer, NULL);
+    }
+
+}
+
+void
+nine_state_prepare_draw_sw(struct NineDevice9 *device, struct NineVertexDeclaration9 *vdecl_out,
+                           int start_vertice, int num_vertices, struct pipe_stream_output_info *so)
+{
+    struct nine_state *state = &device->state;
+
+    struct NineVertexShader9 *vs = state->programmable_vs ? device->state.vs : device->ff.vs;
+
+    assert(state->programmable_vs);
+
+    DBG("Preparing draw\n");
+    cso_set_vertex_shader_handle(device->cso_sw,
+                                 NineVertexShader9_GetVariantProcessVertices(vs, vdecl_out, so));
+    update_vertex_elements_sw(device);
+    update_vertex_buffers_sw(device, start_vertice, num_vertices);
+    update_vs_constants_sw(device);
+    DBG("Preparation succeeded\n");
+}
+
+void
+nine_state_after_draw_sw(struct NineDevice9 *device)
+{
+    struct nine_state *state = &device->state;
+    struct pipe_context *pipe = device->pipe;
+    struct pipe_context *pipe_sw = device->pipe_sw;
+    int i;
+
+    for (i = 0; i < 4; i++) {
+        pipe_sw->set_vertex_buffers(pipe_sw, i, 1, NULL);
+        if (state->transfers_so[i])
+            pipe->transfer_unmap(pipe, state->transfers_so[i]);
+        state->transfers_so[i] = NULL;
+    }
+}
+
+void
+nine_state_destroy_sw(struct NineDevice9 *device)
+{
+    (void) device;
+    /* Everything destroyed with cso */
+}
+
 /*
 static const DWORD nine_render_states_pixel[] =
 {
diff --git a/src/gallium/state_trackers/nine/nine_state.h b/src/gallium/state_trackers/nine/nine_state.h
index 2aa424d..05eb2c1 100644
--- a/src/gallium/state_trackers/nine/nine_state.h
+++ b/src/gallium/state_trackers/nine/nine_state.h
@@ -242,6 +242,9 @@ struct nine_state
         struct pipe_constant_buffer cb_vs_ff;
         struct pipe_constant_buffer cb_ps_ff;
     } pipe;
+
+    /* sw */
+    struct pipe_transfer *transfers_so[4];
 };
 
 /* map D3DRS -> NINE_STATE_x
@@ -263,6 +266,15 @@ void nine_state_set_defaults(struct NineDevice9 *, const D3DCAPS9 *,
                              boolean is_reset);
 void nine_state_clear(struct nine_state *, const boolean device);
 
+void nine_state_init_sw(struct NineDevice9 *device);
+void nine_state_prepare_draw_sw(struct NineDevice9 *device,
+                                struct NineVertexDeclaration9 *vdecl_out,
+                                int start_vertice,
+                                int num_vertices,
+                                struct pipe_stream_output_info *so);
+void nine_state_after_draw_sw(struct NineDevice9 *device);
+void nine_state_destroy_sw(struct NineDevice9 *device);
+
 /* If @alloc is FALSE, the return value may be a const identity matrix.
  * Therefore, do not modify if you set alloc to FALSE !
  */
diff --git a/src/gallium/state_trackers/nine/pixelshader9.c b/src/gallium/state_trackers/nine/pixelshader9.c
index 8bf4f4b..9e28032 100644
--- a/src/gallium/state_trackers/nine/pixelshader9.c
+++ b/src/gallium/state_trackers/nine/pixelshader9.c
@@ -59,6 +59,7 @@ NinePixelShader9_ctor( struct NinePixelShader9 *This,
     info.sampler_ps1xtypes = 0x0;
     info.fog_enable = 0;
     info.projected = 0;
+    info.process_vertices = false;
 
     hr = nine_translate_shader(device, &info);
     if (FAILED(hr))
@@ -162,6 +163,7 @@ NinePixelShader9_GetVariant( struct NinePixelShader9 *This )
         info.fog_mode = device->state.rs[D3DRS_FOGTABLEMODE];
         info.force_color_in_centroid = key >> 34 & 1;
         info.projected = (key >> 48) & 0xffff;
+        info.process_vertices = false;
 
         hr = nine_translate_shader(This->base.device, &info);
         if (FAILED(hr))
diff --git a/src/gallium/state_trackers/nine/vertexdeclaration9.c b/src/gallium/state_trackers/nine/vertexdeclaration9.c
index 955cdbd..e1256e2 100644
--- a/src/gallium/state_trackers/nine/vertexdeclaration9.c
+++ b/src/gallium/state_trackers/nine/vertexdeclaration9.c
@@ -24,12 +24,12 @@
 #include "vertexbuffer9.h"
 #include "device9.h"
 #include "nine_helpers.h"
+#include "nine_shader.h"
 
 #include "pipe/p_format.h"
 #include "pipe/p_context.h"
 #include "util/u_math.h"
 #include "util/u_format.h"
-#include "util/u_box.h"
 #include "translate/translate.h"
 
 #define DBG_CHANNEL DBG_VERTEXDECLARATION
@@ -409,6 +409,53 @@ NineVertexDeclaration9_new_from_fvf( struct NineDevice9 *pDevice,
     NINE_DEVICE_CHILD_NEW(VertexDeclaration9, ppOut, /* args */ pDevice, elems);
 }
 
+void
+NineVertexDeclaration9_FillStreamOutputInfo(
+    struct NineVertexDeclaration9 *This,
+    struct nine_vs_output_info *ShaderOutputsInfo,
+    unsigned numOutputs,
+    struct pipe_stream_output_info *so )
+{
+    unsigned so_outputs = 0;
+    int i, j;
+
+    memset(so, 0, sizeof(struct pipe_stream_output_info));
+
+    for (i = 0; i < numOutputs; i++) {
+        BYTE output_semantic = ShaderOutputsInfo[i].output_semantic;
+        unsigned output_semantic_index = ShaderOutputsInfo[i].output_semantic_index;
+
+        for (j = 0; j < This->nelems; j++) {
+            if ((This->decls[j].Usage == output_semantic ||
+                 (output_semantic == D3DDECLUSAGE_POSITION &&
+                  This->decls[j].Usage == D3DDECLUSAGE_POSITIONT)) &&
+                This->decls[j].UsageIndex == output_semantic_index) {
+                DBG("Matching %s %d: o%d -> %d\n",
+                    nine_declusage_name(nine_d3d9_to_nine_declusage(This->decls[j].Usage, 0)),
+                    This->decls[j].UsageIndex, i, j);
+                so->output[so_outputs].register_index = ShaderOutputsInfo[i].output_index;
+                so->output[so_outputs].start_component = 0;
+                if (ShaderOutputsInfo[i].mask & 8)
+                    so->output[so_outputs].num_components = 4;
+                else if (ShaderOutputsInfo[i].mask & 4)
+                    so->output[so_outputs].num_components = 3;
+                else if (ShaderOutputsInfo[i].mask & 2)
+                    so->output[so_outputs].num_components = 2;
+                else
+                    so->output[so_outputs].num_components = 1;
+                so->output[so_outputs].output_buffer = 0;
+                so->output[so_outputs].dst_offset = so_outputs * sizeof(float[4])/4;
+                so->output[so_outputs].stream = 0;
+                so_outputs++;
+                break;
+            }
+        }
+    }
+
+    so->num_outputs = so_outputs;
+    so->stride[0] = so_outputs * sizeof(float[4])/4;
+}
+
 /* ProcessVertices runs stream output into a temporary buffer to capture
  * all outputs.
  * Now we have to convert them to the format and order set by the vertex
@@ -422,17 +469,13 @@ NineVertexDeclaration9_ConvertStreamOutput(
     struct NineVertexBuffer9 *pDstBuf,
     UINT DestIndex,
     UINT VertexCount,
-    struct pipe_resource *pSrcBuf,
+    void *pSrcBuf,
     const struct pipe_stream_output_info *so )
 {
-    struct pipe_context *pipe = This->base.device->pipe;
-    struct pipe_transfer *transfer = NULL;
     struct translate *translate;
     struct translate_key transkey;
-    struct pipe_box box;
     HRESULT hr;
     unsigned i;
-    void *src_map;
     void *dst_map;
 
     DBG("This=%p pDstBuf=%p DestIndex=%u VertexCount=%u pSrcBuf=%p so=%p\n",
@@ -477,20 +520,12 @@ NineVertexDeclaration9_ConvertStreamOutput(
     if (FAILED(hr))
         goto out;
 
-    src_map = pipe->transfer_map(pipe, pSrcBuf, 0, PIPE_TRANSFER_READ, &box,
-                                 &transfer);
-    if (!src_map) {
-        hr = D3DERR_DRIVERINTERNALERROR;
-        goto out;
-    }
-    translate->set_buffer(translate, 0, src_map, so->stride[0], ~0);
+    translate->set_buffer(translate, 0, pSrcBuf, so->stride[0] * 4, ~0);
 
     translate->run(translate, 0, VertexCount, 0, 0, dst_map);
 
     NineVertexBuffer9_Unlock(pDstBuf);
 out:
-    if (transfer)
-        pipe->transfer_unmap(pipe, transfer);
     translate->release(translate); /* TODO: cache these */
     return hr;
 }
diff --git a/src/gallium/state_trackers/nine/vertexdeclaration9.h b/src/gallium/state_trackers/nine/vertexdeclaration9.h
index 9d3b1bd..7b94f84 100644
--- a/src/gallium/state_trackers/nine/vertexdeclaration9.h
+++ b/src/gallium/state_trackers/nine/vertexdeclaration9.h
@@ -31,6 +31,7 @@ struct pipe_vertex_element;
 struct pipe_stream_output_info;
 struct NineDevice9;
 struct NineVertexBuffer9;
+struct nine_vs_output_info;
 
 struct NineVertexDeclaration9
 {
@@ -78,6 +79,13 @@ NineVertexDeclaration9_GetDeclaration( struct NineVertexDeclaration9 *This,
                                        D3DVERTEXELEMENT9 *pElement,
                                        UINT *pNumElements );
 
+void
+NineVertexDeclaration9_FillStreamOutputInfo(
+    struct NineVertexDeclaration9 *This,
+    struct nine_vs_output_info *ShaderOutputsInfo,
+    unsigned numOutputs,
+    struct pipe_stream_output_info *so );
+
 /* Convert stream output data to the vertex declaration's format. */
 HRESULT
 NineVertexDeclaration9_ConvertStreamOutput(
@@ -85,7 +93,7 @@ NineVertexDeclaration9_ConvertStreamOutput(
     struct NineVertexBuffer9 *pDstBuf,
     UINT DestIndex,
     UINT VertexCount,
-    struct pipe_resource *pSrcBuf,
+    void *pSrcBuf,
     const struct pipe_stream_output_info *so );
 
 #endif /* _NINE_VERTEXDECLARATION9_H_ */
diff --git a/src/gallium/state_trackers/nine/vertexshader9.c b/src/gallium/state_trackers/nine/vertexshader9.c
index 92f8f6b..a8c7c9b 100644
--- a/src/gallium/state_trackers/nine/vertexshader9.c
+++ b/src/gallium/state_trackers/nine/vertexshader9.c
@@ -23,10 +23,12 @@
 #include "nine_helpers.h"
 #include "nine_shader.h"
 
+#include "vertexdeclaration9.h"
 #include "vertexshader9.h"
 
 #include "device9.h"
 #include "pipe/p_context.h"
+#include "cso_cache/cso_context.h"
 
 #define DBG_CHANNEL DBG_VERTEXSHADER
 
@@ -64,6 +66,7 @@ NineVertexShader9_ctor( struct NineVertexShader9 *This,
     info.point_size_min = 0;
     info.point_size_max = 0;
     info.swvp_on = !!(device->params.BehaviorFlags & D3DCREATE_SOFTWARE_VERTEXPROCESSING);
+    info.process_vertices = false;
 
     hr = nine_translate_shader(device, &info);
     if (hr == D3DERR_INVALIDCALL &&
@@ -109,6 +112,7 @@ NineVertexShader9_dtor( struct NineVertexShader9 *This )
     if (This->base.device) {
         struct pipe_context *pipe = This->base.device->pipe;
         struct nine_shader_variant *var = &This->variant;
+        struct nine_shader_variant_so *var_so = &This->variant_so;
 
         do {
             if (var->cso) {
@@ -119,6 +123,13 @@ NineVertexShader9_dtor( struct NineVertexShader9 *This )
             var = var->next;
         } while (var);
 
+        while (var_so && var_so->vdecl) {
+            if (var_so->cso) {
+                cso_delete_vertex_shader(This->base.device->cso_sw, var_so->cso );
+            }
+            var_so = var_so->next;
+        }
+
         if (This->ff_cso) {
             if (This->ff_cso == This->base.device->state.cso.vs)
                 pipe->bind_vs_state(pipe, NULL);
@@ -126,6 +137,7 @@ NineVertexShader9_dtor( struct NineVertexShader9 *This )
         }
     }
     nine_shader_variants_free(&This->variant);
+    nine_shader_variants_so_free(&This->variant_so);
 
     FREE((void *)This->byte_code.tokens); /* const_cast */
 
@@ -178,6 +190,7 @@ NineVertexShader9_GetVariant( struct NineVertexShader9 *This )
         info.point_size_min = asfloat(device->state.rs[D3DRS_POINTSIZE_MIN]);
         info.point_size_max = asfloat(device->state.rs[D3DRS_POINTSIZE_MAX]);
         info.swvp_on = device->swvp;
+        info.process_vertices = false;
 
         hr = nine_translate_shader(This->base.device, &info);
         if (FAILED(hr))
@@ -192,6 +205,38 @@ NineVertexShader9_GetVariant( struct NineVertexShader9 *This )
     return cso;
 }
 
+void *
+NineVertexShader9_GetVariantProcessVertices( struct NineVertexShader9 *This,
+                                             struct NineVertexDeclaration9 *vdecl_out,
+                                             struct pipe_stream_output_info *so )
+{
+    struct nine_shader_info info;
+    HRESULT hr;
+    void *cso;
+
+    cso = nine_shader_variant_so_get(&This->variant_so, vdecl_out, so);
+    if (cso)
+        return cso;
+
+    info.type = PIPE_SHADER_VERTEX;
+    info.const_i_base = 0;
+    info.const_b_base = 0;
+    info.byte_code = This->byte_code.tokens;
+    info.sampler_mask_shadow = 0;
+    info.fog_enable = false;
+    info.point_size_min = 0;
+    info.point_size_max = 0;
+    info.swvp_on = true;
+    info.vdecl_out = vdecl_out;
+    info.process_vertices = true;
+    hr = nine_translate_shader(This->base.device, &info);
+    if (FAILED(hr))
+        return NULL;
+    *so = info.so;
+    nine_shader_variant_so_add(&This->variant_so, vdecl_out, so, info.cso);
+    return info.cso;
+}
+
 IDirect3DVertexShader9Vtbl NineVertexShader9_vtable = {
     (void *)NineUnknown_QueryInterface,
     (void *)NineUnknown_AddRef,
diff --git a/src/gallium/state_trackers/nine/vertexshader9.h b/src/gallium/state_trackers/nine/vertexshader9.h
index 823c71a..1f0cfd6 100644
--- a/src/gallium/state_trackers/nine/vertexshader9.h
+++ b/src/gallium/state_trackers/nine/vertexshader9.h
@@ -31,6 +31,8 @@
 #include "nine_shader.h"
 #include "nine_state.h"
 
+struct NineVertexDeclaration9;
+
 struct NineVertexShader9
 {
     struct NineUnknown base;
@@ -57,8 +59,6 @@ struct NineVertexShader9
 
     struct nine_lconstf lconstf;
 
-    const struct pipe_stream_output_info *so;
-
     uint64_t ff_key[3];
     void *ff_cso;
 
@@ -66,6 +66,9 @@ struct NineVertexShader9
     void *last_cso;
 
     uint64_t next_key;
+
+    /* so */
+    struct nine_shader_variant_so variant_so;
 };
 static inline struct NineVertexShader9 *
 NineVertexShader9( void *data )
@@ -107,6 +110,11 @@ NineVertexShader9_UpdateKey( struct NineVertexShader9 *vs,
 void *
 NineVertexShader9_GetVariant( struct NineVertexShader9 *vs );
 
+void *
+NineVertexShader9_GetVariantProcessVertices( struct NineVertexShader9 *vs,
+                                             struct NineVertexDeclaration9 *vdecl_out,
+                                             struct pipe_stream_output_info *so );
+
 /*** public ***/
 
 HRESULT
-- 
2.10.0



More information about the mesa-dev mailing list