Mesa (master): radeonsi: use a C++ template to decrease draw_vbo overhead by 13 %

Wed Dec 9 21:57:53 UTC 2020

Module: Mesa
Branch: master
Commit: 2b09bde1f5450152ce121a5f58943e01223ff783
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=2b09bde1f5450152ce121a5f58943e01223ff783

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Wed Aug 26 00:16:23 2020 -0400

radeonsi: use a C++ template to decrease draw_vbo overhead by 13 %

With GALLIUM_THREAD=0 to disable draw merging.

Before:
   1, DrawElements ( 1 VBO| 0 UBO|  0    ) w/ no state change,                 8736

After:
   1, DrawElements ( 1 VBO| 0 UBO|  0    ) w/ no state change,                10059

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7807>

---

 src/amd/common/amd_family.h                     |   2 +
 src/gallium/drivers/radeonsi/si_pipe.h          |  38 +++
 src/gallium/drivers/radeonsi/si_state_draw.cpp  | 360 ++++++++++++++----------
 src/gallium/drivers/radeonsi/si_state_shaders.c |   4 +
 4 files changed, 263 insertions(+), 141 deletions(-)

diff --git a/src/amd/common/amd_family.h b/src/amd/common/amd_family.h
index cfb3f47f835..a6175e8b4fd 100644
--- a/src/amd/common/amd_family.h
+++ b/src/amd/common/amd_family.h
@@ -126,6 +126,8 @@ enum chip_class
    GFX9,
    GFX10,
    GFX10_3,
+
+   NUM_GFX_VERSIONS,
 };
 
 enum ring_type
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index b0b7d5469fa..0f1c94ac7e6 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -131,6 +131,26 @@ extern "C" {
 #define SI_RESOURCE_FLAG_UNCACHED          (PIPE_RESOURCE_FLAG_DRV_PRIV << 12)
 #define SI_RESOURCE_FLAG_DRIVER_INTERNAL   (PIPE_RESOURCE_FLAG_DRV_PRIV << 13)
 
+enum si_has_gs {
+   GS_OFF,
+   GS_ON,
+};
+
+enum si_has_tess {
+   TESS_OFF,
+   TESS_ON,
+};
+
+enum si_has_ngg {
+   NGG_OFF,
+   NGG_ON,
+};
+
+enum si_has_prim_discard_cs {
+   PRIM_DISCARD_CS_OFF,
+   PRIM_DISCARD_CS_ON,
+};
+
 enum si_clear_code
 {
    DCC_CLEAR_COLOR_0000 = 0x00000000,
@@ -884,6 +904,12 @@ struct si_small_prim_cull_info {
    float scale[2], translate[2];
 };
 
+typedef void (*pipe_draw_vbo_func)(struct pipe_context *pipe,
+                                   const struct pipe_draw_info *info,
+                                   const struct pipe_draw_indirect_info *indirect,
+                                   const struct pipe_draw_start_count *draws,
+                                   unsigned num_draws);
+
 struct si_context {
    struct pipe_context b; /* base class */
 
@@ -1268,6 +1294,8 @@ struct si_context {
     * a context flush.
     */
    struct hash_table *dirty_implicit_resources;
+
+   pipe_draw_vbo_func draw_vbo[NUM_GFX_VERSIONS - GFX6][2][2][2][2];
 };
 
 /* si_blit.c */
@@ -1912,6 +1940,16 @@ static inline unsigned si_get_shader_wave_size(struct si_shader *shader)
                            shader->key.opt.vs_as_prim_discard_cs);
 }
 
+static inline void si_select_draw_vbo(struct si_context *sctx)
+{
+   sctx->b.draw_vbo = sctx->draw_vbo[sctx->chip_class - GFX6]
+                                    [!!sctx->tes_shader.cso]
+                                    [!!sctx->gs_shader.cso]
+                                    [sctx->ngg]
+                                    [si_compute_prim_discard_enabled(sctx)];
+   assert(sctx->b.draw_vbo);
+}
+
 #define PRINT_ERR(fmt, args...)                                                                    \
    fprintf(stderr, "EE %s:%d %s - " fmt, __FILE__, __LINE__, __func__, ##args)
 
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp
index 2460a33dccc..e1ad9999674 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -521,7 +521,7 @@ static bool num_instanced_prims_less_than(const struct pipe_draw_info *info,
    }
 }
 
-ALWAYS_INLINE
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS> ALWAYS_INLINE
 static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
                                           const struct pipe_draw_info *info,
                                           const struct pipe_draw_indirect_info *indirect,
@@ -533,9 +533,9 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
    unsigned primgroup_size;
    unsigned ia_multi_vgt_param;
 
-   if (sctx->tes_shader.cso) {
+   if (HAS_TESS) {
       primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */
-   } else if (sctx->gs_shader.cso) {
+   } else if (HAS_GS) {
       primgroup_size = 64; /* recommended with a GS */
    } else {
       primgroup_size = 128; /* recommended without a GS and tess */
@@ -552,9 +552,9 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
    ia_multi_vgt_param =
       sctx->ia_multi_vgt_param[key.index] | S_028AA8_PRIMGROUP_SIZE(primgroup_size - 1);
 
-   if (sctx->gs_shader.cso) {
+   if (HAS_GS) {
       /* GS requirement. */
-      if (sctx->chip_class <= GFX8 &&
+      if (GFX_VERSION <= GFX8 &&
           SI_GS_PER_ES / primgroup_size >= sctx->screen->gs_table_depth - 3)
          ia_multi_vgt_param |= S_028AA8_PARTIAL_ES_WAVE_ON(1);
 
@@ -562,7 +562,8 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
        * The hw doc says all multi-SE chips are affected, but Vulkan
        * only applies it to Hawaii. Do what Vulkan does.
        */
-      if (sctx->family == CHIP_HAWAII && G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) &&
+      if (GFX_VERSION == GFX7 &&
+          sctx->family == CHIP_HAWAII && G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) &&
           num_instanced_prims_less_than(info, indirect, prim, min_vertex_count, instance_count, 2))
          sctx->flags |= SI_CONTEXT_VGT_FLUSH;
    }
@@ -597,7 +598,7 @@ static unsigned si_conv_prim_to_gs_out(unsigned mode)
 }
 
 /* rast_prim is the primitive type after GS. */
-ALWAYS_INLINE
+template<si_has_gs HAS_GS, si_has_ngg NGG> ALWAYS_INLINE
 static void si_emit_rasterizer_prim_state(struct si_context *sctx)
 {
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
@@ -620,7 +621,7 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx)
    }
 
    unsigned gs_out_prim = si_conv_prim_to_gs_out(rast_prim);
-   if (unlikely(gs_out_prim != sctx->last_gs_out_prim && (sctx->ngg || sctx->gs_shader.cso))) {
+   if (unlikely(gs_out_prim != sctx->last_gs_out_prim && (NGG || HAS_GS))) {
       radeon_set_context_reg(cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out_prim);
       sctx->last_gs_out_prim = gs_out_prim;
    }
@@ -628,7 +629,7 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx)
    if (initial_cdw != cs->current.cdw)
       sctx->context_roll = true;
 
-   if (sctx->ngg) {
+   if (NGG) {
       struct si_shader *hw_vs = si_get_vs_state(sctx);
 
       if (hw_vs->uses_vs_state_provoking_vertex) {
@@ -696,7 +697,7 @@ static bool si_prim_restart_index_changed(struct si_context *sctx, bool primitiv
                                 sctx->last_restart_index == SI_RESTART_INDEX_UNKNOWN);
 }
 
-ALWAYS_INLINE
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS> ALWAYS_INLINE
 static void si_emit_ia_multi_vgt_param(struct si_context *sctx, const struct pipe_draw_info *info,
                                        const struct pipe_draw_indirect_info *indirect,
                                        enum pipe_prim_type prim, unsigned num_patches,
@@ -707,15 +708,16 @@ static void si_emit_ia_multi_vgt_param(struct si_context *sctx, const struct pip
    unsigned ia_multi_vgt_param;
 
    ia_multi_vgt_param =
-      si_get_ia_multi_vgt_param(sctx, info, indirect, prim, num_patches, instance_count,
-                                primitive_restart, min_vertex_count);
+      si_get_ia_multi_vgt_param<GFX_VERSION, HAS_TESS, HAS_GS>
+         (sctx, info, indirect, prim, num_patches, instance_count, primitive_restart,
+          min_vertex_count);
 
    /* Draw state. */
    if (ia_multi_vgt_param != sctx->last_multi_vgt_param) {
-      if (sctx->chip_class == GFX9)
+      if (GFX_VERSION == GFX9)
          radeon_set_uconfig_reg_idx(cs, sctx->screen, R_030960_IA_MULTI_VGT_PARAM, 4,
                                     ia_multi_vgt_param);
-      else if (sctx->chip_class >= GFX7)
+      else if (GFX_VERSION >= GFX7)
          radeon_set_context_reg_idx(cs, R_028AA8_IA_MULTI_VGT_PARAM, 1, ia_multi_vgt_param);
       else
          radeon_set_context_reg(cs, R_028AA8_IA_MULTI_VGT_PARAM, ia_multi_vgt_param);
@@ -727,14 +729,14 @@ static void si_emit_ia_multi_vgt_param(struct si_context *sctx, const struct pip
 /* GFX10 removed IA_MULTI_VGT_PARAM in exchange for GE_CNTL.
  * We overload last_multi_vgt_param.
  */
-ALWAYS_INLINE
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG> ALWAYS_INLINE
 static void gfx10_emit_ge_cntl(struct si_context *sctx, unsigned num_patches)
 {
    union si_vgt_param_key key = sctx->ia_multi_vgt_param_key;
    unsigned ge_cntl;
 
-   if (sctx->ngg) {
-      if (sctx->tes_shader.cso) {
+   if (NGG) {
+      if (HAS_TESS) {
          ge_cntl = S_03096C_PRIM_GRP_SIZE(num_patches) |
                    S_03096C_VERT_GRP_SIZE(0) |
                    S_03096C_BREAK_WAVE_AT_EOI(key.u.tess_uses_prim_id);
@@ -745,10 +747,10 @@ static void gfx10_emit_ge_cntl(struct si_context *sctx, unsigned num_patches)
       unsigned primgroup_size;
       unsigned vertgroup_size;
 
-      if (sctx->tes_shader.cso) {
+      if (HAS_TESS) {
          primgroup_size = num_patches; /* must be a multiple of NUM_PATCHES */
          vertgroup_size = 0;
-      } else if (sctx->gs_shader.cso) {
+      } else if (HAS_GS) {
          unsigned vgt_gs_onchip_cntl = sctx->gs_shader.current->ctx_reg.gs.vgt_gs_onchip_cntl;
          primgroup_size = G_028A44_GS_PRIMS_PER_SUBGRP(vgt_gs_onchip_cntl);
          vertgroup_size = G_028A44_ES_VERTS_PER_SUBGRP(vgt_gs_onchip_cntl);
@@ -769,7 +771,7 @@ static void gfx10_emit_ge_cntl(struct si_context *sctx, unsigned num_patches)
    }
 }
 
-ALWAYS_INLINE
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG> ALWAYS_INLINE
 static void si_emit_draw_registers(struct si_context *sctx, const struct pipe_draw_info *info,
                                    const struct pipe_draw_indirect_info *indirect,
                                    enum pipe_prim_type prim, unsigned num_patches,
@@ -779,16 +781,17 @@ static void si_emit_draw_registers(struct si_context *sctx, const struct pipe_dr
    struct radeon_cmdbuf *cs = &sctx->gfx_cs;
    unsigned vgt_prim = si_conv_pipe_prim(prim);
 
-   if (sctx->chip_class >= GFX10)
-      gfx10_emit_ge_cntl(sctx, num_patches);
+   if (GFX_VERSION >= GFX10)
+      gfx10_emit_ge_cntl<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx, num_patches);
    else
-      si_emit_ia_multi_vgt_param(sctx, info, indirect, prim, num_patches, instance_count,
-                                 primitive_restart, min_vertex_count);
+      si_emit_ia_multi_vgt_param<GFX_VERSION, HAS_TESS, HAS_GS>
+         (sctx, info, indirect, prim, num_patches, instance_count, primitive_restart,
+          min_vertex_count);
 
    if (vgt_prim != sctx->last_prim) {
-      if (sctx->chip_class >= GFX10)
+      if (GFX_VERSION >= GFX10)
          radeon_set_uconfig_reg(cs, R_030908_VGT_PRIMITIVE_TYPE, vgt_prim);
-      else if (sctx->chip_class >= GFX7)
+      else if (GFX_VERSION >= GFX7)
          radeon_set_uconfig_reg_idx(cs, sctx->screen, R_030908_VGT_PRIMITIVE_TYPE, 1, vgt_prim);
       else
          radeon_set_config_reg(cs, R_008958_VGT_PRIMITIVE_TYPE, vgt_prim);
@@ -798,7 +801,7 @@ static void si_emit_draw_registers(struct si_context *sctx, const struct pipe_dr
 
    /* Primitive restart. */
    if (primitive_restart != sctx->last_primitive_restart_en) {
-      if (sctx->chip_class >= GFX9)
+      if (GFX_VERSION >= GFX9)
          radeon_set_uconfig_reg(cs, R_03092C_VGT_MULTI_PRIM_IB_RESET_EN, primitive_restart);
       else
          radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN, primitive_restart);
@@ -812,6 +815,7 @@ static void si_emit_draw_registers(struct si_context *sctx, const struct pipe_dr
    }
 }
 
+template <chip_class GFX_VERSION, si_has_ngg NGG, si_has_prim_discard_cs ALLOW_PRIM_DISCARD_CS>
 static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw_info *info,
                                  const struct pipe_draw_indirect_info *indirect,
                                  const struct pipe_draw_start_count *draws,
@@ -852,19 +856,19 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
          case 2:
             index_type =
                V_028A7C_VGT_INDEX_16 |
-               (SI_BIG_ENDIAN && sctx->chip_class <= GFX7 ? V_028A7C_VGT_DMA_SWAP_16_BIT : 0);
+               (SI_BIG_ENDIAN && GFX_VERSION <= GFX7 ? V_028A7C_VGT_DMA_SWAP_16_BIT : 0);
             break;
          case 4:
             index_type =
                V_028A7C_VGT_INDEX_32 |
-               (SI_BIG_ENDIAN && sctx->chip_class <= GFX7 ? V_028A7C_VGT_DMA_SWAP_32_BIT : 0);
+               (SI_BIG_ENDIAN && GFX_VERSION <= GFX7 ? V_028A7C_VGT_DMA_SWAP_32_BIT : 0);
             break;
          default:
             assert(!"unreachable");
             return;
          }
 
-         if (sctx->chip_class >= GFX9) {
+         if (GFX_VERSION >= GFX9) {
             radeon_set_uconfig_reg_idx(cs, sctx->screen, R_03090C_VGT_INDEX_TYPE, 2, index_type);
          } else {
             radeon_emit(cs, PKT3(PKT3_INDEX_TYPE, 0, 0));
@@ -874,7 +878,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
          sctx->last_index_size = index_size;
       }
 
-      if (original_index_size) {
+      /* If !ALLOW_PRIM_DISCARD_CS, index_size == original_index_size. */
+      if (!ALLOW_PRIM_DISCARD_CS || original_index_size) {
          index_max_size = (indexbuf->width0 - index_offset) >> util_logbase2(original_index_size);
          /* Skip draw calls with 0-sized index buffers.
           * They cause a hang on some chips, like Navi10-14.
@@ -891,7 +896,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
       /* On GFX7 and later, non-indexed draws overwrite VGT_INDEX_TYPE,
        * so the state must be re-emitted before the next indexed draw.
        */
-      if (sctx->chip_class >= GFX7)
+      if (GFX_VERSION >= GFX7)
          sctx->last_index_size = -1;
    }
 
@@ -1018,7 +1023,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
       set_draw_id &= info->increment_draw_id;
 
       if (index_size) {
-         if (dispatch_prim_discard_cs) {
+         if (ALLOW_PRIM_DISCARD_CS && dispatch_prim_discard_cs) {
             for (unsigned i = 0; i < num_draws; i++) {
                uint64_t va = index_va + draws[0].start * original_index_size;
 
@@ -1049,13 +1054,13 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
                          * can be changed between draws and GS fast launch must be disabled.
                          * NOT_EOP doesn't work on gfx9 and older.
                          */
-                        S_0287F0_NOT_EOP(sctx->chip_class >= GFX10 &&
+                        S_0287F0_NOT_EOP(GFX_VERSION >= GFX10 &&
                                          !set_draw_id &&
                                          i < num_draws - 1));
          }
       } else {
          /* Set the index buffer for fast launch. The VS prolog will load the indices. */
-         if (sctx->ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0)) {
+         if (NGG && sctx->ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0)) {
             index_max_size = (indexbuf->width0 - index_offset) >> util_logbase2(original_index_size);
 
             radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(indexbuf),
@@ -1587,7 +1592,7 @@ void si_emit_cache_flush(struct si_context *sctx)
    sctx->flags = 0;
 }
 
-ALWAYS_INLINE
+template <chip_class GFX_VERSION> ALWAYS_INLINE
 static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
 {
    unsigned i, count = sctx->num_vertex_elements;
@@ -1651,7 +1656,7 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
       uint64_t va = buf->gpu_address + offset;
 
       int64_t num_records = (int64_t)buf->b.b.width0 - offset;
-      if (sctx->chip_class != GFX8 && vb->stride) {
+      if (GFX_VERSION != GFX8 && vb->stride) {
          /* Round up by rounding down and adding 1 */
          num_records = (num_records - velems->format_size[i]) / vb->stride + 1;
       }
@@ -1663,7 +1668,7 @@ static bool si_upload_vertex_buffer_descriptors(struct si_context *sctx)
        *  - 1: index >= NUM_RECORDS (Structured)
        *  - 3: offset >= NUM_RECORDS (Raw)
        */
-      if (sctx->chip_class >= GFX10)
+      if (GFX_VERSION >= GFX10)
          rsrc_word3 |= S_008F0C_OOB_SELECT(vb->stride ? V_008F0C_OOB_SELECT_STRUCTURED
                                                       : V_008F0C_OOB_SELECT_RAW);
 
@@ -1760,6 +1765,7 @@ static void si_get_draw_start_count(struct si_context *sctx, const struct pipe_d
    }
 }
 
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>
 static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info,
                                const struct pipe_draw_indirect_info *indirect,
                                enum pipe_prim_type prim, unsigned instance_count,
@@ -1768,8 +1774,8 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
 {
    unsigned num_patches = 0;
 
-   si_emit_rasterizer_prim_state(sctx);
-   if (sctx->tes_shader.cso)
+   si_emit_rasterizer_prim_state<HAS_GS, NGG>(sctx);
+   if (HAS_TESS)
       si_emit_derived_tess_state(sctx, info, &num_patches);
 
    /* Emit state atoms. */
@@ -1795,8 +1801,9 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
 
    /* Emit draw states. */
    si_emit_vs_state(sctx, info);
-   si_emit_draw_registers(sctx, info, indirect, prim, num_patches, instance_count,
-                          primitive_restart, min_vertex_count);
+   si_emit_draw_registers<GFX_VERSION, HAS_TESS, HAS_GS, NGG>
+         (sctx, info, indirect, prim, num_patches, instance_count, primitive_restart,
+          min_vertex_count);
 }
 
 static bool si_all_vs_resources_read_only(struct si_context *sctx, struct pipe_resource *indexbuf)
@@ -1891,6 +1898,8 @@ static ALWAYS_INLINE bool pd_msg(const char *s)
          pipe_resource_reference(&indexbuf, NULL);        \
    } while (0)
 
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
+          si_has_prim_discard_cs ALLOW_PRIM_DISCARD_CS>
 static void si_draw_vbo(struct pipe_context *ctx,
                         const struct pipe_draw_info *info,
                         const struct pipe_draw_indirect_info *indirect,
@@ -1920,7 +1929,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
    struct si_shader_selector *vs = sctx->vs_shader.cso;
    if (unlikely(!vs || sctx->num_vertex_elements < vs->num_vs_inputs ||
                 (!sctx->ps_shader.cso && !rs->rasterizer_discard) ||
-                (!!sctx->tes_shader.cso != (prim == PIPE_PRIM_PATCHES)))) {
+                (HAS_TESS != (prim == PIPE_PRIM_PATCHES)))) {
       assert(0);
       return;
    }
@@ -1949,10 +1958,10 @@ static void si_draw_vbo(struct pipe_context *ctx,
     * This must be done after si_decompress_textures, which can call
     * draw_vbo recursively, and before si_update_shaders, which uses
     * current_rast_prim for this draw_vbo call. */
-   if (sctx->gs_shader.cso) {
+   if (HAS_GS) {
       /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
       rast_prim = sctx->gs_shader.cso->rast_prim;
-   } else if (sctx->tes_shader.cso) {
+   } else if (HAS_TESS) {
       /* Only possibilities: POINTS, LINE_STRIP, TRIANGLES */
       rast_prim = sctx->tes_shader.cso->rast_prim;
    } else if (util_rast_prim_is_triangles(prim)) {
@@ -1971,12 +1980,12 @@ static void si_draw_vbo(struct pipe_context *ctx,
       sctx->do_update_shaders = true;
    }
 
-   if (sctx->tes_shader.cso) {
+   if (HAS_TESS) {
       struct si_shader_selector *tcs = sctx->tcs_shader.cso;
 
       /* The rarely occuring tcs == NULL case is not optimized. */
       bool same_patch_vertices =
-         sctx->chip_class >= GFX9 &&
+         GFX_VERSION >= GFX9 &&
          tcs && info->vertices_per_patch == tcs->info.base.tess.tcs_vertices_out;
 
       if (sctx->same_patch_vertices != same_patch_vertices) {
@@ -1984,7 +1993,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
          sctx->do_update_shaders = true;
       }
 
-      if (sctx->screen->info.has_ls_vgpr_init_bug) {
+      if (GFX_VERSION == GFX9 && sctx->screen->info.has_ls_vgpr_init_bug) {
          /* Determine whether the LS VGPR fix should be applied.
           *
           * It is only required when num input CPs > num output CPs,
@@ -2002,7 +2011,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
       }
    }
 
-   if (sctx->chip_class <= GFX9 && sctx->gs_shader.cso) {
+   if (GFX_VERSION <= GFX9 && HAS_GS) {
       /* Determine whether the GS triangle strip adjacency fix should
        * be applied. Rotate every other triangle if
        * - triangle strips with adjacency are fed to the GS and
@@ -2010,7 +2019,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
        *   when the restart occurs after an odd number of triangles).
        */
       bool gs_tri_strip_adj_fix =
-         !sctx->tes_shader.cso && prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY && !primitive_restart;
+         !HAS_TESS && prim == PIPE_PRIM_TRIANGLE_STRIP_ADJACENCY && !primitive_restart;
 
       if (gs_tri_strip_adj_fix != sctx->gs_tri_strip_adj_fix) {
          sctx->gs_tri_strip_adj_fix = gs_tri_strip_adj_fix;
@@ -2021,7 +2030,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
    if (index_size) {
       /* Translate or upload, if needed. */
       /* 8-bit indices are supported on GFX8. */
-      if (sctx->chip_class <= GFX7 && index_size == 1) {
+      if (GFX_VERSION <= GFX7 && index_size == 1) {
          unsigned start, count, start_offset, size, offset;
          void *ptr;
 
@@ -2056,7 +2065,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
 
          /* info->start will be added by the drawing code */
          index_offset -= start_offset;
-      } else if (sctx->chip_class <= GFX7 && si_resource(indexbuf)->TC_L2_dirty) {
+      } else if (GFX_VERSION <= GFX7 && si_resource(indexbuf)->TC_L2_dirty) {
          /* GFX8 reads index buffers through TC L2, so it doesn't
           * need this. */
          sctx->flags |= SI_CONTEXT_WB_L2;
@@ -2077,7 +2086,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
          si_context_add_resource_size(sctx, indirect->buffer);
 
       /* Indirect buffers use TC L2 on GFX9, but not older hw. */
-      if (sctx->chip_class <= GFX8) {
+      if (GFX_VERSION <= GFX8) {
          if (indirect->buffer && si_resource(indirect->buffer)->TC_L2_dirty) {
             sctx->flags |= SI_CONTEXT_WB_L2;
             si_resource(indirect->buffer)->TC_L2_dirty = false;
@@ -2101,7 +2110,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
    }
 
    /* Determine if we can use the primitive discard compute shader. */
-   if (si_compute_prim_discard_enabled(sctx) &&
+   if (ALLOW_PRIM_DISCARD_CS &&
        (avg_direct_count > sctx->prim_discard_vertex_count_threshold
            ? (sctx->compute_num_verts_rejected += total_direct_count, true)
            : /* Add, then return true. */
@@ -2130,8 +2139,8 @@ static void si_draw_vbo(struct pipe_context *ctx,
         (!sctx->num_pipeline_stat_queries && !sctx->streamout.prims_gen_query_enabled) ||
         pd_msg("pipestat or primgen query")) &&
        (!sctx->vertex_elements->instance_divisor_is_fetched || pd_msg("loads instance divisors")) &&
-       (!sctx->tes_shader.cso || pd_msg("uses tess")) &&
-       (!sctx->gs_shader.cso || pd_msg("uses GS")) &&
+       (!HAS_TESS || pd_msg("uses tess")) &&
+       (!HAS_GS || pd_msg("uses GS")) &&
        (!sctx->ps_shader.cso->info.uses_primid || pd_msg("PS uses PrimID")) &&
        !rs->polygon_mode_enabled &&
 #if SI_PRIM_DISCARD_DEBUG /* same as cso->prim_discard_cs_allowed */
@@ -2176,64 +2185,66 @@ static void si_draw_vbo(struct pipe_context *ctx,
       }
    }
 
-   if (prim_discard_cs_instancing != sctx->prim_discard_cs_instancing) {
+   if (ALLOW_PRIM_DISCARD_CS &&
+       prim_discard_cs_instancing != sctx->prim_discard_cs_instancing) {
       sctx->prim_discard_cs_instancing = prim_discard_cs_instancing;
       sctx->do_update_shaders = true;
    }
 
    /* Update NGG culling settings. */
    uint8_t old_ngg_culling = sctx->ngg_culling;
-   struct si_shader_selector *hw_vs;
-   if (sctx->ngg && !dispatch_prim_discard_cs && rast_prim == PIPE_PRIM_TRIANGLES &&
-       (hw_vs = si_get_vs(sctx)->cso) &&
-       (avg_direct_count > hw_vs->ngg_cull_vert_threshold ||
-        (!index_size &&
-         avg_direct_count > hw_vs->ngg_cull_nonindexed_fast_launch_vert_threshold &&
-         prim & ((1 << PIPE_PRIM_TRIANGLES) |
-                 (1 << PIPE_PRIM_TRIANGLE_STRIP))))) {
-      uint8_t ngg_culling = 0;
-
-      if (rs->rasterizer_discard) {
-         ngg_culling |= SI_NGG_CULL_FRONT_FACE | SI_NGG_CULL_BACK_FACE;
-      } else {
-         /* Polygon mode can't use view and small primitive culling,
-          * because it draws points or lines where the culling depends
-          * on the point or line width.
-          */
-         if (!rs->polygon_mode_enabled)
-            ngg_culling |= SI_NGG_CULL_VIEW_SMALLPRIMS;
-
-         if (sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front)
-            ngg_culling |= SI_NGG_CULL_FRONT_FACE;
-         if (sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back)
-            ngg_culling |= SI_NGG_CULL_BACK_FACE;
-      }
+   if (GFX_VERSION >= GFX10) {
+      struct si_shader_selector *hw_vs;
+      if (NGG && !dispatch_prim_discard_cs && rast_prim == PIPE_PRIM_TRIANGLES &&
+          (hw_vs = si_get_vs(sctx)->cso) &&
+          (avg_direct_count > hw_vs->ngg_cull_vert_threshold ||
+           (!index_size &&
+            avg_direct_count > hw_vs->ngg_cull_nonindexed_fast_launch_vert_threshold &&
+            prim & ((1 << PIPE_PRIM_TRIANGLES) |
+                    (1 << PIPE_PRIM_TRIANGLE_STRIP))))) {
+         uint8_t ngg_culling = 0;
+
+         if (rs->rasterizer_discard) {
+            ngg_culling |= SI_NGG_CULL_FRONT_FACE | SI_NGG_CULL_BACK_FACE;
+         } else {
+            /* Polygon mode can't use view and small primitive culling,
+             * because it draws points or lines where the culling depends
+             * on the point or line width.
+             */
+            if (!rs->polygon_mode_enabled)
+               ngg_culling |= SI_NGG_CULL_VIEW_SMALLPRIMS;
+
+            if (sctx->viewports.y_inverted ? rs->cull_back : rs->cull_front)
+               ngg_culling |= SI_NGG_CULL_FRONT_FACE;
+            if (sctx->viewports.y_inverted ? rs->cull_front : rs->cull_back)
+               ngg_culling |= SI_NGG_CULL_BACK_FACE;
+         }
 
-      /* Use NGG fast launch for certain primitive types.
-       * A draw must have at least 1 full primitive.
-       */
-      if (ngg_culling &&
-          hw_vs->ngg_cull_nonindexed_fast_launch_vert_threshold < UINT32_MAX &&
-          min_direct_count >= 3 && !sctx->tes_shader.cso &&
-          !sctx->gs_shader.cso) {
-         if (prim == PIPE_PRIM_TRIANGLES && !index_size) {
-            ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST;
-         } else if (prim == PIPE_PRIM_TRIANGLE_STRIP && !primitive_restart) {
-            ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP |
-                           SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(MIN2(index_size, 3));
-            /* The index buffer will be emulated. */
-            index_size = 0;
+         /* Use NGG fast launch for certain primitive types.
+          * A draw must have at least 1 full primitive.
+          */
+         if (ngg_culling &&
+             hw_vs->ngg_cull_nonindexed_fast_launch_vert_threshold < UINT32_MAX &&
+             min_direct_count >= 3 && !HAS_TESS && !HAS_GS) {
+            if (prim == PIPE_PRIM_TRIANGLES && !index_size) {
+               ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_LIST;
+            } else if (prim == PIPE_PRIM_TRIANGLE_STRIP && !primitive_restart) {
+               ngg_culling |= SI_NGG_CULL_GS_FAST_LAUNCH_TRI_STRIP |
+                              SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(MIN2(index_size, 3));
+               /* The index buffer will be emulated. */
+               index_size = 0;
+            }
          }
-      }
 
-      if (ngg_culling != old_ngg_culling) {
-         /* If shader compilation is not ready, this setting will be rejected. */
-         sctx->ngg_culling = ngg_culling;
+         if (ngg_culling != old_ngg_culling) {
+            /* If shader compilation is not ready, this setting will be rejected. */
+            sctx->ngg_culling = ngg_culling;
+            sctx->do_update_shaders = true;
+         }
+      } else if (old_ngg_culling) {
+         sctx->ngg_culling = 0;
          sctx->do_update_shaders = true;
       }
-   } else if (old_ngg_culling) {
-      sctx->ngg_culling = false;
-      sctx->do_update_shaders = true;
    }
 
    if (sctx->shader_has_inlinable_uniforms_mask &
@@ -2255,29 +2266,31 @@ static void si_draw_vbo(struct pipe_context *ctx,
        *
        * This is the setting that is used by the draw.
        */
-      uint8_t ngg_culling = si_get_vs(sctx)->current->key.opt.ngg_culling;
-      if (sctx->chip_class == GFX10 &&
-          !(old_ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) &&
-          ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
-         sctx->flags |= SI_CONTEXT_VGT_FLUSH;
+      if (GFX_VERSION >= GFX10) {
+         uint8_t ngg_culling = si_get_vs(sctx)->current->key.opt.ngg_culling;
+         if (GFX_VERSION == GFX10 &&
+             !(old_ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL) &&
+             ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_ALL)
+            sctx->flags |= SI_CONTEXT_VGT_FLUSH;
+
+         if (old_ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0) &&
+             !(ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0))) {
+            /* Need to re-set these, because we have bound an index buffer there. */
+            sctx->shader_pointers_dirty |=
+               (1u << si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_GEOMETRY)) |
+               (1u << si_sampler_and_image_descriptors_idx(PIPE_SHADER_GEOMETRY));
+            si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
+         }
 
-      if (old_ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0) &&
-          !(ngg_culling & SI_NGG_CULL_GS_FAST_LAUNCH_INDEX_SIZE_PACKED(~0))) {
-         /* Need to re-set these, because we have bound an index buffer there. */
-         sctx->shader_pointers_dirty |=
-            (1u << si_const_and_shader_buffer_descriptors_idx(PIPE_SHADER_GEOMETRY)) |
-            (1u << si_sampler_and_image_descriptors_idx(PIPE_SHADER_GEOMETRY));
-         si_mark_atom_dirty(sctx, &sctx->atoms.s.shader_pointers);
+         /* Set this to the correct value determined by si_update_shaders. */
+         sctx->ngg_culling = ngg_culling;
       }
-
-      /* Set this to the correct value determined by si_update_shaders. */
-      sctx->ngg_culling = ngg_culling;
    }
 
    si_need_gfx_cs_space(sctx, num_draws);
 
    /* If we're using a secure context, determine if cs must be secure or not */
-   if (unlikely(radeon_uses_secure_bos(sctx->ws))) {
+   if (GFX_VERSION >= GFX9 && unlikely(radeon_uses_secure_bos(sctx->ws))) {
       bool secure = si_gfx_resources_check_encrypted(sctx);
       if (secure != sctx->ws->cs_is_secure(&sctx->gfx_cs)) {
          si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW |
@@ -2295,7 +2308,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
    if (unlikely(!si_upload_graphics_shader_descriptors(sctx) ||
                 (sctx->vertex_buffers_dirty &&
                  sctx->num_vertex_elements &&
-                 !si_upload_vertex_buffer_descriptors(sctx)))) {
+                 !si_upload_vertex_buffer_descriptors<GFX_VERSION>(sctx)))) {
       DRAW_CLEANUP;
       return;
    }
@@ -2307,7 +2320,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
    unsigned masked_atoms = 0;
    bool gfx9_scissor_bug = false;
 
-   if (sctx->screen->info.has_gfx9_scissor_bug) {
+   if (GFX_VERSION == GFX9 && sctx->screen->info.has_gfx9_scissor_bug) {
       masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.scissors);
       gfx9_scissor_bug = true;
 
@@ -2329,8 +2342,9 @@ static void si_draw_vbo(struct pipe_context *ctx,
          masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);
 
       /* Emit all states except possibly render condition. */
-      si_emit_all_states(sctx, info, indirect, prim, instance_count, min_direct_count,
-                         primitive_restart, masked_atoms);
+      si_emit_all_states<GFX_VERSION, HAS_TESS, HAS_GS, NGG>
+            (sctx, info, indirect, prim, instance_count, min_direct_count,
+             primitive_restart, masked_atoms);
       sctx->emit_cache_flush(sctx);
       /* <-- CUs are idle here. */
 
@@ -2339,22 +2353,23 @@ static void si_draw_vbo(struct pipe_context *ctx,
          sctx->dirty_atoms &= ~si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);
       }
 
-      if (gfx9_scissor_bug &&
+      if (GFX_VERSION == GFX9 && gfx9_scissor_bug &&
           (sctx->context_roll || si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) {
          sctx->atoms.s.scissors.emit(sctx);
          sctx->dirty_atoms &= ~si_get_atom_bit(sctx, &sctx->atoms.s.scissors);
       }
       assert(sctx->dirty_atoms == 0);
 
-      si_emit_draw_packets(sctx, info, indirect, draws, num_draws,
-                           indexbuf, index_size, index_offset, instance_count,
-                           dispatch_prim_discard_cs, original_index_size);
+      si_emit_draw_packets<GFX_VERSION, NGG, ALLOW_PRIM_DISCARD_CS>
+            (sctx, info, indirect, draws, num_draws, indexbuf, index_size,
+             index_offset, instance_count, dispatch_prim_discard_cs,
+             original_index_size);
       /* <-- CUs are busy here. */
 
       /* Start prefetches after the draw has been started. Both will run
        * in parallel, but starting the draw first is more important.
        */
-      if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask)
+      if (GFX_VERSION >= GFX7 && sctx->prefetch_L2_mask)
          cik_emit_prefetch_L2(sctx, false);
    } else {
       /* If we don't wait for idle, start prefetches first, then set
@@ -2364,31 +2379,36 @@ static void si_draw_vbo(struct pipe_context *ctx,
          sctx->emit_cache_flush(sctx);
 
       /* Only prefetch the API VS and VBO descriptors. */
-      if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask)
+      if (GFX_VERSION >= GFX7 && sctx->prefetch_L2_mask)
          cik_emit_prefetch_L2(sctx, true);
 
-      si_emit_all_states(sctx, info, indirect, prim, instance_count, min_direct_count,
-                         primitive_restart, masked_atoms);
+      si_emit_all_states<GFX_VERSION, HAS_TESS, HAS_GS, NGG>
+            (sctx, info, indirect, prim, instance_count, min_direct_count,
+             primitive_restart, masked_atoms);
 
-      if (gfx9_scissor_bug &&
+      if (GFX_VERSION == GFX9 && gfx9_scissor_bug &&
           (sctx->context_roll || si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) {
          sctx->atoms.s.scissors.emit(sctx);
          sctx->dirty_atoms &= ~si_get_atom_bit(sctx, &sctx->atoms.s.scissors);
       }
       assert(sctx->dirty_atoms == 0);
 
-      si_emit_draw_packets(sctx, info, indirect, draws, num_draws,
-                           indexbuf, index_size, index_offset, instance_count,
-                           dispatch_prim_discard_cs, original_index_size);
+      si_emit_draw_packets<GFX_VERSION, NGG, ALLOW_PRIM_DISCARD_CS>
+            (sctx, info, indirect, draws, num_draws, indexbuf, index_size,
+             index_offset, instance_count,
+             dispatch_prim_discard_cs, original_index_size);
 
       /* Prefetch the remaining shaders after the draw has been
        * started. */
-      if (sctx->chip_class >= GFX7 && sctx->prefetch_L2_mask)
+      if (GFX_VERSION >= GFX7 && sctx->prefetch_L2_mask)
          cik_emit_prefetch_L2(sctx, false);
    }
 
-   /* Clear the context roll flag after the draw call. */
-   sctx->context_roll = false;
+   /* Clear the context roll flag after the draw call.
+    * Only used by the gfx9 scissor bug.
+    */
+   if (GFX_VERSION == GFX9)
+      sctx->context_roll = false;
 
    if (unlikely(sctx->current_saved_cs)) {
       si_trace_emit(sctx);
@@ -2397,7 +2417,8 @@ static void si_draw_vbo(struct pipe_context *ctx,
 
    /* Workaround for a VGT hang when streamout is enabled.
     * It must be done after drawing. */
-   if ((sctx->family == CHIP_HAWAII || sctx->family == CHIP_TONGA || sctx->family == CHIP_FIJI) &&
+   if ((GFX_VERSION == GFX7 || GFX_VERSION == GFX8) &&
+       (sctx->family == CHIP_HAWAII || sctx->family == CHIP_TONGA || sctx->family == CHIP_FIJI) &&
        si_get_strmout_en(sctx)) {
       sctx->flags |= SI_CONTEXT_VGT_STREAMOUT_SYNC;
    }
@@ -2457,7 +2478,7 @@ static void si_draw_rectangle(struct blitter_context *blitter, void *vertex_elem
    sctx->vertex_buffer_pointer_dirty = false;
    sctx->vertex_buffer_user_sgprs_dirty = false;
 
-   si_draw_vbo(pipe, &info, NULL, &draw, 1);
+   pipe->draw_vbo(pipe, &info, NULL, &draw, 1);
 }
 
 extern "C"
@@ -2475,10 +2496,67 @@ void si_trace_emit(struct si_context *sctx)
       u_log_flush(sctx->log);
 }
 
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS,
+          si_has_ngg NGG, si_has_prim_discard_cs ALLOW_PRIM_DISCARD_CS>
+static void si_init_draw_vbo(struct si_context *sctx)
+{
+   /* Prim discard CS is only useful on gfx7+ because gfx6 doesn't have async compute. */
+   if (ALLOW_PRIM_DISCARD_CS && GFX_VERSION < GFX7)
+      return;
+
+   if (NGG && GFX_VERSION < GFX10)
+      return;
+
+   sctx->draw_vbo[GFX_VERSION - GFX6][HAS_TESS][HAS_GS][NGG][ALLOW_PRIM_DISCARD_CS] =
+      si_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG, ALLOW_PRIM_DISCARD_CS>;
+}
+
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS>
+static void si_init_draw_vbo_all_internal_options(struct si_context *sctx)
+{
+   si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_OFF, PRIM_DISCARD_CS_OFF>(sctx);
+   si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_OFF, PRIM_DISCARD_CS_ON>(sctx);
+   si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_ON, PRIM_DISCARD_CS_OFF>(sctx);
+   si_init_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG_ON, PRIM_DISCARD_CS_ON>(sctx);
+}
+
+template <chip_class GFX_VERSION>
+static void si_init_draw_vbo_all_pipeline_options(struct si_context *sctx)
+{
+   si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_OFF, GS_OFF>(sctx);
+   si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_OFF, GS_ON>(sctx);
+   si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_ON, GS_OFF>(sctx);
+   si_init_draw_vbo_all_internal_options<GFX_VERSION, TESS_ON, GS_ON>(sctx);
+}
+
+static void si_init_draw_vbo_all_families(struct si_context *sctx)
+{
+   si_init_draw_vbo_all_pipeline_options<GFX6>(sctx);
+   si_init_draw_vbo_all_pipeline_options<GFX7>(sctx);
+   si_init_draw_vbo_all_pipeline_options<GFX8>(sctx);
+   si_init_draw_vbo_all_pipeline_options<GFX9>(sctx);
+   si_init_draw_vbo_all_pipeline_options<GFX10>(sctx);
+   si_init_draw_vbo_all_pipeline_options<GFX10_3>(sctx);
+}
+
+static void si_invalid_draw_vbo(struct pipe_context *pipe,
+                                const struct pipe_draw_info *info,
+                                const struct pipe_draw_indirect_info *indirect,
+                                const struct pipe_draw_start_count *draws,
+                                unsigned num_draws)
+{
+   unreachable("vertex shader not bound");
+}
+
 extern "C"
 void si_init_draw_functions(struct si_context *sctx)
 {
-   sctx->b.draw_vbo = si_draw_vbo;
+   si_init_draw_vbo_all_families(sctx);
+
+   /* Bind a fake draw_vbo, so that draw_vbo isn't NULL, which would skip
+    * initialization of callbacks in upper layers (such as u_threaded_context).
+    */
+   sctx->b.draw_vbo = si_invalid_draw_vbo;
    sctx->blitter->draw_rectangle = si_draw_rectangle;
 
    si_init_ia_multi_vgt_param_table(sctx);
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 73fcf719671..d9a7f38c38f 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -3011,6 +3011,7 @@ static void si_bind_vs_shader(struct pipe_context *ctx, void *state)
       si_shader_change_notify(sctx);
 
    si_update_common_shader_state(sctx, sel, PIPE_SHADER_VERTEX);
+   si_select_draw_vbo(sctx);
    si_update_vs_viewport_state(sctx);
    si_update_streamout_state(sctx);
    si_update_clip_regs(sctx, old_hw_vs, old_hw_vs_variant, si_get_vs(sctx)->cso,
@@ -3059,6 +3060,7 @@ bool si_update_ngg(struct si_context *sctx)
 
       sctx->ngg = new_ngg;
       sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
+      si_select_draw_vbo(sctx);
       return true;
    }
    return false;
@@ -3081,6 +3083,7 @@ static void si_bind_gs_shader(struct pipe_context *ctx, void *state)
    sctx->ia_multi_vgt_param_key.u.uses_gs = sel != NULL;
 
    si_update_common_shader_state(sctx, sel, PIPE_SHADER_GEOMETRY);
+   si_select_draw_vbo(sctx);
    sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
 
    ngg_changed = si_update_ngg(sctx);
@@ -3132,6 +3135,7 @@ static void si_bind_tes_shader(struct pipe_context *ctx, void *state)
    si_update_tess_uses_prim_id(sctx);
 
    si_update_common_shader_state(sctx, sel, PIPE_SHADER_TESS_EVAL);
+   si_select_draw_vbo(sctx);
    sctx->last_gs_out_prim = -1; /* reset this so that it gets updated */
 
    bool ngg_changed = si_update_ngg(sctx);