Mesa (main): mesa/st: implement hardware accelerated GL_SELECT

Mon Jun 6 19:05:07 UTC 2022

Module: Mesa
Branch: main
Commit: 9b22ab4167e6c2c0cf8868a9109545fe2c0d164b
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=9b22ab4167e6c2c0cf8868a9109545fe2c0d164b

Author: Qiang Yu <yuq825 at gmail.com>
Date:   Sat Apr  2 17:12:20 2022 +0800

mesa/st: implement hardware accelerated GL_SELECT

Use an internal geometry shader to handle input primitives. Do full
accurate culling and clipping in the shader and output hit result and
min/max depth to a SSBO for final being written to select buffer.

With multiple result slots in SSBO we can left multiple draws on the
fly and wait them done when buffer is full or exit GL_SELECT mode.

This provides quicker selection response compared to software based
solution. Tested on Discovery Studio 2020: some complex model needs
1~2s selection response time originally, now it's almost selected
immidiately.

Reviewed-by: Marek Olšák <marek.olsak at amd.com>
Signed-off-by: Qiang Yu <yuq825 at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/15765>

---

 src/mesa/main/feedback.c                   |   4 +-
 src/mesa/meson.build                       |   1 +
 src/mesa/state_tracker/st_cb_feedback.c    |  20 +-
 src/mesa/state_tracker/st_context.c        |   7 +
 src/mesa/state_tracker/st_context.h        |   2 +
 src/mesa/state_tracker/st_draw.c           |  67 +++
 src/mesa/state_tracker/st_draw.h           |   9 +
 src/mesa/state_tracker/st_draw_hw_select.c | 821 +++++++++++++++++++++++++++++
 8 files changed, 924 insertions(+), 7 deletions(-)

diff --git a/src/mesa/main/feedback.c b/src/mesa/main/feedback.c
index e6d925b274d..3a035b67eeb 100644
--- a/src/mesa/main/feedback.c
+++ b/src/mesa/main/feedback.c
@@ -637,9 +637,11 @@ _mesa_RenderMode( GLenum mode )
 	 return 0;
    }
 
-   ctx->RenderMode = mode;
    st_RenderMode( ctx, mode );
 
+   /* finally update render mode to new one */
+   ctx->RenderMode = mode;
+
    return result;
 }
 
diff --git a/src/mesa/meson.build b/src/mesa/meson.build
index 522a4c2eb11..8fba301efc5 100644
--- a/src/mesa/meson.build
+++ b/src/mesa/meson.build
@@ -351,6 +351,7 @@ files_libmesa = files(
   'state_tracker/st_draw.c',
   'state_tracker/st_draw.h',
   'state_tracker/st_draw_feedback.c',
+  'state_tracker/st_draw_hw_select.c',
   'state_tracker/st_extensions.c',
   'state_tracker/st_extensions.h',
   'state_tracker/st_format.c',
diff --git a/src/mesa/state_tracker/st_cb_feedback.c b/src/mesa/state_tracker/st_cb_feedback.c
index c5e6f779cd0..e937386fde0 100644
--- a/src/mesa/state_tracker/st_cb_feedback.c
+++ b/src/mesa/state_tracker/st_cb_feedback.c
@@ -291,12 +291,16 @@ st_RenderMode(struct gl_context *ctx, GLenum newMode )
       st_init_draw_functions(st->screen, &ctx->Driver);
    }
    else if (newMode == GL_SELECT) {
-      if (!st->selection_stage)
-         st->selection_stage = draw_glselect_stage(ctx, draw);
-      draw_set_rasterize_stage(draw, st->selection_stage);
-      /* Plug in new vbo draw function */
-      ctx->Driver.DrawGallium = _mesa_draw_gallium_fallback;
-      ctx->Driver.DrawGalliumMultiMode = _mesa_draw_gallium_multimode_fallback;
+      if (ctx->Const.HardwareAcceleratedSelect)
+         st_init_hw_select_draw_functions(st->screen, &ctx->Driver);
+      else {
+         if (!st->selection_stage)
+            st->selection_stage = draw_glselect_stage(ctx, draw);
+         draw_set_rasterize_stage(draw, st->selection_stage);
+         /* Plug in new vbo draw function */
+         ctx->Driver.DrawGallium = _mesa_draw_gallium_fallback;
+         ctx->Driver.DrawGalliumMultiMode = _mesa_draw_gallium_multimode_fallback;
+      }
    }
    else {
       struct gl_program *vp = st->ctx->VertexProgram._Current;
@@ -311,4 +315,8 @@ st_RenderMode(struct gl_context *ctx, GLenum newMode )
       if (vp)
          st->dirty |= ST_NEW_VERTEX_PROGRAM(st, vp);
    }
+
+   /* Restore geometry shader states when leaving GL_SELECT mode. */
+   if (ctx->RenderMode == GL_SELECT && ctx->Const.HardwareAcceleratedSelect)
+      st->dirty |= ST_NEW_GS_SSBOS | ST_NEW_GS_CONSTANTS | ST_NEW_GS_STATE;
 }
diff --git a/src/mesa/state_tracker/st_context.c b/src/mesa/state_tracker/st_context.c
index 7c889727dbc..ffd50ec3e17 100644
--- a/src/mesa/state_tracker/st_context.c
+++ b/src/mesa/state_tracker/st_context.c
@@ -64,6 +64,7 @@
 #include "util/u_upload_mgr.h"
 #include "util/u_vbuf.h"
 #include "util/u_memory.h"
+#include "util/hash_table.h"
 #include "cso_cache/cso_context.h"
 #include "compiler/glsl/glsl_parser_extras.h"
 #include "nir/nir_to_tgsi.h"
@@ -958,6 +959,12 @@ st_destroy_context(struct st_context *st)
    st_release_program(st, &st->tep);
    st_release_program(st, &st->cp);
 
+   if (st->hw_select_shaders) {
+      hash_table_foreach(st->hw_select_shaders, entry)
+         st->pipe->delete_gs_state(st->pipe, entry->data);
+      _mesa_hash_table_destroy(st->hw_select_shaders, NULL);
+   }
+
    /* release framebuffer in the winsys buffers list */
    LIST_FOR_EACH_ENTRY_SAFE_REV(stfb, next, &st->winsys_buffers, head) {
       _mesa_reference_framebuffer(&stfb, NULL);
diff --git a/src/mesa/state_tracker/st_context.h b/src/mesa/state_tracker/st_context.h
index d3eb1075617..749d3a4fea5 100644
--- a/src/mesa/state_tracker/st_context.h
+++ b/src/mesa/state_tracker/st_context.h
@@ -383,6 +383,8 @@ struct st_context
       struct st_zombie_shader_node list;
       simple_mtx_t mutex;
    } zombie_shaders;
+
+   struct hash_table *hw_select_shaders;
 };
 
 
diff --git a/src/mesa/state_tracker/st_draw.c b/src/mesa/state_tracker/st_draw.c
index cd0b8d6e8b2..1fbfc5c1170 100644
--- a/src/mesa/state_tracker/st_draw.c
+++ b/src/mesa/state_tracker/st_draw.c
@@ -519,3 +519,70 @@ st_draw_quad(struct st_context *st,
 
    return true;
 }
+
+static void
+st_hw_select_draw_gallium(struct gl_context *ctx,
+                          struct pipe_draw_info *info,
+                          unsigned drawid_offset,
+                          const struct pipe_draw_start_count_bias *draws,
+                          unsigned num_draws)
+{
+   struct st_context *st = st_context(ctx);
+
+   prepare_draw(st, ctx, ST_PIPELINE_RENDER_STATE_MASK, ST_PIPELINE_RENDER);
+
+   if (!prepare_indexed_draw(st, ctx, info, draws, num_draws))
+      return;
+
+   if (!st_draw_hw_select_prepare_common(ctx) ||
+       !st_draw_hw_select_prepare_mode(ctx, info))
+      return;
+
+   cso_multi_draw(st->cso_context, info, drawid_offset, draws, num_draws);
+}
+
+static void
+st_hw_select_draw_gallium_multimode(struct gl_context *ctx,
+                                    struct pipe_draw_info *info,
+                                    const struct pipe_draw_start_count_bias *draws,
+                                    const unsigned char *mode,
+                                    unsigned num_draws)
+{
+   struct st_context *st = st_context(ctx);
+
+   prepare_draw(st, ctx, ST_PIPELINE_RENDER_STATE_MASK, ST_PIPELINE_RENDER);
+
+   if (!prepare_indexed_draw(st, ctx, info, draws, num_draws))
+      return;
+
+   if (!st_draw_hw_select_prepare_common(ctx))
+      return;
+
+   unsigned i, first;
+   struct cso_context *cso = st->cso_context;
+
+   /* Find consecutive draws where mode doesn't vary. */
+   for (i = 0, first = 0; i <= num_draws; i++) {
+      if (i == num_draws || mode[i] != mode[first]) {
+         info->mode = mode[first];
+
+         if (st_draw_hw_select_prepare_mode(ctx, info))
+            cso_multi_draw(cso, info, 0, &draws[first], i - first);
+
+         first = i;
+
+         /* We can pass the reference only once. st_buffer_object keeps
+          * the reference alive for later draws.
+          */
+         info->take_index_buffer_ownership = false;
+      }
+   }
+}
+
+void
+st_init_hw_select_draw_functions(struct pipe_screen *screen,
+                                 struct dd_function_table *functions)
+{
+   functions->DrawGallium = st_hw_select_draw_gallium;
+   functions->DrawGalliumMultiMode = st_hw_select_draw_gallium_multimode;
+}
diff --git a/src/mesa/state_tracker/st_draw.h b/src/mesa/state_tracker/st_draw.h
index e0e6e472bbd..46ee64df6fa 100644
--- a/src/mesa/state_tracker/st_draw.h
+++ b/src/mesa/state_tracker/st_draw.h
@@ -99,4 +99,13 @@ st_indirect_draw_vbo(struct gl_context *ctx,
                      const struct _mesa_index_buffer *ib,
                      bool primitive_restart,
                      unsigned restart_index);
+
+bool
+st_draw_hw_select_prepare_common(struct gl_context *ctx);
+bool
+st_draw_hw_select_prepare_mode(struct gl_context *ctx, struct pipe_draw_info *info);
+void
+st_init_hw_select_draw_functions(struct pipe_screen *screen,
+                                 struct dd_function_table *functions);
+
 #endif
diff --git a/src/mesa/state_tracker/st_draw_hw_select.c b/src/mesa/state_tracker/st_draw_hw_select.c
new file mode 100644
index 00000000000..5a032da046a
--- /dev/null
+++ b/src/mesa/state_tracker/st_draw_hw_select.c
@@ -0,0 +1,821 @@
+/*
+ * Copyright 2022 Advanced Micro Devices, Inc.
+ * All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included
+ * in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
+ * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+ * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+ * OTHER DEALINGS IN THE SOFTWARE.
+ */
+
+#include "main/enums.h"
+#include "main/context.h"
+
+#include "st_context.h"
+#include "st_nir.h"
+#include "st_draw.h"
+
+#include "nir.h"
+#include "nir_builtin_builder.h"
+
+#include "u_memory.h"
+
+union state_key {
+   struct {
+      unsigned num_user_clip_planes:4;
+      unsigned face_culling_enabled:1;
+      unsigned result_offset_from_attribute:1;
+      unsigned primitive:4;
+   };
+   uint32_t u32;
+};
+
+enum primitive_state {
+   HW_SELECT_PRIM_NONE,
+   HW_SELECT_PRIM_POINTS,
+   HW_SELECT_PRIM_LINES,
+   HW_SELECT_PRIM_TRIANGLES,
+   HW_SELECT_PRIM_QUADS,
+};
+
+struct geometry_constant {
+   float depth_scale;
+   float depth_transport;
+   uint32_t culling_config;
+   uint32_t result_offset;
+   float clip_planes[MAX_CLIP_PLANES][4];
+};
+
+#define set_uniform_location(var, field, packed)                 \
+   do {                                                          \
+      unsigned offset = Offset(struct geometry_constant, field); \
+      var->data.driver_location = offset >> (packed ? 2 : 4);    \
+      var->data.location_frac = (offset >> 2) & 0x3;             \
+   } while (0)
+
+static nir_ssa_def *
+has_nan_or_inf(nir_builder *b, nir_ssa_def *v)
+{
+   nir_ssa_def *nan = nir_bany_fnequal4(b, v, v);
+
+   nir_ssa_def *imm = nir_imm_float(b, INFINITY);
+   nir_ssa_def *inf = nir_bany(b, nir_feq(b, nir_fabs(b, v), imm));
+
+   return nir_ior(b, nan, inf);
+}
+
+static void
+return_if_true(nir_builder *b, nir_ssa_def *cond)
+{
+   nir_if *if_cond = nir_push_if(b, cond);
+   nir_jump(b, nir_jump_return);
+   nir_pop_if(b, if_cond);
+}
+
+static void
+get_input_vertices(nir_builder *b, nir_ssa_def **v)
+{
+   const int num_in_vert = b->shader->info.gs.vertices_in;
+
+   nir_variable *in_pos = nir_variable_create(
+      b->shader, nir_var_shader_in, glsl_array_type(glsl_vec4_type(), num_in_vert, 0),
+      "gl_Position");
+   in_pos->data.location = VARYING_SLOT_POS;
+
+   nir_ssa_def *is_nan_or_inf = NULL;
+   for (int i = 0; i < num_in_vert; i++) {
+      v[i] = nir_load_array_var_imm(b, in_pos, i);
+      nir_ssa_def *r = has_nan_or_inf(b, v[i]);
+      is_nan_or_inf = i ? nir_ior(b, is_nan_or_inf, r) : r;
+   }
+   return_if_true(b, is_nan_or_inf);
+}
+
+static void
+face_culling(nir_builder *b, nir_ssa_def **v, bool packed)
+{
+   /* use the z value of the face normal to determine if the face points to us:
+    *   Nz = (x1 - x0) * (y2 - y0) - (y1 - y0) * (x2 - x0)
+    *
+    * it should be in NDC (Normalized Device Coordinate), but now we are in clip
+    * space (Vd = Vc / Vc.w), so multiply Nz with w0*w1*w2 to get the clip space
+    * value:
+    *   det = x0 * (y1 * w2 - y2 * w1) +
+    *         x1 * (y2 * w0 - y0 * w2) +
+    *         x2 * (y0 * w1 - y1 * w0)
+    *
+    * we only care about the sign of the det, but also need to count the sign of
+    * w0/w1/w2 as a negtive w would change the direction of Nz < 0
+    */
+   nir_ssa_def *y1w2 = nir_fmul(b, nir_channel(b, v[1], 1), nir_channel(b, v[2], 3));
+   nir_ssa_def *y2w1 = nir_fmul(b, nir_channel(b, v[2], 1), nir_channel(b, v[1], 3));
+   nir_ssa_def *y2w0 = nir_fmul(b, nir_channel(b, v[2], 1), nir_channel(b, v[0], 3));
+   nir_ssa_def *y0w2 = nir_fmul(b, nir_channel(b, v[0], 1), nir_channel(b, v[2], 3));
+   nir_ssa_def *y0w1 = nir_fmul(b, nir_channel(b, v[0], 1), nir_channel(b, v[1], 3));
+   nir_ssa_def *y1w0 = nir_fmul(b, nir_channel(b, v[1], 1), nir_channel(b, v[0], 3));
+   nir_ssa_def *t0 = nir_fmul(b, nir_channel(b, v[0], 0), nir_fsub(b, y1w2, y2w1));
+   nir_ssa_def *t1 = nir_fmul(b, nir_channel(b, v[1], 0), nir_fsub(b, y2w0, y0w2));
+   nir_ssa_def *t2 = nir_fmul(b, nir_channel(b, v[2], 0), nir_fsub(b, y0w1, y1w0));
+   nir_ssa_def *det = nir_fadd(b, nir_fadd(b, t0, t1), t2);
+
+   /* invert det sign once any vertex w < 0 */
+   nir_ssa_def *n0 = nir_flt(b, nir_channel(b, v[0], 3), nir_imm_float(b, 0));
+   nir_ssa_def *n1 = nir_flt(b, nir_channel(b, v[1], 3), nir_imm_float(b, 0));
+   nir_ssa_def *n2 = nir_flt(b, nir_channel(b, v[2], 3), nir_imm_float(b, 0));
+   nir_ssa_def *cond = nir_ixor(b, nir_ixor(b, n0, n1), n2);
+   det = nir_bcsel(b, cond, nir_fneg(b, det), det);
+
+   nir_variable *culling_config = nir_variable_create(
+      b->shader, nir_var_uniform, glsl_uint_type(), "culling_config");
+   set_uniform_location(culling_config, culling_config, packed);
+   nir_ssa_def *config = nir_i2b(b, nir_load_var(b, culling_config));
+
+   /* det < 0 then z points to camera */
+   nir_ssa_def *zero = nir_imm_zero(b, 1, det->bit_size);
+   nir_ssa_def *is_zero = nir_feq(b, det, zero);
+   nir_ssa_def *is_neg = nir_flt(b, det, zero);
+   nir_ssa_def *cull = nir_ixor(b, is_neg, config);
+   return_if_true(b, nir_ior(b, is_zero, cull));
+}
+
+static void
+fast_frustum_culling(nir_builder *b, nir_ssa_def **v)
+{
+   nir_ssa_def *cull = NULL;
+
+   /* there are six culling planes for the visible volume:
+    *   1.  x + w = 0
+    *   2. -x + w = 0
+    *   3.  y + w = 0
+    *   4. -y + w = 0
+    *   5.  z + w = 0
+    *   6. -z + w = 0
+    *
+    * if all vertices of the primitive are outside (plane equation <0) of
+    * any plane, the primitive must be invisible.
+    */
+   for (int i = 0; i < 6; i++) {
+      nir_ssa_def *outside = NULL;
+
+      for (int j = 0; j < b->shader->info.gs.vertices_in; j++) {
+         nir_ssa_def *c = nir_channel(b, v[j], i >> 1);
+         if (i & 1)
+            c = nir_fneg(b, c);
+
+         nir_ssa_def *r = nir_flt(b, nir_channel(b, v[j], 3), c);
+         outside = j ? nir_iand(b, outside, r) : r;
+      }
+
+      cull = i ? nir_ior(b, cull, outside) : outside;
+   }
+
+   return_if_true(b, cull);
+}
+
+static nir_ssa_def *
+get_intersection(nir_builder *b, nir_ssa_def *v1, nir_ssa_def *v2,
+                 nir_ssa_def *d1, nir_ssa_def *d2)
+{
+   nir_ssa_def *factor = nir_fdiv(b, d1, nir_fsub(b, d1, d2));
+   return nir_fmad(b, nir_fsub(b, v2, v1), factor, v1);
+}
+
+#define begin_for_loop(name, max)                                       \
+   nir_variable *name##_index =                                         \
+      nir_local_variable_create(b->impl, glsl_int_type(), #name "_i");  \
+   nir_store_var(b, name##_index, nir_imm_int(b, 0), 1);                \
+                                                                        \
+   nir_loop *name = nir_push_loop(b);                                   \
+   {                                                                    \
+      nir_ssa_def *idx = nir_load_var(b, name##_index);                 \
+      nir_if *if_in_loop = nir_push_if(b, nir_ilt(b, idx, max));
+
+#define end_for_loop(name)                                              \
+         nir_store_var(b, name##_index, nir_iadd_imm(b, idx, 1), 1);    \
+      nir_push_else(b, if_in_loop);                                     \
+         nir_jump(b, nir_jump_break);                                   \
+      nir_pop_if(b, if_in_loop);                                        \
+   }                                                                    \
+   nir_pop_loop(b, name);
+
+static void
+clip_with_plane(nir_builder *b, nir_variable *vert, nir_variable *num_vert,
+                int max_vert, nir_ssa_def *plane)
+{
+   nir_variable *all_clipped = nir_local_variable_create(
+      b->impl, glsl_bool_type(), "all_clipped");
+   nir_store_var(b, all_clipped, nir_imm_true(b), 1);
+
+   nir_variable *dist = nir_local_variable_create(
+      b->impl, glsl_array_type(glsl_float_type(), max_vert, 0), "dist");
+
+   nir_ssa_def *num = nir_load_var(b, num_vert);
+   begin_for_loop(dist_loop, num)
+   {
+      nir_ssa_def *v = nir_load_array_var(b, vert, idx);
+      nir_ssa_def *d = nir_fdot(b, v, plane);
+      nir_store_array_var(b, dist, idx, d, 1);
+
+      nir_ssa_def *clipped = nir_flt(b, d, nir_imm_float(b, 0));
+      nir_store_var(b, all_clipped,
+                    nir_iand(b, nir_load_var(b, all_clipped), clipped), 1);
+   }
+   end_for_loop(dist_loop)
+
+   return_if_true(b, nir_load_var(b, all_clipped));
+
+   /* Use +/0/- to denote the dist[i] sign, which means:
+    * +: inside plane
+    * -: outside plane
+    * 0: just on the plane
+    *
+    * Some example:
+    * ++++: all vertex not clipped
+    * ----: all vertex clipped
+    * +-++: one vertex clipped, need to insert two vertex at '-', array grow
+    * +--+: two vertex clipped, need to insert two vertex at '--', array same
+    * +---: three vertex clipped, need to insert two vertex at '---', array trim
+    * +-0+: one vertex clipped, need to insert one vertex at '-', array same
+    *
+    * Plane clip only produce convex polygon, so '-' must be contigous, there's
+    * no '+-+-', so one clip plane can only grow array by 1.
+    */
+
+   /* when array grow or '-' has been replaced with inserted vertex, save the
+    * original vert to be used by following calculation.
+    */
+   nir_variable *saved =
+      nir_local_variable_create(b->impl, glsl_vec4_type(), "saved");
+
+   nir_variable *vert_index =
+      nir_local_variable_create(b->impl, glsl_int_type(), "vert_index");
+   nir_store_var(b, vert_index, nir_imm_int(b, 0), 1);
+
+   begin_for_loop(vert_loop, num)
+   {
+      nir_ssa_def *di = nir_load_array_var(b, dist, idx);
+      nir_if *if_clipped = nir_push_if(b, nir_flt(b, di, nir_imm_float(b, 0)));
+      {
+         /* - case, we need to take care of sign change and insert vertex */
+
+         nir_ssa_def *prev = nir_bcsel(b, nir_ieq_imm(b, idx, 0),
+                                       nir_iadd_imm(b, num, -1),
+                                       nir_iadd_imm(b, idx, -1));
+         nir_ssa_def *dp = nir_load_array_var(b, dist, prev);
+         nir_if *prev_if = nir_push_if(b, nir_flt(b, nir_imm_float(b, 0), dp));
+         {
+            /* +- case, replace - with inserted vertex
+             * assert(vert_index <= idx), array is sure to not grow here
+             * but need to save vert[idx] when vert_index==idx
+             */
+
+            nir_ssa_def *vi = nir_load_array_var(b, vert, idx);
+            nir_store_var(b, saved, vi, 0xf);
+
+            nir_ssa_def *vp = nir_load_array_var(b, vert, prev);
+            nir_ssa_def *iv = get_intersection(b, vp, vi, dp, di);
+            nir_ssa_def *index = nir_load_var(b, vert_index);
+            nir_store_array_var(b, vert, index, iv, 0xf);
+
+            nir_store_var(b, vert_index, nir_iadd_imm(b, index, 1), 1);
+         }
+         nir_pop_if(b, prev_if);
+
+         nir_ssa_def *next = nir_bcsel(b, nir_ieq(b, idx, nir_iadd_imm(b, num, -1)),
+                                       nir_imm_int(b, 0), nir_iadd_imm(b, idx, 1));
+         nir_ssa_def *dn = nir_load_array_var(b, dist, next);
+         nir_if *next_if = nir_push_if(b, nir_flt(b, nir_imm_float(b, 0), dn));
+         {
+            /* -+ case, may grow array:
+             *   vert_index > idx: +-+ case, grow array, current vertex in 'saved',
+             *     save next + to 'saved', will replace it with inserted vertex.
+             *   vert_index <= idx: --+ case, will replace last - with inserted vertex,
+             *     no need to save last -, because + case won't use - value.
+             */
+
+            nir_ssa_def *index = nir_load_var(b, vert_index);
+            nir_ssa_def *vi = nir_bcsel(b, nir_flt(b, idx, index),
+                                        nir_load_var(b, saved),
+                                        nir_load_array_var(b, vert, idx));
+            nir_ssa_def *vn = nir_load_array_var(b, vert, next);
+            nir_ssa_def *iv = get_intersection(b, vn, vi, dn, di);
+
+            nir_store_var(b, saved, nir_load_array_var(b, vert, index), 0xf);
+            nir_store_array_var(b, vert, index, iv, 0xf);
+
+            nir_store_var(b, vert_index, nir_iadd_imm(b, index, 1), 1);
+         }
+         nir_pop_if(b, next_if);
+      }
+      nir_push_else(b, if_clipped);
+      {
+         /* +/0 case, just keep the vert
+          *   vert_index > idx: array grew case, vert[idx] is inserted vertex or prev
+          *     +/0 vertex, current vertex is in 'saved', need to save next vertex
+          *   vert_index < idx: array trim case
+          */
+
+         nir_ssa_def *index = nir_load_var(b, vert_index);
+         nir_ssa_def *vi = nir_bcsel(b, nir_flt(b, idx, index),
+                                     nir_load_var(b, saved),
+                                     nir_load_array_var(b, vert, idx));
+
+         nir_store_var(b, saved, nir_load_array_var(b, vert, index), 0xf);
+         nir_store_array_var(b, vert, index, vi, 0xf);
+
+         nir_store_var(b, vert_index, nir_iadd_imm(b, index, 1), 1);
+      }
+      nir_pop_if(b, if_clipped);
+   }
+   end_for_loop(vert_loop);
+
+   nir_copy_var(b, num_vert, vert_index);
+}
+
+static nir_ssa_def *
+get_user_clip_plane(nir_builder *b, int index, bool packed)
+{
+   char name[16];
+   snprintf(name, sizeof(name), "gl_ClipPlane%d", index);
+   nir_variable *plane = nir_variable_create(
+      b->shader, nir_var_uniform, glsl_vec4_type(), name);
+
+   set_uniform_location(plane, clip_planes[index][0], packed);
+
+   return nir_load_var(b, plane);
+}
+
+static void
+get_depth_range_transform(nir_builder *b, bool packed, nir_ssa_def **trans)
+{
+   nir_variable *depth_scale = nir_variable_create(
+      b->shader, nir_var_uniform, glsl_float_type(), "depth_scale");
+   set_uniform_location(depth_scale, depth_scale, packed);
+
+   nir_variable *depth_transport = nir_variable_create(
+      b->shader, nir_var_uniform, glsl_float_type(), "depth_transport");
+   set_uniform_location(depth_transport, depth_transport, packed);
+
+   trans[0] = nir_load_var(b, depth_scale);
+   trans[1] = nir_load_var(b, depth_transport);
+}
+
+static nir_ssa_def *
+get_window_space_depth(nir_builder *b, nir_ssa_def *v, nir_ssa_def **trans)
+{
+   nir_ssa_def *z = nir_channel(b, v, 2);
+   nir_ssa_def *w = nir_channel(b, v, 3);
+
+   /* do perspective division, if w==0, xyz must be 0 too (otherwise can't pass
+    * the clip test), 0/0=NaN, but we want it to be the nearest point.
+    */
+   nir_ssa_def *c = nir_feq(b, w, nir_imm_float(b, 0));
+   nir_ssa_def *d = nir_bcsel(b, c, nir_imm_float(b, -1), nir_fdiv(b, z, w));
+
+   /* map [-1, 1] to [near, far] set by glDepthRange(near, far) */
+   return nir_fmad(b, trans[0], d, trans[1]);
+}
+
+static void
+update_result_buffer(nir_builder *b, nir_ssa_def *dmin, nir_ssa_def *dmax,
+                     bool offset_from_attribute, bool packed)
+{
+   nir_ssa_def *offset;
+   if (offset_from_attribute) {
+      nir_variable *in_offset = nir_variable_create(
+         b->shader, nir_var_shader_in,
+         glsl_array_type(glsl_uint_type(), b->shader->info.gs.vertices_in, 0),
+         "result_offset");
+      in_offset->data.location = VARYING_SLOT_VAR0;
+      offset = nir_load_array_var_imm(b, in_offset, 0);
+   } else {
+      nir_variable *uni_offset = nir_variable_create(
+         b->shader, nir_var_uniform, glsl_uint_type(), "result_offset");
+      set_uniform_location(uni_offset, result_offset, packed);
+      offset = nir_load_var(b, uni_offset);
+   }
+
+   nir_variable_create(b->shader, nir_var_mem_ssbo,
+                       glsl_array_type(glsl_uint_type(), 0, 0), "result");
+   /* driver_location = 0 (slot 0) */
+
+   nir_ssa_def *ssbo = nir_imm_int(b, 0);
+   nir_ssbo_atomic_exchange(b, 32, ssbo, offset, nir_imm_int(b, 1));
+   nir_ssbo_atomic_umin(b, 32, ssbo, nir_iadd_imm(b, offset, 4), dmin);
+   nir_ssbo_atomic_umax(b, 32, ssbo, nir_iadd_imm(b, offset, 8), dmax);
+}
+
+static void
+build_point_nir_shader(nir_builder *b, union state_key state, bool packed)
+{
+   assert(b->shader->info.gs.vertices_in == 1);
+
+   nir_ssa_def *v;
+   get_input_vertices(b, &v);
+
+   fast_frustum_culling(b, &v);
+
+   nir_ssa_def *outside = NULL;
+   for (int i = 0; i < state.num_user_clip_planes; i++) {
+      nir_ssa_def *p = get_user_clip_plane(b, i, packed);
+      nir_ssa_def *d = nir_fdot(b, v, p);
+      nir_ssa_def *r = nir_flt(b, d, nir_imm_float(b, 0));
+      outside = i ? nir_ior(b, outside, r) : r;
+   }
+   if (outside)
+      return_if_true(b, outside);
+
+   nir_ssa_def *trans[2];
+   get_depth_range_transform(b, packed, trans);
+
+   nir_ssa_def *depth = get_window_space_depth(b, v, trans);
+   nir_ssa_def *fdepth = nir_fmul_imm(b, depth, 4294967295.0);
+   nir_ssa_def *idepth = nir_f2uN(b, fdepth, 32);
+
+   update_result_buffer(b, idepth, idepth, state.result_offset_from_attribute, packed);
+}
+
+static nir_variable *
+create_clip_planes(nir_builder *b, int num_clip_planes, bool packed)
+{
+   nir_variable *clip_planes = nir_local_variable_create(
+      b->impl, glsl_array_type(glsl_vec4_type(), num_clip_planes, 0), "clip_planes");
+
+   nir_ssa_def *unit_clip_planes[6] = {
+      nir_imm_vec4(b,  1,  0,  0,  1),
+      nir_imm_vec4(b, -1,  0,  0,  1),
+      nir_imm_vec4(b,  0,  1,  0,  1),
+      nir_imm_vec4(b,  0, -1,  0,  1),
+      nir_imm_vec4(b,  0,  0,  1,  1),
+      nir_imm_vec4(b,  0,  0, -1,  1),
+   };
+   for (int i = 0; i < 6; i++)
+      nir_store_array_var_imm(b, clip_planes, i, unit_clip_planes[i], 0xf);
+
+   for (int i = 6; i < num_clip_planes; i++) {
+      nir_ssa_def *p = get_user_clip_plane(b, i - 6, packed);
+      nir_store_array_var_imm(b, clip_planes, i, p, 0xf);
+   }
+
+   return clip_planes;
+}
+
+static void
+build_line_nir_shader(nir_builder *b, union state_key state, bool packed)
+{
+   assert(b->shader->info.gs.vertices_in == 2);
+
+   nir_ssa_def *v[2];
+   get_input_vertices(b, v);
+
+   fast_frustum_culling(b, v);
+
+   nir_variable *vert0 = nir_local_variable_create(b->impl, glsl_vec4_type(), "vert0");
+   nir_store_var(b, vert0, v[0], 0xf);
+
+   nir_variable *vert1 = nir_local_variable_create(b->impl, glsl_vec4_type(), "vert1");
+   nir_store_var(b, vert1, v[1], 0xf);
+
+   const int num_clip_planes = 6 + state.num_user_clip_planes;
+   nir_variable *clip_planes = create_clip_planes(b, num_clip_planes, packed);
+
+   begin_for_loop(clip_loop, nir_imm_int(b, num_clip_planes))
+   {
+      nir_ssa_def *plane = nir_load_array_var(b, clip_planes, idx);
+      nir_ssa_def *v0 = nir_load_var(b, vert0);
+      nir_ssa_def *v1 = nir_load_var(b, vert1);
+      nir_ssa_def *d0 = nir_fdot(b, v0, plane);
+      nir_ssa_def *d1 = nir_fdot(b, v1, plane);
+      nir_ssa_def *n0 = nir_flt(b, d0, nir_imm_float(b, 0));
+      nir_ssa_def *n1 = nir_flt(b, d1, nir_imm_float(b, 0));
+
+      return_if_true(b, nir_iand(b, n0, n1));
+
+      nir_if *clip_if = nir_push_if(b, nir_ior(b, n0, n1));
+      {
+         nir_ssa_def *iv = get_intersection(b, v0, v1, d0, d1);
+         nir_store_var(b, vert0, nir_bcsel(b, n0, iv, v0), 0xf);
+         nir_store_var(b, vert1, nir_bcsel(b, n1, iv, v1), 0xf);
+      }
+      nir_pop_if(b, clip_if);
+   }
+   end_for_loop(clip_loop)
+
+   nir_ssa_def *trans[2];
+   get_depth_range_transform(b, packed, trans);
+
+   nir_ssa_def *d0 = get_window_space_depth(b, nir_load_var(b, vert0), trans);
+   nir_ssa_def *d1 = get_window_space_depth(b, nir_load_var(b, vert1), trans);
+
+   nir_ssa_def *dmin = nir_fmin(b, d0, d1);
+   nir_ssa_def *dmax = nir_fmax(b, d0, d1);
+
+   nir_ssa_def *fdmin = nir_fmul_imm(b, dmin, 4294967295.0);
+   nir_ssa_def *idmin = nir_f2uN(b, fdmin, 32);
+
+   nir_ssa_def *fdmax = nir_fmul_imm(b, dmax, 4294967295.0);
+   nir_ssa_def *idmax = nir_f2uN(b, fdmax, 32);
+
+   update_result_buffer(b, idmin, idmax, state.result_offset_from_attribute, packed);
+}
+
+static void
+build_planar_primitive_nir_shader(nir_builder *b, union state_key state, bool packed)
+{
+   const int num_in_vert = b->shader->info.gs.vertices_in;
+   assert(num_in_vert == 3 || num_in_vert == 4);
+
+   nir_ssa_def *v[4];
+   get_input_vertices(b, v);
+
+   if (state.face_culling_enabled)
+      face_culling(b, v, packed);
+
+   /* fast frustum culling, this should filter out most primitives */
+   fast_frustum_culling(b, v);
+
+   const int num_clip_planes = 6 + state.num_user_clip_planes;
+   const int max_vert = num_in_vert + num_clip_planes;
+
+   /* TODO: could use shared memory (ie. AMD GPU LDS) for this array
+    * to reduce register usage.
+    */
+   nir_variable *vert = nir_local_variable_create(
+      b->impl, glsl_array_type(glsl_vec4_type(), max_vert, 0), "vert");
+   for (int i = 0; i < num_in_vert; i++)
+      nir_store_array_var_imm(b, vert, i, v[i], 0xf);
+
+   nir_variable *num_vert =
+      nir_local_variable_create(b->impl, glsl_int_type(), "num_vert");
+   nir_store_var(b, num_vert, nir_imm_int(b, num_in_vert), 1);
+
+   nir_variable *clip_planes = create_clip_planes(b, num_clip_planes, packed);
+
+   /* accurate clipping with all clip planes */
+   begin_for_loop(clip_loop, nir_imm_int(b, num_clip_planes))
+   {
+      nir_ssa_def *plane = nir_load_array_var(b, clip_planes, idx);
+      clip_with_plane(b, vert, num_vert, max_vert, plane);
+   }
+   end_for_loop(clip_loop)
+
+   nir_ssa_def *trans[2];
+   get_depth_range_transform(b, packed, trans);
+
+   nir_variable *dmin =
+      nir_local_variable_create(b->impl, glsl_float_type(), "dmin");
+   nir_store_var(b, dmin, nir_imm_float(b, 1), 1);
+
+   nir_variable *dmax =
+      nir_local_variable_create(b->impl, glsl_float_type(), "dmax");
+   nir_store_var(b, dmax, nir_imm_float(b, 0), 1);
+
+   begin_for_loop(depth_loop, nir_load_var(b, num_vert))
+   {
+      nir_ssa_def *vtx = nir_load_array_var(b, vert, idx);
+      nir_ssa_def *depth = get_window_space_depth(b, vtx, trans);
+      nir_store_var(b, dmin, nir_fmin(b, nir_load_var(b, dmin), depth), 1);
+      nir_store_var(b, dmax, nir_fmax(b, nir_load_var(b, dmax), depth), 1);
+   }
+   end_for_loop(depth_loop)
+
+   nir_ssa_def *fdmin = nir_fmul_imm(b, nir_load_var(b, dmin), 4294967295.0);
+   nir_ssa_def *idmin = nir_f2uN(b, fdmin, 32);
+
+   nir_ssa_def *fdmax = nir_fmul_imm(b, nir_load_var(b, dmax), 4294967295.0);
+   nir_ssa_def *idmax = nir_f2uN(b, fdmax, 32);
+
+   update_result_buffer(b, idmin, idmax, state.result_offset_from_attribute, packed);
+}
+
+static void *
+hw_select_create_gs(struct st_context *st, union state_key state)
+{
+   const nir_shader_compiler_options *options =
+      st_get_nir_compiler_options(st, MESA_SHADER_GEOMETRY);
+
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_GEOMETRY, options,
+                                                  "hw select GS");
+
+   nir_shader *nir = b.shader;
+   nir->info.inputs_read = VARYING_BIT_POS;
+   nir->info.num_ssbos = 1;
+   nir->info.gs.output_primitive = SHADER_PRIM_POINTS;
+   nir->info.gs.vertices_out = 1;
+   nir->info.gs.invocations = 1;
+   nir->info.gs.active_stream_mask = 1;
+
+   if (state.result_offset_from_attribute)
+      nir->info.inputs_read |= VARYING_BIT_VAR(0);
+
+   bool packed = st->ctx->Const.PackedDriverUniformStorage;
+
+   switch (state.primitive) {
+   case HW_SELECT_PRIM_POINTS:
+      nir->info.gs.input_primitive = SHADER_PRIM_POINTS;
+      nir->info.gs.vertices_in = 1;
+      build_point_nir_shader(&b, state, packed);
+      break;
+   case HW_SELECT_PRIM_LINES:
+      nir->info.gs.input_primitive = SHADER_PRIM_LINES;
+      nir->info.gs.vertices_in = 2;
+      build_line_nir_shader(&b, state, packed);
+      break;
+   case HW_SELECT_PRIM_TRIANGLES:
+      nir->info.gs.input_primitive = SHADER_PRIM_TRIANGLES;
+      nir->info.gs.vertices_in = 3;
+      build_planar_primitive_nir_shader(&b, state, packed);
+      break;
+   case HW_SELECT_PRIM_QUADS:
+      /* geometry shader has no quad primitive, use lines_adjacency instead */
+      nir->info.gs.input_primitive = SHADER_PRIM_LINES_ADJACENCY;
+      nir->info.gs.vertices_in = 4;
+      build_planar_primitive_nir_shader(&b, state, packed);
+      break;
+   default:
+      unreachable("unexpected primitive");
+   }
+
+   nir_lower_returns(nir);
+
+   return st_nir_finish_builtin_shader(st, nir);
+}
+
+bool
+st_draw_hw_select_prepare_common(struct gl_context *ctx)
+{
+   struct st_context *st = st_context(ctx);
+   if (st->gp || st->tcp || st->tep) {
+      fprintf(stderr, "HW GL_SELECT does not support user geometry/tessellation shader\n");
+      return false;
+   }
+
+   struct geometry_constant consts;
+
+   float n = ctx->ViewportArray[0].Near;
+   float f = ctx->ViewportArray[0].Far;
+   consts.depth_scale = (f - n) / 2;
+   consts.depth_transport = (f + n) / 2;
+
+   /* this field is not used when face culling disabled */
+   consts.culling_config =
+      (ctx->Polygon.CullFaceMode == GL_BACK) ^
+      (ctx->Polygon.FrontFace == GL_CCW);
+
+   /* this field is not used when passing result offset by attribute */
+   consts.result_offset = st->ctx->Select.ResultOffset;
+
+   int num_planes = 0;
+   u_foreach_bit(i, ctx->Transform.ClipPlanesEnabled) {
+      COPY_4V(consts.clip_planes[num_planes], ctx->Transform._ClipUserPlane[i]);
+      num_planes++;
+   }
+
+   struct pipe_constant_buffer cb;
+   cb.buffer = NULL;
+   cb.user_buffer = &consts;
+   cb.buffer_offset = 0;
+   cb.buffer_size = sizeof(consts) - (MAX_CLIP_PLANES - num_planes) * 4 * sizeof(float);
+
+   struct pipe_context *pipe = st->pipe;
+   pipe->set_constant_buffer(pipe, PIPE_SHADER_GEOMETRY, 0, false, &cb);
+
+   struct pipe_shader_buffer buffer;
+   memset(&buffer, 0, sizeof(buffer));
+   buffer.buffer = ctx->Select.Result->buffer;
+   buffer.buffer_size = MAX_NAME_STACK_RESULT_NUM * 3 * sizeof(int);
+
+   pipe->set_shader_buffers(pipe, PIPE_SHADER_GEOMETRY, 0, 1, &buffer, 0x1);
+
+   return true;
+}
+
+static union state_key
+make_state_key(struct gl_context *ctx, int mode)
+{
+   union state_key state = {0};
+
+   switch (mode) {
+   case GL_POINTS:
+      state.primitive = HW_SELECT_PRIM_POINTS;
+      break;
+   case GL_LINES:
+   case GL_LINE_STRIP:
+   case GL_LINE_LOOP:
+      state.primitive = HW_SELECT_PRIM_LINES;
+      break;
+   case GL_QUADS:
+      state.primitive = HW_SELECT_PRIM_QUADS;
+      break;
+   case GL_TRIANGLES:
+   case GL_TRIANGLE_STRIP:
+   case GL_TRIANGLE_FAN:
+      /* These will be broken into triangles. */
+   case GL_QUAD_STRIP:
+   case GL_POLYGON:
+      state.primitive = HW_SELECT_PRIM_TRIANGLES;
+      break;
+   default:
+      fprintf(stderr, "HW GL_SELECT does not support draw mode %s\n",
+              _mesa_enum_to_string(mode));
+      return (union state_key){0};
+   }
+
+   /* TODO: support gl_ClipDistance/gl_CullDistance, but it costs more regs */
+   struct gl_program *vp = ctx->st->vp;
+   if (vp->info.clip_distance_array_size || vp->info.cull_distance_array_size) {
+      fprintf(stderr, "HW GL_SELECT does not support gl_ClipDistance/gl_CullDistance\n");
+      return (union state_key){0};
+   }
+
+   state.num_user_clip_planes = util_bitcount(ctx->Transform.ClipPlanesEnabled);
+
+   /* face culling only apply to 2D primitives */
+   if (state.primitive == HW_SELECT_PRIM_QUADS ||
+       state.primitive == HW_SELECT_PRIM_TRIANGLES)
+      state.face_culling_enabled = ctx->Polygon.CullFlag;
+
+   state.result_offset_from_attribute =
+      ctx->VertexProgram._VPMode == VP_MODE_FF &&
+      (ctx->VertexProgram._VaryingInputs & VERT_BIT_SELECT_RESULT_OFFSET);
+
+   return state;
+}
+
+bool
+st_draw_hw_select_prepare_mode(struct gl_context *ctx, struct pipe_draw_info *info)
+{
+   union state_key key = make_state_key(ctx, info->mode);
+   if (!key.u32)
+      return false;
+
+   struct st_context *st = st_context(ctx);
+   if (!st->hw_select_shaders)
+      st->hw_select_shaders = _mesa_hash_table_create_u32_keys(NULL);
+
+   struct hash_entry *he = _mesa_hash_table_search(st->hw_select_shaders,
+                                                   (void*)(uintptr_t)key.u32);
+   void *gs;
+   if (!he) {
+      gs = hw_select_create_gs(st, key);
+      if (!gs)
+         return false;
+
+      _mesa_hash_table_insert(st->hw_select_shaders, (void*)(uintptr_t)key.u32, gs);
+   } else
+      gs = he->data;
+
+   struct cso_context *cso = st->cso_context;
+   cso_set_geometry_shader_handle(cso, gs);
+
+   /* Replace draw mode with equivalent one which geometry shader support.
+    *
+    * New mode consume same vertex buffer structure and produce primitive with
+    * same vertices (no need to be same type of primitive, because geometry shader
+    * operate on vertives and emit nothing).
+    *
+    * We can break QUAD and POLYGON to triangles with same shape. But we can't futher
+    * break them into single line or point because new primitive need to contain >=3
+    * vertices so that it's still handled in 2D (planar) way instead of 1D (line) or
+    * 0D (point) way which have different algorithm.
+    */
+   switch (info->mode) {
+   case GL_QUADS:
+      info->mode = GL_LINES_ADJACENCY;
+      break;
+   case GL_QUAD_STRIP:
+      info->mode = GL_TRIANGLE_STRIP;
+      break;
+   case GL_POLYGON:
+      info->mode = GL_TRIANGLE_FAN;
+      break;
+   default:
+      break;
+   }
+
+   /* Only normal glBegin/End draws pass result offset by attribute to avoid flush
+    * vertices when change name stack, so multiple glBegin/End sections before/after
+    * name stack calls can be merged to a single draw call. To achieve this We mark
+    * name stack result buffer used in glEnd instead of the last draw call.
+    *
+    * Other case like glDrawArrays and display list replay won't merge draws cross
+    * name stack calls, so we just mark name stack result buffer used here.
+    */
+   if (!key.result_offset_from_attribute)
+      ctx->Select.ResultUsed = GL_TRUE;
+
+   return true;
+}