[Mesa-dev] [PATCH 19/22] anv/nir: add support for dvec3/4 consuming two locations

Fri Nov 25 08:52:49 UTC 2016

One difference between OpenGL and Vulkan regarding 64-bit vertex
attribute types is that dvec3 and dvec4 consumes just one location in
OpenGL, while in Vulkan it consumes two locations.

Thus, in OpenGL for each dvec3/dvec4 vertex attrib we mark just one bit
in our internal inputs_read bitmap (and also the corresponding bit in
double_inputs_read bitmap) while in Vulkan we mark two consecutive bits
in both bitmaps.

This is handled with a nir option called "dvec3_consumes_two_locations",
which is set to True for Vulkan code. And all the computation regarding
emitting vertices as well as the mapping between attributes and physical
registers use this option to correctly do the work.
---
 src/amd/vulkan/radv_pipeline.c               |  1 +
 src/compiler/nir/nir.h                       |  5 +++
 src/compiler/nir/nir_gather_info.c           |  6 +--
 src/gallium/drivers/freedreno/ir3/ir3_nir.c  |  1 +
 src/intel/vulkan/anv_device.c                |  2 +-
 src/intel/vulkan/genX_pipeline.c             | 62 +++++++++++++++++-----------
 src/mesa/drivers/dri/i965/brw_compiler.c     | 23 ++++++++++-
 src/mesa/drivers/dri/i965/brw_compiler.h     |  2 +-
 src/mesa/drivers/dri/i965/brw_fs_visitor.cpp | 14 +++++--
 src/mesa/drivers/dri/i965/brw_nir.c          | 18 +++++---
 src/mesa/drivers/dri/i965/brw_vec4.cpp       | 13 ++++--
 src/mesa/drivers/dri/i965/intel_screen.c     |  3 +-
 12 files changed, 105 insertions(+), 45 deletions(-)

diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index ee5d812..90d4650 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -59,6 +59,7 @@ static const struct nir_shader_compiler_options nir_options = {
 	.lower_unpack_unorm_4x8 = true,
 	.lower_extract_byte = true,
 	.lower_extract_word = true,
+	.dvec3_consumes_two_locations = true,
 };
 
 VkResult radv_CreateShaderModule(
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 1679d89..0fc8f39 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -1794,6 +1794,11 @@ typedef struct nir_shader_compiler_options {
     * information must be inferred from the list of input nir_variables.
     */
    bool use_interpolated_input_intrinsics;
+
+   /**
+    * In Vulkan, a dvec3/dvec4 consumes two locations instead just one.
+    */
+   bool dvec3_consumes_two_locations;
 } nir_shader_compiler_options;
 
 typedef struct nir_shader {
diff --git a/src/compiler/nir/nir_gather_info.c b/src/compiler/nir/nir_gather_info.c
index 07c9949..8c80671 100644
--- a/src/compiler/nir/nir_gather_info.c
+++ b/src/compiler/nir/nir_gather_info.c
@@ -96,7 +96,7 @@ mark_whole_variable(nir_shader *shader, nir_variable *var)
 
    const unsigned slots =
       var->data.compact ? DIV_ROUND_UP(glsl_get_length(type), 4)
-                        : glsl_count_attribute_slots(type, is_vertex_input);
+                        : glsl_count_attribute_slots(type, is_vertex_input && !shader->options->dvec3_consumes_two_locations);
 
    set_io_mask(shader, var, 0, slots);
 }
@@ -168,7 +168,7 @@ try_mask_partial_io(nir_shader *shader, nir_deref_var *deref)
        var->data.mode == nir_var_shader_in)
       is_vertex_input = true;
 
-   unsigned offset = get_io_offset(deref, is_vertex_input);
+   unsigned offset = get_io_offset(deref, is_vertex_input && !shader->options->dvec3_consumes_two_locations);
    if (offset == -1)
       return false;
 
@@ -184,7 +184,7 @@ try_mask_partial_io(nir_shader *shader, nir_deref_var *deref)
    }
 
    /* double element width for double types that takes two slots */
-   if (!is_vertex_input &&
+   if ((!is_vertex_input || shader->options->dvec3_consumes_two_locations) &&
        glsl_type_is_dual_slot(glsl_without_array(type))) {
       elem_width *= 2;
    }
diff --git a/src/gallium/drivers/freedreno/ir3/ir3_nir.c b/src/gallium/drivers/freedreno/ir3/ir3_nir.c
index 2d86a52..5c5c9ad 100644
--- a/src/gallium/drivers/freedreno/ir3/ir3_nir.c
+++ b/src/gallium/drivers/freedreno/ir3/ir3_nir.c
@@ -50,6 +50,7 @@ static const nir_shader_compiler_options options = {
 		.vertex_id_zero_based = true,
 		.lower_extract_byte = true,
 		.lower_extract_word = true,
+		.dvec3_consumes_two_locations = false,
 };
 
 struct nir_shader *
diff --git a/src/intel/vulkan/anv_device.c b/src/intel/vulkan/anv_device.c
index 2c8ac49..725848f 100644
--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@@ -167,7 +167,7 @@ anv_physical_device_init(struct anv_physical_device *device,
 
    brw_process_intel_debug_variable();
 
-   device->compiler = brw_compiler_create(NULL, &device->info);
+   device->compiler = brw_compiler_create(NULL, &device->info, true);
    if (device->compiler == NULL) {
       result = vk_error(VK_ERROR_OUT_OF_HOST_MEMORY);
       goto fail;
diff --git a/src/intel/vulkan/genX_pipeline.c b/src/intel/vulkan/genX_pipeline.c
index cb164ad..97c40b8 100644
--- a/src/intel/vulkan/genX_pipeline.c
+++ b/src/intel/vulkan/genX_pipeline.c
@@ -33,26 +33,33 @@
 static uint32_t
 vertex_element_comp_control(enum isl_format format, unsigned comp)
 {
-   uint8_t bits;
    switch (comp) {
-   case 0: bits = isl_format_layouts[format].channels.r.bits; break;
-   case 1: bits = isl_format_layouts[format].channels.g.bits; break;
-   case 2: bits = isl_format_layouts[format].channels.b.bits; break;
-   case 3: bits = isl_format_layouts[format].channels.a.bits; break;
-   default: unreachable("Invalid component");
-   }
-
-   if (bits) {
-      return VFCOMP_STORE_SRC;
-   } else if (comp < 3) {
-      return VFCOMP_STORE_0;
-   } else if (isl_format_layouts[format].channels.r.type == ISL_UINT ||
-            isl_format_layouts[format].channels.r.type == ISL_SINT) {
-      assert(comp == 3);
-      return VFCOMP_STORE_1_INT;
-   } else {
-      assert(comp == 3);
-      return VFCOMP_STORE_1_FP;
+   case 0:
+      return isl_format_layouts[format].channels.r.bits ?
+         VFCOMP_STORE_SRC : VFCOMP_STORE_0;
+   case 1:
+      return isl_format_layouts[format].channels.g.bits ?
+         VFCOMP_STORE_SRC : VFCOMP_STORE_0;
+   case 2:
+      return isl_format_layouts[format].channels.b.bits ?
+         VFCOMP_STORE_SRC : ((isl_format_layouts[format].channels.r.type == ISL_RAW) ?
+                             VFCOMP_NOSTORE : VFCOMP_STORE_0);
+   case 3:
+      if (isl_format_layouts[format].channels.a.bits)
+         return VFCOMP_STORE_SRC;
+      else
+         switch (isl_format_layouts[format].channels.r.type) {
+         case ISL_RAW:
+            return isl_format_layouts[format].channels.b.bits ?
+               VFCOMP_STORE_0 : VFCOMP_NOSTORE;
+         case ISL_UINT:
+         case ISL_SINT:
+            return VFCOMP_STORE_1_INT;
+         default:
+            return VFCOMP_STORE_1_FP;
+         }
+   default:
+      unreachable("Invalid component");
    }
 }
 
@@ -64,8 +71,10 @@ emit_vertex_input(struct anv_pipeline *pipeline,
 
    /* Pull inputs_read out of the VS prog data */
    const uint64_t inputs_read = vs_prog_data->inputs_read;
+   const uint64_t double_inputs_read = vs_prog_data->double_inputs_read;
    assert((inputs_read & ((1 << VERT_ATTRIB_GENERIC0) - 1)) == 0);
    const uint32_t elements = inputs_read >> VERT_ATTRIB_GENERIC0;
+   const uint32_t elements_double = double_inputs_read >> VERT_ATTRIB_GENERIC0;
 
 #if GEN_GEN >= 8
    /* On BDW+, we only need to allocate space for base ids.  Setting up
@@ -83,13 +92,16 @@ emit_vertex_input(struct anv_pipeline *pipeline,
                                 vs_prog_data->uses_baseinstance;
 #endif
 
-   uint32_t elem_count = __builtin_popcount(elements) + needs_svgs_elem;
-   if (elem_count == 0)
+   uint32_t elem_count =
+      __builtin_popcount(elements) - DIV_ROUND_UP(__builtin_popcount(elements_double), 2);
+
+   uint32_t total_elems = elem_count + needs_svgs_elem;
+   if (total_elems == 0)
       return;
 
    uint32_t *p;
 
-   const uint32_t num_dwords = 1 + elem_count * 2;
+   const uint32_t num_dwords = 1 + total_elems * 2;
    p = anv_batch_emitn(&pipeline->batch, num_dwords,
                        GENX(3DSTATE_VERTEX_ELEMENTS));
    memset(p + 1, 0, (num_dwords - 1) * 4);
@@ -107,7 +119,9 @@ emit_vertex_input(struct anv_pipeline *pipeline,
       if ((elements & (1 << desc->location)) == 0)
          continue; /* Binding unused */
 
-      uint32_t slot = __builtin_popcount(elements & ((1 << desc->location) - 1));
+      uint32_t slot =
+         __builtin_popcount(elements & ((1 << desc->location) - 1)) -
+         DIV_ROUND_UP(__builtin_popcount(elements_double & ((1 << desc->location) - 1)), 2);
 
       struct GENX(VERTEX_ELEMENT_STATE) element = {
          .VertexBufferIndex = desc->binding,
@@ -137,7 +151,7 @@ emit_vertex_input(struct anv_pipeline *pipeline,
 #endif
    }
 
-   const uint32_t id_slot = __builtin_popcount(elements);
+   const uint32_t id_slot = elem_count;
    if (needs_svgs_elem) {
       /* From the Broadwell PRM for the 3D_Vertex_Component_Control enum:
        *    "Within a VERTEX_ELEMENT_STATE structure, if a Component
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.c b/src/mesa/drivers/dri/i965/brw_compiler.c
index 1aa72bc..b9eceeb 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.c
+++ b/src/mesa/drivers/dri/i965/brw_compiler.c
@@ -55,6 +55,22 @@ static const struct nir_shader_compiler_options scalar_nir_options = {
    .lower_unpack_snorm_4x8 = true,
    .lower_unpack_unorm_2x16 = true,
    .lower_unpack_unorm_4x8 = true,
+   .dvec3_consumes_two_locations = false,
+};
+
+static const struct nir_shader_compiler_options vulkan_scalar_nir_options = {
+   COMMON_OPTIONS,
+   .lower_pack_half_2x16 = true,
+   .lower_pack_snorm_2x16 = true,
+   .lower_pack_snorm_4x8 = true,
+   .lower_pack_unorm_2x16 = true,
+   .lower_pack_unorm_4x8 = true,
+   .lower_unpack_half_2x16 = true,
+   .lower_unpack_snorm_2x16 = true,
+   .lower_unpack_snorm_4x8 = true,
+   .lower_unpack_unorm_2x16 = true,
+   .lower_unpack_unorm_4x8 = true,
+   .dvec3_consumes_two_locations = true,
 };
 
 static const struct nir_shader_compiler_options vector_nir_options = {
@@ -75,6 +91,7 @@ static const struct nir_shader_compiler_options vector_nir_options = {
    .lower_unpack_unorm_2x16 = true,
    .lower_extract_byte = true,
    .lower_extract_word = true,
+   .dvec3_consumes_two_locations = false,
 };
 
 static const struct nir_shader_compiler_options vector_nir_options_gen6 = {
@@ -92,10 +109,11 @@ static const struct nir_shader_compiler_options vector_nir_options_gen6 = {
    .lower_unpack_unorm_2x16 = true,
    .lower_extract_byte = true,
    .lower_extract_word = true,
+   .dvec3_consumes_two_locations = false,
 };
 
 struct brw_compiler *
-brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo)
+brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo, bool is_vulkan)
 {
    struct brw_compiler *compiler = rzalloc(mem_ctx, struct brw_compiler);
 
@@ -138,7 +156,8 @@ brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo)
          compiler->glsl_compiler_options[i].EmitNoIndirectSampler = true;
 
       if (is_scalar) {
-         compiler->glsl_compiler_options[i].NirOptions = &scalar_nir_options;
+         compiler->glsl_compiler_options[i].NirOptions = is_vulkan ? &vulkan_scalar_nir_options
+                                                                   : &scalar_nir_options;
       } else {
          compiler->glsl_compiler_options[i].NirOptions =
             devinfo->gen < 6 ? &vector_nir_options : &vector_nir_options_gen6;
diff --git a/src/mesa/drivers/dri/i965/brw_compiler.h b/src/mesa/drivers/dri/i965/brw_compiler.h
index 65a7478..34df343 100644
--- a/src/mesa/drivers/dri/i965/brw_compiler.h
+++ b/src/mesa/drivers/dri/i965/brw_compiler.h
@@ -760,7 +760,7 @@ DEFINE_PROG_DATA_DOWNCAST(sf)
 /** @} */
 
 struct brw_compiler *
-brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo);
+brw_compiler_create(void *mem_ctx, const struct gen_device_info *devinfo, bool is_vulkan);
 
 /**
  * Compile a vertex shader.
diff --git a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
index 14415bd..00c5bb4 100644
--- a/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
+++ b/src/mesa/drivers/dri/i965/brw_fs_visitor.cpp
@@ -35,10 +35,16 @@ using namespace brw;
 fs_reg *
 fs_visitor::emit_vs_system_value(int location)
 {
-   fs_reg *reg = new(this->mem_ctx)
-      fs_reg(ATTR, 4 * (_mesa_bitcount_64(nir->info->inputs_read) +
-                        _mesa_bitcount_64(nir->info->double_inputs_read)),
-             BRW_REGISTER_TYPE_D);
+   fs_reg *reg;
+   if (nir->options->dvec3_consumes_two_locations)
+      reg = new(this->mem_ctx)
+         fs_reg(ATTR, 4 * _mesa_bitcount_64(nir->info->inputs_read),
+                BRW_REGISTER_TYPE_D);
+   else
+      reg = new(this->mem_ctx)
+         fs_reg(ATTR, 4 * (_mesa_bitcount_64(nir->info->inputs_read) +
+                           _mesa_bitcount_64(nir->info->double_inputs_read)),
+                BRW_REGISTER_TYPE_D);
    struct brw_vs_prog_data *vs_prog_data = brw_vs_prog_data(prog_data);
 
    switch (location) {
diff --git a/src/mesa/drivers/dri/i965/brw_nir.c b/src/mesa/drivers/dri/i965/brw_nir.c
index 763e3ec..daa22c6 100644
--- a/src/mesa/drivers/dri/i965/brw_nir.c
+++ b/src/mesa/drivers/dri/i965/brw_nir.c
@@ -96,7 +96,7 @@ add_const_offset_to_base(nir_shader *nir, nir_variable_mode mode)
 }
 
 static bool
-remap_vs_attrs(nir_block *block, shader_info *nir_info)
+remap_vs_attrs(nir_block *block, nir_shader *nir)
 {
    nir_foreach_instr(instr, block) {
       if (instr->type != nir_instr_type_intrinsic)
@@ -111,10 +111,13 @@ remap_vs_attrs(nir_block *block, shader_info *nir_info)
           * before it and counting the bits.
           */
          int attr = intrin->const_index[0];
-         int slot = _mesa_bitcount_64(nir_info->inputs_read &
+         int slot = _mesa_bitcount_64(nir->info->inputs_read &
+                                      BITFIELD64_MASK(attr));
+
+         int dslot = 0;
+         if (!nir->options->dvec3_consumes_two_locations)
+            dslot = _mesa_bitcount_64(nir->info->double_inputs_read &
                                       BITFIELD64_MASK(attr));
-         int dslot = _mesa_bitcount_64(nir_info->double_inputs_read &
-                                       BITFIELD64_MASK(attr));
          intrin->const_index[0] = 4 * (slot + dslot);
       }
    }
@@ -204,7 +207,10 @@ brw_nir_lower_vs_inputs(nir_shader *nir,
     * loaded as one vec4 or dvec4 per element (or matrix column), depending on
     * whether it is a double-precision type or not.
     */
-   nir_lower_io(nir, nir_var_shader_in, type_size_vs_input, 0);
+   nir_lower_io(nir,
+                nir_var_shader_in,
+                nir->options->dvec3_consumes_two_locations ? type_size_vec4 : type_size_vs_input,
+                0);
 
    /* This pass needs actual constants */
    nir_opt_constant_folding(nir);
@@ -220,7 +226,7 @@ brw_nir_lower_vs_inputs(nir_shader *nir,
       nir_foreach_function(function, nir) {
          if (function->impl) {
             nir_foreach_block(block, function->impl) {
-               remap_vs_attrs(block, nir->info);
+               remap_vs_attrs(block, nir);
             }
          }
       }
diff --git a/src/mesa/drivers/dri/i965/brw_vec4.cpp b/src/mesa/drivers/dri/i965/brw_vec4.cpp
index b9e592f..0aab97f 100644
--- a/src/mesa/drivers/dri/i965/brw_vec4.cpp
+++ b/src/mesa/drivers/dri/i965/brw_vec4.cpp
@@ -2129,6 +2129,9 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
 
    unsigned nr_attributes = _mesa_bitcount_64(prog_data->inputs_read);
 
+   if (shader->options->dvec3_consumes_two_locations)
+      nr_attributes -= DIV_ROUND_UP(_mesa_bitcount_64(prog_data->double_inputs_read), 2);
+
    /* gl_VertexID and gl_InstanceID are system values, but arrive via an
     * incoming vertex attribute.  So, add an extra slot.
     */
@@ -2146,9 +2149,13 @@ brw_compile_vs(const struct brw_compiler *compiler, void *log_data,
       nr_attributes++;
    }
 
-   unsigned nr_attribute_slots =
-      nr_attributes +
-      _mesa_bitcount_64(shader->info->double_inputs_read);
+   unsigned nr_attribute_slots = nr_attributes;
+   if (shader->options->dvec3_consumes_two_locations)
+      nr_attribute_slots +=
+         DIV_ROUND_UP(_mesa_bitcount_64(shader->info->double_inputs_read), 2);
+   else
+      nr_attribute_slots +=
+         _mesa_bitcount_64(shader->info->double_inputs_read);
 
    /* The 3DSTATE_VS documentation lists the lower bound on "Vertex URB Entry
     * Read Length" as 1 in vec4 mode, and 0 in SIMD8 mode.  Empirically, in
diff --git a/src/mesa/drivers/dri/i965/intel_screen.c b/src/mesa/drivers/dri/i965/intel_screen.c
index e1c3c19..c050f86 100644
--- a/src/mesa/drivers/dri/i965/intel_screen.c
+++ b/src/mesa/drivers/dri/i965/intel_screen.c
@@ -1688,7 +1688,8 @@ __DRIconfig **intelInitScreen2(__DRIscreen *dri_screen)
       ? screenExtensions : intelRobustScreenExtensions;
 
    screen->compiler = brw_compiler_create(screen,
-                                          &screen->devinfo);
+                                          &screen->devinfo,
+                                          false);
    screen->compiler->shader_debug_log = shader_debug_log_mesa;
    screen->compiler->shader_perf_log = shader_perf_log_mesa;
    screen->program_id = 1;
-- 
2.7.4