Mesa (master): nir: add many passes that lower and optimize 16-bit input/outputs and samplers

Tue Apr 13 05:32:52 UTC 2021

Module: Mesa
Branch: master
Commit: fb29cef8ddabdd05aeddc5220017bb28a83bb19c
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=fb29cef8ddabdd05aeddc5220017bb28a83bb19c

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Sun Feb  7 21:10:08 2021 -0500

nir: add many passes that lower and optimize 16-bit input/outputs and samplers

Added:
* a pass that renumbers bases of IO intrinsics
* a pass that converts mediump IO to 16 bits, optionally using the new
  packed varying slots
* a pass that sets (forces) mediump in IO intrinsics (for testing)
* a pass that remaps VARYING_SLOT_VAR[0..15]_16BIT to VARYING_SLOT_VAR[0..31]
  (if some shader stages don't want packed varyings)
* a pass that folds type conversions around texture opcodes into those
  opcodes (e.g. tex(f2f32(coord), ..) is changed into tex accepting f16)
* a pass that changes (legalizes) sampler src and dst types based on specified
  hw constraints (e.g. derivatives must be the same type as coordinates)

Reviewed-by: Matt Turner <mattst88 at gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/9050>

---

 src/compiler/Makefile.sources                |   2 +-
 src/compiler/nir/meson.build                 |   2 +-
 src/compiler/nir/nir.h                       |  18 +-
 src/compiler/nir/nir_lower_mediump.c         | 611 +++++++++++++++++++++++++++
 src/compiler/nir/nir_lower_mediump_outputs.c |  79 ----
 src/freedreno/ir3/ir3_nir.c                  |   2 +-
 src/gallium/drivers/radeonsi/si_shader_nir.c |   2 +-
 src/util/bitset.h                            |  19 +
 8 files changed, 651 insertions(+), 84 deletions(-)

diff --git a/src/compiler/Makefile.sources b/src/compiler/Makefile.sources
index 0a4133d49a3..66777c7213e 100644
--- a/src/compiler/Makefile.sources
+++ b/src/compiler/Makefile.sources
@@ -279,7 +279,7 @@ NIR_FILES = \
 	nir/nir_lower_io_to_scalar.c \
 	nir/nir_lower_io_to_vector.c \
 	nir/nir_lower_multiview.c \
-	nir/nir_lower_mediump_outputs.c \
+	nir/nir_lower_mediump.c \
 	nir/nir_lower_memcpy.c \
 	nir/nir_lower_memory_model.c \
 	nir/nir_lower_non_uniform_access.c \
diff --git a/src/compiler/nir/meson.build b/src/compiler/nir/meson.build
index f45a9c17b43..d17bbe504a5 100644
--- a/src/compiler/nir/meson.build
+++ b/src/compiler/nir/meson.build
@@ -168,7 +168,7 @@ files_libnir = files(
   'nir_lower_io_to_scalar.c',
   'nir_lower_io_to_vector.c',
   'nir_lower_multiview.c',
-  'nir_lower_mediump_outputs.c',
+  'nir_lower_mediump.c',
   'nir_lower_memcpy.c',
   'nir_lower_memory_model.c',
   'nir_lower_non_uniform_access.c',
diff --git a/src/compiler/nir/nir.h b/src/compiler/nir/nir.h
index 2054fd761d6..bc386b5435c 100644
--- a/src/compiler/nir/nir.h
+++ b/src/compiler/nir/nir.h
@@ -4991,7 +4991,23 @@ bool nir_lower_doubles(nir_shader *shader, const nir_shader *softfp64,
                        nir_lower_doubles_options options);
 bool nir_lower_pack(nir_shader *shader);
 
-void nir_lower_mediump_outputs(nir_shader *nir);
+bool nir_recompute_io_bases(nir_function_impl *impl, nir_variable_mode modes);
+bool nir_lower_mediump_io(nir_shader *nir, nir_variable_mode modes,
+                          uint64_t varying_mask, bool use_16bit_slots);
+bool nir_force_mediump_io(nir_shader *nir, nir_variable_mode modes,
+                          nir_alu_type types);
+bool nir_unpack_16bit_varying_slots(nir_shader *nir, nir_variable_mode modes);
+bool nir_fold_16bit_sampler_conversions(nir_shader *nir,
+                                        unsigned tex_src_types);
+
+typedef struct {
+   bool legalize_type;         /* whether this src should be legalized */
+   uint8_t bit_size;           /* bit_size to enforce */
+   nir_tex_src_type match_src; /* if bit_size is 0, match bit size of this */
+} nir_tex_src_type_constraint, nir_tex_src_type_constraints[nir_num_tex_src_types];
+
+bool nir_legalize_16bit_sampler_srcs(nir_shader *nir,
+                                     nir_tex_src_type_constraints constraints);
 
 bool nir_lower_point_size(nir_shader *shader, float min, float max);
 
diff --git a/src/compiler/nir/nir_lower_mediump.c b/src/compiler/nir/nir_lower_mediump.c
new file mode 100644
index 00000000000..0cc58c1e755
--- /dev/null
+++ b/src/compiler/nir/nir_lower_mediump.c
@@ -0,0 +1,611 @@
+/*
+ * Copyright (C) 2020 Google, Inc.
+ * Copyright (C) 2021 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "nir.h"
+#include "nir_builder.h"
+
+/**
+ * Return the intrinsic if it matches the mask in "modes", else return NULL.
+ */
+static nir_intrinsic_instr *
+get_io_intrinsic(nir_instr *instr, nir_variable_mode modes,
+                 nir_variable_mode *out_mode)
+{
+   if (instr->type != nir_instr_type_intrinsic)
+      return NULL;
+
+   nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+
+   switch (intr->intrinsic) {
+   case nir_intrinsic_load_input:
+   case nir_intrinsic_load_input_vertex:
+   case nir_intrinsic_load_interpolated_input:
+   case nir_intrinsic_load_per_vertex_input:
+      *out_mode = nir_var_shader_in;
+      return modes & nir_var_shader_in ? intr : NULL;
+   case nir_intrinsic_load_output:
+   case nir_intrinsic_load_per_vertex_output:
+   case nir_intrinsic_store_output:
+   case nir_intrinsic_store_per_vertex_output:
+      *out_mode = nir_var_shader_out;
+      return modes & nir_var_shader_out ? intr : NULL;
+   default:
+      return NULL;
+   }
+}
+
+/**
+ * Recompute the IO "base" indices from scratch to remove holes or to fix
+ * incorrect base values due to changes in IO locations by using IO locations
+ * to assign new bases. The mapping from locations to bases becomes
+ * monotonically increasing.
+ */
+bool
+nir_recompute_io_bases(nir_function_impl *impl, nir_variable_mode modes)
+{
+   BITSET_DECLARE(inputs, NUM_TOTAL_VARYING_SLOTS);
+   BITSET_DECLARE(outputs, NUM_TOTAL_VARYING_SLOTS);
+   BITSET_ZERO(inputs);
+   BITSET_ZERO(outputs);
+
+   /* Gather the bitmasks of used locations. */
+   nir_foreach_block_safe (block, impl) {
+      nir_foreach_instr_safe (instr, block) {
+         nir_variable_mode mode;
+         nir_intrinsic_instr *intr = get_io_intrinsic(instr, modes, &mode);
+         if (!intr)
+            continue;
+
+         nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+         unsigned num_slots = sem.num_slots;
+         if (sem.medium_precision)
+            num_slots = (num_slots + sem.high_16bits + 1) / 2;
+
+         if (mode == nir_var_shader_in) {
+            for (unsigned i = 0; i < num_slots; i++)
+               BITSET_SET(inputs, sem.location + i);
+         } else if (!sem.dual_source_blend_index) {
+            for (unsigned i = 0; i < num_slots; i++)
+               BITSET_SET(outputs, sem.location + i);
+         }
+      }
+   }
+
+   /* Renumber bases. */
+   bool changed = false;
+
+   nir_foreach_block_safe (block, impl) {
+      nir_foreach_instr_safe (instr, block) {
+         nir_variable_mode mode;
+         nir_intrinsic_instr *intr = get_io_intrinsic(instr, modes, &mode);
+         if (!intr)
+            continue;
+
+         nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+         unsigned num_slots = sem.num_slots;
+         if (sem.medium_precision)
+            num_slots = (num_slots + sem.high_16bits + 1) / 2;
+
+         if (mode == nir_var_shader_in) {
+            nir_intrinsic_set_base(intr,
+                                   BITSET_PREFIX_SUM(inputs, sem.location));
+         } else if (sem.dual_source_blend_index) {
+            nir_intrinsic_set_base(intr,
+                                   BITSET_PREFIX_SUM(outputs, NUM_TOTAL_VARYING_SLOTS));
+         } else {
+            nir_intrinsic_set_base(intr,
+                                   BITSET_PREFIX_SUM(outputs, sem.location));
+         }
+         changed = true;
+      }
+   }
+
+   nir_metadata_preserve(impl, nir_metadata_all);
+   return changed;
+}
+
+/**
+ * Lower mediump inputs and/or outputs to 16 bits.
+ *
+ * \param modes            Whether to lower inputs, outputs, or both.
+ * \param varying_mask     Determines which varyings to skip (VS inputs,
+ *    FS outputs, and patch varyings ignore this mask).
+ * \param use_16bit_slots  Remap lowered slots to* VARYING_SLOT_VARn_16BIT.
+ */
+bool
+nir_lower_mediump_io(nir_shader *nir, nir_variable_mode modes,
+                     uint64_t varying_mask, bool use_16bit_slots)
+{
+   bool changed = false;
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+   assert(impl);
+
+   nir_builder b;
+   nir_builder_init(&b, impl);
+
+   nir_foreach_block_safe (block, impl) {
+      nir_foreach_instr_safe (instr, block) {
+         nir_variable_mode mode;
+         nir_intrinsic_instr *intr = get_io_intrinsic(instr, modes, &mode);
+         if (!intr)
+            continue;
+
+         nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+         nir_ssa_def *(*convert)(nir_builder *, nir_ssa_def *);
+         bool is_varying = !(nir->info.stage == MESA_SHADER_VERTEX &&
+                             mode == nir_var_shader_in) &&
+                           !(nir->info.stage == MESA_SHADER_FRAGMENT &&
+                             mode == nir_var_shader_out);
+
+         if (!sem.medium_precision ||
+             (is_varying && sem.location <= VARYING_SLOT_VAR31 &&
+              !(varying_mask & BITFIELD64_BIT(sem.location))))
+            continue; /* can't lower */
+
+         if (nir_intrinsic_has_src_type(intr)) {
+            /* Stores. */
+            nir_alu_type type = nir_intrinsic_src_type(intr);
+
+            switch (type) {
+            case nir_type_float32:
+               convert = nir_f2fmp;
+               break;
+            case nir_type_int32:
+            case nir_type_uint32:
+               convert = nir_i2imp;
+               break;
+            default:
+               continue; /* already lowered? */
+            }
+
+            /* Convert the 32-bit store into a 16-bit store. */
+            b.cursor = nir_before_instr(&intr->instr);
+            nir_instr_rewrite_src_ssa(&intr->instr, &intr->src[0],
+                                      convert(&b, intr->src[0].ssa));
+            nir_intrinsic_set_src_type(intr, (type & ~32) | 16);
+         } else {
+            /* Loads. */
+            nir_alu_type type = nir_intrinsic_dest_type(intr);
+
+            switch (type) {
+            case nir_type_float32:
+               convert = nir_f2f32;
+               break;
+            case nir_type_int32:
+               convert = nir_i2i32;
+               break;
+            case nir_type_uint32:
+               convert = nir_u2u32;
+               break;
+            default:
+               continue; /* already lowered? */
+            }
+
+            /* Convert the 32-bit load into a 16-bit load. */
+            b.cursor = nir_after_instr(&intr->instr);
+            intr->dest.ssa.bit_size = 16;
+            nir_intrinsic_set_dest_type(intr, (type & ~32) | 16);
+            nir_ssa_def *dst = convert(&b, &intr->dest.ssa);
+            nir_ssa_def_rewrite_uses_after(&intr->dest.ssa, dst,
+                                           dst->parent_instr);
+         }
+
+         if (use_16bit_slots && is_varying &&
+             sem.location >= VARYING_SLOT_VAR0 &&
+             sem.location <= VARYING_SLOT_VAR31) {
+            unsigned index = sem.location - VARYING_SLOT_VAR0;
+
+            sem.location = VARYING_SLOT_VAR0_16BIT + index / 2;
+            sem.high_16bits = index % 2;
+            nir_intrinsic_set_io_semantics(intr, sem);
+         }
+         changed = true;
+      }
+   }
+
+   if (changed)
+      nir_recompute_io_bases(impl, modes);
+
+   nir_metadata_preserve(impl, nir_metadata_all);
+   return changed;
+}
+
+/**
+ * Set the mediump precision bit for those shader inputs and outputs that are
+ * set in the "modes" mask. Non-generic varyings (that GLES3 doesn't have)
+ * are ignored. The "types" mask can be (nir_type_float | nir_type_int), etc.
+ */
+bool
+nir_force_mediump_io(nir_shader *nir, nir_variable_mode modes,
+                     nir_alu_type types)
+{
+   bool changed = false;
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+   assert(impl);
+
+   nir_builder b;
+   nir_builder_init(&b, impl);
+
+   nir_foreach_block_safe (block, impl) {
+      nir_foreach_instr_safe (instr, block) {
+         nir_variable_mode mode;
+         nir_intrinsic_instr *intr = get_io_intrinsic(instr, modes, &mode);
+         if (!intr)
+            continue;
+
+         nir_alu_type type;
+         if (nir_intrinsic_has_src_type(intr))
+            type = nir_intrinsic_src_type(intr);
+         else
+            type = nir_intrinsic_dest_type(intr);
+         if (!(type & types))
+            continue;
+
+         nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+
+         if (nir->info.stage == MESA_SHADER_FRAGMENT &&
+             mode == nir_var_shader_out) {
+            /* Only accept FS outputs. */
+            if (sem.location < FRAG_RESULT_DATA0 &&
+                sem.location != FRAG_RESULT_COLOR)
+               continue;
+         } else if (nir->info.stage == MESA_SHADER_VERTEX &&
+                    mode == nir_var_shader_in) {
+            /* Accept all VS inputs. */
+         } else {
+            /* Only accept generic varyings. */
+            if (sem.location < VARYING_SLOT_VAR0 ||
+                sem.location > VARYING_SLOT_VAR31)
+            continue;
+         }
+
+         sem.medium_precision = 1;
+         nir_intrinsic_set_io_semantics(intr, sem);
+         changed = true;
+      }
+   }
+
+   nir_metadata_preserve(impl, nir_metadata_all);
+   return changed;
+}
+
+/**
+ * Remap 16-bit varying slots to the original 32-bit varying slots.
+ * This only changes IO semantics and bases.
+ */
+bool
+nir_unpack_16bit_varying_slots(nir_shader *nir, nir_variable_mode modes)
+{
+   bool changed = false;
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+   assert(impl);
+
+   nir_foreach_block_safe (block, impl) {
+      nir_foreach_instr_safe (instr, block) {
+         nir_variable_mode mode;
+         nir_intrinsic_instr *intr = get_io_intrinsic(instr, modes, &mode);
+         if (!intr)
+            continue;
+
+         nir_io_semantics sem = nir_intrinsic_io_semantics(intr);
+
+         if (sem.location < VARYING_SLOT_VAR0_16BIT ||
+             sem.location > VARYING_SLOT_VAR15_16BIT)
+            continue;
+
+         sem.location = VARYING_SLOT_VAR0 +
+                        (sem.location - VARYING_SLOT_VAR0_16BIT) * 2 +
+                        sem.high_16bits;
+         sem.high_16bits = 0;
+         nir_intrinsic_set_io_semantics(intr, sem);
+         changed = true;
+      }
+   }
+
+   if (changed)
+      nir_recompute_io_bases(impl, modes);
+
+   nir_metadata_preserve(impl, nir_metadata_all);
+   return changed;
+}
+
+static bool
+is_n_to_m_conversion(nir_instr *instr, unsigned n, nir_op m)
+{
+   if (instr->type != nir_instr_type_alu)
+      return false;
+
+   nir_alu_instr *alu = nir_instr_as_alu(instr);
+   return alu->op == m && alu->src[0].src.ssa->bit_size == n;
+}
+
+static bool
+is_f16_to_f32_conversion(nir_instr *instr)
+{
+   return is_n_to_m_conversion(instr, 16, nir_op_f2f32);
+}
+
+static bool
+is_f32_to_f16_conversion(nir_instr *instr)
+{
+   return is_n_to_m_conversion(instr, 32, nir_op_f2f16) ||
+          is_n_to_m_conversion(instr, 32, nir_op_f2f16_rtne) ||
+          is_n_to_m_conversion(instr, 32, nir_op_f2fmp);
+}
+
+static bool
+is_i16_to_i32_conversion(nir_instr *instr)
+{
+   return is_n_to_m_conversion(instr, 16, nir_op_i2i32);
+}
+
+static bool
+is_u16_to_u32_conversion(nir_instr *instr)
+{
+   return is_n_to_m_conversion(instr, 16, nir_op_u2u32);
+}
+
+static bool
+is_i32_to_i16_conversion(nir_instr *instr)
+{
+   return is_n_to_m_conversion(instr, 32, nir_op_i2i16);
+}
+
+static void
+replace_with_mov(nir_builder *b, nir_instr *instr, nir_src *src,
+                 nir_alu_instr *alu)
+{
+   nir_ssa_def *mov = nir_mov_alu(b, alu->src[0],
+                                  nir_dest_num_components(alu->dest.dest));
+   assert(!alu->dest.saturate);
+   nir_instr_rewrite_src_ssa(instr, src, mov);
+}
+
+/**
+ * If texture source operands use f16->f32 conversions or return values are
+ * followed by f16->f32 or f32->f16, remove those conversions. This benefits
+ * drivers that have texture opcodes that can accept and return 16-bit types.
+ *
+ * "tex_src_types" is a mask of nir_tex_src_* operands that should be handled.
+ * It's always done for the destination.
+ *
+ * This should be run after late algebraic optimizations.
+ * Copy propagation and DCE should be run after this.
+ */
+bool
+nir_fold_16bit_sampler_conversions(nir_shader *nir,
+                                   unsigned tex_src_types)
+{
+   bool changed = false;
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+   assert(impl);
+
+   nir_builder b;
+   nir_builder_init(&b, impl);
+
+   nir_foreach_block_safe (block, impl) {
+      nir_foreach_instr_safe (instr, block) {
+         if (instr->type != nir_instr_type_tex)
+            continue;
+
+         nir_tex_instr *tex = nir_instr_as_tex(instr);
+         nir_instr *src;
+         nir_alu_instr *src_alu;
+
+         /* Skip because AMD doesn't support 16-bit types with these. */
+         if ((tex->op == nir_texop_txs ||
+              tex->op == nir_texop_query_levels) ||
+             tex->sampler_dim == GLSL_SAMPLER_DIM_CUBE)
+            continue;
+
+         /* Optimize source operands. */
+         for (unsigned i = 0; i < tex->num_srcs; i++) {
+            /* Filter out sources that should be ignored. */
+            if (!(BITFIELD_BIT(tex->src[i].src_type) & tex_src_types))
+               continue;
+
+            src = tex->src[i].src.ssa->parent_instr;
+            if (src->type != nir_instr_type_alu)
+               continue;
+
+            src_alu = nir_instr_as_alu(src);
+            b.cursor = nir_before_instr(src);
+
+            if (src_alu->op == nir_op_mov) {
+               assert(!"The IR shouldn't contain any movs to make this pass"
+                       " effective.");
+               continue;
+            }
+
+            /* Handle vector sources that are made of scalar instructions. */
+            if (nir_op_is_vec(src_alu->op)) {
+               /* See if the vector is made of f16->f32 opcodes. */
+               unsigned num = nir_dest_num_components(src_alu->dest.dest);
+               bool is_f16_to_f32 = true;
+               bool is_u16_to_u32 = true;
+
+               for (unsigned comp = 0; comp < num; comp++) {
+                  nir_instr *instr = src_alu->src[comp].src.ssa->parent_instr;
+                  is_f16_to_f32 &= is_f16_to_f32_conversion(instr);
+                  /* Zero-extension (u16) and sign-extension (i16) have
+                   * the same behavior here - txf returns 0 if bit 15 is set
+                   * because it's out of bounds and the higher bits don't
+                   * matter.
+                   */
+                  is_u16_to_u32 &= is_u16_to_u32_conversion(instr) ||
+                                   is_i16_to_i32_conversion(instr);
+               }
+
+               if (!is_f16_to_f32 && !is_u16_to_u32)
+                  continue;
+
+               nir_alu_instr *new_vec = nir_alu_instr_clone(nir, src_alu);
+               nir_instr_insert_after(&src_alu->instr, &new_vec->instr);
+
+               /* Replace conversions with mov. */
+               for (unsigned comp = 0; comp < num; comp++) {
+                  nir_instr *instr = new_vec->src[comp].src.ssa->parent_instr;
+                  replace_with_mov(&b, &new_vec->instr,
+                                   &new_vec->src[comp].src,
+                                   nir_instr_as_alu(instr));
+               }
+
+               new_vec->dest.dest.ssa.bit_size =
+                  new_vec->src[0].src.ssa->bit_size;
+               nir_instr_rewrite_src_ssa(&tex->instr, &tex->src[i].src,
+                                         &new_vec->dest.dest.ssa);
+               changed = true;
+            } else if (is_f16_to_f32_conversion(&src_alu->instr) ||
+                       is_u16_to_u32_conversion(&src_alu->instr) ||
+                       is_i16_to_i32_conversion(&src_alu->instr)) {
+               /* Handle scalar sources. */
+               replace_with_mov(&b, &tex->instr, &tex->src[i].src, src_alu);
+               changed = true;
+            }
+         }
+
+         /* Optimize the destination. */
+         bool is_f16_to_f32 = true;
+         bool is_f32_to_f16 = true;
+         bool is_i16_to_i32 = true;
+         bool is_i32_to_i16 = true; /* same behavior for int and uint */
+         bool is_u16_to_u32 = true;
+
+         nir_foreach_use(use, &tex->dest.ssa) {
+            is_f16_to_f32 &= is_f16_to_f32_conversion(use->parent_instr);
+            is_f32_to_f16 &= is_f32_to_f16_conversion(use->parent_instr);
+            is_i16_to_i32 &= is_i16_to_i32_conversion(use->parent_instr);
+            is_i32_to_i16 &= is_i32_to_i16_conversion(use->parent_instr);
+            is_u16_to_u32 &= is_u16_to_u32_conversion(use->parent_instr);
+         }
+
+         if (is_f16_to_f32 || is_f32_to_f16 || is_i16_to_i32 ||
+             is_i32_to_i16 || is_u16_to_u32) {
+            /* All uses are the same conversions. Replace them with mov. */
+            nir_foreach_use(use, &tex->dest.ssa) {
+               nir_alu_instr *conv = nir_instr_as_alu(use->parent_instr);
+               conv->op = nir_op_mov;
+               tex->dest.ssa.bit_size = conv->dest.dest.ssa.bit_size;
+               tex->dest_type = (tex->dest_type & (~16 & ~32 & ~64)) |
+                                conv->dest.dest.ssa.bit_size;
+            }
+            changed = true;
+         }
+      }
+   }
+
+   nir_metadata_preserve(impl, nir_metadata_all);
+   return changed;
+}
+
+/**
+ * Fix types of source operands of texture opcodes according to
+ * the constraints by inserting the appropriate conversion opcodes.
+ *
+ * For example, if the type of derivatives must be equal to texture
+ * coordinates and the type of the texture bias must be 32-bit, there
+ * will be 2 constraints describing that.
+ */
+bool
+nir_legalize_16bit_sampler_srcs(nir_shader *nir,
+                                nir_tex_src_type_constraints constraints)
+{
+   bool changed = false;
+   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
+   assert(impl);
+
+   nir_builder b;
+   nir_builder_init(&b, impl);
+
+   nir_foreach_block_safe (block, impl) {
+      nir_foreach_instr_safe (instr, block) {
+         if (instr->type != nir_instr_type_tex)
+            continue;
+
+         nir_tex_instr *tex = nir_instr_as_tex(instr);
+         int8_t map[nir_num_tex_src_types];
+         memset(map, -1, sizeof(map));
+
+         /* Create a mapping from src_type to src[i]. */
+         for (unsigned i = 0; i < tex->num_srcs; i++)
+            map[tex->src[i].src_type] = i;
+
+         /* Legalize src types. */
+         for (unsigned i = 0; i < tex->num_srcs; i++) {
+            nir_tex_src_type_constraint c = constraints[tex->src[i].src_type];
+
+            if (!c.legalize_type)
+               continue;
+
+            /* Determine the required bit size for the src. */
+            unsigned bit_size;
+            if (c.bit_size) {
+               bit_size = c.bit_size;
+            } else {
+               if (map[c.match_src] == -1)
+                  continue; /* e.g. txs */
+
+               bit_size = tex->src[map[c.match_src]].src.ssa->bit_size;
+            }
+
+            /* Check if the type is legal. */
+            if (bit_size == tex->src[i].src.ssa->bit_size)
+               continue;
+
+            /* Fix the bit size. */
+            bool is_sint = i == nir_tex_src_offset;
+            bool is_uint = !is_sint &&
+                           (tex->op == nir_texop_txf ||
+                            tex->op == nir_texop_txf_ms ||
+                            tex->op == nir_texop_txs ||
+                            tex->op == nir_texop_samples_identical);
+            nir_ssa_def *(*convert)(nir_builder *, nir_ssa_def *);
+
+            switch (bit_size) {
+            case 16:
+               convert = is_sint ? nir_i2i16 :
+                         is_uint ? nir_u2u16 : nir_f2f16;
+               break;
+            case 32:
+               convert = is_sint ? nir_i2i32 :
+                         is_uint ? nir_u2u32 : nir_f2f32;
+               break;
+            default:
+               assert(!"unexpected bit size");
+               continue;
+            }
+
+            b.cursor = nir_before_instr(&tex->instr);
+            nir_ssa_def *conv =
+               convert(&b, nir_ssa_for_src(&b, tex->src[i].src,
+                                           tex->src[i].src.ssa->num_components));
+            nir_instr_rewrite_src_ssa(&tex->instr, &tex->src[i].src, conv);
+            changed = true;
+         }
+      }
+   }
+
+   nir_metadata_preserve(impl, nir_metadata_all);
+   return changed;
+}
diff --git a/src/compiler/nir/nir_lower_mediump_outputs.c b/src/compiler/nir/nir_lower_mediump_outputs.c
deleted file mode 100644
index 5176cea99e7..00000000000
--- a/src/compiler/nir/nir_lower_mediump_outputs.c
+++ /dev/null
@@ -1,79 +0,0 @@
-/*
- * Copyright (C) 2020 Google, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a
- * copy of this software and associated documentation files (the "Software"),
- * to deal in the Software without restriction, including without limitation
- * the rights to use, copy, modify, merge, publish, distribute, sublicense,
- * and/or sell copies of the Software, and to permit persons to whom the
- * Software is furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice (including the next
- * paragraph) shall be included in all copies or substantial portions of the
- * Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
- * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-
-#include "nir.h"
-#include "nir_builder.h"
-
-/* Lower mediump outputs to float16, int16, or uint16. */
-
-void
-nir_lower_mediump_outputs(nir_shader *nir)
-{
-   nir_function_impl *impl = nir_shader_get_entrypoint(nir);
-   assert(impl);
-
-   /* Get rid of old derefs before we change the types of the variables */
-   nir_opt_dce(nir);
-
-   nir_builder b;
-   nir_builder_init(&b, impl);
-
-   nir_foreach_block_safe (block, impl) {
-      nir_foreach_instr_safe (instr, block) {
-         if (instr->type != nir_instr_type_intrinsic)
-            continue;
-
-         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
-         if (intr->intrinsic != nir_intrinsic_store_output)
-            continue;
-
-         if (!nir_intrinsic_io_semantics(intr).medium_precision)
-            break; /* can't lower */
-
-         switch (nir_intrinsic_src_type(intr)) {
-         case nir_type_float32:
-            b.cursor = nir_before_instr(&intr->instr);
-            nir_instr_rewrite_src(&intr->instr, &intr->src[0],
-                  nir_src_for_ssa(nir_f2f16(&b, intr->src[0].ssa)));
-            nir_intrinsic_set_src_type(intr, nir_type_float16);
-            break;
-
-         case nir_type_int32:
-            b.cursor = nir_before_instr(&intr->instr);
-            nir_instr_rewrite_src(&intr->instr, &intr->src[0],
-                  nir_src_for_ssa(nir_i2i16(&b, intr->src[0].ssa)));
-            nir_intrinsic_set_src_type(intr, nir_type_int16);
-            break;
-
-         case nir_type_uint32:
-            b.cursor = nir_before_instr(&intr->instr);
-            nir_instr_rewrite_src(&intr->instr, &intr->src[0],
-                  nir_src_for_ssa(nir_u2u16(&b, intr->src[0].ssa)));
-            nir_intrinsic_set_src_type(intr, nir_type_uint16);
-            break;
-
-         default:;
-         }
-      }
-   }
-}
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
index 101fa004335..759b2ccc18f 100644
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -367,7 +367,7 @@ ir3_nir_post_finalize(struct ir3_compiler *compiler, nir_shader *s)
 	if (compiler->gpu_id >= 600 &&
 			s->info.stage == MESA_SHADER_FRAGMENT &&
 			!(ir3_shader_debug & IR3_DBG_NOFP16)) {
-		NIR_PASS_V(s, nir_lower_mediump_outputs);
+		NIR_PASS_V(s, nir_lower_mediump_io, nir_var_shader_out, 0, false);
 	}
 
 	/* we cannot ensure that ir3_finalize_nir() is only called once, so
diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c
index f5a61de3c3c..d48c8483c4e 100644
--- a/src/gallium/drivers/radeonsi/si_shader_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shader_nir.c
@@ -799,7 +799,7 @@ static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir)
    if (nir->info.stage == MESA_SHADER_FRAGMENT &&
        sscreen->info.has_packed_math_16bit &&
        sscreen->b.get_shader_param(&sscreen->b, PIPE_SHADER_FRAGMENT, PIPE_SHADER_CAP_FP16))
-      NIR_PASS_V(nir, nir_lower_mediump_outputs);
+      NIR_PASS_V(nir, nir_lower_mediump_io, nir_var_shader_out, 0, false);
 
    si_nir_opts(sscreen, nir, true);
 
diff --git a/src/util/bitset.h b/src/util/bitset.h
index 29de65e839c..b9e968293b1 100644
--- a/src/util/bitset.h
+++ b/src/util/bitset.h
@@ -80,6 +80,25 @@
    ((x)[BITSET_BITWORD(b)] &= ~BITSET_RANGE(b, e)) : \
    (assert (!"BITSET_CLEAR_RANGE: bit range crosses word boundary"), 0))
 
+static inline unsigned
+__bitset_prefix_sum(const BITSET_WORD *x, unsigned b, unsigned n)
+{
+   unsigned prefix = 0;
+
+   for (unsigned i = 0; i < n; i++) {
+      if ((i + 1) * BITSET_WORDBITS <= b) {
+         prefix += util_bitcount(x[i]);
+      } else {
+         prefix += util_bitcount(x[i] & BITFIELD_MASK(b - i * BITSET_WORDBITS));
+         break;
+      }
+   }
+   return prefix;
+}
+
+#define BITSET_PREFIX_SUM(x, b) \
+   __bitset_prefix_sum(x, b, ARRAY_SIZE(x))
+
 /* Get first bit set in a bitset.
  */
 static inline int