Mesa (master): radeonsi: implement 16-bit FS color outputs

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Tue Sep 22 02:55:25 UTC 2020


Module: Mesa
Branch: master
Commit: 98a52fecdaaac073943fb0f1322a29d01bfeb9c7
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=98a52fecdaaac073943fb0f1322a29d01bfeb9c7

Author: Marek Olšák <marek.olsak at amd.com>
Date:   Mon May 11 02:42:18 2020 -0400

radeonsi: implement 16-bit FS color outputs

This removes type conversions from 16 bits to 32 bits in the main function
and then back to 16 bits in the epilog.

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/6622>

---

 src/amd/llvm/ac_llvm_build.c                     |  22 +++++
 src/amd/llvm/ac_llvm_build.h                     |   2 +
 src/amd/llvm/ac_nir_to_llvm.c                    |   1 +
 src/gallium/drivers/radeonsi/si_shader.c         |   1 +
 src/gallium/drivers/radeonsi/si_shader.h         |  10 ++
 src/gallium/drivers/radeonsi/si_shader_llvm.c    |   7 +-
 src/gallium/drivers/radeonsi/si_shader_llvm_ps.c | 115 ++++++++++++++++++-----
 src/gallium/drivers/radeonsi/si_shader_nir.c     |  18 ++++
 8 files changed, 152 insertions(+), 24 deletions(-)

diff --git a/src/amd/llvm/ac_llvm_build.c b/src/amd/llvm/ac_llvm_build.c
index 9166ba8721c..506ea58ec97 100644
--- a/src/amd/llvm/ac_llvm_build.c
+++ b/src/amd/llvm/ac_llvm_build.c
@@ -2285,6 +2285,28 @@ LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, LLVMValueRef a
    return LLVMBuildBitCast(ctx->builder, res, ctx->i32, "");
 }
 
+LLVMValueRef ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context *ctx,
+                                         LLVMValueRef args[2])
+{
+   LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
+   LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
+   LLVMValueRef code = LLVMConstInlineAsm(calltype,
+                                          "v_cvt_pknorm_i16_f16 $0, $1, $2", "=v,v,v",
+                                          false, false);
+   return LLVMBuildCall(ctx->builder, code, args, 2, "");
+}
+
+LLVMValueRef ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context *ctx,
+                                         LLVMValueRef args[2])
+{
+   LLVMTypeRef param_types[] = {ctx->f16, ctx->f16};
+   LLVMTypeRef calltype = LLVMFunctionType(ctx->i32, param_types, 2, false);
+   LLVMValueRef code = LLVMConstInlineAsm(calltype,
+                                          "v_cvt_pknorm_u16_f16 $0, $1, $2", "=v,v,v",
+                                          false, false);
+   return LLVMBuildCall(ctx->builder, code, args, 2, "");
+}
+
 /* The 8-bit and 10-bit clamping is for HW workarounds. */
 LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
                                  bool hi)
diff --git a/src/amd/llvm/ac_llvm_build.h b/src/amd/llvm/ac_llvm_build.h
index 756bbebd8f5..2e08a990b2d 100644
--- a/src/amd/llvm/ac_llvm_build.h
+++ b/src/amd/llvm/ac_llvm_build.h
@@ -432,6 +432,8 @@ LLVMValueRef ac_build_image_get_sample_count(struct ac_llvm_context *ctx, LLVMVa
 LLVMValueRef ac_build_cvt_pkrtz_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2]);
 LLVMValueRef ac_build_cvt_pknorm_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2]);
 LLVMValueRef ac_build_cvt_pknorm_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2]);
+LLVMValueRef ac_build_cvt_pknorm_i16_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2]);
+LLVMValueRef ac_build_cvt_pknorm_u16_f16(struct ac_llvm_context *ctx, LLVMValueRef args[2]);
 LLVMValueRef ac_build_cvt_pk_i16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
                                  bool hi);
 LLVMValueRef ac_build_cvt_pk_u16(struct ac_llvm_context *ctx, LLVMValueRef args[2], unsigned bits,
diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c
index c86339205ee..c1411366871 100644
--- a/src/amd/llvm/ac_nir_to_llvm.c
+++ b/src/amd/llvm/ac_nir_to_llvm.c
@@ -2450,6 +2450,7 @@ static void visit_store_output(struct ac_nir_context *ctx, nir_intrinsic_instr *
       indir_index = get_src(ctx, offset);
 
    switch (ac_get_elem_bits(&ctx->ac, LLVMTypeOf(src))) {
+   case 16:
    case 32:
       break;
    case 64:
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 8a26ac0d06d..a83289ff1dd 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -2260,6 +2260,7 @@ void si_get_ps_epilog_key(struct si_shader *shader, union si_shader_part_key *ke
    struct si_shader_info *info = &shader->selector->info;
    memset(key, 0, sizeof(*key));
    key->ps_epilog.colors_written = info->colors_written;
+   key->ps_epilog.color_types = info->output_color_types;
    key->ps_epilog.writes_z = info->writes_z;
    key->ps_epilog.writes_stencil = info->writes_stencil;
    key->ps_epilog.writes_samplemask = info->writes_samplemask;
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index d26f36a4388..da74404008e 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -315,6 +315,13 @@ struct si_compiler_ctx_state {
    bool is_debug_context;
 };
 
+enum si_color_output_type {
+   SI_TYPE_ANY32,
+   SI_TYPE_FLOAT16,
+   SI_TYPE_INT16,
+   SI_TYPE_UINT16,
+};
+
 struct si_shader_info {
    shader_info base;
 
@@ -330,6 +337,7 @@ struct si_shader_info {
    ubyte output_usagemask[PIPE_MAX_SHADER_OUTPUTS];
    ubyte output_readmask[PIPE_MAX_SHADER_OUTPUTS];
    ubyte output_streams[PIPE_MAX_SHADER_OUTPUTS];
+   ubyte output_type[PIPE_MAX_SHADER_OUTPUTS]; /* enum nir_alu_type */
 
    ubyte color_interpolate[2];
    ubyte color_interpolate_loc[2];
@@ -341,6 +349,7 @@ struct si_shader_info {
 
    ubyte colors_read; /**< which color components are read by the FS */
    ubyte colors_written;
+   uint16_t output_color_types; /**< Each bit pair is enum si_color_output_type */
    bool color0_writes_all_cbufs; /**< gl_FragColor */
    bool reads_samplemask;   /**< does fragment shader read sample mask? */
    bool reads_tess_factors; /**< If TES reads TESSINNER or TESSOUTER */
@@ -577,6 +586,7 @@ union si_shader_part_key {
    struct {
       struct si_ps_epilog_bits states;
       unsigned colors_written : 8;
+      unsigned color_types : 16;
       unsigned writes_z : 1;
       unsigned writes_stencil : 1;
       unsigned writes_samplemask : 1;
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm.c b/src/gallium/drivers/radeonsi/si_shader_llvm.c
index ab3aed107e3..e5f14cc0c9c 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm.c
@@ -450,8 +450,13 @@ bool si_nir_build_llvm(struct si_shader_context *ctx, struct nir_shader *nir)
 
    const struct si_shader_info *info = &ctx->shader->selector->info;
    for (unsigned i = 0; i < info->num_outputs; i++) {
+      LLVMTypeRef type = ctx->ac.f32;
+
+      if (nir_alu_type_get_type_size(ctx->shader->selector->info.output_type[i]) == 16)
+         type = ctx->ac.f16;
+
       for (unsigned j = 0; j < 4; j++)
-         ctx->abi.outputs[i * 4 + j] = ac_build_alloca_undef(&ctx->ac, ctx->ac.f32, "");
+         ctx->abi.outputs[i * 4 + j] = ac_build_alloca_undef(&ctx->ac, type, "");
    }
 
    ac_nir_translate(&ctx->ac, &ctx->abi, &ctx->args, nir);
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c b/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c
index 37711eefa04..4527a9c4a88 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_ps.c
@@ -209,6 +209,9 @@ static void si_alpha_test(struct si_shader_context *ctx, LLVMValueRef alpha)
       assert(cond);
 
       LLVMValueRef alpha_ref = LLVMGetParam(ctx->main_fn, SI_PARAM_ALPHA_REF);
+      if (LLVMTypeOf(alpha) == ctx->ac.f16)
+         alpha_ref = LLVMBuildFPTrunc(ctx->ac.builder, alpha_ref, ctx->ac.f16, "");
+
       LLVMValueRef alpha_pass = LLVMBuildFCmp(ctx->ac.builder, cond, alpha, alpha_ref, "");
       ac_build_kill_if_false(&ctx->ac, alpha_pass);
    } else {
@@ -233,6 +236,9 @@ static LLVMValueRef si_scale_alpha_by_sample_mask(struct si_shader_context *ctx,
    coverage = LLVMBuildFMul(ctx->ac.builder, coverage,
                             LLVMConstReal(ctx->ac.f32, 1.0 / SI_NUM_SMOOTH_AA_SAMPLES), "");
 
+   if (LLVMTypeOf(alpha) == ctx->ac.f16)
+      coverage = LLVMBuildFPTrunc(ctx->ac.builder, coverage, ctx->ac.f16, "");
+
    return LLVMBuildFMul(ctx->ac.builder, alpha, coverage, "");
 }
 
@@ -241,10 +247,36 @@ struct si_ps_exports {
    struct ac_export_args args[10];
 };
 
+static LLVMValueRef pack_two_16bit(struct ac_llvm_context *ctx, LLVMValueRef args[2])
+{
+   LLVMValueRef tmp = ac_build_gather_values(ctx, args, 2);
+   return LLVMBuildBitCast(ctx->builder, tmp, ctx->v2f16, "");
+}
+
+static LLVMValueRef get_color_32bit(struct si_shader_context *ctx, unsigned color_type,
+                                    LLVMValueRef value)
+{
+   switch (color_type) {
+   case SI_TYPE_FLOAT16:
+      return LLVMBuildFPExt(ctx->ac.builder, value, ctx->ac.f32, "");
+   case SI_TYPE_INT16:
+      value = ac_to_integer(&ctx->ac, value);
+      value = LLVMBuildSExt(ctx->ac.builder, value, ctx->ac.i32, "");
+      return ac_to_float(&ctx->ac, value);
+   case SI_TYPE_UINT16:
+      value = ac_to_integer(&ctx->ac, value);
+      value = LLVMBuildZExt(ctx->ac.builder, value, ctx->ac.i32, "");
+      return ac_to_float(&ctx->ac, value);
+   case SI_TYPE_ANY32:
+      return value;
+   }
+   return NULL;
+}
+
 /* Initialize arguments for the shader export intrinsic */
 static void si_llvm_init_ps_export_args(struct si_shader_context *ctx, LLVMValueRef *values,
                                         unsigned cbuf, unsigned compacted_mrt_index,
-                                        struct ac_export_args *args)
+                                        unsigned color_type, struct ac_export_args *args)
 {
    const struct si_shader_key *key = &ctx->shader->key;
    unsigned col_formats = key->part.ps.epilog.spi_shader_col_format;
@@ -289,49 +321,65 @@ static void si_llvm_init_ps_export_args(struct si_shader_context *ctx, LLVMValue
 
    case V_028714_SPI_SHADER_32_R:
       args->enabled_channels = 1; /* writemask */
-      args->out[0] = values[0];
+      args->out[0] = get_color_32bit(ctx, color_type, values[0]);
       break;
 
    case V_028714_SPI_SHADER_32_GR:
       args->enabled_channels = 0x3; /* writemask */
-      args->out[0] = values[0];
-      args->out[1] = values[1];
+      args->out[0] = get_color_32bit(ctx, color_type, values[0]);
+      args->out[1] = get_color_32bit(ctx, color_type, values[1]);
       break;
 
    case V_028714_SPI_SHADER_32_AR:
       if (ctx->screen->info.chip_class >= GFX10) {
          args->enabled_channels = 0x3; /* writemask */
-         args->out[0] = values[0];
-         args->out[1] = values[3];
+         args->out[0] = get_color_32bit(ctx, color_type, values[0]);
+         args->out[1] = get_color_32bit(ctx, color_type, values[3]);
       } else {
          args->enabled_channels = 0x9; /* writemask */
-         args->out[0] = values[0];
-         args->out[3] = values[3];
+         args->out[0] = get_color_32bit(ctx, color_type, values[0]);
+         args->out[3] = get_color_32bit(ctx, color_type, values[3]);
       }
       break;
 
    case V_028714_SPI_SHADER_FP16_ABGR:
-      packf = ac_build_cvt_pkrtz_f16;
+      if (color_type != SI_TYPE_ANY32)
+         packf = pack_two_16bit;
+      else
+         packf = ac_build_cvt_pkrtz_f16;
       break;
 
    case V_028714_SPI_SHADER_UNORM16_ABGR:
-      packf = ac_build_cvt_pknorm_u16;
+      if (color_type != SI_TYPE_ANY32)
+         packf = ac_build_cvt_pknorm_u16_f16;
+      else
+         packf = ac_build_cvt_pknorm_u16;
       break;
 
    case V_028714_SPI_SHADER_SNORM16_ABGR:
-      packf = ac_build_cvt_pknorm_i16;
+      if (color_type != SI_TYPE_ANY32)
+         packf = ac_build_cvt_pknorm_i16_f16;
+      else
+         packf = ac_build_cvt_pknorm_i16;
       break;
 
    case V_028714_SPI_SHADER_UINT16_ABGR:
-      packi = ac_build_cvt_pk_u16;
+      if (color_type != SI_TYPE_ANY32)
+         packf = pack_two_16bit;
+      else
+         packi = ac_build_cvt_pk_u16;
       break;
 
    case V_028714_SPI_SHADER_SINT16_ABGR:
-      packi = ac_build_cvt_pk_i16;
+      if (color_type != SI_TYPE_ANY32)
+         packf = pack_two_16bit;
+      else
+         packi = ac_build_cvt_pk_i16;
       break;
 
    case V_028714_SPI_SHADER_32_ABGR:
-      memcpy(&args->out[0], values, sizeof(values[0]) * 4);
+      for (unsigned i = 0; i < 4; i++)
+         args->out[i] = get_color_32bit(ctx, color_type, values[i]);
       break;
    }
 
@@ -362,7 +410,7 @@ static void si_llvm_init_ps_export_args(struct si_shader_context *ctx, LLVMValue
 
 static bool si_export_mrt_color(struct si_shader_context *ctx, LLVMValueRef *color, unsigned index,
                                 unsigned compacted_mrt_index, unsigned samplemask_param,
-                                bool is_last, struct si_ps_exports *exp)
+                                bool is_last, unsigned color_type, struct si_ps_exports *exp)
 {
    int i;
 
@@ -373,7 +421,7 @@ static bool si_export_mrt_color(struct si_shader_context *ctx, LLVMValueRef *col
 
    /* Alpha to one */
    if (ctx->shader->key.part.ps.epilog.alpha_to_one)
-      color[3] = ctx->ac.f32_1;
+      color[3] = LLVMConstReal(LLVMTypeOf(color[0]), 1);
 
    /* Alpha test */
    if (index == 0 && ctx->shader->key.part.ps.epilog.alpha_func != PIPE_FUNC_ALWAYS)
@@ -392,7 +440,8 @@ static bool si_export_mrt_color(struct si_shader_context *ctx, LLVMValueRef *col
 
       /* Get the export arguments, also find out what the last one is. */
       for (c = 0; c <= ctx->shader->key.part.ps.epilog.last_cbuf; c++) {
-         si_llvm_init_ps_export_args(ctx, color, c, compacted_mrt_index, &args[c]);
+         si_llvm_init_ps_export_args(ctx, color, c, compacted_mrt_index,
+                                     color_type, &args[c]);
          if (args[c].enabled_channels) {
             compacted_mrt_index++;
             last = c;
@@ -415,7 +464,8 @@ static bool si_export_mrt_color(struct si_shader_context *ctx, LLVMValueRef *col
       struct ac_export_args args;
 
       /* Export */
-      si_llvm_init_ps_export_args(ctx, color, index, compacted_mrt_index, &args);
+      si_llvm_init_ps_export_args(ctx, color, index, compacted_mrt_index,
+                                  color_type, &args);
       if (is_last) {
          args.valid_mask = 1; /* whether the EXEC mask is valid */
          args.done = 1;       /* DONE bit */
@@ -500,8 +550,17 @@ static void si_llvm_return_fs_outputs(struct ac_shader_abi *abi, unsigned max_ou
       if (!color[i][0])
          continue;
 
-      for (j = 0; j < 4; j++)
-         ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
+      if (LLVMTypeOf(color[i][0]) == ctx->ac.f16) {
+         for (j = 0; j < 2; j++) {
+            LLVMValueRef tmp = ac_build_gather_values(&ctx->ac, &color[i][j * 2], 2);
+            tmp = LLVMBuildBitCast(builder, tmp, ctx->ac.f32, "");
+            ret = LLVMBuildInsertValue(builder, ret, tmp, vgpr++, "");
+         }
+         vgpr += 2;
+      } else {
+         for (j = 0; j < 4; j++)
+            ret = LLVMBuildInsertValue(builder, ret, color[i][j], vgpr++, "");
+      }
    }
    if (depth)
       ret = LLVMBuildInsertValue(builder, ret, depth, vgpr++, "");
@@ -868,13 +927,23 @@ void si_llvm_build_ps_epilog(struct si_shader_context *ctx, union si_shader_part
    while (colors_written) {
       LLVMValueRef color[4];
       int output_index = u_bit_scan(&colors_written);
+      unsigned color_type = (key->ps_epilog.color_types >> (output_index * 2)) & 0x3;
 
-      for (i = 0; i < 4; i++)
-         color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
+      if (color_type != SI_TYPE_ANY32) {
+         for (i = 0; i < 4; i++) {
+            color[i] = LLVMGetParam(ctx->main_fn, vgpr + i / 2);
+            color[i] = LLVMBuildBitCast(ctx->ac.builder, color[i], ctx->ac.v2f16, "");
+            color[i] = ac_llvm_extract_elem(&ctx->ac, color[i], i % 2);
+         }
+         vgpr += 4;
+      } else {
+         for (i = 0; i < 4; i++)
+            color[i] = LLVMGetParam(ctx->main_fn, vgpr++);
+      }
 
       if (si_export_mrt_color(ctx, color, output_index, num_compacted_mrts,
                               ctx->args.arg_count - 1,
-                              output_index == last_color_export, &exp))
+                              output_index == last_color_export, color_type, &exp))
          num_compacted_mrts++;
    }
 
diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c
index 7b39c6511eb..8ed40441976 100644
--- a/src/gallium/drivers/radeonsi/si_shader_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shader_nir.c
@@ -163,6 +163,11 @@ static void scan_io_usage(struct si_shader_info *info, nir_intrinsic_instr *intr
                }
             }
 
+            if (nir_intrinsic_has_type(intr))
+               info->output_type[loc] = nir_intrinsic_type(intr);
+            else
+               info->output_type[loc] = nir_type_float32;
+
             info->output_usagemask[loc] |= mask;
             info->num_outputs = MAX2(info->num_outputs, loc + 1);
 
@@ -181,6 +186,13 @@ static void scan_io_usage(struct si_shader_info *info, nir_intrinsic_instr *intr
                   if (semantic >= FRAG_RESULT_DATA0 && semantic <= FRAG_RESULT_DATA7) {
                      unsigned index = semantic - FRAG_RESULT_DATA0;
                      info->colors_written |= 1 << (index + i);
+
+                     if (nir_intrinsic_type(intr) == nir_type_float16)
+                        info->output_color_types |= SI_TYPE_FLOAT16 << (index * 2);
+                     else if (nir_intrinsic_type(intr) == nir_type_int16)
+                        info->output_color_types |= SI_TYPE_INT16 << (index * 2);
+                     else if (nir_intrinsic_type(intr) == nir_type_uint16)
+                        info->output_color_types |= SI_TYPE_UINT16 << (index * 2);
                   }
                   break;
                }
@@ -678,6 +690,12 @@ static void si_lower_nir(struct si_screen *sscreen, struct nir_shader *nir)
    NIR_PASS_V(nir, nir_lower_load_const_to_scalar);
    NIR_PASS_V(nir, nir_lower_var_copies);
    NIR_PASS_V(nir, nir_opt_access);
+
+   if (nir->info.stage == MESA_SHADER_FRAGMENT &&
+       sscreen->info.has_packed_math_16bit &&
+       sscreen->b.get_shader_param(&sscreen->b, PIPE_SHADER_FRAGMENT, PIPE_SHADER_CAP_FP16))
+      NIR_PASS_V(nir, nir_lower_mediump_outputs);
+
    si_nir_opts(nir, true);
 
    /* Lower large variables that are always constant with load_constant



More information about the mesa-commit mailing list