Mesa (main): pan/bi: Implement fquantize2f16

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Mon Apr 25 16:49:43 UTC 2022


Module: Mesa
Branch: main
Commit: c9b33fe7dcc5bc2f633f5ad56ddb6463417561bd
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=c9b33fe7dcc5bc2f633f5ad56ddb6463417561bd

Author: Alyssa Rosenzweig <alyssa at collabora.com>
Date:   Sat Apr 23 17:33:47 2022 -0400

pan/bi: Implement fquantize2f16

Implement as f2f32(f2f16(x)) with the conversions in flush-to-zero mode.
Accessing flush-to-zero mode on Bifrost is nontrivial: it is specified
per-clause, rather than per-instruction. I've opted to pipe support for ftz
clauses through the scheduler. This solution has two nice properties:

* It uses the native hardware for flushing subnormals, avoiding extra lowering.
* It's "smart" about scheduling around FTZ requirements, meaning we get good
code generated even for a shader that e.g. quantizes a vector.

With an unrelated scheduler fix, the *V2F32_TO_V2F16/+F16_TO_F32 operation fits
in a single tuple, minimizing the overhead of the special FTZ clause.

We'll have to do something a bit different for Valhall (FLUSH.f32), but we'll
worry about when we actually have PanVK brought up on Valhall.

Fixes dEQP-VK.spirv_assembly.instruction.compute.opquantize.*

Signed-off-by: Alyssa Rosenzweig <alyssa at collabora.com>
Reviewed-by: Jason Ekstrand <jason.ekstrand at collabora.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/16123>

---

 src/panfrost/bifrost/ISA.xml              |  3 ++
 src/panfrost/bifrost/bi_builder.h.py      |  2 +-
 src/panfrost/bifrost/bi_pack.c            |  1 +
 src/panfrost/bifrost/bi_schedule.c        | 49 +++++++++++++++++++++++++++++++
 src/panfrost/bifrost/bifrost_compile.c    |  9 ++++++
 src/panfrost/bifrost/compiler.h           |  4 +++
 src/panfrost/ci/deqp-panfrost-g52-vk.toml |  1 +
 7 files changed, 68 insertions(+), 1 deletion(-)

diff --git a/src/panfrost/bifrost/ISA.xml b/src/panfrost/bifrost/ISA.xml
index 5970f7da675..55fb080ff54 100644
--- a/src/panfrost/bifrost/ISA.xml
+++ b/src/panfrost/bifrost/ISA.xml
@@ -2429,6 +2429,7 @@
       <opt>rtz</opt>
       <opt>rtna</opt>
     </mod>
+    <mod name="ftz" start="9" size="1" opt="ftz" pseudo="true"/>
     <derived start="6" size="1">
       <and>
         <eq left="abs0" right="#none"/>
@@ -3870,6 +3871,7 @@
       <opt>h0</opt>
       <opt>h1</opt>
     </mod>
+    <mod name="ftz" start="9" size="1" opt="ftz" pseudo="true"/>
   </ins>
 
   <ins name="+F16_TO_S32">
@@ -7938,6 +7940,7 @@
       <opt>rtz</opt>
       <opt>rtna</opt>
     </mod>
+    <mod name="ftz" start="9" size="1" opt="ftz" pseudo="true"/>
     <derived start="6" size="1">
       <and>
         <eq left="abs0" right="#none"/>
diff --git a/src/panfrost/bifrost/bi_builder.h.py b/src/panfrost/bifrost/bi_builder.h.py
index 5ba37818264..a41edb66750 100644
--- a/src/panfrost/bifrost/bi_builder.h.py
+++ b/src/panfrost/bifrost/bi_builder.h.py
@@ -21,7 +21,7 @@
 
 SKIP = set(["lane", "lane_dest", "lanes", "lanes", "replicate", "swz", "widen",
     "swap", "neg", "abs", "not", "sign", "extend", "divzero", "clamp", "sem",
-    "not_result", "skip", "round"])
+    "not_result", "skip", "round", "ftz"])
 
 TEMPLATE = """
 #ifndef _BI_BUILDER_H_
diff --git a/src/panfrost/bifrost/bi_pack.c b/src/panfrost/bifrost/bi_pack.c
index e400adfaf74..5cef648bf5e 100644
--- a/src/panfrost/bifrost/bi_pack.c
+++ b/src/panfrost/bifrost/bi_pack.c
@@ -57,6 +57,7 @@ bi_pack_header(bi_clause *clause, bi_clause *next_1, bi_clause *next_2)
                 .dependency_slot = clause->scoreboard_id,
                 .message_type = clause->message_type,
                 .next_message_type = next_1 ? next_1->message_type : 0,
+                .flush_to_zero = clause->ftz ? BIFROST_FTZ_ALWAYS : BIFROST_FTZ_DISABLE
         };
 
         uint64_t u = 0;
diff --git a/src/panfrost/bifrost/bi_schedule.c b/src/panfrost/bifrost/bi_schedule.c
index 01bf6299be0..a3e176b0a15 100644
--- a/src/panfrost/bifrost/bi_schedule.c
+++ b/src/panfrost/bifrost/bi_schedule.c
@@ -107,6 +107,17 @@ struct bi_const_state {
         unsigned word_idx;
 };
 
+enum bi_ftz_state {
+        /* No flush-to-zero state assigned yet */
+        BI_FTZ_STATE_NONE,
+
+        /* Never flush-to-zero */
+        BI_FTZ_STATE_DISABLE,
+
+        /* Always flush-to-zero */
+        BI_FTZ_STATE_ENABLE,
+};
+
 struct bi_clause_state {
         /* Has a message-passing instruction already been assigned? */
         bool message;
@@ -118,6 +129,9 @@ struct bi_clause_state {
 
         unsigned tuple_count;
         struct bi_const_state consts[8];
+
+        /* Numerical state of the clause */
+        enum bi_ftz_state ftz;
 };
 
 /* Determines messsage type by checking the table and a few special cases. Only
@@ -1027,6 +1041,28 @@ bi_write_count(bi_instr *instr, uint64_t live_after_temp)
         return count;
 }
 
+/*
+ * Test if an instruction required flush-to-zero mode. Currently only supported
+ * for f16<-->f32 conversions to implement fquantize16
+ */
+static bool
+bi_needs_ftz(bi_instr *I)
+{
+        return (I->op == BI_OPCODE_F16_TO_F32 ||
+                I->op == BI_OPCODE_V2F32_TO_V2F16) && I->ftz;
+}
+
+/*
+ * Test if an instruction would be numerically incompatible with the clause. At
+ * present we only consider flush-to-zero modes.
+ */
+static bool
+bi_numerically_incompatible(struct bi_clause_state *clause, bi_instr *instr)
+{
+        return (clause->ftz != BI_FTZ_STATE_NONE) &&
+               ((clause->ftz == BI_FTZ_STATE_ENABLE) != bi_needs_ftz(instr));
+}
+
 /* Instruction placement entails two questions: what subset of instructions in
  * the block can legally be scheduled? and of those which is the best? That is,
  * we seek to maximize a cost function on a subset of the worklist satisfying a
@@ -1056,6 +1092,10 @@ bi_instr_schedulable(bi_instr *instr,
         if (bi_must_not_last(instr) && tuple->last)
                 return false;
 
+        /* Numerical properties must be compatible with the clause */
+        if (bi_numerically_incompatible(clause, instr))
+                return false;
+
         /* Message-passing instructions are not guaranteed write within the
          * same clause (most likely they will not), so if a later instruction
          * in the clause accesses the destination, the message-passing
@@ -1220,6 +1260,13 @@ bi_pop_instr(struct bi_clause_state *clause, struct bi_tuple_state *tuple,
                 if (bi_tuple_is_new_src(instr, &tuple->reg, s))
                         tuple->reg.reads[tuple->reg.nr_reads++] = instr->src[s];
         }
+
+        /* This could be optimized to allow pairing integer instructions with
+         * special flush-to-zero instructions, but punting on this until we have
+         * a workload that cares.
+         */
+        clause->ftz = bi_needs_ftz(instr) ? BI_FTZ_STATE_ENABLE :
+                                            BI_FTZ_STATE_DISABLE;
 }
 
 /* Choose the best instruction and pop it off the worklist. Returns NULL if no
@@ -1865,6 +1912,8 @@ bi_schedule_clause(bi_context *ctx, bi_block *block, struct bi_worklist st, uint
         clause->next_clause_prefetch = !last || (last->op != BI_OPCODE_JUMP);
         clause->block = block;
 
+        clause->ftz = (clause_state.ftz == BI_FTZ_STATE_ENABLE);
+
         /* We emit in reverse and emitted to the back of the tuples array, so
          * move it up front for easy indexing */
         memmove(clause->tuples,
diff --git a/src/panfrost/bifrost/bifrost_compile.c b/src/panfrost/bifrost/bifrost_compile.c
index 83507f477a0..871333630f2 100644
--- a/src/panfrost/bifrost/bifrost_compile.c
+++ b/src/panfrost/bifrost/bifrost_compile.c
@@ -2357,6 +2357,15 @@ bi_emit_alu(bi_builder *b, nir_alu_instr *instr)
                 bi_f16_to_f32_to(b, dst, s0);
                 break;
 
+        case nir_op_fquantize2f16:
+        {
+                bi_instr *f16 = bi_v2f32_to_v2f16_to(b, bi_temp(b->shader), s0, s0);
+                bi_instr *f32 = bi_f16_to_f32_to(b, dst, bi_half(f16->dest[0], false));
+
+                f16->ftz = f32->ftz = true;
+                break;
+        }
+
         case nir_op_f2i32:
                 if (src_sz == 32)
                         bi_f32_to_s32_to(b, dst, s0);
diff --git a/src/panfrost/bifrost/compiler.h b/src/panfrost/bifrost/compiler.h
index 3675965fd15..98ec840c4ca 100644
--- a/src/panfrost/bifrost/compiler.h
+++ b/src/panfrost/bifrost/compiler.h
@@ -465,6 +465,7 @@ typedef struct {
                 struct {
                         enum bi_special special; /* FADD_RSCALE, FMA_RSCALE */
                         enum bi_round round; /* FMA, converts, FADD, _RSCALE, etc */
+                        bool ftz; /* Flush-to-zero for F16_TO_F32 */
                 };
 
                 struct {
@@ -635,6 +636,9 @@ typedef struct {
 
         /* Discard helper threads */
         bool td;
+
+        /* Should flush-to-zero mode be enabled for this clause? */
+        bool ftz;
 } bi_clause;
 
 #define BI_NUM_SLOTS 8
diff --git a/src/panfrost/ci/deqp-panfrost-g52-vk.toml b/src/panfrost/ci/deqp-panfrost-g52-vk.toml
index 6858f2460e6..a223c499bc5 100644
--- a/src/panfrost/ci/deqp-panfrost-g52-vk.toml
+++ b/src/panfrost/ci/deqp-panfrost-g52-vk.toml
@@ -23,5 +23,6 @@ include = [
     "dEQP-VK.image.load_store.with_format.*",
     "dEQP-VK.pipeline.input_assembly.*",
     "dEQP-VK.pipeline.sampler.view_type.*.format.r*.address_modes.all_mode_clamp_to_border*",
+    "dEQP-VK.spirv_assembly.instruction.compute.opquantize.*",
     "dEQP-VK.ssbo.layout.single_basic_type.*",
 ]



More information about the mesa-commit mailing list