Mesa (staging/20.1): intel/eu: Use non-coherent mode (BTI=253) for stateless A64 messages

GitLab Mirror gitlab-mirror at kemper.freedesktop.org
Mon Aug 3 18:32:10 UTC 2020


Module: Mesa
Branch: staging/20.1
Commit: 99e7735280ab71fd144f01e489d1b79dbd8457b2
URL:    http://cgit.freedesktop.org/mesa/mesa/commit/?id=99e7735280ab71fd144f01e489d1b79dbd8457b2

Author: Jason Ekstrand <jason at jlekstrand.net>
Date:   Wed Apr 29 17:14:58 2020 -0500

intel/eu: Use non-coherent mode (BTI=253) for stateless A64 messages

We don't care about full IA coherency since we always have the
opportunity in GL or Vulkan to flush the data cache.  Using IA-coherent
mode is likely just making A64 access slower than it needs to be.

Reviewed-by: Caio Marcelo de Oliveira Filho <caio.oliveira at intel.com>
Reviewed-by: Kenneth Graunke <kenneth at whitecape.org>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/4819>
(cherry picked from commit 4985e380dd776ac65c4ae5627138211f9d9f03ce)

---

 .pick_status.json                   |  2 +-
 src/intel/compiler/brw_eu.h         | 12 ++++++++----
 src/intel/compiler/brw_eu_defines.h | 37 +++++++++++++++++++++++++++++++------
 3 files changed, 40 insertions(+), 11 deletions(-)

diff --git a/.pick_status.json b/.pick_status.json
index 85d1aecdf4d..362576fd6a6 100644
--- a/.pick_status.json
+++ b/.pick_status.json
@@ -32926,7 +32926,7 @@
         "description": "intel/eu: Use non-coherent mode (BTI=253) for stateless A64 messages",
         "nominated": false,
         "nomination_type": null,
-        "resolution": 4,
+        "resolution": 1,
         "master_sha": null,
         "because_sha": null
     },
diff --git a/src/intel/compiler/brw_eu.h b/src/intel/compiler/brw_eu.h
index 7ae17dbdd37..262c527b2e9 100644
--- a/src/intel/compiler/brw_eu.h
+++ b/src/intel/compiler/brw_eu.h
@@ -744,7 +744,8 @@ brw_dp_a64_untyped_surface_rw_desc(const struct gen_device_info *devinfo,
       SET_BITS(brw_mdc_cmask(num_channels), 3, 0) |
       SET_BITS(simd_mode, 5, 4);
 
-   return brw_dp_desc(devinfo, BRW_BTI_STATELESS, msg_type, msg_control);
+   return brw_dp_desc(devinfo, GEN8_BTI_STATELESS_NON_COHERENT,
+                      msg_type, msg_control);
 }
 
 /**
@@ -782,7 +783,8 @@ brw_dp_a64_byte_scattered_rw_desc(const struct gen_device_info *devinfo,
       SET_BITS(brw_mdc_a64_ds(bit_size / 8), 3, 2) |
       SET_BITS(exec_size == 16, 4, 4);
 
-   return brw_dp_desc(devinfo, BRW_BTI_STATELESS, msg_type, msg_control);
+   return brw_dp_desc(devinfo, GEN8_BTI_STATELESS_NON_COHERENT,
+                      msg_type, msg_control);
 }
 
 static inline uint32_t
@@ -803,7 +805,8 @@ brw_dp_a64_untyped_atomic_desc(const struct gen_device_info *devinfo,
       SET_BITS(bit_size == 64, 4, 4) |
       SET_BITS(response_expected, 5, 5);
 
-   return brw_dp_desc(devinfo, BRW_BTI_STATELESS, msg_type, msg_control);
+   return brw_dp_desc(devinfo, GEN8_BTI_STATELESS_NON_COHERENT,
+                      msg_type, msg_control);
 }
 
 static inline uint32_t
@@ -822,7 +825,8 @@ brw_dp_a64_untyped_atomic_float_desc(const struct gen_device_info *devinfo,
       SET_BITS(atomic_op, 1, 0) |
       SET_BITS(response_expected, 5, 5);
 
-   return brw_dp_desc(devinfo, BRW_BTI_STATELESS, msg_type, msg_control);
+   return brw_dp_desc(devinfo, GEN8_BTI_STATELESS_NON_COHERENT,
+                      msg_type, msg_control);
 }
 
 static inline uint32_t
diff --git a/src/intel/compiler/brw_eu_defines.h b/src/intel/compiler/brw_eu_defines.h
index 742e12c2830..d63360222ec 100644
--- a/src/intel/compiler/brw_eu_defines.h
+++ b/src/intel/compiler/brw_eu_defines.h
@@ -1419,12 +1419,37 @@ enum brw_message_target {
 /* Dataport special binding table indices: */
 #define BRW_BTI_STATELESS                255
 #define GEN7_BTI_SLM                     254
-/* Note that on Gen8+ BTI 255 was redefined to be IA-coherent according to the
- * hardware spec, however because the DRM sets bit 4 of HDC_CHICKEN0 on BDW,
- * CHV and at least some pre-production steppings of SKL due to
- * WaForceEnableNonCoherent, HDC memory access may have been overridden by the
- * kernel to be non-coherent (matching the behavior of the same BTI on
- * pre-Gen8 hardware) and BTI 255 may actually be an alias for BTI 253.
+
+#define HSW_BTI_STATELESS_LOCALLY_COHERENT 255
+#define HSW_BTI_STATELESS_NON_COHERENT 253
+#define HSW_BTI_STATELESS_GLOBALLY_COHERENT 252
+#define HSW_BTI_STATELESS_LLC_COHERENT 251
+#define HSW_BTI_STATELESS_L3_UNCACHED 250
+
+/* The hardware docs are a bit contradictory here.  On Haswell, where they
+ * first added cache ability control, there were 5 different cache modes (see
+ * HSW_BTI_STATELESS_* above).  On Broadwell, they reduced to two:
+ *
+ *  - IA-Coherent (BTI=255): Coherent within Gen and coherent within the
+ *    entire IA cache memory hierarchy.
+ *
+ *  - Non-Coherent (BTI=253): Coherent within Gen, same cache type.
+ *
+ * Information about stateless cache coherency can be found in the "A32
+ * Stateless" section of the "3D Media GPGPU" volume of the PRM for each
+ * hardware generation.
+ *
+ * Unfortunately, the docs for MDC_STATELESS appear to have been copied and
+ * pasted from Haswell and give the Haswell definitions for the BTI values of
+ * 255 and 253 including a warning about accessing 253 surfaces from multiple
+ * threads.  This seems to be a copy+paste error and the definitions from the
+ * "A32 Stateless" section should be trusted instead.
+ *
+ * Note that because the DRM sets bit 4 of HDC_CHICKEN0 on BDW, CHV and at
+ * least some pre-production steppings of SKL due to WaForceEnableNonCoherent,
+ * HDC memory access may have been overridden by the kernel to be non-coherent
+ * (matching the behavior of the same BTI on pre-Gen8 hardware) and BTI 255
+ * may actually be an alias for BTI 253.
  */
 #define GEN8_BTI_STATELESS_IA_COHERENT   255
 #define GEN8_BTI_STATELESS_NON_COHERENT  253



More information about the mesa-commit mailing list