[Mesa-dev] [PATCH 2/2] SQUASH: intel/fs/bank_conflicts: Roll back to the nineties.

Thu Jun 22 19:20:12 UTC 2017

---
 src/intel/compiler/brw_fs_bank_conflicts.cpp | 274 ++++++++++++++++++---------
 1 file changed, 188 insertions(+), 86 deletions(-)

diff --git a/src/intel/compiler/brw_fs_bank_conflicts.cpp b/src/intel/compiler/brw_fs_bank_conflicts.cpp
index 0225c70..dc88cac 100644
--- a/src/intel/compiler/brw_fs_bank_conflicts.cpp
+++ b/src/intel/compiler/brw_fs_bank_conflicts.cpp
@@ -51,9 +51,6 @@
 #include "brw_fs.h"
 #include "brw_cfg.h"
 
-#include <vector>
-#include <array>
-
 #ifdef __SSE2__
 
 #include <emmintrin.h>
@@ -72,7 +69,9 @@ namespace {
    /**
     * SIMD integer vector data type.
     */
-   typedef std::array<__m128i, 2> vector_type;
+   struct vector_type {
+      __m128i v[2];
+   };
 
    /**
     * Scalar data type matching the representation of a single component of \p
@@ -88,8 +87,7 @@ namespace {
    /**
     * Number of components of a \p vector_type.
     */
-   const unsigned vector_width = 2 * sizeof(vector_type::value_type) /
-                                     sizeof(scalar_type);
+   const unsigned vector_width = 2 * sizeof(__m128i) / sizeof(scalar_type);
 
    /**
     * Set the i-th component of vector \p v to \p x.
@@ -98,7 +96,7 @@ namespace {
    set(vector_type &v, unsigned i, scalar_type x)
    {
       assert(i < vector_width);
-      memcpy((char *)v.data() + i * sizeof(x), &x, sizeof(x));
+      memcpy((char *)v.v + i * sizeof(x), &x, sizeof(x));
    }
 
    /**
@@ -109,7 +107,7 @@ namespace {
    {
       assert(i < vector_width);
       scalar_type x;
-      memcpy(&x, (char *)v.data() + i * sizeof(x), sizeof(x));
+      memcpy(&x, (char *)v.v + i * sizeof(x), sizeof(x));
       return x;
    }
 
@@ -119,10 +117,10 @@ namespace {
    vector_type
    adds(const vector_type &v, const vector_type &w)
    {
-      const vector_type u = {
-         _mm_adds_epi16(v[0], w[0]),
-         _mm_adds_epi16(v[1], w[1])
-      };
+      const vector_type u = {{
+            _mm_adds_epi16(v.v[0], w.v[0]),
+            _mm_adds_epi16(v.v[1], w.v[1])
+         }};
       return u;
    }
 
@@ -132,10 +130,10 @@ namespace {
    vector_type
    subs(const vector_type &v, const vector_type &w)
    {
-      const vector_type u = {
-         _mm_subs_epi16(v[0], w[0]),
-         _mm_subs_epi16(v[1], w[1])
-      };
+      const vector_type u = {{
+            _mm_subs_epi16(v.v[0], w.v[0]),
+            _mm_subs_epi16(v.v[1], w.v[1])
+         }};
       return u;
    }
 
@@ -145,10 +143,10 @@ namespace {
    vector_type
    mask(const vector_type &v, const vector_type &w)
    {
-      const vector_type u = {
-         _mm_and_si128(v[0], w[0]),
-         _mm_and_si128(v[1], w[1])
-      };
+      const vector_type u = {{
+            _mm_and_si128(v.v[0], w.v[0]),
+            _mm_and_si128(v.v[1], w.v[1])
+         }};
       return u;
    }
 
@@ -158,7 +156,7 @@ namespace {
    scalar_type
    sums(const vector_type &v)
    {
-      const __m128i v8 = _mm_adds_epi16(v[0], v[1]);
+      const __m128i v8 = _mm_adds_epi16(v.v[0], v.v[1]);
       const __m128i v4 = _mm_adds_epi16(v8, _mm_shuffle_epi32(v8, 0x4e));
       const __m128i v2 = _mm_adds_epi16(v4, _mm_shuffle_epi32(v4, 0xb1));
       const __m128i v1 = _mm_adds_epi16(v2, _mm_shufflelo_epi16(v2, 0xb1));
@@ -225,7 +223,7 @@ namespace {
    vector_type
    adds(vector_type v, vector_type w)
    {
-      return std::max(INT16_MIN, std::min(INT16_MAX, int(v) + w));
+      return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) + w));
    }
 
    /**
@@ -234,7 +232,7 @@ namespace {
    vector_type
    subs(vector_type v, vector_type w)
    {
-      return std::max(INT16_MIN, std::min(INT16_MAX, int(v) - w));
+      return MAX2(INT16_MIN, MIN2(INT16_MAX, int(v) - w));
    }
 
    /**
@@ -258,6 +256,15 @@ namespace {
 
 #endif
 
+/**
+ * Swap \p x and \p y.
+ */
+#define SWAP(x, y) do {                          \
+      __typeof(y) _swap_tmp = y;                 \
+      y = x;                                     \
+      x = _swap_tmp;                             \
+   } while (0)
+
 namespace {
    /**
     * Variable-length vector type intended to represent cycle-count costs for
@@ -267,7 +274,37 @@ namespace {
     * atoms are assigned the same bank b or opposite-parity banks b and b^1).
     * \sa shader_conflict_weight_matrix()
     */
-   typedef std::vector<vector_type> weight_vector_type;
+   struct weight_vector_type {
+      weight_vector_type() : v(NULL), size(0) {}
+
+      weight_vector_type(unsigned n) :
+         v(new vector_type[DIV_ROUND_UP(n, vector_width)]()),
+         size(n) {}
+
+      weight_vector_type(const weight_vector_type &u) :
+         v(new vector_type[DIV_ROUND_UP(u.size, vector_width)]()),
+         size(u.size)
+      {
+         memcpy(v, u.v,
+                DIV_ROUND_UP(u.size, vector_width) * sizeof(vector_type));
+      }
+
+      ~weight_vector_type()
+      {
+         delete[] v;
+      }
+
+      weight_vector_type &
+      operator=(weight_vector_type u)
+      {
+         SWAP(v, u.v);
+         SWAP(size, u.size);
+         return *this;
+      }
+
+      vector_type *v;
+      unsigned size;
+   };
 
    /**
     * Set the (i, p)-th component of weight vector \p v to \p x.
@@ -275,7 +312,7 @@ namespace {
    void
    set(weight_vector_type &v, unsigned i, unsigned p, scalar_type x)
    {
-      set(v[(2 * i + p) / vector_width], (2 * i + p) % vector_width, x);
+      set(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width, x);
    }
 
    /**
@@ -284,7 +321,7 @@ namespace {
    scalar_type
    get(const weight_vector_type &v, unsigned i, unsigned p)
    {
-      return get(v[(2 * i + p) / vector_width], (2 * i + p) % vector_width);
+      return get(v.v[(2 * i + p) / vector_width], (2 * i + p) % vector_width);
    }
 
    /**
@@ -316,13 +353,43 @@ namespace {
        * Create a (for the moment unrestricted) partitioning of a register
        * file of size \p n.  The units are arbitrary.
        */
-      partitioning(unsigned n) {
+      partitioning(unsigned n) :
+         max_reg(n),
+         offsets(new unsigned[n + num_terminator_atoms]),
+         atoms(new unsigned[n + num_terminator_atoms])
+      {
          for (unsigned i = 0; i < n + num_terminator_atoms; i++) {
-            offsets.push_back(i);
-            atoms.push_back(i);
+            offsets[i] = i;
+            atoms[i] = i;
          }
       }
 
+      partitioning(const partitioning &p) :
+         max_reg(p.max_reg),
+         offsets(new unsigned[p.num_atoms() + num_terminator_atoms]),
+         atoms(new unsigned[p.max_reg + num_terminator_atoms])
+      {
+         memcpy(offsets, p.offsets,
+                sizeof(unsigned) * (p.num_atoms() + num_terminator_atoms));
+         memcpy(atoms, p.atoms,
+                sizeof(unsigned) * (p.max_reg + num_terminator_atoms));
+      }
+
+      ~partitioning()
+      {
+         delete[] offsets;
+         delete[] atoms;
+      }
+
+      partitioning &
+      operator=(partitioning p)
+      {
+         SWAP(max_reg, p.max_reg);
+         SWAP(offsets, p.offsets);
+         SWAP(atoms, p.atoms);
+         return *this;
+      }
+
       /**
        * Require register range [reg, reg + n[ to be considered part of the
        * same atom.
@@ -336,7 +403,7 @@ namespace {
           * case that the specified contiguity requirement leads to the fusion
           * (yay) of one or more existing atoms.
           */
-         for (unsigned reg1 = reg + 1; reg1 < atoms.size(); reg1++) {
+         for (unsigned reg1 = reg + 1; reg1 <= max_reg; reg1++) {
             if (offsets[atoms[reg1]] < reg + n) {
                atoms[reg1] = r;
             } else {
@@ -347,11 +414,6 @@ namespace {
                atoms[reg1] = r;
             }
          }
-
-         /* Clean up the scraps if we ended up with less atoms than we started
-          * with.
-          */
-         offsets.erase(offsets.begin() + r + 1, offsets.end());
       }
 
       /**
@@ -388,7 +450,7 @@ namespace {
       unsigned
       num_atoms() const
       {
-         return offsets.size() - num_terminator_atoms;
+         return atoms[max_reg];
       }
 
    private:
@@ -398,8 +460,9 @@ namespace {
        * size_of_atom().
        */
       static const unsigned num_terminator_atoms = 1;
-      std::vector<unsigned> offsets;
-      std::vector<unsigned> atoms;
+      unsigned max_reg;
+      unsigned *offsets;
+      unsigned *atoms;
    };
 
    /**
@@ -455,10 +518,10 @@ namespace {
     * Return the set of GRF atoms that should be left untouched at their
     * original location to avoid violating hardware or software assumptions.
     */
-   std::vector<bool>
+   bool *
    shader_reg_constraints(const fs_visitor *v, const partitioning &p)
    {
-      std::vector<bool> constrained(p.num_atoms());
+      bool *constrained = new bool[p.num_atoms()]();
 
       /* These are read implicitly by some send-message instructions without
        * any indication at the IR level.  Assume they are unsafe to move
@@ -520,12 +583,13 @@ namespace {
     *           meantime optimizing based on Gen9 weights is likely to be more
     *           helpful than not optimizing at all.
     */
-   std::vector<weight_vector_type>
+   weight_vector_type *
    shader_conflict_weight_matrix(const fs_visitor *v, const partitioning &p)
    {
-      std::vector<weight_vector_type> conflicts(p.num_atoms(),
-         weight_vector_type(DIV_ROUND_UP(2 * p.num_atoms(),
-                                               vector_width)));
+      weight_vector_type *conflicts = new weight_vector_type[p.num_atoms()];
+      for (unsigned r = 0; r < p.num_atoms(); r++)
+         conflicts[r] = weight_vector_type(2 * p.num_atoms());
+
       /* Crude approximation of the number of times the current basic block
        * will be executed at run-time.
        */
@@ -575,8 +639,8 @@ namespace {
                 * between atoms r and s.  Note that the weight matrix is
                 * symmetric with respect to indices r and s by construction.
                 */
-               const scalar_type w = std::min(unsigned(max_scalar),
-                  get(conflicts[r], s, p) + cycle_scale);
+               const scalar_type w = MIN2(unsigned(max_scalar),
+                                          get(conflicts[r], s, p) + cycle_scale);
                set(conflicts[r], s, p, w);
                set(conflicts[s], r, p, w);
             }
@@ -592,14 +656,16 @@ namespace {
     * the specified \p conflicts matrix (\sa
     * shader_conflict_weight_matrix()).
     */
-   std::vector<bool>
-   have_any_conflicts(const std::vector<weight_vector_type> &conflicts)
+   bool *
+   have_any_conflicts(const partitioning &p,
+                      const weight_vector_type *conflicts)
    {
-      std::vector<bool> any_conflicts(conflicts.size());
+      bool *any_conflicts = new bool[p.num_atoms()]();
 
-      for (unsigned r = 0; r < conflicts.size(); r++) {
-         for (unsigned s = 0; s < conflicts[r].size(); s++)
-            any_conflicts[r] = any_conflicts[r] || sums(conflicts[r][s]);
+      for (unsigned r = 0; r < p.num_atoms(); r++) {
+         const unsigned m = DIV_ROUND_UP(conflicts[r].size, vector_width);
+         for (unsigned s = 0; s < m; s++)
+            any_conflicts[r] |= sums(conflicts[r].v[s]);
       }
 
       return any_conflicts;
@@ -627,27 +693,60 @@ namespace {
                    const weight_vector_type &bank_mask_n,
                    const weight_vector_type &conflicts)
    {
+      const unsigned m = DIV_ROUND_UP(conflicts.size, vector_width);
       vector_type s_p = {}, s_n = {};
 
-      for (unsigned r = 0; r < conflicts.size(); r++) {
-         s_p = adds(s_p, mask(bank_mask_p[r], conflicts[r]));
-         s_n = adds(s_n, mask(bank_mask_n[r], conflicts[r]));
+      for (unsigned r = 0; r < m; r++) {
+         s_p = adds(s_p, mask(bank_mask_p.v[r], conflicts.v[r]));
+         s_n = adds(s_n, mask(bank_mask_n.v[r], conflicts.v[r]));
       }
 
       return sums(subs(s_p, s_n));
    }
 
    /**
-    * Return an identity permutation of GRF atoms, represented as the start GRF
-    * offset each atom is mapped into.
+    * Register atom permutation, represented as the start GRF offset each atom
+    * is mapped into.
+    */
+   struct permutation {
+      permutation() : v(NULL), size(0) {}
+
+      permutation(unsigned n) :
+         v(new unsigned[n]()), size(n) {}
+
+      permutation(const permutation &p) :
+         v(new unsigned[p.size]), size(p.size)
+      {
+         memcpy(v, p.v, p.size * sizeof(unsigned));
+      }
+
+      ~permutation()
+      {
+         delete[] v;
+      }
+
+      permutation &
+      operator=(permutation p)
+      {
+         SWAP(v, p.v);
+         SWAP(size, p.size);
+         return *this;
+      }
+
+      unsigned *v;
+      unsigned size;
+   };
+
+   /**
+    * Return an identity permutation of GRF atoms.
     */
-   std::vector<unsigned>
+   permutation
    identity_reg_permutation(const partitioning &p)
    {
-      std::vector<unsigned> map(p.num_atoms());
+      permutation map(p.num_atoms());
 
-      for (unsigned r = 0; r < map.size(); r++)
-         map[r] = p.reg_of_atom(r);
+      for (unsigned r = 0; r < map.size; r++)
+         map.v[r] = p.reg_of_atom(r);
 
       return map;
    }
@@ -671,18 +770,18 @@ namespace {
     * characteristic function of each bank, if you regard it as a set
     * containing all atoms assigned to it according to the \p map array.
     */
-   std::array<weight_vector_type, 4>
-   bank_characteristics(const std::vector<unsigned> &map)
+   weight_vector_type *
+   bank_characteristics(const permutation &map)
    {
-      std::array<weight_vector_type, 4> banks;
+      weight_vector_type *banks = new weight_vector_type[4];
 
-      for (unsigned b = 0; b < banks.size(); b++) {
-         banks[b].resize(DIV_ROUND_UP(2 * map.size(), vector_width));
+      for (unsigned b = 0; b < 4; b++) {
+         banks[b] = weight_vector_type(2 * map.size);
 
-         for (unsigned j = 0; j < map.size(); j++) {
+         for (unsigned j = 0; j < map.size; j++) {
             for (unsigned p = 0; p < 2; p++)
                set(banks[b], j, p,
-                   (b ^ p) == bank_of(map[j]) ? -1 : 0);
+                   (b ^ p) == bank_of(map.v[j]) ? -1 : 0);
          }
       }
 
@@ -697,24 +796,24 @@ namespace {
     * may allow it to do a better job in some cases -- It simply reorders
     * existing atoms in the GRF space without affecting their identity.
     */
-   std::vector<unsigned>
+   permutation
    optimize_reg_permutation(const partitioning &p,
-                            const std::vector<bool> &constrained,
-                            const std::vector<weight_vector_type> &conflicts,
-                            std::vector<unsigned> map)
+                            const bool *constrained,
+                            const weight_vector_type *conflicts,
+                            permutation map)
    {
-      const std::vector<bool> any_conflicts = have_any_conflicts(conflicts);
-      std::array<weight_vector_type, 4> banks = bank_characteristics(map);
+      const bool *any_conflicts = have_any_conflicts(p, conflicts);
+      weight_vector_type *banks = bank_characteristics(map);
 
-      for (unsigned r = 0; r < map.size(); r++) {
-         const unsigned bank_r = bank_of(map[r]);
+      for (unsigned r = 0; r < map.size; r++) {
+         const unsigned bank_r = bank_of(map.v[r]);
 
          if (!constrained[r]) {
             unsigned best_s = r;
             int best_benefit = 0;
 
-            for (unsigned s = 0; s < map.size(); s++) {
-               const unsigned bank_s = bank_of(map[s]);
+            for (unsigned s = 0; s < map.size; s++) {
+               const unsigned bank_s = bank_of(map.v[s]);
 
                if (bank_r != bank_s && !constrained[s] &&
                    p.size_of_atom(r) == p.size_of_atom(s) &&
@@ -731,16 +830,18 @@ namespace {
             }
 
             if (best_s != r) {
-               for (unsigned b = 0; b < banks.size(); b++) {
+               for (unsigned b = 0; b < 4; b++) {
                   for (unsigned p = 0; p < 2; p++)
                      swap(banks[b], r, p, best_s, p);
                }
 
-               std::swap(map[r], map[best_s]);
+               SWAP(map.v[r], map.v[best_s]);
             }
          }
       }
 
+      delete[] banks;
+      delete[] any_conflicts;
       return map;
    }
 
@@ -749,13 +850,12 @@ namespace {
     * return the result.
     */
    fs_reg
-   transform(const partitioning &p, const std::vector<unsigned> &map,
-             fs_reg r)
+   transform(const partitioning &p, const permutation &map, fs_reg r)
    {
       if (r.file == VGRF) {
          const unsigned reg = reg_of(r);
          const unsigned s = p.atom_of_reg(reg);
-         r.nr = map[s] + reg - p.reg_of_atom(s);
+         r.nr = map.v[s] + reg - p.reg_of_atom(s);
          r.offset = r.offset % REG_SIZE;
       }
 
@@ -773,10 +873,10 @@ fs_visitor::opt_bank_conflicts()
       return false;
 
    const partitioning p = shader_reg_partitioning(this);
-   const std::vector<bool> constrained = shader_reg_constraints(this, p);
-   const std::vector<weight_vector_type> conflicts =
+   const bool *constrained = shader_reg_constraints(this, p);
+   const weight_vector_type *conflicts =
       shader_conflict_weight_matrix(this, p);
-   const std::vector<unsigned> map =
+   const permutation map =
       optimize_reg_permutation(p, constrained, conflicts,
                                identity_reg_permutation(p));
 
@@ -787,5 +887,7 @@ fs_visitor::opt_bank_conflicts()
          inst->src[i] = transform(p, map, inst->src[i]);
    }
 
+   delete[] conflicts;
+   delete[] constrained;
    return true;
 }
-- 
2.10.2