[Mesa-dev] [PATCH] gallium/swr: update rasterizer (532172)
Tim Rowley
timothy.o.rowley at intel.com
Tue Mar 22 19:45:48 UTC 2016
Highlights include:
* code style fixes
* start removing win32 types
* switch DC/DS rings to ringbuffer datastructure
* rdtsc bucket support for shaders
* address some coverity issues
* user clip planes
* global arena
* support llvm-svn
---
src/gallium/docs/source/drivers/openswr/knobs.rst | 18 +-
src/gallium/drivers/swr/Makefile.sources-arch | 2 +-
.../drivers/swr/rasterizer/common/containers.hpp | 270 +++----
src/gallium/drivers/swr/rasterizer/common/os.h | 44 +-
.../swr/rasterizer/common/rdtsc_buckets.cpp | 18 +-
.../drivers/swr/rasterizer/common/rdtsc_buckets.h | 9 +-
.../swr/rasterizer/common/rdtsc_buckets_shared.h | 4 +-
.../drivers/swr/rasterizer/common/simdintrin.h | 805 +++++++++++++--------
src/gallium/drivers/swr/rasterizer/core/api.cpp | 308 ++++----
src/gallium/drivers/swr/rasterizer/core/api.h | 56 +-
src/gallium/drivers/swr/rasterizer/core/arena.cpp | 166 -----
src/gallium/drivers/swr/rasterizer/core/arena.h | 310 +++++++-
.../drivers/swr/rasterizer/core/backend.cpp | 241 ++----
src/gallium/drivers/swr/rasterizer/core/backend.h | 173 ++++-
src/gallium/drivers/swr/rasterizer/core/clip.cpp | 3 +
src/gallium/drivers/swr/rasterizer/core/clip.h | 98 ++-
src/gallium/drivers/swr/rasterizer/core/context.h | 45 +-
.../drivers/swr/rasterizer/core/depthstencil.h | 6 +-
src/gallium/drivers/swr/rasterizer/core/fifo.hpp | 6 +-
.../swr/rasterizer/core/format_conversion.h | 4 +-
.../drivers/swr/rasterizer/core/format_types.h | 32 +-
.../drivers/swr/rasterizer/core/frontend.cpp | 95 ++-
src/gallium/drivers/swr/rasterizer/core/frontend.h | 13 +-
.../drivers/swr/rasterizer/core/knobs_init.h | 5 +
src/gallium/drivers/swr/rasterizer/core/pa.h | 92 +--
.../drivers/swr/rasterizer/core/rasterizer.cpp | 78 +-
.../drivers/swr/rasterizer/core/ringbuffer.h | 102 +++
src/gallium/drivers/swr/rasterizer/core/state.h | 10 +-
.../drivers/swr/rasterizer/core/threads.cpp | 222 +-----
src/gallium/drivers/swr/rasterizer/core/threads.h | 6 +-
.../drivers/swr/rasterizer/core/tilemgr.cpp | 298 +++++++-
src/gallium/drivers/swr/rasterizer/core/tilemgr.h | 121 +---
src/gallium/drivers/swr/rasterizer/core/utils.cpp | 5 +
src/gallium/drivers/swr/rasterizer/core/utils.h | 51 +-
.../drivers/swr/rasterizer/jitter/JitManager.cpp | 4 +
.../drivers/swr/rasterizer/jitter/JitManager.h | 8 +-
.../drivers/swr/rasterizer/jitter/blend_jit.cpp | 8 +-
.../drivers/swr/rasterizer/jitter/builder.cpp | 16 +-
.../drivers/swr/rasterizer/jitter/builder.h | 6 +
.../drivers/swr/rasterizer/jitter/builder_misc.cpp | 172 ++++-
.../drivers/swr/rasterizer/jitter/builder_misc.h | 8 +-
.../drivers/swr/rasterizer/jitter/fetch_jit.cpp | 72 +-
.../jitter/scripts/gen_llvm_ir_macros.py | 21 +-
.../rasterizer/jitter/scripts/gen_llvm_types.py | 2 +-
.../swr/rasterizer/jitter/streamout_jit.cpp | 8 +-
.../drivers/swr/rasterizer/memory/ClearTile.cpp | 14 +-
.../drivers/swr/rasterizer/memory/Convert.h | 14 +-
.../drivers/swr/rasterizer/memory/tilingtraits.h | 58 +-
.../drivers/swr/rasterizer/scripts/gen_knobs.py | 2 +-
.../drivers/swr/rasterizer/scripts/knob_defs.py | 73 +-
.../rasterizer/scripts/templates/knobs.template | 8 +-
src/gallium/drivers/swr/swr_context.cpp | 1 -
52 files changed, 2464 insertions(+), 1747 deletions(-)
delete mode 100644 src/gallium/drivers/swr/rasterizer/core/arena.cpp
create mode 100644 src/gallium/drivers/swr/rasterizer/core/ringbuffer.h
diff --git a/src/gallium/docs/source/drivers/openswr/knobs.rst b/src/gallium/docs/source/drivers/openswr/knobs.rst
index 06f228a..c26581d 100644
--- a/src/gallium/docs/source/drivers/openswr/knobs.rst
+++ b/src/gallium/docs/source/drivers/openswr/knobs.rst
@@ -4,10 +4,6 @@
OpenSWR has a number of environment variables which control its
operation, in addition to the normal Mesa and gallium controls.
-.. envvar:: KNOB_ENABLE_ASSERT_DIALOGS <bool> (true)
-
-Use dialogs when asserts fire. Asserts are only enabled in debug builds
-
.. envvar:: KNOB_SINGLE_THREADED <bool> (false)
If enabled will perform all rendering on the API thread. This is useful mainly for debugging purposes.
@@ -52,7 +48,7 @@ Frame at which to stop saving buckets data. NOTE: KNOB_ENABLE_RDTSC must be ena
Number of spin-loop iterations worker threads will perform before going to sleep when waiting for work
-.. envvar:: KNOB_MAX_DRAWS_IN_FLIGHT <uint32_t> (160)
+.. envvar:: KNOB_MAX_DRAWS_IN_FLIGHT <uint32_t> (96)
Maximum number of draws outstanding before API thread blocks.
@@ -64,18 +60,6 @@ Maximum primitives in a single Draw(). Larger primitives are split into smaller
Maximum primitives in a single Draw() with tessellation enabled. Larger primitives are split into smaller Draw calls. Should be a multiple of (vectorWidth).
-.. envvar:: KNOB_MAX_FRAC_ODD_TESS_FACTOR <float> (63.0f)
-
-(DEBUG) Maximum tessellation factor for fractional-odd partitioning.
-
-.. envvar:: KNOB_MAX_FRAC_EVEN_TESS_FACTOR <float> (64.0f)
-
-(DEBUG) Maximum tessellation factor for fractional-even partitioning.
-
-.. envvar:: KNOB_MAX_INTEGER_TESS_FACTOR <uint32_t> (64)
-
-(DEBUG) Maximum tessellation factor for integer partitioning.
-
.. envvar:: KNOB_BUCKETS_ENABLE_THREADVIZ <bool> (false)
Enable threadviz output.
diff --git a/src/gallium/drivers/swr/Makefile.sources-arch b/src/gallium/drivers/swr/Makefile.sources-arch
index 6c105f4..a04b120 100644
--- a/src/gallium/drivers/swr/Makefile.sources-arch
+++ b/src/gallium/drivers/swr/Makefile.sources-arch
@@ -59,7 +59,6 @@ COMMON_CXX_SOURCES := \
CORE_CXX_SOURCES := \
rasterizer/core/api.cpp \
rasterizer/core/api.h \
- rasterizer/core/arena.cpp \
rasterizer/core/arena.h \
rasterizer/core/backend.cpp \
rasterizer/core/backend.h \
@@ -83,6 +82,7 @@ CORE_CXX_SOURCES := \
rasterizer/core/rasterizer.h \
rasterizer/core/rdtsc_core.cpp \
rasterizer/core/rdtsc_core.h \
+ rasterizer/core/ringbuffer.h \
rasterizer/core/state.h \
rasterizer/core/threads.cpp \
rasterizer/core/threads.h \
diff --git a/src/gallium/drivers/swr/rasterizer/common/containers.hpp b/src/gallium/drivers/swr/rasterizer/common/containers.hpp
index bc96c5f..f3c0597 100644
--- a/src/gallium/drivers/swr/rasterizer/common/containers.hpp
+++ b/src/gallium/drivers/swr/rasterizer/common/containers.hpp
@@ -33,137 +33,137 @@ namespace SWRL
template <typename T, int NUM_ELEMENTS>
struct UncheckedFixedVector
{
- UncheckedFixedVector() : mSize(0)
- {
- }
-
- UncheckedFixedVector(std::size_t size, T const& exemplar)
- {
- this->mSize = 0;
- for (std::size_t i = 0; i < size; ++i)
- this->push_back(exemplar);
- }
-
- template <typename Iter>
- UncheckedFixedVector(Iter fst, Iter lst)
- {
- this->mSize = 0;
- for ( ; fst != lst; ++fst)
- this->push_back(*fst);
- }
-
- UncheckedFixedVector(UncheckedFixedVector const& UFV)
- {
- this->mSize = 0;
- for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
- (*this)[i] = UFV[i];
- this->mSize = UFV.size();
- }
-
- UncheckedFixedVector& operator=(UncheckedFixedVector const& UFV)
- {
- for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
- (*this)[i] = UFV[i];
- this->mSize = UFV.size();
- return *this;
- }
-
- T* begin() { return &this->mElements[0]; }
- T* end() { return &this->mElements[0] + this->mSize; }
- T const* begin() const { return &this->mElements[0]; }
- T const* end() const { return &this->mElements[0] + this->mSize; }
-
- friend bool operator==(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
- {
- if (L.size() != R.size()) return false;
- for (std::size_t i = 0, N = L.size(); i < N; ++i)
- {
- if (L[i] != R[i]) return false;
- }
- return true;
- }
-
- friend bool operator!=(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
- {
- if (L.size() != R.size()) return true;
- for (std::size_t i = 0, N = L.size(); i < N; ++i)
- {
- if (L[i] != R[i]) return true;
- }
- return false;
- }
-
- T& operator[](std::size_t idx)
- {
- return this->mElements[idx];
- }
- T const& operator[](std::size_t idx) const
- {
- return this->mElements[idx];
- }
- void push_back(T const& t)
- {
- this->mElements[this->mSize] = t;
- ++this->mSize;
- }
- void pop_back()
- {
- SWR_ASSERT(this->mSize > 0);
- --this->mSize;
- }
- T& back()
- {
- return this->mElements[this->mSize-1];
- }
- T const& back() const
- {
- return this->mElements[this->mSize-1];
- }
- bool empty() const
- {
- return this->mSize == 0;
- }
- std::size_t size() const
- {
- return this->mSize;
- }
- void resize(std::size_t sz)
- {
- this->mSize = sz;
- }
- void clear()
- {
- this->resize(0);
- }
+ UncheckedFixedVector() : mSize(0)
+ {
+ }
+
+ UncheckedFixedVector(std::size_t size, T const& exemplar)
+ {
+ this->mSize = 0;
+ for (std::size_t i = 0; i < size; ++i)
+ this->push_back(exemplar);
+ }
+
+ template <typename Iter>
+ UncheckedFixedVector(Iter fst, Iter lst)
+ {
+ this->mSize = 0;
+ for ( ; fst != lst; ++fst)
+ this->push_back(*fst);
+ }
+
+ UncheckedFixedVector(UncheckedFixedVector const& UFV)
+ {
+ this->mSize = 0;
+ for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
+ (*this)[i] = UFV[i];
+ this->mSize = UFV.size();
+ }
+
+ UncheckedFixedVector& operator=(UncheckedFixedVector const& UFV)
+ {
+ for (std::size_t i = 0, N = UFV.size(); i < N; ++i)
+ (*this)[i] = UFV[i];
+ this->mSize = UFV.size();
+ return *this;
+ }
+
+ T* begin() { return &this->mElements[0]; }
+ T* end() { return &this->mElements[0] + this->mSize; }
+ T const* begin() const { return &this->mElements[0]; }
+ T const* end() const { return &this->mElements[0] + this->mSize; }
+
+ friend bool operator==(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
+ {
+ if (L.size() != R.size()) return false;
+ for (std::size_t i = 0, N = L.size(); i < N; ++i)
+ {
+ if (L[i] != R[i]) return false;
+ }
+ return true;
+ }
+
+ friend bool operator!=(UncheckedFixedVector const& L, UncheckedFixedVector const& R)
+ {
+ if (L.size() != R.size()) return true;
+ for (std::size_t i = 0, N = L.size(); i < N; ++i)
+ {
+ if (L[i] != R[i]) return true;
+ }
+ return false;
+ }
+
+ T& operator[](std::size_t idx)
+ {
+ return this->mElements[idx];
+ }
+ T const& operator[](std::size_t idx) const
+ {
+ return this->mElements[idx];
+ }
+ void push_back(T const& t)
+ {
+ this->mElements[this->mSize] = t;
+ ++this->mSize;
+ }
+ void pop_back()
+ {
+ SWR_ASSERT(this->mSize > 0);
+ --this->mSize;
+ }
+ T& back()
+ {
+ return this->mElements[this->mSize-1];
+ }
+ T const& back() const
+ {
+ return this->mElements[this->mSize-1];
+ }
+ bool empty() const
+ {
+ return this->mSize == 0;
+ }
+ std::size_t size() const
+ {
+ return this->mSize;
+ }
+ void resize(std::size_t sz)
+ {
+ this->mSize = sz;
+ }
+ void clear()
+ {
+ this->resize(0);
+ }
private:
- std::size_t mSize;
- T mElements[NUM_ELEMENTS];
+ std::size_t mSize{ 0 };
+ T mElements[NUM_ELEMENTS];
};
template <typename T, int NUM_ELEMENTS>
struct FixedStack : UncheckedFixedVector<T, NUM_ELEMENTS>
{
- FixedStack() {}
-
- void push(T const& t)
- {
- this->push_back(t);
- }
-
- void pop()
- {
- this->pop_back();
- }
-
- T& top()
- {
- return this->back();
- }
-
- T const& top() const
- {
- return this->back();
- }
+ FixedStack() {}
+
+ void push(T const& t)
+ {
+ this->push_back(t);
+ }
+
+ void pop()
+ {
+ this->pop_back();
+ }
+
+ T& top()
+ {
+ return this->back();
+ }
+
+ T const& top() const
+ {
+ return this->back();
+ }
};
template <typename T>
@@ -190,16 +190,16 @@ namespace std
template <typename T, int N>
struct hash<SWRL::UncheckedFixedVector<T, N>>
{
- size_t operator() (SWRL::UncheckedFixedVector<T, N> const& v) const
- {
- if (v.size() == 0) return 0;
- std::hash<T> H;
- size_t x = H(v[0]);
- if (v.size() == 1) return x;
- for (size_t i = 1; i < v.size(); ++i)
- x ^= H(v[i]) + 0x9e3779b9 + (x<<6) + (x>>2);
- return x;
- }
+ size_t operator() (SWRL::UncheckedFixedVector<T, N> const& v) const
+ {
+ if (v.size() == 0) return 0;
+ std::hash<T> H;
+ size_t x = H(v[0]);
+ if (v.size() == 1) return x;
+ for (size_t i = 1; i < v.size(); ++i)
+ x ^= H(v[i]) + 0x9e3779b9 + (x<<6) + (x>>2);
+ return x;
+ }
};
diff --git a/src/gallium/drivers/swr/rasterizer/common/os.h b/src/gallium/drivers/swr/rasterizer/common/os.h
index 522ae0d..5794f3f 100644
--- a/src/gallium/drivers/swr/rasterizer/common/os.h
+++ b/src/gallium/drivers/swr/rasterizer/common/os.h
@@ -47,16 +47,18 @@
#define DEBUGBREAK __debugbreak()
#define PRAGMA_WARNING_PUSH_DISABLE(...) \
- __pragma(warning(push));\
- __pragma(warning(disable:__VA_ARGS__));
+ __pragma(warning(push));\
+ __pragma(warning(disable:__VA_ARGS__));
#define PRAGMA_WARNING_POP() __pragma(warning(pop))
#if defined(_WIN32)
#if defined(_WIN64)
+#define BitScanReverseSizeT BitScanReverse64
#define BitScanForwardSizeT BitScanForward64
#define _mm_popcount_sizeT _mm_popcnt_u64
#else
+#define BitScanReverseSizeT BitScanReverse
#define BitScanForwardSizeT BitScanForward
#define _mm_popcount_sizeT _mm_popcnt_u32
#endif
@@ -68,29 +70,20 @@
#include <stdlib.h>
#include <string.h>
-#include <X11/Xmd.h>
#include <x86intrin.h>
#include <stdint.h>
#include <sys/types.h>
#include <unistd.h>
#include <sys/stat.h>
+#include <stdio.h>
-typedef void VOID;
+typedef void VOID;
typedef void* LPVOID;
-typedef CARD8 BOOL;
-typedef wchar_t WCHAR;
-typedef uint16_t UINT16;
-typedef int INT;
-typedef unsigned int UINT;
-typedef uint32_t UINT32;
-typedef uint64_t UINT64;
-typedef int64_t INT64;
-typedef void* HANDLE;
-typedef float FLOAT;
-typedef int LONG;
-typedef CARD8 BYTE;
-typedef unsigned char UCHAR;
-typedef unsigned int DWORD;
+typedef int INT;
+typedef unsigned int UINT;
+typedef void* HANDLE;
+typedef int LONG;
+typedef unsigned int DWORD;
#undef FALSE
#define FALSE 0
@@ -104,8 +97,11 @@ typedef unsigned int DWORD;
#define INLINE __inline
#endif
#define DEBUGBREAK asm ("int $3")
+#if !defined(__CYGWIN__)
#define __cdecl
+#define __stdcall
#define __declspec(X)
+#endif
#define GCC_VERSION (__GNUC__ * 10000 \
+ __GNUC_MINOR__ * 100 \
@@ -180,21 +176,13 @@ unsigned char _bittest(const LONG *a, LONG b)
#define CreateDirectory(name, pSecurity) mkdir(name, 0777)
-#if defined(_WIN32)
-static inline
-unsigned int _mm_popcnt_u32(unsigned int v)
-{
- return __builtin_popcount(v);
-}
-#endif
-
#define _aligned_free free
#define InterlockedCompareExchange(Dest, Exchange, Comparand) __sync_val_compare_and_swap(Dest, Comparand, Exchange)
#define InterlockedExchangeAdd(Addend, Value) __sync_fetch_and_add(Addend, Value)
#define InterlockedDecrement(Append) __sync_sub_and_fetch(Append, 1)
+#define InterlockedDecrement64(Append) __sync_sub_and_fetch(Append, 1)
#define InterlockedIncrement(Append) __sync_add_and_fetch(Append, 1)
#define _ReadWriteBarrier() asm volatile("" ::: "memory")
-#define __stdcall
#define PRAGMA_WARNING_PUSH_DISABLE(...)
#define PRAGMA_WARNING_POP()
@@ -206,7 +194,7 @@ unsigned int _mm_popcnt_u32(unsigned int v)
#endif
// Universal types
-typedef BYTE KILOBYTE[1024];
+typedef uint8_t KILOBYTE[1024];
typedef KILOBYTE MEGABYTE[1024];
typedef MEGABYTE GIGABYTE[1024];
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
index 454641b..c6768b4 100644
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.cpp
@@ -64,12 +64,14 @@ void BucketManager::RegisterThread(const std::string& name)
UINT BucketManager::RegisterBucket(const BUCKET_DESC& desc)
{
+ mThreadMutex.lock();
size_t id = mBuckets.size();
mBuckets.push_back(desc);
+ mThreadMutex.unlock();
return (UINT)id;
}
-void BucketManager::PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket)
+void BucketManager::PrintBucket(FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket)
{
const char *arrows[] = {
"",
@@ -88,7 +90,7 @@ void BucketManager::PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64
float percentParent = (float)((double)bucket.elapsed / (double)parentCycles * 100.0);
// compute average cycle count per invocation
- UINT64 CPE = bucket.elapsed / bucket.count;
+ uint64_t CPE = bucket.elapsed / bucket.count;
BUCKET_DESC &desc = mBuckets[bucket.id];
@@ -127,7 +129,7 @@ void BucketManager::PrintThread(FILE* f, const BUCKET_THREAD& thread)
// compute thread level total cycle counts across all buckets from root
const BUCKET& root = thread.root;
- UINT64 totalCycles = 0;
+ uint64_t totalCycles = 0;
for (const BUCKET& child : root.children)
{
totalCycles += child.elapsed;
@@ -186,3 +188,13 @@ void BucketManager::PrintReport(const std::string& filename)
fclose(f);
}
}
+
+void BucketManager_StartBucket(BucketManager* pBucketMgr, uint32_t id)
+{
+ pBucketMgr->StartBucket(id);
+}
+
+void BucketManager_StopBucket(BucketManager* pBucketMgr, uint32_t id)
+{
+ pBucketMgr->StopBucket(id);
+}
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
index 99cb10e..9dfa7f6 100644
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets.h
@@ -70,7 +70,9 @@ public:
// removes all registered buckets
void ClearBuckets()
{
+ mThreadMutex.lock();
mBuckets.clear();
+ mThreadMutex.unlock();
}
/// Registers a new thread with the manager.
@@ -209,7 +211,7 @@ public:
}
private:
- void PrintBucket(FILE* f, UINT level, UINT64 threadCycles, UINT64 parentCycles, const BUCKET& bucket);
+ void PrintBucket(FILE* f, UINT level, uint64_t threadCycles, uint64_t parentCycles, const BUCKET& bucket);
void PrintThread(FILE* f, const BUCKET_THREAD& thread);
// list of active threads that have registered with this manager
@@ -227,3 +229,8 @@ private:
bool mThreadViz{ false };
std::string mThreadVizDir;
};
+
+
+// C helpers for jitter
+void BucketManager_StartBucket(BucketManager* pBucketMgr, uint32_t id);
+void BucketManager_StopBucket(BucketManager* pBucketMgr, uint32_t id);
diff --git a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
index 41c6d5d..34c322e 100644
--- a/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
+++ b/src/gallium/drivers/swr/rasterizer/common/rdtsc_buckets_shared.h
@@ -64,13 +64,13 @@ struct BUCKET_THREAD
std::string name;
// id for this thread, assigned by the thread manager
- uint32_t id;
+ uint32_t id{ 0 };
// root of the bucket hierarchy for this thread
BUCKET root;
// currently executing bucket somewhere in the hierarchy
- BUCKET* pCurrent;
+ BUCKET* pCurrent{ nullptr };
// currently executing hierarchy level
uint32_t level{ 0 };
diff --git a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
index 8fa6d9e..fa792b4 100644
--- a/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
+++ b/src/gallium/drivers/swr/rasterizer/common/simdintrin.h
@@ -43,14 +43,14 @@ typedef uint8_t simdmask;
// simd vector
OSALIGNSIMD(union) simdvector
{
- simdscalar v[4];
- struct
- {
- simdscalar x, y, z, w;
- };
-
- simdscalar& operator[] (const int i) { return v[i]; }
- const simdscalar& operator[] (const int i) const { return v[i]; }
+ simdscalar v[4];
+ struct
+ {
+ simdscalar x, y, z, w;
+ };
+
+ simdscalar& operator[] (const int i) { return v[i]; }
+ const simdscalar& operator[] (const int i) const { return v[i]; }
};
#if KNOB_SIMD_WIDTH == 8
@@ -59,8 +59,8 @@ OSALIGNSIMD(union) simdvector
#define _simd_load1_ps _mm256_broadcast_ss
#define _simd_loadu_ps _mm256_loadu_ps
#define _simd_setzero_ps _mm256_setzero_ps
-#define _simd_set1_ps _mm256_set1_ps
-#define _simd_blend_ps _mm256_blend_ps
+#define _simd_set1_ps _mm256_set1_ps
+#define _simd_blend_ps _mm256_blend_ps
#define _simd_blendv_ps _mm256_blendv_ps
#define _simd_store_ps _mm256_store_ps
#define _simd_mul_ps _mm256_mul_ps
@@ -100,21 +100,156 @@ OSALIGNSIMD(union) simdvector
INLINE \
__m256i func(__m256i a, __m256i b)\
{\
- __m128i aHi = _mm256_extractf128_si256(a, 1);\
- __m128i bHi = _mm256_extractf128_si256(b, 1);\
- __m128i aLo = _mm256_castsi256_si128(a);\
- __m128i bLo = _mm256_castsi256_si128(b);\
+ __m128i aHi = _mm256_extractf128_si256(a, 1);\
+ __m128i bHi = _mm256_extractf128_si256(b, 1);\
+ __m128i aLo = _mm256_castsi256_si128(a);\
+ __m128i bLo = _mm256_castsi256_si128(b);\
\
- __m128i subLo = intrin(aLo, bLo);\
- __m128i subHi = intrin(aHi, bHi);\
+ __m128i subLo = intrin(aLo, bLo);\
+ __m128i subHi = intrin(aHi, bHi);\
\
- __m256i result = _mm256_castsi128_si256(subLo);\
- result = _mm256_insertf128_si256(result, subHi, 1);\
+ __m256i result = _mm256_castsi128_si256(subLo);\
+ result = _mm256_insertf128_si256(result, subHi, 1);\
\
- return result;\
+ return result;\
}
#if (KNOB_ARCH == KNOB_ARCH_AVX)
+INLINE
+__m256 _simdemu_permute_ps(__m256 a, __m256i b)
+{
+ __m128 aHi = _mm256_extractf128_ps(a, 1);
+ __m128i bHi = _mm256_extractf128_si256(b, 1);
+ __m128 aLo = _mm256_castps256_ps128(a);
+ __m128i bLo = _mm256_castsi256_si128(b);
+
+ __m128i indexHi = _mm_cmpgt_epi32(bLo, _mm_set1_epi32(3));
+ __m128 resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bLo, _mm_set1_epi32(0x3)));
+ __m128 resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bLo, _mm_set1_epi32(0x3)));
+ __m128 blendLowRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi));
+
+ indexHi = _mm_cmpgt_epi32(bHi, _mm_set1_epi32(3));
+ resLow = _mm_permutevar_ps(aLo, _mm_and_si128(bHi, _mm_set1_epi32(0x3)));
+ resHi = _mm_permutevar_ps(aHi, _mm_and_si128(bHi, _mm_set1_epi32(0x3)));
+ __m128 blendHiRes = _mm_blendv_ps(resLow, resHi, _mm_castsi128_ps(indexHi));
+
+ __m256 result = _mm256_castps128_ps256(blendLowRes);
+ result = _mm256_insertf128_ps(result, blendHiRes, 1);
+
+ return result;
+}
+
+INLINE
+__m256i _simdemu_srlv_epi32(__m256i vA, __m256i vCount)
+{
+ int32_t aHi, aLow, countHi, countLow;
+ __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
+ __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
+ __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
+ __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
+
+ aHi = _mm_extract_epi32(vAHi, 0);
+ countHi = _mm_extract_epi32(vCountHi, 0);
+ aHi >>= countHi;
+ vAHi = _mm_insert_epi32(vAHi, aHi, 0);
+
+ aLow = _mm_extract_epi32(vALow, 0);
+ countLow = _mm_extract_epi32(vCountLow, 0);
+ aLow >>= countLow;
+ vALow = _mm_insert_epi32(vALow, aLow, 0);
+
+ aHi = _mm_extract_epi32(vAHi, 1);
+ countHi = _mm_extract_epi32(vCountHi, 1);
+ aHi >>= countHi;
+ vAHi = _mm_insert_epi32(vAHi, aHi, 1);
+
+ aLow = _mm_extract_epi32(vALow, 1);
+ countLow = _mm_extract_epi32(vCountLow, 1);
+ aLow >>= countLow;
+ vALow = _mm_insert_epi32(vALow, aLow, 1);
+
+ aHi = _mm_extract_epi32(vAHi, 2);
+ countHi = _mm_extract_epi32(vCountHi, 2);
+ aHi >>= countHi;
+ vAHi = _mm_insert_epi32(vAHi, aHi, 2);
+
+ aLow = _mm_extract_epi32(vALow, 2);
+ countLow = _mm_extract_epi32(vCountLow, 2);
+ aLow >>= countLow;
+ vALow = _mm_insert_epi32(vALow, aLow, 2);
+
+ aHi = _mm_extract_epi32(vAHi, 3);
+ countHi = _mm_extract_epi32(vCountHi, 3);
+ aHi >>= countHi;
+ vAHi = _mm_insert_epi32(vAHi, aHi, 3);
+
+ aLow = _mm_extract_epi32(vALow, 3);
+ countLow = _mm_extract_epi32(vCountLow, 3);
+ aLow >>= countLow;
+ vALow = _mm_insert_epi32(vALow, aLow, 3);
+
+ __m256i ret = _mm256_set1_epi32(0);
+ ret = _mm256_insertf128_si256(ret, vAHi, 1);
+ ret = _mm256_insertf128_si256(ret, vALow, 0);
+ return ret;
+}
+
+
+INLINE
+__m256i _simdemu_sllv_epi32(__m256i vA, __m256i vCount)
+{
+ int32_t aHi, aLow, countHi, countLow;
+ __m128i vAHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 1));
+ __m128i vALow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vA), 0));
+ __m128i vCountHi = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 1));
+ __m128i vCountLow = _mm_castps_si128(_mm256_extractf128_ps(_mm256_castsi256_ps(vCount), 0));
+
+ aHi = _mm_extract_epi32(vAHi, 0);
+ countHi = _mm_extract_epi32(vCountHi, 0);
+ aHi <<= countHi;
+ vAHi = _mm_insert_epi32(vAHi, aHi, 0);
+
+ aLow = _mm_extract_epi32(vALow, 0);
+ countLow = _mm_extract_epi32(vCountLow, 0);
+ aLow <<= countLow;
+ vALow = _mm_insert_epi32(vALow, aLow, 0);
+
+ aHi = _mm_extract_epi32(vAHi, 1);
+ countHi = _mm_extract_epi32(vCountHi, 1);
+ aHi <<= countHi;
+ vAHi = _mm_insert_epi32(vAHi, aHi, 1);
+
+ aLow = _mm_extract_epi32(vALow, 1);
+ countLow = _mm_extract_epi32(vCountLow, 1);
+ aLow <<= countLow;
+ vALow = _mm_insert_epi32(vALow, aLow, 1);
+
+ aHi = _mm_extract_epi32(vAHi, 2);
+ countHi = _mm_extract_epi32(vCountHi, 2);
+ aHi <<= countHi;
+ vAHi = _mm_insert_epi32(vAHi, aHi, 2);
+
+ aLow = _mm_extract_epi32(vALow, 2);
+ countLow = _mm_extract_epi32(vCountLow, 2);
+ aLow <<= countLow;
+ vALow = _mm_insert_epi32(vALow, aLow, 2);
+
+ aHi = _mm_extract_epi32(vAHi, 3);
+ countHi = _mm_extract_epi32(vCountHi, 3);
+ aHi <<= countHi;
+ vAHi = _mm_insert_epi32(vAHi, aHi, 3);
+
+ aLow = _mm_extract_epi32(vALow, 3);
+ countLow = _mm_extract_epi32(vCountLow, 3);
+ aLow <<= countLow;
+ vALow = _mm_insert_epi32(vALow, aLow, 3);
+
+ __m256i ret = _mm256_set1_epi32(0);
+ ret = _mm256_insertf128_si256(ret, vAHi, 1);
+ ret = _mm256_insertf128_si256(ret, vALow, 0);
+ return ret;
+}
+
#define _simd_mul_epi32 _simdemu_mul_epi32
#define _simd_mullo_epi32 _simdemu_mullo_epi32
#define _simd_sub_epi32 _simdemu_sub_epi32
@@ -136,7 +271,14 @@ __m256i func(__m256i a, __m256i b)\
#define _simd_add_epi8 _simdemu_add_epi8
#define _simd_cmpeq_epi64 _simdemu_cmpeq_epi64
#define _simd_cmpgt_epi64 _simdemu_cmpgt_epi64
+#define _simd_cmpgt_epi8 _simdemu_cmpgt_epi8
+#define _simd_cmpeq_epi8 _simdemu_cmpeq_epi8
+#define _simd_cmpgt_epi16 _simdemu_cmpgt_epi16
+#define _simd_cmpeq_epi16 _simdemu_cmpeq_epi16
#define _simd_movemask_epi8 _simdemu_movemask_epi8
+#define _simd_permute_ps _simdemu_permute_ps
+#define _simd_srlv_epi32 _simdemu_srlv_epi32
+#define _simd_sllv_epi32 _simdemu_sllv_epi32
SIMD_EMU_EPI(_simdemu_mul_epi32, _mm_mul_epi32)
SIMD_EMU_EPI(_simdemu_mullo_epi32, _mm_mullo_epi32)
@@ -158,6 +300,10 @@ SIMD_EMU_EPI(_simdemu_subs_epu8, _mm_subs_epu8)
SIMD_EMU_EPI(_simdemu_add_epi8, _mm_add_epi8)
SIMD_EMU_EPI(_simdemu_cmpeq_epi64, _mm_cmpeq_epi64)
SIMD_EMU_EPI(_simdemu_cmpgt_epi64, _mm_cmpgt_epi64)
+SIMD_EMU_EPI(_simdemu_cmpgt_epi8, _mm_cmpgt_epi8)
+SIMD_EMU_EPI(_simdemu_cmpeq_epi8, _mm_cmpeq_epi8)
+SIMD_EMU_EPI(_simdemu_cmpgt_epi16, _mm_cmpgt_epi16)
+SIMD_EMU_EPI(_simdemu_cmpeq_epi16, _mm_cmpeq_epi16)
#define _simd_unpacklo_epi32(a, b) _mm256_castps_si256(_mm256_unpacklo_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
#define _simd_unpackhi_epi32(a, b) _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b)))
@@ -176,25 +322,25 @@ SIMD_EMU_EPI(_simdemu_shuffle_epi8, _mm_shuffle_epi8)
INLINE
__m128 _mm_fmaddemu_ps(__m128 a, __m128 b, __m128 c)
{
- __m128 res = _mm_mul_ps(a, b);
- res = _mm_add_ps(res, c);
- return res;
+ __m128 res = _mm_mul_ps(a, b);
+ res = _mm_add_ps(res, c);
+ return res;
}
INLINE
__m256 _mm_fmaddemu256_ps(__m256 a, __m256 b, __m256 c)
{
- __m256 res = _mm256_mul_ps(a, b);
- res = _mm256_add_ps(res, c);
- return res;
+ __m256 res = _mm256_mul_ps(a, b);
+ res = _mm256_add_ps(res, c);
+ return res;
}
INLINE
__m256 _mm_fmsubemu256_ps(__m256 a, __m256 b, __m256 c)
{
- __m256 res = _mm256_mul_ps(a, b);
- res = _mm256_sub_ps(res, c);
- return res;
+ __m256 res = _mm256_mul_ps(a, b);
+ res = _mm256_sub_ps(res, c);
+ return res;
}
INLINE
@@ -295,7 +441,14 @@ int _simdemu_movemask_epi8(__m256i a)
#define _simd_cmpeq_epi64 _mm256_cmpeq_epi64
#define _simd_cmpgt_epi64 _mm256_cmpgt_epi64
+#define _simd_cmpgt_epi8 _mm256_cmpgt_epi8
+#define _simd_cmpeq_epi8 _mm256_cmpeq_epi8
+#define _simd_cmpgt_epi16 _mm256_cmpgt_epi16
+#define _simd_cmpeq_epi16 _mm256_cmpeq_epi16
#define _simd_movemask_epi8 _mm256_movemask_epi8
+#define _simd_permute_ps _mm256_permutevar8x32_ps
+#define _simd_srlv_epi32 _mm256_srlv_epi32
+#define _simd_sllv_epi32 _mm256_sllv_epi32
#endif
#define _simd_shuffleps_epi32(vA, vB, imm) _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(vA), _mm256_castsi256_ps(vB), imm))
@@ -343,30 +496,30 @@ void _simd_mov(simdscalar &r, unsigned int rlane, simdscalar& s, unsigned int sl
INLINE __m256i _simdemu_slli_epi32(__m256i a, uint32_t i)
{
- __m128i aHi = _mm256_extractf128_si256(a, 1);
- __m128i aLo = _mm256_castsi256_si128(a);
+ __m128i aHi = _mm256_extractf128_si256(a, 1);
+ __m128i aLo = _mm256_castsi256_si128(a);
- __m128i resHi = _mm_slli_epi32(aHi, i);
- __m128i resLo = _mm_slli_epi32(aLo, i);
+ __m128i resHi = _mm_slli_epi32(aHi, i);
+ __m128i resLo = _mm_slli_epi32(aLo, i);
- __m256i result = _mm256_castsi128_si256(resLo);
- result = _mm256_insertf128_si256(result, resHi, 1);
+ __m256i result = _mm256_castsi128_si256(resLo);
+ result = _mm256_insertf128_si256(result, resHi, 1);
- return result;
+ return result;
}
INLINE __m256i _simdemu_srai_epi32(__m256i a, uint32_t i)
{
- __m128i aHi = _mm256_extractf128_si256(a, 1);
- __m128i aLo = _mm256_castsi256_si128(a);
+ __m128i aHi = _mm256_extractf128_si256(a, 1);
+ __m128i aLo = _mm256_castsi256_si128(a);
- __m128i resHi = _mm_srai_epi32(aHi, i);
- __m128i resLo = _mm_srai_epi32(aLo, i);
+ __m128i resHi = _mm_srai_epi32(aHi, i);
+ __m128i resLo = _mm_srai_epi32(aLo, i);
- __m256i result = _mm256_castsi128_si256(resLo);
- result = _mm256_insertf128_si256(result, resHi, 1);
+ __m256i result = _mm256_castsi128_si256(resLo);
+ result = _mm256_insertf128_si256(result, resHi, 1);
- return result;
+ return result;
}
INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i)
@@ -386,7 +539,7 @@ INLINE __m256i _simdemu_srli_epi32(__m256i a, uint32_t i)
INLINE
void _simdvec_transpose(simdvector &v)
{
- SWR_ASSERT(false, "Need to implement 8 wide version");
+ SWR_ASSERT(false, "Need to implement 8 wide version");
}
#else
@@ -397,132 +550,132 @@ void _simdvec_transpose(simdvector &v)
INLINE
void _simdvec_load_ps(simdvector& r, const float *p)
{
- r[0] = _simd_set1_ps(p[0]);
- r[1] = _simd_set1_ps(p[1]);
- r[2] = _simd_set1_ps(p[2]);
- r[3] = _simd_set1_ps(p[3]);
+ r[0] = _simd_set1_ps(p[0]);
+ r[1] = _simd_set1_ps(p[1]);
+ r[2] = _simd_set1_ps(p[2]);
+ r[3] = _simd_set1_ps(p[3]);
}
INLINE
void _simdvec_mov(simdvector& r, const simdscalar& s)
{
- r[0] = s;
- r[1] = s;
- r[2] = s;
- r[3] = s;
+ r[0] = s;
+ r[1] = s;
+ r[2] = s;
+ r[3] = s;
}
INLINE
void _simdvec_mov(simdvector& r, const simdvector& v)
{
- r[0] = v[0];
- r[1] = v[1];
- r[2] = v[2];
- r[3] = v[3];
+ r[0] = v[0];
+ r[1] = v[1];
+ r[2] = v[2];
+ r[3] = v[3];
}
// just move a lane from the source simdvector to dest simdvector
INLINE
void _simdvec_mov(simdvector &r, unsigned int rlane, simdvector& s, unsigned int slane)
{
- _simd_mov(r[0], rlane, s[0], slane);
- _simd_mov(r[1], rlane, s[1], slane);
- _simd_mov(r[2], rlane, s[2], slane);
- _simd_mov(r[3], rlane, s[3], slane);
+ _simd_mov(r[0], rlane, s[0], slane);
+ _simd_mov(r[1], rlane, s[1], slane);
+ _simd_mov(r[2], rlane, s[2], slane);
+ _simd_mov(r[3], rlane, s[3], slane);
}
INLINE
void _simdvec_dp3_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
{
- simdscalar tmp;
- r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
+ simdscalar tmp;
+ r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
- tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
- r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
+ tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
+ r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
- tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
- r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+ tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
+ r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
}
INLINE
void _simdvec_dp4_ps(simdscalar& r, const simdvector& v0, const simdvector& v1)
{
- simdscalar tmp;
- r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
+ simdscalar tmp;
+ r = _simd_mul_ps(v0[0], v1[0]); // (v0.x*v1.x)
- tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
- r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
+ tmp = _simd_mul_ps(v0[1], v1[1]); // (v0.y*v1.y)
+ r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y)
- tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
- r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+ tmp = _simd_mul_ps(v0[2], v1[2]); // (v0.z*v1.z)
+ r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
- tmp = _simd_mul_ps(v0[3], v1[3]); // (v0.w*v1.w)
- r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
+ tmp = _simd_mul_ps(v0[3], v1[3]); // (v0.w*v1.w)
+ r = _simd_add_ps(r, tmp); // (v0.x*v1.x) + (v0.y*v1.y) + (v0.z*v1.z)
}
INLINE
simdscalar _simdvec_rcp_length_ps(const simdvector& v)
{
- simdscalar length;
- _simdvec_dp4_ps(length, v, v);
- return _simd_rsqrt_ps(length);
+ simdscalar length;
+ _simdvec_dp4_ps(length, v, v);
+ return _simd_rsqrt_ps(length);
}
INLINE
void _simdvec_normalize_ps(simdvector& r, const simdvector& v)
{
- simdscalar vecLength;
- vecLength = _simdvec_rcp_length_ps(v);
+ simdscalar vecLength;
+ vecLength = _simdvec_rcp_length_ps(v);
- r[0] = _simd_mul_ps(v[0], vecLength);
- r[1] = _simd_mul_ps(v[1], vecLength);
- r[2] = _simd_mul_ps(v[2], vecLength);
- r[3] = _simd_mul_ps(v[3], vecLength);
+ r[0] = _simd_mul_ps(v[0], vecLength);
+ r[1] = _simd_mul_ps(v[1], vecLength);
+ r[2] = _simd_mul_ps(v[2], vecLength);
+ r[3] = _simd_mul_ps(v[3], vecLength);
}
INLINE
void _simdvec_mul_ps(simdvector& r, const simdvector& v, const simdscalar& s)
{
- r[0] = _simd_mul_ps(v[0], s);
- r[1] = _simd_mul_ps(v[1], s);
- r[2] = _simd_mul_ps(v[2], s);
- r[3] = _simd_mul_ps(v[3], s);
+ r[0] = _simd_mul_ps(v[0], s);
+ r[1] = _simd_mul_ps(v[1], s);
+ r[2] = _simd_mul_ps(v[2], s);
+ r[3] = _simd_mul_ps(v[3], s);
}
INLINE
void _simdvec_mul_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
{
- r[0] = _simd_mul_ps(v0[0], v1[0]);
- r[1] = _simd_mul_ps(v0[1], v1[1]);
- r[2] = _simd_mul_ps(v0[2], v1[2]);
- r[3] = _simd_mul_ps(v0[3], v1[3]);
+ r[0] = _simd_mul_ps(v0[0], v1[0]);
+ r[1] = _simd_mul_ps(v0[1], v1[1]);
+ r[2] = _simd_mul_ps(v0[2], v1[2]);
+ r[3] = _simd_mul_ps(v0[3], v1[3]);
}
INLINE
void _simdvec_add_ps(simdvector& r, const simdvector& v0, const simdvector& v1)
{
- r[0] = _simd_add_ps(v0[0], v1[0]);
- r[1] = _simd_add_ps(v0[1], v1[1]);
- r[2] = _simd_add_ps(v0[2], v1[2]);
- r[3] = _simd_add_ps(v0[3], v1[3]);
+ r[0] = _simd_add_ps(v0[0], v1[0]);
+ r[1] = _simd_add_ps(v0[1], v1[1]);
+ r[2] = _simd_add_ps(v0[2], v1[2]);
+ r[3] = _simd_add_ps(v0[3], v1[3]);
}
INLINE
void _simdvec_min_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
{
- r[0] = _simd_min_ps(v0[0], s);
- r[1] = _simd_min_ps(v0[1], s);
- r[2] = _simd_min_ps(v0[2], s);
- r[3] = _simd_min_ps(v0[3], s);
+ r[0] = _simd_min_ps(v0[0], s);
+ r[1] = _simd_min_ps(v0[1], s);
+ r[2] = _simd_min_ps(v0[2], s);
+ r[3] = _simd_min_ps(v0[3], s);
}
INLINE
void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
{
- r[0] = _simd_max_ps(v0[0], s);
- r[1] = _simd_max_ps(v0[1], s);
- r[2] = _simd_max_ps(v0[2], s);
- r[3] = _simd_max_ps(v0[3], s);
+ r[0] = _simd_max_ps(v0[0], s);
+ r[1] = _simd_max_ps(v0[1], s);
+ r[2] = _simd_max_ps(v0[2], s);
+ r[3] = _simd_max_ps(v0[3], s);
}
// Matrix4x4 * Vector4
@@ -532,65 +685,65 @@ void _simdvec_max_ps(simdvector& r, const simdvector& v0, const simdscalar& s)
// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * v.w)
INLINE
void _simd_mat4x4_vec4_multiply(
- simdvector& result,
- const float *pMatrix,
- const simdvector& v)
-{
- simdscalar m;
- simdscalar r0;
- simdscalar r1;
-
- m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
- r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
- result[0] = r0;
-
- m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
- r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
- result[1] = r0;
-
- m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
- r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
- result[2] = r0;
-
- m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3]
- r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
- result[3] = r0;
+ simdvector& result,
+ const float *pMatrix,
+ const simdvector& v)
+{
+ simdscalar m;
+ simdscalar r0;
+ simdscalar r1;
+
+ m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
+ r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+ result[0] = r0;
+
+ m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
+ r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+ result[1] = r0;
+
+ m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
+ r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+ result[2] = r0;
+
+ m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3]
+ r1 = _simd_mul_ps(m, v[3]); // (m3 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * v.w)
+ result[3] = r0;
}
// Matrix4x4 * Vector3 - Direction Vector where w = 0.
@@ -600,45 +753,45 @@ void _simd_mat4x4_vec4_multiply(
// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 0)
INLINE
void _simd_mat3x3_vec3_w0_multiply(
- simdvector& result,
- const float *pMatrix,
- const simdvector& v)
-{
- simdscalar m;
- simdscalar r0;
- simdscalar r1;
-
- m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- result[0] = r0;
-
- m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- result[1] = r0;
-
- m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- result[2] = r0;
-
- result[3] = _simd_setzero_ps();
+ simdvector& result,
+ const float *pMatrix,
+ const simdvector& v)
+{
+ simdscalar m;
+ simdscalar r0;
+ simdscalar r1;
+
+ m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ result[0] = r0;
+
+ m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ result[1] = r0;
+
+ m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ result[2] = r0;
+
+ result[3] = _simd_setzero_ps();
}
// Matrix4x4 * Vector3 - Position vector where w = 1.
@@ -648,108 +801,108 @@ void _simd_mat3x3_vec3_w0_multiply(
// outVec.w = (m30 * v.x) + (m31 * v.y) + (m32 * v.z) + (m33 * 1)
INLINE
void _simd_mat4x4_vec3_w1_multiply(
- simdvector& result,
- const float *pMatrix,
- const simdvector& v)
-{
- simdscalar m;
- simdscalar r0;
- simdscalar r1;
-
- m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
- r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
- result[0] = r0;
-
- m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
- r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
- result[1] = r0;
-
- m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
- r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
- result[2] = r0;
-
- m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3]
- result[3] = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ simdvector& result,
+ const float *pMatrix,
+ const simdvector& v)
+{
+ simdscalar m;
+ simdscalar r0;
+ simdscalar r1;
+
+ m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
+ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ result[0] = r0;
+
+ m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
+ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ result[1] = r0;
+
+ m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
+ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ result[2] = r0;
+
+ m = _simd_load1_ps(pMatrix + 3*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 3*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 3*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 3*4 + 3); // m[row][3]
+ result[3] = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
}
INLINE
void _simd_mat4x3_vec3_w1_multiply(
- simdvector& result,
- const float *pMatrix,
- const simdvector& v)
-{
- simdscalar m;
- simdscalar r0;
- simdscalar r1;
-
- m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
- r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
- result[0] = r0;
-
- m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
- r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
- result[1] = r0;
-
- m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
- r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
- m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
- r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
- m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
- r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
- r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
- m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
- r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
- result[2] = r0;
- result[3] = _simd_set1_ps(1.0f);
+ simdvector& result,
+ const float *pMatrix,
+ const simdvector& v)
+{
+ simdscalar m;
+ simdscalar r0;
+ simdscalar r1;
+
+ m = _simd_load1_ps(pMatrix + 0*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 0*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 0*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 0*4 + 3); // m[row][3]
+ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ result[0] = r0;
+
+ m = _simd_load1_ps(pMatrix + 1*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 1*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 1*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 1*4 + 3); // m[row][3]
+ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ result[1] = r0;
+
+ m = _simd_load1_ps(pMatrix + 2*4 + 0); // m[row][0]
+ r0 = _simd_mul_ps(m, v[0]); // (m00 * v.x)
+ m = _simd_load1_ps(pMatrix + 2*4 + 1); // m[row][1]
+ r1 = _simd_mul_ps(m, v[1]); // (m1 * v.y)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y)
+ m = _simd_load1_ps(pMatrix + 2*4 + 2); // m[row][2]
+ r1 = _simd_mul_ps(m, v[2]); // (m2 * v.z)
+ r0 = _simd_add_ps(r0, r1); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z)
+ m = _simd_load1_ps(pMatrix + 2*4 + 3); // m[row][3]
+ r0 = _simd_add_ps(r0, m); // (m0 * v.x) + (m1 * v.y) + (m2 * v.z) + (m2 * 1)
+ result[2] = r0;
+ result[3] = _simd_set1_ps(1.0f);
}
//////////////////////////////////////////////////////////////////////////
@@ -783,5 +936,61 @@ static INLINE simdscalar InterpolateComponent(simdscalar vI, simdscalar vJ, cons
return vplaneps(vA, vB, vC, vI, vJ);
}
+INLINE
+UINT pdep_u32(UINT a, UINT mask)
+{
+#if KNOB_ARCH==KNOB_ARCH_AVX2
+ return _pdep_u32(a, mask);
+#else
+ UINT result = 0;
+
+ // copied from http://wm.ite.pl/articles/pdep-soft-emu.html
+ // using bsf instead of funky loop
+ DWORD maskIndex;
+ while (_BitScanForward(&maskIndex, mask))
+ {
+ // 1. isolate lowest set bit of mask
+ const UINT lowest = 1 << maskIndex;
+
+ // 2. populate LSB from src
+ const UINT LSB = (UINT)((int)(a << 31) >> 31);
+
+ // 3. copy bit from mask
+ result |= LSB & lowest;
+
+ // 4. clear lowest bit
+ mask &= ~lowest;
+
+ // 5. prepare for next iteration
+ a >>= 1;
+ }
+
+ return result;
+#endif
+}
+
+INLINE
+UINT pext_u32(UINT a, UINT mask)
+{
+#if KNOB_ARCH==KNOB_ARCH_AVX2
+ return _pext_u32(a, mask);
+#else
+ UINT result = 0;
+ DWORD maskIndex;
+ uint32_t currentBit = 0;
+ while (_BitScanForward(&maskIndex, mask))
+ {
+ // 1. isolate lowest set bit of mask
+ const UINT lowest = 1 << maskIndex;
+
+ // 2. copy bit from mask
+ result |= ((a & lowest) > 0) << currentBit++;
+
+ // 3. clear lowest bit
+ mask &= ~lowest;
+ }
+ return result;
+#endif
+}
#endif//__SWR_SIMDINTRIN_H__
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.cpp b/src/gallium/drivers/swr/rasterizer/core/api.cpp
index fccccab..6ebb3f8 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/api.cpp
@@ -49,7 +49,7 @@ void SetupDefaultState(SWR_CONTEXT *pContext);
/// @brief Create SWR Context.
/// @param pCreateInfo - pointer to creation info.
HANDLE SwrCreateContext(
- const SWR_CREATECONTEXT_INFO* pCreateInfo)
+ SWR_CREATECONTEXT_INFO* pCreateInfo)
{
RDTSC_RESET();
RDTSC_INIT(0);
@@ -61,27 +61,16 @@ HANDLE SwrCreateContext(
pContext->driverType = pCreateInfo->driver;
pContext->privateStateSize = pCreateInfo->privateStateSize;
- pContext->dcRing = (DRAW_CONTEXT*)_aligned_malloc(sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT, 64);
- memset(pContext->dcRing, 0, sizeof(DRAW_CONTEXT)*KNOB_MAX_DRAWS_IN_FLIGHT);
-
- pContext->dsRing = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT, 64);
- memset(pContext->dsRing, 0, sizeof(DRAW_STATE)*KNOB_MAX_DRAWS_IN_FLIGHT);
-
- pContext->numSubContexts = pCreateInfo->maxSubContexts;
- if (pContext->numSubContexts > 1)
- {
- pContext->subCtxSave = (DRAW_STATE*)_aligned_malloc(sizeof(DRAW_STATE) * pContext->numSubContexts, 64);
- memset(pContext->subCtxSave, 0, sizeof(DRAW_STATE) * pContext->numSubContexts);
- }
+ pContext->dcRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
+ pContext->dsRing.Init(KNOB_MAX_DRAWS_IN_FLIGHT);
for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
{
- pContext->dcRing[dc].pArena = new Arena();
- pContext->dcRing[dc].inUse = false;
+ pContext->dcRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
pContext->dcRing[dc].pTileMgr = new MacroTileMgr(*(pContext->dcRing[dc].pArena));
pContext->dcRing[dc].pDispatch = new DispatchQueue(); /// @todo Could lazily allocate this if Dispatch seen.
- pContext->dsRing[dc].pArena = new Arena();
+ pContext->dsRing[dc].pArena = new CachingArena(pContext->cachingArenaAllocator);
}
if (!KNOB_SINGLE_THREADED)
@@ -108,9 +97,6 @@ HANDLE SwrCreateContext(
pContext->pScratch[i] = (uint8_t*)_aligned_malloc((32 * 1024), KNOB_SIMD_WIDTH * 4);
}
- pContext->nextDrawId = 1;
- pContext->DrawEnqueued = 1;
-
// State setup AFTER context is fully initialized
SetupDefaultState(pContext);
@@ -125,6 +111,13 @@ HANDLE SwrCreateContext(
pContext->pfnStoreTile = pCreateInfo->pfnStoreTile;
pContext->pfnClearTile = pCreateInfo->pfnClearTile;
+ // pass pointer to bucket manager back to caller
+#ifdef KNOB_ENABLE_RDTSC
+ pCreateInfo->pBucketMgr = &gBucketMgr;
+#endif
+
+ pCreateInfo->contextSaveSize = sizeof(API_STATE);
+
return (HANDLE)pContext;
}
@@ -148,10 +141,6 @@ void SwrDestroyContext(HANDLE hContext)
_aligned_free(pContext->pScratch[i]);
}
- _aligned_free(pContext->dcRing);
- _aligned_free(pContext->dsRing);
- _aligned_free(pContext->subCtxSave);
-
delete(pContext->pHotTileMgr);
pContext->~SWR_CONTEXT();
@@ -168,49 +157,28 @@ void WakeAllThreads(SWR_CONTEXT *pContext)
pContext->FifosNotEmpty.notify_all();
}
-bool StillDrawing(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC)
+template<bool IsDraw>
+void QueueWork(SWR_CONTEXT *pContext)
{
- // For single thread nothing should still be drawing.
- if (KNOB_SINGLE_THREADED) { return false; }
-
- if (pDC->isCompute)
+ if (IsDraw)
{
- if (pDC->doneCompute)
- {
- pDC->inUse = false;
- return false;
- }
+ // Each worker thread looks at a DC for both FE and BE work at different times and so we
+ // multiply threadDone by 2. When the threadDone counter has reached 0 then all workers
+ // have moved past this DC. (i.e. Each worker has checked this DC for both FE and BE work and
+ // then moved on if all work is done.)
+ pContext->pCurDrawContext->threadsDone =
+ pContext->NumWorkerThreads ? pContext->NumWorkerThreads * 2 : 2;
}
-
- // Check if backend work is done. First make sure all triangles have been binned.
- if (pDC->doneFE == true)
+ else
{
- // ensure workers have all moved passed this draw
- if (pDC->threadsDoneFE != pContext->NumWorkerThreads)
- {
- return true;
- }
-
- if (pDC->threadsDoneBE != pContext->NumWorkerThreads)
- {
- return true;
- }
-
- pDC->inUse = false; // all work is done.
+ pContext->pCurDrawContext->threadsDone =
+ pContext->NumWorkerThreads ? pContext->NumWorkerThreads : 1;
}
- return pDC->inUse;
-}
-
-void QueueDraw(SWR_CONTEXT *pContext)
-{
- SWR_ASSERT(pContext->pCurDrawContext->inUse == false);
- pContext->pCurDrawContext->inUse = true;
-
_ReadWriteBarrier();
{
std::unique_lock<std::mutex> lock(pContext->WaitLock);
- pContext->DrawEnqueued++;
+ pContext->dcRing.Enqueue();
}
if (KNOB_SINGLE_THREADED)
@@ -219,10 +187,24 @@ void QueueDraw(SWR_CONTEXT *pContext)
uint32_t mxcsr = _mm_getcsr();
_mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
- std::unordered_set<uint32_t> lockedTiles;
- uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
- WorkOnFifoFE(pContext, 0, curDraw[0], 0);
- WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles);
+ if (IsDraw)
+ {
+ static TileSet lockedTiles;
+ uint64_t curDraw[2] = { pContext->pCurDrawContext->drawId, pContext->pCurDrawContext->drawId };
+ WorkOnFifoFE(pContext, 0, curDraw[0], 0);
+ WorkOnFifoBE(pContext, 0, curDraw[1], lockedTiles);
+ }
+ else
+ {
+ uint64_t curDispatch = pContext->pCurDrawContext->drawId;
+ WorkOnCompute(pContext, 0, curDispatch);
+ }
+
+ // Dequeue the work here, if not already done, since we're single threaded (i.e. no workers).
+ if (!pContext->dcRing.IsEmpty())
+ {
+ pContext->dcRing.Dequeue();
+ }
// restore csr
_mm_setcsr(mxcsr);
@@ -239,40 +221,14 @@ void QueueDraw(SWR_CONTEXT *pContext)
pContext->pCurDrawContext = nullptr;
}
-///@todo Combine this with QueueDraw
-void QueueDispatch(SWR_CONTEXT *pContext)
+INLINE void QueueDraw(SWR_CONTEXT* pContext)
{
- SWR_ASSERT(pContext->pCurDrawContext->inUse == false);
- pContext->pCurDrawContext->inUse = true;
-
- _ReadWriteBarrier();
- {
- std::unique_lock<std::mutex> lock(pContext->WaitLock);
- pContext->DrawEnqueued++;
- }
-
- if (KNOB_SINGLE_THREADED)
- {
- // flush denormals to 0
- uint32_t mxcsr = _mm_getcsr();
- _mm_setcsr(mxcsr | _MM_FLUSH_ZERO_ON | _MM_DENORMALS_ZERO_ON);
-
- uint64_t curDispatch = pContext->pCurDrawContext->drawId;
- WorkOnCompute(pContext, 0, curDispatch);
-
- // restore csr
- _mm_setcsr(mxcsr);
- }
- else
- {
- RDTSC_START(APIDrawWakeAllThreads);
- WakeAllThreads(pContext);
- RDTSC_STOP(APIDrawWakeAllThreads, 1, 0);
- }
+ QueueWork<true>(pContext);
+}
- // Set current draw context to NULL so that next state call forces a new draw context to be created and populated.
- pContext->pPrevDrawContext = pContext->pCurDrawContext;
- pContext->pCurDrawContext = nullptr;
+INLINE void QueueDispatch(SWR_CONTEXT* pContext)
+{
+ QueueWork<false>(pContext);
}
DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
@@ -281,22 +237,22 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
// If current draw context is null then need to obtain a new draw context to use from ring.
if (pContext->pCurDrawContext == nullptr)
{
- uint32_t dcIndex = pContext->nextDrawId % KNOB_MAX_DRAWS_IN_FLIGHT;
-
- DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
- pContext->pCurDrawContext = pCurDrawContext;
-
- // Need to wait until this draw context is available to use.
- while (StillDrawing(pContext, pCurDrawContext))
+ // Need to wait for a free entry.
+ while (pContext->dcRing.IsFull())
{
_mm_pause();
}
+ uint32_t dcIndex = pContext->dcRing.GetHead() % KNOB_MAX_DRAWS_IN_FLIGHT;
+
+ DRAW_CONTEXT* pCurDrawContext = &pContext->dcRing[dcIndex];
+ pContext->pCurDrawContext = pCurDrawContext;
+
// Assign next available entry in DS ring to this DC.
uint32_t dsIndex = pContext->curStateId % KNOB_MAX_DRAWS_IN_FLIGHT;
pCurDrawContext->pState = &pContext->dsRing[dsIndex];
- Arena& stateArena = *(pCurDrawContext->pState->pArena);
+ auto& stateArena = *(pCurDrawContext->pState->pArena);
// Copy previous state to current state.
if (pContext->pPrevDrawContext)
@@ -332,18 +288,15 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
pCurDrawContext->pArena->Reset();
pCurDrawContext->pContext = pContext;
pCurDrawContext->isCompute = false; // Dispatch has to set this to true.
- pCurDrawContext->inUse = false;
- pCurDrawContext->doneCompute = false;
pCurDrawContext->doneFE = false;
pCurDrawContext->FeLock = 0;
- pCurDrawContext->threadsDoneFE = 0;
- pCurDrawContext->threadsDoneBE = 0;
+ pCurDrawContext->threadsDone = 0;
pCurDrawContext->pTileMgr->initialize();
// Assign unique drawId for this DC
- pCurDrawContext->drawId = pContext->nextDrawId++;
+ pCurDrawContext->drawId = pContext->dcRing.GetHead();
}
else
{
@@ -354,38 +307,36 @@ DRAW_CONTEXT* GetDrawContext(SWR_CONTEXT *pContext, bool isSplitDraw = false)
return pContext->pCurDrawContext;
}
-void SWR_API SwrSetActiveSubContext(
- HANDLE hContext,
- uint32_t subContextIndex)
+API_STATE* GetDrawState(SWR_CONTEXT *pContext)
{
- SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
- if (subContextIndex >= pContext->numSubContexts)
- {
- return;
- }
+ DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+ SWR_ASSERT(pDC->pState != nullptr);
- if (subContextIndex != pContext->curSubCtxId)
- {
- // Save and restore draw state
- DRAW_CONTEXT* pDC = GetDrawContext(pContext);
- CopyState(
- pContext->subCtxSave[pContext->curSubCtxId],
- *(pDC->pState));
+ return &pDC->pState->state;
+}
- CopyState(
- *(pDC->pState),
- pContext->subCtxSave[subContextIndex]);
+void SWR_API SwrSaveState(
+ HANDLE hContext,
+ void* pOutputStateBlock,
+ size_t memSize)
+{
+ SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+ auto pSrc = GetDrawState(pContext);
+ SWR_ASSERT(pOutputStateBlock && memSize >= sizeof(*pSrc));
- pContext->curSubCtxId = subContextIndex;
- }
+ memcpy(pOutputStateBlock, pSrc, sizeof(*pSrc));
}
-API_STATE* GetDrawState(SWR_CONTEXT *pContext)
+void SWR_API SwrRestoreState(
+ HANDLE hContext,
+ const void* pStateBlock,
+ size_t memSize)
{
- DRAW_CONTEXT* pDC = GetDrawContext(pContext);
- SWR_ASSERT(pDC->pState != nullptr);
+ SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+ auto pDst = GetDrawState(pContext);
+ SWR_ASSERT(pStateBlock && memSize >= sizeof(*pDst));
- return &pDC->pState->state;
+ memcpy(pDst, pStateBlock, sizeof(*pDst));
}
void SetupDefaultState(SWR_CONTEXT *pContext)
@@ -431,16 +382,12 @@ void SwrWaitForIdle(HANDLE hContext)
SWR_CONTEXT *pContext = GetContext(hContext);
RDTSC_START(APIWaitForIdle);
- // Wait for all work to complete.
- for (uint32_t dc = 0; dc < KNOB_MAX_DRAWS_IN_FLIGHT; ++dc)
- {
- DRAW_CONTEXT *pDC = &pContext->dcRing[dc];
- while (StillDrawing(pContext, pDC))
- {
- _mm_pause();
- }
+ while (!pContext->dcRing.IsEmpty())
+ {
+ _mm_pause();
}
+
RDTSC_STOP(APIWaitForIdle, 1, 0);
}
@@ -770,16 +717,25 @@ void SetupMacroTileScissors(DRAW_CONTEXT *pDC)
pState->scissorInFixedPoint.bottom = bottom * FIXED_POINT_SCALE - 1;
}
}
-
+// templated backend function tables
+extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
+extern PFN_BACKEND_FUNC gBackendSingleSample[2][2];
+extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
+extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2];
+extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS + 1][SWR_MULTISAMPLE_TYPE_MAX];
+extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2];
+extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2];
+extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2];
void SetupPipeline(DRAW_CONTEXT *pDC)
{
DRAW_STATE* pState = pDC->pState;
const SWR_RASTSTATE &rastState = pState->state.rastState;
+ const SWR_PS_STATE &psState = pState->state.psState;
BACKEND_FUNCS& backendFuncs = pState->backendFuncs;
const uint32_t forcedSampleCount = (rastState.bForcedSampleCount) ? 1 : 0;
// setup backend
- if (pState->state.psState.pfnPixelShader == nullptr)
+ if (psState.pfnPixelShader == nullptr)
{
backendFuncs.pfnBackend = gBackendNullPs[pState->state.rastState.sampleCount];
// always need to generate I & J per sample for Z interpolation
@@ -788,41 +744,40 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
else
{
const bool bMultisampleEnable = ((rastState.sampleCount > SWR_MULTISAMPLE_1X) || rastState.bForcedSampleCount) ? 1 : 0;
- const uint32_t centroid = ((pState->state.psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
+ const uint32_t centroid = ((psState.barycentricsMask & SWR_BARYCENTRIC_CENTROID_MASK) > 0) ? 1 : 0;
// currently only support 'normal' input coverage
- SWR_ASSERT(pState->state.psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL ||
- pState->state.psState.inputCoverage == SWR_INPUT_COVERAGE_NONE);
+ SWR_ASSERT(psState.inputCoverage == SWR_INPUT_COVERAGE_NORMAL ||
+ psState.inputCoverage == SWR_INPUT_COVERAGE_NONE);
- SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)pState->state.psState.barycentricsMask;
+ SWR_BARYCENTRICS_MASK barycentricsMask = (SWR_BARYCENTRICS_MASK)psState.barycentricsMask;
// select backend function
- switch(pState->state.psState.shadingRate)
+ switch(psState.shadingRate)
{
case SWR_SHADING_RATE_PIXEL:
if(bMultisampleEnable)
{
// always need to generate I & J per sample for Z interpolation
barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
- backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][pState->state.psState.inputCoverage][centroid][forcedSampleCount];
- backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][pState->state.blendState.sampleCount];
+ backendFuncs.pfnBackend = gBackendPixelRateTable[rastState.sampleCount][rastState.samplePattern][psState.inputCoverage][centroid][forcedSampleCount];
+ backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
}
else
{
// always need to generate I & J per pixel for Z interpolation
barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_PIXEL_MASK);
- backendFuncs.pfnBackend = gBackendSingleSample[pState->state.psState.inputCoverage][centroid];
- backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][SWR_MULTISAMPLE_1X];
+ backendFuncs.pfnBackend = gBackendSingleSample[psState.inputCoverage][centroid];
+ backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][SWR_MULTISAMPLE_1X];
}
break;
case SWR_SHADING_RATE_SAMPLE:
SWR_ASSERT(rastState.samplePattern == SWR_MSAA_STANDARD_PATTERN);
// always need to generate I & J per sample for Z interpolation
barycentricsMask = (SWR_BARYCENTRICS_MASK)(barycentricsMask | SWR_BARYCENTRIC_PER_SAMPLE_MASK);
- backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][pState->state.psState.inputCoverage][centroid];
- backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[pState->state.psState.numRenderTargets][pState->state.blendState.sampleCount];
+ backendFuncs.pfnBackend = gBackendSampleRateTable[rastState.sampleCount][psState.inputCoverage][centroid];
+ backendFuncs.pfnOutputMerger = gBackendOutputMergerTable[psState.numRenderTargets][pState->state.blendState.sampleCount];
break;
- case SWR_SHADING_RATE_COARSE:
default:
SWR_ASSERT(0 && "Invalid shading rate");
break;
@@ -913,7 +868,7 @@ void SetupPipeline(DRAW_CONTEXT *pDC)
uint32_t numRTs = pState->state.psState.numRenderTargets;
pState->state.colorHottileEnable = 0;
- if(pState->state.psState.pfnPixelShader != nullptr)
+ if (psState.pfnPixelShader != nullptr)
{
for (uint32_t rt = 0; rt < numRTs; ++rt)
{
@@ -1005,6 +960,11 @@ uint32_t MaxVertsPerDraw(
}
break;
+ // The Primitive Assembly code can only handle 1 RECT at a time.
+ case TOP_RECT_LIST:
+ vertsPerDraw = 3;
+ break;
+
default:
// We are not splitting up draws for other topologies.
break;
@@ -1305,7 +1265,10 @@ void SwrDrawIndexedInstanced(
DrawIndexedInstance(hContext, topology, numIndices, indexOffset, baseVertex, numInstances, startInstance);
}
-// Attach surfaces to pipeline
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrInvalidateTiles
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to invalidate.
void SwrInvalidateTiles(
HANDLE hContext,
uint32_t attachmentMask)
@@ -1313,10 +1276,39 @@ void SwrInvalidateTiles(
SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+ pDC->FeWork.type = DISCARDINVALIDATETILES;
+ pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
+ pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
+ memset(&pDC->FeWork.desc.discardInvalidateTiles.rect, 0, sizeof(SWR_RECT));
+ pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_INVALID;
+ pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = false;
+ pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = false;
+
+ //enqueue
+ QueueDraw(pContext);
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief SwrDiscardRect
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
+/// @param rect - if rect is all zeros, the entire attachment surface will be discarded
+void SwrDiscardRect(
+ HANDLE hContext,
+ uint32_t attachmentMask,
+ SWR_RECT rect)
+{
+ SWR_CONTEXT *pContext = (SWR_CONTEXT*)hContext;
+ DRAW_CONTEXT* pDC = GetDrawContext(pContext);
+
// Queue a load to the hottile
- pDC->FeWork.type = INVALIDATETILES;
- pDC->FeWork.pfnWork = ProcessInvalidateTiles;
- pDC->FeWork.desc.invalidateTiles.attachmentMask = attachmentMask;
+ pDC->FeWork.type = DISCARDINVALIDATETILES;
+ pDC->FeWork.pfnWork = ProcessDiscardInvalidateTiles;
+ pDC->FeWork.desc.discardInvalidateTiles.attachmentMask = attachmentMask;
+ pDC->FeWork.desc.discardInvalidateTiles.rect = rect;
+ pDC->FeWork.desc.discardInvalidateTiles.newTileState = SWR_TILE_RESOLVED;
+ pDC->FeWork.desc.discardInvalidateTiles.createNewTiles = true;
+ pDC->FeWork.desc.discardInvalidateTiles.fullTilesOnly = true;
//enqueue
QueueDraw(pContext);
@@ -1391,7 +1383,7 @@ void SwrClearRenderTarget(
uint32_t clearMask,
const float clearColor[4],
float z,
- BYTE stencil)
+ uint8_t stencil)
{
RDTSC_START(APIClearRenderTarget);
diff --git a/src/gallium/drivers/swr/rasterizer/core/api.h b/src/gallium/drivers/swr/rasterizer/core/api.h
index 72fae8b..90c2f03 100644
--- a/src/gallium/drivers/swr/rasterizer/core/api.h
+++ b/src/gallium/drivers/swr/rasterizer/core/api.h
@@ -53,7 +53,7 @@ typedef void(SWR_API *PFN_CALLBACK_FUNC)(uint64_t data, uint64_t data2, uint64_t
/// @param pDstHotTile - pointer to the hot tile surface
typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, SWR_FORMAT dstFormat,
SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
- uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pDstHotTile);
+ uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, uint8_t *pDstHotTile);
//////////////////////////////////////////////////////////////////////////
/// @brief Function signature for store hot tiles
@@ -65,7 +65,7 @@ typedef void(SWR_API *PFN_LOAD_TILE)(HANDLE hPrivateContext, SWR_FORMAT dstForma
/// @param pSrcHotTile - pointer to the hot tile surface
typedef void(SWR_API *PFN_STORE_TILE)(HANDLE hPrivateContext, SWR_FORMAT srcFormat,
SWR_RENDERTARGET_ATTACHMENT renderTargetIndex,
- uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, BYTE *pSrcHotTile);
+ uint32_t x, uint32_t y, uint32_t renderTargetArrayIndex, uint8_t *pSrcHotTile);
/// @brief Function signature for clearing from the hot tiles clear value
/// @param hPrivateContext - handle to private data
@@ -77,6 +77,8 @@ typedef void(SWR_API *PFN_CLEAR_TILE)(HANDLE hPrivateContext,
SWR_RENDERTARGET_ATTACHMENT rtIndex,
uint32_t x, uint32_t y, const float* pClearColor);
+class BucketManager;
+
//////////////////////////////////////////////////////////////////////////
/// SWR_CREATECONTEXT_INFO
/////////////////////////////////////////////////////////////////////////
@@ -88,13 +90,17 @@ struct SWR_CREATECONTEXT_INFO
// Use SwrGetPrivateContextState() to access private state.
uint32_t privateStateSize;
- // Each SWR context can have multiple sets of active state
- uint32_t maxSubContexts;
-
- // tile manipulation functions
+ // Tile manipulation functions
PFN_LOAD_TILE pfnLoadTile;
PFN_STORE_TILE pfnStoreTile;
PFN_CLEAR_TILE pfnClearTile;
+
+ // Pointer to rdtsc buckets mgr returned to the caller.
+ // Only populated when KNOB_ENABLE_RDTSC is set
+ BucketManager* pBucketMgr;
+
+ // Output: size required memory passed to for SwrSaveState / SwrRestoreState
+ size_t contextSaveSize;
};
//////////////////////////////////////////////////////////////////////////
@@ -112,7 +118,7 @@ struct SWR_RECT
/// @brief Create SWR Context.
/// @param pCreateInfo - pointer to creation info.
HANDLE SWR_API SwrCreateContext(
- const SWR_CREATECONTEXT_INFO* pCreateInfo);
+ SWR_CREATECONTEXT_INFO* pCreateInfo);
//////////////////////////////////////////////////////////////////////////
/// @brief Destroys SWR Context.
@@ -121,12 +127,24 @@ void SWR_API SwrDestroyContext(
HANDLE hContext);
//////////////////////////////////////////////////////////////////////////
-/// @brief Set currently active state context
-/// @param subContextIndex - value from 0 to
-/// SWR_CREATECONTEXT_INFO.maxSubContexts. Defaults to 0.
-void SWR_API SwrSetActiveSubContext(
+/// @brief Saves API state associated with hContext
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pOutputStateBlock - Memory block to receive API state data
+/// @param memSize - Size of memory pointed to by pOutputStateBlock
+void SWR_API SwrSaveState(
HANDLE hContext,
- uint32_t subContextIndex);
+ void* pOutputStateBlock,
+ size_t memSize);
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief Restores API state to hContext previously saved with SwrSaveState
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param pStateBlock - Memory block to read API state data from
+/// @param memSize - Size of memory pointed to by pStateBlock
+void SWR_API SwrRestoreState(
+ HANDLE hContext,
+ const void* pStateBlock,
+ size_t memSize);
//////////////////////////////////////////////////////////////////////////
/// @brief Sync cmd. Executes the callback func when all rendering up to this sync
@@ -391,6 +409,16 @@ void SWR_API SwrInvalidateTiles(
uint32_t attachmentMask);
//////////////////////////////////////////////////////////////////////////
+/// @brief SwrDiscardRect
+/// @param hContext - Handle passed back from SwrCreateContext
+/// @param attachmentMask - The mask specifies which surfaces attached to the hottiles to discard.
+/// @param rect - if rect is all zeros, the entire attachment surface will be discarded
+void SWR_API SwrDiscardRect(
+ HANDLE hContext,
+ uint32_t attachmentMask,
+ SWR_RECT rect);
+
+//////////////////////////////////////////////////////////////////////////
/// @brief SwrDispatch
/// @param hContext - Handle passed back from SwrCreateContext
/// @param threadGroupCountX - Number of thread groups dispatched in X direction
@@ -419,9 +447,9 @@ void SWR_API SwrStoreTiles(
void SWR_API SwrClearRenderTarget(
HANDLE hContext,
uint32_t clearMask,
- const FLOAT clearColor[4],
+ const float clearColor[4],
float z,
- BYTE stencil);
+ uint8_t stencil);
void SWR_API SwrSetRastState(
HANDLE hContext,
diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.cpp b/src/gallium/drivers/swr/rasterizer/core/arena.cpp
deleted file mode 100644
index 8184c8d..0000000
--- a/src/gallium/drivers/swr/rasterizer/core/arena.cpp
+++ /dev/null
@@ -1,166 +0,0 @@
-/****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
-*
-* Permission is hereby granted, free of charge, to any person obtaining a
-* copy of this software and associated documentation files (the "Software"),
-* to deal in the Software without restriction, including without limitation
-* the rights to use, copy, modify, merge, publish, distribute, sublicense,
-* and/or sell copies of the Software, and to permit persons to whom the
-* Software is furnished to do so, subject to the following conditions:
-*
-* The above copyright notice and this permission notice (including the next
-* paragraph) shall be included in all copies or substantial portions of the
-* Software.
-*
-* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
-* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
-* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
-* IN THE SOFTWARE.
-*
-* @file arena.cpp
-*
-* @brief Arena memory manager
-* The arena is convenient and fast for managing allocations for any of
-* our allocations that are associated with operations and can all be freed
-* once when their operation has completed. Allocations are cheap since
-* most of the time its simply an increment of an offset. Also, no need to
-* free individual allocations. All of the arena memory can be freed at once.
-*
-******************************************************************************/
-
-#include "context.h"
-#include "arena.h"
-
-#include <cmath>
-
-Arena::Arena()
- : m_pCurBlock(nullptr), m_size(0)
-{
- m_pMutex = new std::mutex();
-}
-
-Arena::~Arena()
-{
- Reset(); // Reset just in case to avoid leaking memory.
-
- if (m_pCurBlock)
- {
- _aligned_free(m_pCurBlock->pMem);
- delete m_pCurBlock;
- }
-
- delete m_pMutex;
-}
-
-///@todo Remove this when all users have stopped using this.
-void Arena::Init()
-{
- m_size = 0;
- m_pCurBlock = nullptr;
-
- m_pMutex = new std::mutex();
-}
-
-void* Arena::AllocAligned(size_t size, size_t align)
-{
- if (m_pCurBlock)
- {
- ArenaBlock* pCurBlock = m_pCurBlock;
- pCurBlock->offset = AlignUp(pCurBlock->offset, align);
-
- if ((pCurBlock->offset + size) <= pCurBlock->blockSize)
- {
- void* pMem = PtrAdd(pCurBlock->pMem, pCurBlock->offset);
- pCurBlock->offset += size;
- m_size += size;
- return pMem;
- }
-
- // Not enough memory in this block, fall through to allocate
- // a new block
- }
-
- static const size_t ArenaBlockSize = 1024*1024;
- size_t blockSize = std::max(m_size + ArenaBlockSize, std::max(size, ArenaBlockSize));
- blockSize = AlignUp(blockSize, KNOB_SIMD_WIDTH*4);
-
- void *pMem = _aligned_malloc(blockSize, KNOB_SIMD_WIDTH*4); // Arena blocks are always simd byte aligned.
- SWR_ASSERT(pMem != nullptr);
-
- ArenaBlock* pNewBlock = new (std::nothrow) ArenaBlock();
- SWR_ASSERT(pNewBlock != nullptr);
-
- if (pNewBlock != nullptr)
- {
- pNewBlock->pNext = m_pCurBlock;
-
- m_pCurBlock = pNewBlock;
- m_pCurBlock->pMem = pMem;
- m_pCurBlock->blockSize = blockSize;
-
- }
-
- return AllocAligned(size, align);
-}
-
-void* Arena::Alloc(size_t size)
-{
- return AllocAligned(size, 1);
-}
-
-void* Arena::AllocAlignedSync(size_t size, size_t align)
-{
- void* pAlloc = nullptr;
-
- SWR_ASSERT(m_pMutex != nullptr);
-
- m_pMutex->lock();
- pAlloc = AllocAligned(size, align);
- m_pMutex->unlock();
-
- return pAlloc;
-}
-
-void* Arena::AllocSync(size_t size)
-{
- void* pAlloc = nullptr;
-
- SWR_ASSERT(m_pMutex != nullptr);
-
- m_pMutex->lock();
- pAlloc = Alloc(size);
- m_pMutex->unlock();
-
- return pAlloc;
-}
-
-void Arena::Reset(bool removeAll)
-{
- if (m_pCurBlock)
- {
- m_pCurBlock->offset = 0;
-
- ArenaBlock *pUsedBlocks = m_pCurBlock->pNext;
- m_pCurBlock->pNext = nullptr;
- while(pUsedBlocks)
- {
- ArenaBlock* pBlock = pUsedBlocks;
- pUsedBlocks = pBlock->pNext;
-
- _aligned_free(pBlock->pMem);
- delete pBlock;
- }
-
- if (removeAll)
- {
- _aligned_free(m_pCurBlock->pMem);
- delete m_pCurBlock;
- m_pCurBlock = nullptr;
- }
- }
-
- m_size = 0;
-}
diff --git a/src/gallium/drivers/swr/rasterizer/core/arena.h b/src/gallium/drivers/swr/rasterizer/core/arena.h
index 76eee11..7f635b8 100644
--- a/src/gallium/drivers/swr/rasterizer/core/arena.h
+++ b/src/gallium/drivers/swr/rasterizer/core/arena.h
@@ -33,37 +33,307 @@
#pragma once
#include <mutex>
+#include <algorithm>
+#include <atomic>
+#include "core/utils.h"
-class Arena
+class DefaultAllocator
{
public:
- Arena();
- ~Arena();
+ void* AllocateAligned(size_t size, size_t align)
+ {
+ void* p = _aligned_malloc(size, align);
+ return p;
+ }
+ void Free(void* pMem)
+ {
+ _aligned_free(pMem);
+ }
+};
- void Init();
+static const size_t ARENA_BLOCK_SHIFT = 5;
+static const size_t ARENA_BLOCK_ALIGN = KNOB_SIMD_WIDTH * 4;
+static_assert((1U << ARENA_BLOCK_SHIFT) == ARENA_BLOCK_ALIGN,
+ "Invalid value for ARENA_BLOCK_ALIGN/SHIFT");
- void* AllocAligned(size_t size, size_t align);
- void* Alloc(size_t size);
+struct ArenaBlock
+{
+ size_t blockSize = 0;
+ ArenaBlock* pNext = nullptr;
+};
+static_assert(sizeof(ArenaBlock) <= ARENA_BLOCK_ALIGN,
+ "Increase BLOCK_ALIGN size");
- void* AllocAlignedSync(size_t size, size_t align);
- void* AllocSync(size_t size);
+// Caching Allocator for Arena
+template<uint32_t NumBucketsT = 1, uint32_t StartBucketBitT = 20>
+struct CachingAllocatorT : DefaultAllocator
+{
+ static uint32_t GetBucketId(size_t blockSize)
+ {
+ uint32_t bucketId = 0;
- void Reset(bool removeAll = false);
- size_t Size() { return m_size; }
+#if defined(BitScanReverseSizeT)
+ BitScanReverseSizeT((unsigned long*)&bucketId, blockSize >> CACHE_START_BUCKET_BIT);
+ bucketId = std::min<uint32_t>(bucketId, CACHE_NUM_BUCKETS - 1);
+#endif
-private:
+ return bucketId;
+ }
+
+ void* AllocateAligned(size_t size, size_t align)
+ {
+ SWR_ASSERT(size >= sizeof(ArenaBlock));
+ SWR_ASSERT(size <= uint32_t(-1));
+
+ size_t blockSize = size - ARENA_BLOCK_ALIGN;
+
+ {
+ // search cached blocks
+ std::lock_guard<std::mutex> l(m_mutex);
+ ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(blockSize)];
+ ArenaBlock* pBlock = pPrevBlock->pNext;
+ ArenaBlock* pPotentialBlock = nullptr;
+ ArenaBlock* pPotentialPrev = nullptr;
+
+ while (pBlock)
+ {
+ if (pBlock->blockSize >= blockSize)
+ {
+ if (pBlock == AlignUp(pBlock, align))
+ {
+ if (pBlock->blockSize == blockSize)
+ {
+ // Won't find a better match
+ break;
+ }
+
+ // We could use this as it is larger than we wanted, but
+ // continue to search for a better match
+ pPotentialBlock = pBlock;
+ pPotentialPrev = pPrevBlock;
+ }
+ }
+ else
+ {
+ // Blocks are sorted by size (biggest first)
+ // So, if we get here, there are no blocks
+ // large enough, fall through to allocation.
+ pBlock = nullptr;
+ break;
+ }
+
+ pPrevBlock = pBlock;
+ pBlock = pBlock->pNext;
+ }
+
+ if (!pBlock)
+ {
+ // Couldn't find an exact match, use next biggest size
+ pBlock = pPotentialBlock;
+ pPrevBlock = pPotentialPrev;
+ }
+
+ if (pBlock)
+ {
+ SWR_ASSERT(pPrevBlock && pPrevBlock->pNext == pBlock);
+ pPrevBlock->pNext = pBlock->pNext;
+ pBlock->pNext = nullptr;
- struct ArenaBlock
+ return pBlock;
+ }
+
+ m_totalAllocated += size;
+
+#if 0
+ {
+ static uint32_t count = 0;
+ char buf[128];
+ sprintf_s(buf, "Arena Alloc %d 0x%llx bytes - 0x%llx total\n", ++count, uint64_t(size), uint64_t(m_totalAllocated));
+ OutputDebugStringA(buf);
+ }
+#endif
+ }
+
+ return this->DefaultAllocator::AllocateAligned(size, align);
+ }
+
+ void Free(void* pMem)
+ {
+ if (pMem)
+ {
+ ArenaBlock* pNewBlock = reinterpret_cast<ArenaBlock*>(pMem);
+ SWR_ASSERT(pNewBlock->blockSize >= 0);
+
+ std::unique_lock<std::mutex> l(m_mutex);
+ ArenaBlock* pPrevBlock = &m_cachedBlocks[GetBucketId(pNewBlock->blockSize)];
+ ArenaBlock* pBlock = pPrevBlock->pNext;
+
+ while (pBlock)
+ {
+ if (pNewBlock->blockSize >= pBlock->blockSize)
+ {
+ // Insert here
+ break;
+ }
+ pPrevBlock = pBlock;
+ pBlock = pBlock->pNext;
+ }
+
+ // Insert into list
+ SWR_ASSERT(pPrevBlock);
+ pPrevBlock->pNext = pNewBlock;
+ pNewBlock->pNext = pBlock;
+ }
+ }
+
+ ~CachingAllocatorT()
{
- void* pMem = nullptr;
- size_t blockSize = 0;
- size_t offset = 0;
- ArenaBlock* pNext = nullptr;
- };
+ // Free all cached blocks
+ for (uint32_t i = 0; i < CACHE_NUM_BUCKETS; ++i)
+ {
+ ArenaBlock* pBlock = m_cachedBlocks[i].pNext;
+ while (pBlock)
+ {
+ ArenaBlock* pNext = pBlock->pNext;
+ this->DefaultAllocator::Free(pBlock);
+ pBlock = pNext;
+ }
+ }
+ }
- ArenaBlock* m_pCurBlock = nullptr;
- size_t m_size = 0;
+ // buckets, for block sizes < (1 << (start+1)), < (1 << (start+2)), ...
+ static const uint32_t CACHE_NUM_BUCKETS = NumBucketsT;
+ static const uint32_t CACHE_START_BUCKET_BIT = StartBucketBitT;
+
+ ArenaBlock m_cachedBlocks[CACHE_NUM_BUCKETS];
+ std::mutex m_mutex;
+
+ size_t m_totalAllocated = 0;
+};
+typedef CachingAllocatorT<> CachingAllocator;
+
+template<typename T = DefaultAllocator, size_t BlockSizeT = (128 * 1024)>
+class TArena
+{
+public:
+ TArena(T& in_allocator) : m_allocator(in_allocator) {}
+ TArena() : m_allocator(m_defAllocator) {}
+ ~TArena()
+ {
+ Reset(true);
+ }
+
+ void* AllocAligned(size_t size, size_t align)
+ {
+ SWR_ASSERT(size);
+ SWR_ASSERT(align <= ARENA_BLOCK_ALIGN);
+
+ if (m_pCurBlock)
+ {
+ ArenaBlock* pCurBlock = m_pCurBlock;
+ size_t offset = AlignUp(m_offset, align);
+
+ if ((offset + size) <= pCurBlock->blockSize)
+ {
+ void* pMem = PtrAdd(pCurBlock, offset + ARENA_BLOCK_ALIGN);
+ m_offset = offset + size;
+ return pMem;
+ }
+
+ // Not enough memory in this block, fall through to allocate
+ // a new block
+ }
+
+ static const size_t ArenaBlockSize = BlockSizeT - ARENA_BLOCK_ALIGN;
+ size_t blockSize = std::max(size, ArenaBlockSize);
+
+ // Add in one BLOCK_ALIGN unit to store ArenaBlock in.
+ blockSize = AlignUp(blockSize, ARENA_BLOCK_ALIGN);
+
+ void *pMem = m_allocator.AllocateAligned(blockSize + ARENA_BLOCK_ALIGN, ARENA_BLOCK_ALIGN); // Arena blocks are always simd byte aligned.
+ SWR_ASSERT(pMem != nullptr);
+
+ ArenaBlock* pNewBlock = new (pMem) ArenaBlock();
+
+ if (pNewBlock != nullptr)
+ {
+ m_offset = 0;
+ pNewBlock->pNext = m_pCurBlock;
+
+ m_pCurBlock = pNewBlock;
+ m_pCurBlock->blockSize = blockSize;
+ }
+
+ return AllocAligned(size, align);
+ }
+
+ void* Alloc(size_t size)
+ {
+ return AllocAligned(size, 1);
+ }
+
+ void* AllocAlignedSync(size_t size, size_t align)
+ {
+ void* pAlloc = nullptr;
+
+ m_mutex.lock();
+ pAlloc = AllocAligned(size, align);
+ m_mutex.unlock();
+
+ return pAlloc;
+ }
+
+ void* AllocSync(size_t size)
+ {
+ void* pAlloc = nullptr;
+
+ m_mutex.lock();
+ pAlloc = Alloc(size);
+ m_mutex.unlock();
+
+ return pAlloc;
+ }
+
+ void Reset(bool removeAll = false)
+ {
+ m_offset = 0;
+
+ if (m_pCurBlock)
+ {
+ ArenaBlock *pUsedBlocks = m_pCurBlock->pNext;
+ m_pCurBlock->pNext = nullptr;
+ while (pUsedBlocks)
+ {
+ ArenaBlock* pBlock = pUsedBlocks;
+ pUsedBlocks = pBlock->pNext;
+
+ m_allocator.Free(pBlock);
+ }
+
+ if (removeAll)
+ {
+ m_allocator.Free(m_pCurBlock);
+ m_pCurBlock = nullptr;
+ }
+ }
+ }
+
+ bool IsEmpty()
+ {
+ return (m_pCurBlock == nullptr) || (m_offset == 0 && m_pCurBlock->pNext == nullptr);
+ }
+
+private:
+
+ ArenaBlock* m_pCurBlock = nullptr;
+ size_t m_offset = 0;
/// @note Mutex is only used by sync allocation functions.
- std::mutex* m_pMutex;
+ std::mutex m_mutex;
+
+ DefaultAllocator m_defAllocator;
+ T& m_allocator;
};
+
+using StdArena = TArena<DefaultAllocator>;
+using CachingArena = TArena<CachingAllocator>;
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.cpp b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
index 4a472bc..95110af 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.cpp
@@ -156,7 +156,7 @@ void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTil
}
template<SWR_FORMAT format>
-void ClearRasterTile(BYTE *pTileBuffer, simdvector &value)
+void ClearRasterTile(uint8_t *pTileBuffer, simdvector &value)
{
auto lambda = [&](int comp)
{
@@ -299,10 +299,10 @@ void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, vo
/// @todo clear data should come in as RGBA32_FLOAT
DWORD clearData[4];
float clearFloat[4];
- clearFloat[0] = ((BYTE*)(&pClear->clearRTColor))[0] / 255.0f;
- clearFloat[1] = ((BYTE*)(&pClear->clearRTColor))[1] / 255.0f;
- clearFloat[2] = ((BYTE*)(&pClear->clearRTColor))[2] / 255.0f;
- clearFloat[3] = ((BYTE*)(&pClear->clearRTColor))[3] / 255.0f;
+ clearFloat[0] = ((uint8_t*)(&pClear->clearRTColor))[0] / 255.0f;
+ clearFloat[1] = ((uint8_t*)(&pClear->clearRTColor))[1] / 255.0f;
+ clearFloat[2] = ((uint8_t*)(&pClear->clearRTColor))[2] / 255.0f;
+ clearFloat[3] = ((uint8_t*)(&pClear->clearRTColor))[3] / 255.0f;
clearData[0] = *(DWORD*)&clearFloat[0];
clearData[1] = *(DWORD*)&clearFloat[1];
clearData[2] = *(DWORD*)&clearFloat[2];
@@ -399,30 +399,32 @@ void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile
}
-void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
+void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData)
{
- INVALIDATE_TILES_DESC *pDesc = (INVALIDATE_TILES_DESC*)pData;
+ DISCARD_INVALIDATE_TILES_DESC *pDesc = (DISCARD_INVALIDATE_TILES_DESC *)pData;
SWR_CONTEXT *pContext = pDC->pContext;
+ const int numSamples = GetNumSamples(pDC->pState->state.rastState.sampleCount);
+
for (uint32_t i = 0; i < SWR_NUM_ATTACHMENTS; ++i)
{
if (pDesc->attachmentMask & (1 << i))
{
- HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTile(pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, false);
+ HOTTILE *pHotTile = pContext->pHotTileMgr->GetHotTileNoLoad(
+ pContext, pDC, macroTile, (SWR_RENDERTARGET_ATTACHMENT)i, pDesc->createNewTiles, numSamples);
if (pHotTile)
{
- pHotTile->state = HOTTILE_INVALID;
+ pHotTile->state = (HOTTILE_STATE)pDesc->newTileState;
}
}
}
}
#if KNOB_SIMD_WIDTH == 8
-const __m256 vQuadCenterOffsetsX = { 0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5 };
-const __m256 vQuadCenterOffsetsY = { 0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5 };
-const __m256 vQuadULOffsetsX ={0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
-const __m256 vQuadULOffsetsY ={0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
-#define MASK 0xff
+const __m256 vCenterOffsetsX = {0.5, 1.5, 0.5, 1.5, 2.5, 3.5, 2.5, 3.5};
+const __m256 vCenterOffsetsY = {0.5, 0.5, 1.5, 1.5, 0.5, 0.5, 1.5, 1.5};
+const __m256 vULOffsetsX = {0.0, 1.0, 0.0, 1.0, 2.0, 3.0, 2.0, 3.0};
+const __m256 vULOffsetsY = {0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 1.0, 1.0};
#else
#error Unsupported vector width
#endif
@@ -457,155 +459,6 @@ simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscala
return _simd_movemask_ps(vClipMask);
}
-template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
-INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
-{
-
- // will need to update for avx512
- assert(KNOB_SIMD_WIDTH == 8);
-
- __m256i mask[2];
- __m256i sampleCoverage[2];
- if(bIsStandardPattern)
- {
- __m256i src = _mm256_set1_epi32(0);
- __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
-
- if(MultisampleTraits<sampleCountT>::numSamples == 1)
- {
- mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
- }
- else if(MultisampleTraits<sampleCountT>::numSamples == 2)
- {
- mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
- }
- else if(MultisampleTraits<sampleCountT>::numSamples == 4)
- {
- mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
- }
- else if(MultisampleTraits<sampleCountT>::numSamples == 8)
- {
- mask[0] = _mm256_set1_epi32(-1);
- }
- else if(MultisampleTraits<sampleCountT>::numSamples == 16)
- {
- mask[0] = _mm256_set1_epi32(-1);
- mask[1] = _mm256_set1_epi32(-1);
- index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
- }
-
- // gather coverage for samples 0-7
- sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
- if(MultisampleTraits<sampleCountT>::numSamples > 8)
- {
- // gather coverage for samples 8-15
- sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
- }
- }
- else
- {
- // center coverage is the same for all samples; just broadcast to the sample slots
- uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
- if(MultisampleTraits<sampleCountT>::numSamples == 1)
- {
- sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
- }
- else if(MultisampleTraits<sampleCountT>::numSamples == 2)
- {
- sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
- }
- else if(MultisampleTraits<sampleCountT>::numSamples == 4)
- {
- sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
- }
- else if(MultisampleTraits<sampleCountT>::numSamples == 8)
- {
- sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
- }
- else if(MultisampleTraits<sampleCountT>::numSamples == 16)
- {
- sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
- sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
- }
- }
-
- mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
- // pull out the the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
- __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
-
- __m256i packedCoverage1;
- if(MultisampleTraits<sampleCountT>::numSamples > 8)
- {
- // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
- packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
- }
-
-#if (KNOB_ARCH == KNOB_ARCH_AVX)
- // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
- __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
- __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
- packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
-
- __m256i packedSampleCoverage;
- if(MultisampleTraits<sampleCountT>::numSamples > 8)
- {
- // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
- hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
- shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
- shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
- packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
- packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
- }
- else
- {
- packedSampleCoverage = packedCoverage0;
- }
-#else
- __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
- // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
- packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
-
- __m256i packedSampleCoverage;
- if(MultisampleTraits<sampleCountT>::numSamples > 8)
- {
- permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
- // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
- packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
-
- // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
- packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
- }
- else
- {
- packedSampleCoverage = packedCoverage0;
- }
-#endif
-
- for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
- {
- // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
- inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
-
- if(!bForcedSampleCount)
- {
- // input coverage has to be anded with sample mask if MSAA isn't forced on
- inputMask[i] &= sampleMask;
- }
-
- // shift to the next pixel in the 4x2
- packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
- }
-}
-
-template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
-INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
-{
- uint32_t inputMask[KNOB_SIMD_WIDTH];
- generateInputCoverage<sampleCountT, bIsStandardPattern, bForcedSampleCount>(coverageMask, inputMask, sampleMask);
- inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
-}
-
template<bool perspMask>
INLINE void CalcPixelBarycentrics(const BarycentricCoeffs& coeffs, SWR_PS_CONTEXT &psContext)
{
@@ -766,6 +619,8 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND
// type safety guaranteed from template instantiation in BEChooser<>::GetFunc
static const SWR_MULTISAMPLE_COUNT sampleCount = (SWR_MULTISAMPLE_COUNT)sampleCountT;
uint32_t rasterTileColorOffset = MultisampleTraits<sampleCount>::RasterTileColorOffset(sample);
+ simdvector blendOut;
+
for(uint32_t rt = 0; rt < NumRT; ++rt)
{
uint8_t *pColorSample;
@@ -779,6 +634,9 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND
}
const SWR_RENDER_TARGET_BLEND_STATE *pRTBlend = &pBlendState->renderTarget[rt];
+ // pfnBlendFunc may not update all channels. Initialize with PS output.
+ /// TODO: move this into the blend JIT.
+ blendOut = psContext.shaded[rt];
// Blend outputs and update coverage mask for alpha test
if(pfnBlendFunc[rt] != nullptr)
@@ -789,7 +647,7 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND
psContext.shaded[1],
sample,
pColorSample,
- psContext.shaded[rt],
+ blendOut,
&psContext.oMask,
(simdscalari*)&coverageMask);
}
@@ -805,19 +663,19 @@ void OutputMerger(SWR_PS_CONTEXT &psContext, uint8_t* (&pColorBase)[SWR_NUM_REND
// store with color mask
if(!pRTBlend->writeDisableRed)
{
- _simd_maskstore_ps((float*)pColorSample, outputMask, psContext.shaded[rt].x);
+ _simd_maskstore_ps((float*)pColorSample, outputMask, blendOut.x);
}
if(!pRTBlend->writeDisableGreen)
{
- _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, psContext.shaded[rt].y);
+ _simd_maskstore_ps((float*)(pColorSample + simd), outputMask, blendOut.y);
}
if(!pRTBlend->writeDisableBlue)
{
- _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, psContext.shaded[rt].z);
+ _simd_maskstore_ps((float*)(pColorSample + simd * 2), outputMask, blendOut.z);
}
if(!pRTBlend->writeDisableAlpha)
{
- _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, psContext.shaded[rt].w);
+ _simd_maskstore_ps((float*)(pColorSample + simd * 3), outputMask, blendOut.w);
}
}
}
@@ -884,9 +742,9 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
{
// UL pixel corner
- psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+ psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
// pixel center
- psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+ psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
{
@@ -898,9 +756,9 @@ void BackendSingleSample(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint3
if(coverageMask & MASK)
{
RDTSC_START(BEBarycentric);
- psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+ psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
// pixel center
- psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+ psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
@@ -1077,15 +935,15 @@ void BackendSampleRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_
for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
{
// UL pixel corner
- psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+ psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
// pixel center
- psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+ psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
{
- psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+ psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
// pixel center
- psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+ psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
RDTSC_START(BEBarycentric);
backendFuncs.pfnCalcPixelBarycentrics(coeffs, psContext);
@@ -1313,14 +1171,14 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
for(uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
{
- psContext.vY.UL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
- psContext.vY.center = _simd_add_ps(vQuadCenterOffsetsY, _simd_set1_ps((float)yy));
+ psContext.vY.UL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
+ psContext.vY.center = _simd_add_ps(vCenterOffsetsY, _simd_set1_ps((float)yy));
for(uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
{
- simdscalar vZ[MultisampleTraits<sampleCount>::numSamples];
- psContext.vX.UL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+ simdscalar vZ[MultisampleTraits<sampleCount>::numSamples]{ 0 };
+ psContext.vX.UL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
// set pixel center positions
- psContext.vX.center = _simd_add_ps(vQuadCenterOffsetsX, _simd_set1_ps((float)xx));
+ psContext.vX.center = _simd_add_ps(vCenterOffsetsX, _simd_set1_ps((float)xx));
if (bInputCoverage)
{
@@ -1353,7 +1211,7 @@ void BackendPixelRate(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t
}
else
{
- psContext.activeMask = _simd_set1_epi32(-1);
+ psContext.activeMask = _simd_set1_epi32(-1);
}
// need to declare enough space for all samples
@@ -1555,6 +1413,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
SWR_CONTEXT *pContext = pDC->pContext;
const API_STATE& state = GetApiState(pDC);
const BACKEND_FUNCS& backendFuncs = pDC->pState->backendFuncs;
+ const SWR_RASTSTATE& rastState = pDC->pState->state.rastState;
// broadcast scalars
BarycentricCoeffs coeffs;
@@ -1572,7 +1431,7 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
coeffs.vRecipDet = _simd_broadcast_ss(&work.recipDet);
- BYTE *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
+ uint8_t *pDepthBase = renderBuffers.pDepth, *pStencilBase = renderBuffers.pStencil;
RDTSC_STOP(BESetup, 0, 0);
@@ -1580,12 +1439,12 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
for (uint32_t yy = y; yy < y + KNOB_TILE_Y_DIM; yy += SIMD_TILE_Y_DIM)
{
// UL pixel corner
- simdscalar vYSamplePosUL = _simd_add_ps(vQuadULOffsetsY, _simd_set1_ps((float)yy));
+ simdscalar vYSamplePosUL = _simd_add_ps(vULOffsetsY, _simd_set1_ps((float)yy));
for (uint32_t xx = x; xx < x + KNOB_TILE_X_DIM; xx += SIMD_TILE_X_DIM)
{
// UL pixel corners
- simdscalar vXSamplePosUL = _simd_add_ps(vQuadULOffsetsX, _simd_set1_ps((float)xx));
+ simdscalar vXSamplePosUL = _simd_add_ps(vULOffsetsX, _simd_set1_ps((float)xx));
// iterate over active samples
unsigned long sample = 0;
@@ -1593,7 +1452,8 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
while (_BitScanForward(&sample, sampleMask))
{
sampleMask &= ~(1 << sample);
- if (work.coverageMask[sample] & MASK)
+ simdmask coverageMask = work.coverageMask[sample] & MASK;
+ if (coverageMask)
{
RDTSC_START(BEBarycentric);
// calculate per sample positions
@@ -1607,7 +1467,14 @@ void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y,
RDTSC_STOP(BEBarycentric, 0, 0);
- simdscalar vCoverageMask = vMask(work.coverageMask[sample] & MASK);
+ // interpolate user clip distance if available
+ if (rastState.clipDistanceMask)
+ {
+ coverageMask &= ~ComputeUserClipMask(rastState.clipDistanceMask, work.pUserClipBuffer,
+ psContext.vI.sample, psContext.vJ.sample);
+ }
+
+ simdscalar vCoverageMask = vMask(coverageMask);
simdscalar stencilPassMask = vCoverageMask;
// offset depth/stencil buffers current sample
diff --git a/src/gallium/drivers/swr/rasterizer/core/backend.h b/src/gallium/drivers/swr/rasterizer/core/backend.h
index 53089e5..2fa1895 100644
--- a/src/gallium/drivers/swr/rasterizer/core/backend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/backend.h
@@ -29,16 +29,20 @@
#pragma once
#include "common/os.h"
-#include "core/context.h"
+#include "core/context.h"
+#include "core/multisample.h"
void ProcessComputeBE(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t threadGroupId);
void ProcessSyncBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
void ProcessQueryStatsBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
void ProcessClearBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pUserData);
void ProcessStoreTileBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
-void ProcessInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
+void ProcessDiscardInvalidateTilesBE(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t macroTile, void *pData);
void BackendNullPS(DRAW_CONTEXT *pDC, uint32_t workerId, uint32_t x, uint32_t y, SWR_TRIANGLE_DESC &work, RenderOutputBuffers &renderBuffers);
void InitClearTilesTable();
+simdmask ComputeUserClipMask(uint8_t clipMask, float* pUserClipBuffer, simdscalar vI, simdscalar vJ);
+void InitBackendFuncTables();
+void InitCPSFuncTables();
enum SWR_BACKEND_FUNCS
{
@@ -47,13 +51,160 @@ enum SWR_BACKEND_FUNCS
SWR_BACKEND_MSAA_SAMPLE_RATE,
SWR_BACKEND_FUNCS_MAX,
};
-void InitBackendFuncTables();
-extern PFN_BACKEND_FUNC gBackendNullPs[SWR_MULTISAMPLE_TYPE_MAX];
-extern PFN_BACKEND_FUNC gBackendSingleSample[2][2];
-extern PFN_BACKEND_FUNC gBackendPixelRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_MSAA_SAMPLE_PATTERN_MAX][SWR_INPUT_COVERAGE_MAX][2][2];
-extern PFN_BACKEND_FUNC gBackendSampleRateTable[SWR_MULTISAMPLE_TYPE_MAX][SWR_INPUT_COVERAGE_MAX][2];
-extern PFN_OUTPUT_MERGER gBackendOutputMergerTable[SWR_NUM_RENDERTARGETS+1][SWR_MULTISAMPLE_TYPE_MAX];
-extern PFN_CALC_PIXEL_BARYCENTRICS gPixelBarycentricTable[2];
-extern PFN_CALC_SAMPLE_BARYCENTRICS gSampleBarycentricTable[2];
-extern PFN_CALC_CENTROID_BARYCENTRICS gCentroidBarycentricTable[SWR_MULTISAMPLE_TYPE_MAX][2][2][2];
+#if KNOB_SIMD_WIDTH == 8
+extern const __m256 vCenterOffsetsX;
+extern const __m256 vCenterOffsetsY;
+extern const __m256 vULOffsetsX;
+extern const __m256 vULOffsetsY;
+#define MASK 0xff
+#endif
+
+template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
+INLINE void generateInputCoverage(const uint64_t *const coverageMask, uint32_t (&inputMask)[KNOB_SIMD_WIDTH], const uint32_t sampleMask)
+{
+
+ // will need to update for avx512
+ assert(KNOB_SIMD_WIDTH == 8);
+
+ __m256i mask[2];
+ __m256i sampleCoverage[2];
+ if(bIsStandardPattern)
+ {
+ __m256i src = _mm256_set1_epi32(0);
+ __m256i index0 = _mm256_set_epi32(7, 6, 5, 4, 3, 2, 1, 0), index1;
+
+ if(MultisampleTraits<sampleCountT>::numSamples == 1)
+ {
+ mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, -1);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 2)
+ {
+ mask[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 4)
+ {
+ mask[0] = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 8)
+ {
+ mask[0] = _mm256_set1_epi32(-1);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 16)
+ {
+ mask[0] = _mm256_set1_epi32(-1);
+ mask[1] = _mm256_set1_epi32(-1);
+ index1 = _mm256_set_epi32(15, 14, 13, 12, 11, 10, 9, 8);
+ }
+
+ // gather coverage for samples 0-7
+ sampleCoverage[0] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index0, _mm256_castsi256_ps(mask[0]), 8));
+ if(MultisampleTraits<sampleCountT>::numSamples > 8)
+ {
+ // gather coverage for samples 8-15
+ sampleCoverage[1] = _mm256_castps_si256(_simd_mask_i32gather_ps(_mm256_castsi256_ps(src), (const float*)coverageMask, index1, _mm256_castsi256_ps(mask[1]), 8));
+ }
+ }
+ else
+ {
+ // center coverage is the same for all samples; just broadcast to the sample slots
+ uint32_t centerCoverage = ((uint32_t)(*coverageMask) & MASK);
+ if(MultisampleTraits<sampleCountT>::numSamples == 1)
+ {
+ sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, 0, centerCoverage);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 2)
+ {
+ sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, 0, 0, centerCoverage, centerCoverage);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 4)
+ {
+ sampleCoverage[0] = _mm256_set_epi32(0, 0, 0, 0, centerCoverage, centerCoverage, centerCoverage, centerCoverage);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 8)
+ {
+ sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
+ }
+ else if(MultisampleTraits<sampleCountT>::numSamples == 16)
+ {
+ sampleCoverage[0] = _mm256_set1_epi32(centerCoverage);
+ sampleCoverage[1] = _mm256_set1_epi32(centerCoverage);
+ }
+ }
+
+ mask[0] = _mm256_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0,
+ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0xC, 0x8, 0x4, 0x0);
+ // pull out the the 8bit 4x2 coverage for samples 0-7 into the lower 32 bits of each 128bit lane
+ __m256i packedCoverage0 = _simd_shuffle_epi8(sampleCoverage[0], mask[0]);
+
+ __m256i packedCoverage1;
+ if(MultisampleTraits<sampleCountT>::numSamples > 8)
+ {
+ // pull out the the 8bit 4x2 coverage for samples 8-15 into the lower 32 bits of each 128bit lane
+ packedCoverage1 = _simd_shuffle_epi8(sampleCoverage[1], mask[0]);
+ }
+
+#if (KNOB_ARCH == KNOB_ARCH_AVX)
+ // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
+ __m256i hiToLow = _mm256_permute2f128_si256(packedCoverage0, packedCoverage0, 0x83);
+ __m256 shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
+ packedCoverage0 = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), shufRes, 0xFE));
+
+ __m256i packedSampleCoverage;
+ if(MultisampleTraits<sampleCountT>::numSamples > 8)
+ {
+ // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
+ hiToLow = _mm256_permute2f128_si256(packedCoverage1, packedCoverage1, 0x83);
+ shufRes = _mm256_shuffle_ps(_mm256_castsi256_ps(hiToLow), _mm256_castsi256_ps(hiToLow), _MM_SHUFFLE(1, 1, 0, 1));
+ shufRes = _mm256_blend_ps(_mm256_castsi256_ps(packedCoverage1), shufRes, 0xFE);
+ packedCoverage1 = _mm256_castps_si256(_mm256_castpd_ps(_mm256_shuffle_pd(_mm256_castps_pd(shufRes), _mm256_castps_pd(shufRes), 0x01)));
+ packedSampleCoverage = _mm256_castps_si256(_mm256_blend_ps(_mm256_castsi256_ps(packedCoverage0), _mm256_castsi256_ps(packedCoverage1), 0xFC));
+ }
+ else
+ {
+ packedSampleCoverage = packedCoverage0;
+ }
+#else
+ __m256i permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x7, 0x7, 0x4, 0x0);
+ // pack lower 32 bits of each 128 bit lane into lower 64 bits of single 128 bit lane
+ packedCoverage0 = _mm256_permutevar8x32_epi32(packedCoverage0, permMask);
+
+ __m256i packedSampleCoverage;
+ if(MultisampleTraits<sampleCountT>::numSamples > 8)
+ {
+ permMask = _mm256_set_epi32(0x7, 0x7, 0x7, 0x7, 0x4, 0x0, 0x7, 0x7);
+ // pack lower 32 bits of each 128 bit lane into upper 64 bits of single 128 bit lane
+ packedCoverage1 = _mm256_permutevar8x32_epi32(packedCoverage1, permMask);
+
+ // blend coverage masks for samples 0-7 and samples 8-15 into single 128 bit lane
+ packedSampleCoverage = _mm256_blend_epi32(packedCoverage0, packedCoverage1, 0x0C);
+ }
+ else
+ {
+ packedSampleCoverage = packedCoverage0;
+ }
+#endif
+
+ for(int32_t i = KNOB_SIMD_WIDTH - 1; i >= 0; i--)
+ {
+ // convert packed sample coverage masks into single coverage masks for all samples for each pixel in the 4x2
+ inputMask[i] = _simd_movemask_epi8(packedSampleCoverage);
+
+ if(!bForcedSampleCount)
+ {
+ // input coverage has to be anded with sample mask if MSAA isn't forced on
+ inputMask[i] &= sampleMask;
+ }
+
+ // shift to the next pixel in the 4x2
+ packedSampleCoverage = _simd_slli_epi32(packedSampleCoverage, 1);
+ }
+}
+
+template<SWR_MULTISAMPLE_COUNT sampleCountT, bool bIsStandardPattern, bool bForcedSampleCount>
+INLINE void generateInputCoverage(const uint64_t *const coverageMask, __m256 &inputCoverage, const uint32_t sampleMask)
+{
+ uint32_t inputMask[KNOB_SIMD_WIDTH];
+ generateInputCoverage<sampleCountT, bIsStandardPattern, bForcedSampleCount>(coverageMask, inputMask, sampleMask);
+ inputCoverage = _simd_castsi_ps(_mm256_set_epi32(inputMask[7], inputMask[6], inputMask[5], inputMask[4], inputMask[3], inputMask[2], inputMask[1], inputMask[0]));
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.cpp b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
index ce27bf7..3a2a8b3 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.cpp
@@ -31,6 +31,9 @@
#include "common/os.h"
#include "core/clip.h"
+// Temp storage used by the clipper
+THREAD simdvertex tlsTempVertices[7];
+
float ComputeInterpFactor(float boundaryCoord0, float boundaryCoord1)
{
return (boundaryCoord0 / (boundaryCoord0 - boundaryCoord1));
diff --git a/src/gallium/drivers/swr/rasterizer/core/clip.h b/src/gallium/drivers/swr/rasterizer/core/clip.h
index 49494a4..ba5870a 100644
--- a/src/gallium/drivers/swr/rasterizer/core/clip.h
+++ b/src/gallium/drivers/swr/rasterizer/core/clip.h
@@ -32,6 +32,9 @@
#include "core/pa.h"
#include "rdtsc_core.h"
+// Temp storage used by the clipper
+extern THREAD simdvertex tlsTempVertices[7];
+
enum SWR_CLIPCODES
{
// Shift clip codes out of the mantissa to prevent denormalized values when used in float compare.
@@ -354,6 +357,25 @@ public:
}
}
+ // assemble user clip distances if enabled
+ if (this->state.rastState.clipDistanceMask & 0xf)
+ {
+ pa.Assemble(VERTEX_CLIPCULL_DIST_LO_SLOT, tmpVector);
+ for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+ {
+ vertices[i].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT] = tmpVector[i];
+ }
+ }
+
+ if (this->state.rastState.clipDistanceMask & 0xf0)
+ {
+ pa.Assemble(VERTEX_CLIPCULL_DIST_HI_SLOT, tmpVector);
+ for (uint32_t i = 0; i < NumVertsPerPrim; ++i)
+ {
+ vertices[i].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT] = tmpVector[i];
+ }
+ }
+
uint32_t numAttribs = maxSlot + 1;
simdscalari vNumClippedVerts = ClipPrims((float*)&vertices[0], vPrimMask, vClipMask, numAttribs);
@@ -436,6 +458,27 @@ public:
}
}
+ // transpose user clip distances if enabled
+ if (this->state.rastState.clipDistanceMask & 0xf)
+ {
+ pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT]) + sizeof(float) * inputPrim;
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_LO_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
+ pBase += sizeof(simdscalar);
+ }
+ }
+
+ if (this->state.rastState.clipDistanceMask & 0xf0)
+ {
+ pBase = (uint8_t*)(&vertices[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT]) + sizeof(float) * inputPrim;
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ transposedPrims[0].attrib[VERTEX_CLIPCULL_DIST_HI_SLOT][c] = _simd_mask_i32gather_ps(_mm256_undefined_ps(), (const float*)pBase, vOffsets, vMask, 1);
+ pBase += sizeof(simdscalar);
+ }
+ }
+
PA_STATE_OPT clipPa(this->pDC, numEmittedPrims, (uint8_t*)&transposedPrims[0], numEmittedVerts, true, clipTopology);
while (clipPa.GetNextStreamOutput())
@@ -630,6 +673,31 @@ private:
ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
}
}
+
+ // interpolate clip distance if enabled
+ if (this->state.rastState.clipDistanceMask & 0xf)
+ {
+ uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
+ simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
+ simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+ ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
+ }
+ }
+
+ if (this->state.rastState.clipDistanceMask & 0xf0)
+ {
+ uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ simdscalar vAttrib0 = GatherComponent(pInVerts, attribSlot, vActiveMask, s, c);
+ simdscalar vAttrib1 = GatherComponent(pInVerts, attribSlot, vActiveMask, p, c);
+ simdscalar vOutAttrib = _simd_fmadd_ps(_simd_sub_ps(vAttrib1, vAttrib0), t, vAttrib0);
+ ScatterComponent(pOutVerts, attribSlot, vActiveMask, outIndex, c, vOutAttrib);
+ }
+ }
}
template<SWR_CLIPCODES ClippingPlane>
@@ -700,6 +768,27 @@ private:
}
}
+ // store clip distance if enabled
+ if (this->state.rastState.clipDistanceMask & 0xf)
+ {
+ uint32_t attribSlot = VERTEX_CLIPCULL_DIST_LO_SLOT;
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+ ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
+ }
+ }
+
+ if (this->state.rastState.clipDistanceMask & 0xf0)
+ {
+ uint32_t attribSlot = VERTEX_CLIPCULL_DIST_HI_SLOT;
+ for (uint32_t c = 0; c < 4; ++c)
+ {
+ simdscalar vAttrib = GatherComponent(pInVerts, attribSlot, s_in, s, c);
+ ScatterComponent(pOutVerts, attribSlot, s_in, vOutIndex, c, vAttrib);
+ }
+ }
+
// increment outIndex
vOutIndex = _simd_blendv_epi32(vOutIndex, _simd_add_epi32(vOutIndex, _simd_set1_epi32(1)), s_in);
}
@@ -818,8 +907,7 @@ private:
simdscalari ClipPrims(float* pVertices, const simdscalar& vPrimMask, const simdscalar& vClipMask, int numAttribs)
{
// temp storage
- simdvertex tempVertices[7];
- float* pTempVerts = (float*)&tempVertices[0];
+ float* pTempVerts = (float*)&tlsTempVertices[0];
// zero out num input verts for non-active lanes
simdscalari vNumInPts = _simd_set1_epi32(NumVertsPerPrim);
@@ -854,9 +942,9 @@ private:
return vNumOutPts;
}
- const uint32_t workerId;
- const DRIVER_TYPE driverType;
- DRAW_CONTEXT* pDC;
+ const uint32_t workerId{ 0 };
+ const DRIVER_TYPE driverType{ DX };
+ DRAW_CONTEXT* pDC{ nullptr };
const API_STATE& state;
simdscalar clipCodes[NumVertsPerPrim];
};
diff --git a/src/gallium/drivers/swr/rasterizer/core/context.h b/src/gallium/drivers/swr/rasterizer/core/context.h
index 4a214af..b8f15ca 100644
--- a/src/gallium/drivers/swr/rasterizer/core/context.h
+++ b/src/gallium/drivers/swr/rasterizer/core/context.h
@@ -41,6 +41,7 @@
#include "core/knobs.h"
#include "common/simdintrin.h"
#include "core/threads.h"
+#include "ringbuffer.h"
// x.8 fixed point precision values
#define FIXED_POINT_SHIFT 8
@@ -82,6 +83,7 @@ struct SWR_TRIANGLE_DESC
float *pUserClipBuffer;
uint64_t coverageMask[SWR_MAX_NUM_MULTISAMPLES];
+ uint64_t anyCoveredSamples;
TRI_FLAGS triFlags;
};
@@ -109,12 +111,16 @@ struct CLEAR_DESC
CLEAR_FLAGS flags;
float clearRTColor[4]; // RGBA_32F
float clearDepth; // [0..1]
- BYTE clearStencil;
+ uint8_t clearStencil;
};
-struct INVALIDATE_TILES_DESC
+struct DISCARD_INVALIDATE_TILES_DESC
{
uint32_t attachmentMask;
+ SWR_RECT rect;
+ SWR_TILE_STATE newTileState;
+ bool createNewTiles;
+ bool fullTilesOnly;
};
struct SYNC_DESC
@@ -150,7 +156,7 @@ enum WORK_TYPE
SYNC,
DRAW,
CLEAR,
- INVALIDATETILES,
+ DISCARDINVALIDATETILES,
STORETILES,
QUERYSTATS,
};
@@ -164,7 +170,7 @@ struct BE_WORK
SYNC_DESC sync;
TRIANGLE_WORK_DESC tri;
CLEAR_DESC clear;
- INVALIDATE_TILES_DESC invalidateTiles;
+ DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
STORE_TILES_DESC storeTiles;
QUERY_DESC queryStats;
} desc;
@@ -201,7 +207,7 @@ struct FE_WORK
SYNC_DESC sync;
DRAW_WORK draw;
CLEAR_DESC clear;
- INVALIDATE_TILES_DESC invalidateTiles;
+ DISCARD_INVALIDATE_TILES_DESC discardInvalidateTiles;
STORE_TILES_DESC storeTiles;
QUERY_DESC queryStats;
} desc;
@@ -354,6 +360,7 @@ struct BACKEND_FUNCS
PFN_OUTPUT_MERGER pfnOutputMerger;
};
+
// Draw State
struct DRAW_STATE
{
@@ -365,7 +372,7 @@ struct DRAW_STATE
BACKEND_FUNCS backendFuncs;
PFN_PROCESS_PRIMS pfnProcessPrims;
- Arena* pArena; // This should only be used by API thread.
+ CachingArena* pArena; // This should only be used by API thread.
};
// Draw Context
@@ -381,23 +388,18 @@ struct DRAW_CONTEXT
FE_WORK FeWork;
volatile OSALIGNLINE(uint32_t) FeLock;
- volatile OSALIGNLINE(bool) inUse;
volatile OSALIGNLINE(bool) doneFE; // Is FE work done for this draw?
-
- // Have all worker threads moved past draw in DC ring?
- volatile OSALIGNLINE(uint32_t) threadsDoneFE;
- volatile OSALIGNLINE(uint32_t) threadsDoneBE;
+ volatile OSALIGNLINE(int64_t) threadsDone;
uint64_t dependency;
MacroTileMgr* pTileMgr;
// The following fields are valid if isCompute is true.
- volatile OSALIGNLINE(bool) doneCompute; // Is this dispatch done? (isCompute)
DispatchQueue* pDispatch; // Queue for thread groups. (isCompute)
DRAW_STATE* pState;
- Arena* pArena;
+ CachingArena* pArena;
uint8_t* pSpillFill[KNOB_MAX_NUM_THREADS]; // Scratch space used for spill fills.
};
@@ -438,7 +440,7 @@ struct SWR_CONTEXT
// 3. State - When an applications sets state after draw
// a. Same as step 1.
// b. State is copied from prev draw context to current.
- DRAW_CONTEXT* dcRing;
+ RingBuffer<DRAW_CONTEXT> dcRing;
DRAW_CONTEXT *pCurDrawContext; // This points to DC entry in ring for an unsubmitted draw.
DRAW_CONTEXT *pPrevDrawContext; // This points to DC entry for the previous context submitted that we can copy state from.
@@ -448,14 +450,10 @@ struct SWR_CONTEXT
// These split draws all have identical state. So instead of storing the state directly
// in the Draw Context (DC) we instead store it in a Draw State (DS). This allows multiple DCs
// to reference a single entry in the DS ring.
- DRAW_STATE* dsRing;
+ RingBuffer<DRAW_STATE> dsRing;
uint32_t curStateId; // Current index to the next available entry in the DS ring.
- DRAW_STATE* subCtxSave; // Save area for inactive contexts.
- uint32_t curSubCtxId; // Current index for active state subcontext.
- uint32_t numSubContexts; // Number of available subcontexts
-
uint32_t NumWorkerThreads;
THREAD_POOL threadPool; // Thread pool associated with this context
@@ -463,13 +461,6 @@ struct SWR_CONTEXT
std::condition_variable FifosNotEmpty;
std::mutex WaitLock;
- // Draw Contexts will get a unique drawId generated from this
- uint64_t nextDrawId;
-
- // most recent draw id enqueued by the API thread
- // written by api thread, read by multiple workers
- OSALIGNLINE(volatile uint64_t) DrawEnqueued;
-
DRIVER_TYPE driverType;
uint32_t privateStateSize;
@@ -486,6 +477,8 @@ struct SWR_CONTEXT
// Scratch space for workers.
uint8_t* pScratch[KNOB_MAX_NUM_THREADS];
+
+ CachingAllocator cachingArenaAllocator;
};
void WaitForDependencies(SWR_CONTEXT *pContext, uint64_t drawId);
diff --git a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
index 4f245c8..2cc9d40 100644
--- a/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
+++ b/src/gallium/drivers/swr/rasterizer/core/depthstencil.h
@@ -82,7 +82,7 @@ void StencilOp(SWR_STENCILOP op, simdscalar mask, simdscalar stencilRefps, simds
INLINE
simdscalar DepthStencilTest(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState,
- bool frontFacing, simdscalar interpZ, BYTE* pDepthBase, simdscalar coverageMask, BYTE *pStencilBase,
+ bool frontFacing, simdscalar interpZ, uint8_t* pDepthBase, simdscalar coverageMask, uint8_t *pStencilBase,
simdscalar* pStencilMask)
{
static_assert(KNOB_DEPTH_HOT_TILE_FORMAT == R32_FLOAT, "Unsupported depth hot tile format");
@@ -177,8 +177,8 @@ simdscalar DepthStencilTest(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENC
INLINE
void DepthStencilWrite(const SWR_VIEWPORT* pViewport, const SWR_DEPTH_STENCIL_STATE* pDSState,
- bool frontFacing, simdscalar interpZ, BYTE* pDepthBase, const simdscalar& depthMask, const simdscalar& coverageMask,
- BYTE *pStencilBase, const simdscalar& stencilMask)
+ bool frontFacing, simdscalar interpZ, uint8_t* pDepthBase, const simdscalar& depthMask, const simdscalar& coverageMask,
+ uint8_t *pStencilBase, const simdscalar& stencilMask)
{
if (pDSState->depthWriteEnable)
{
diff --git a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
index 7e55601..ccf0b70 100644
--- a/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
+++ b/src/gallium/drivers/swr/rasterizer/core/fifo.hpp
@@ -49,7 +49,8 @@ struct QUEUE
static const uint32_t mBlockSizeShift = 6;
static const uint32_t mBlockSize = 1 << mBlockSizeShift;
- void clear(Arena& arena)
+ template <typename ArenaT>
+ void clear(ArenaT& arena)
{
mHead = 0;
mTail = 0;
@@ -102,7 +103,8 @@ struct QUEUE
mNumEntries --;
}
- bool enqueue_try_nosync(Arena& arena, const T* entry)
+ template <typename ArenaT>
+ bool enqueue_try_nosync(ArenaT& arena, const T* entry)
{
memcpy(&mCurBlock[mTail], entry, sizeof(T));
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
index 83d85fc..344758e 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_conversion.h
@@ -34,7 +34,7 @@
/// @param pSrc - source data in SOA form
/// @param dst - output data in SOA form
template<SWR_FORMAT SrcFormat>
-INLINE void LoadSOA(const BYTE *pSrc, simdvector &dst)
+INLINE void LoadSOA(const uint8_t *pSrc, simdvector &dst)
{
// fast path for float32
if ((FormatTraits<SrcFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<SrcFormat>::GetBPC(0) == 32))
@@ -141,7 +141,7 @@ INLINE simdscalar Normalize(simdscalar vComp, uint32_t Component)
/// @param src - source data in SOA form
/// @param dst - output data in SOA form
template<SWR_FORMAT DstFormat>
-INLINE void StoreSOA(const simdvector &src, BYTE *pDst)
+INLINE void StoreSOA(const simdvector &src, uint8_t *pDst)
{
// fast path for float32
if ((FormatTraits<DstFormat>::GetType(0) == SWR_TYPE_FLOAT) && (FormatTraits<DstFormat>::GetBPC(0) == 32))
diff --git a/src/gallium/drivers/swr/rasterizer/core/format_types.h b/src/gallium/drivers/swr/rasterizer/core/format_types.h
index aa35025..9acf846 100644
--- a/src/gallium/drivers/swr/rasterizer/core/format_types.h
+++ b/src/gallium/drivers/swr/rasterizer/core/format_types.h
@@ -34,8 +34,8 @@ template <uint32_t NumBits, bool Signed = false>
struct PackTraits
{
static const uint32_t MyNumBits = NumBits;
- static simdscalar loadSOA(const BYTE *pSrc) = delete;
- static void storeSOA(BYTE *pDst, simdscalar src) = delete;
+ static simdscalar loadSOA(const uint8_t *pSrc) = delete;
+ static void storeSOA(uint8_t *pDst, simdscalar src) = delete;
static simdscalar unpack(simdscalar &in) = delete;
static simdscalar pack(simdscalar &in) = delete;
};
@@ -48,8 +48,8 @@ struct PackTraits<0, false>
{
static const uint32_t MyNumBits = 0;
- static simdscalar loadSOA(const BYTE *pSrc) { return _simd_setzero_ps(); }
- static void storeSOA(BYTE *pDst, simdscalar src) { return; }
+ static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_setzero_ps(); }
+ static void storeSOA(uint8_t *pDst, simdscalar src) { return; }
static simdscalar unpack(simdscalar &in) { return _simd_setzero_ps(); }
static simdscalar pack(simdscalar &in) { return _simd_setzero_ps(); }
};
@@ -63,7 +63,7 @@ struct PackTraits<8, false>
{
static const uint32_t MyNumBits = 8;
- static simdscalar loadSOA(const BYTE *pSrc)
+ static simdscalar loadSOA(const uint8_t *pSrc)
{
#if KNOB_SIMD_WIDTH == 8
__m256 result = _mm256_setzero_ps();
@@ -74,7 +74,7 @@ struct PackTraits<8, false>
#endif
}
- static void storeSOA(BYTE *pDst, simdscalar src)
+ static void storeSOA(uint8_t *pDst, simdscalar src)
{
// store simd bytes
#if KNOB_SIMD_WIDTH == 8
@@ -125,7 +125,7 @@ struct PackTraits<8, true>
{
static const uint32_t MyNumBits = 8;
- static simdscalar loadSOA(const BYTE *pSrc)
+ static simdscalar loadSOA(const uint8_t *pSrc)
{
#if KNOB_SIMD_WIDTH == 8
__m256 result = _mm256_setzero_ps();
@@ -136,7 +136,7 @@ struct PackTraits<8, true>
#endif
}
- static void storeSOA(BYTE *pDst, simdscalar src)
+ static void storeSOA(uint8_t *pDst, simdscalar src)
{
// store simd bytes
#if KNOB_SIMD_WIDTH == 8
@@ -188,7 +188,7 @@ struct PackTraits<16, false>
{
static const uint32_t MyNumBits = 16;
- static simdscalar loadSOA(const BYTE *pSrc)
+ static simdscalar loadSOA(const uint8_t *pSrc)
{
#if KNOB_SIMD_WIDTH == 8
__m256 result = _mm256_setzero_ps();
@@ -199,7 +199,7 @@ struct PackTraits<16, false>
#endif
}
- static void storeSOA(BYTE *pDst, simdscalar src)
+ static void storeSOA(uint8_t *pDst, simdscalar src)
{
#if KNOB_SIMD_WIDTH == 8
// store 16B (2B * 8)
@@ -249,7 +249,7 @@ struct PackTraits<16, true>
{
static const uint32_t MyNumBits = 16;
- static simdscalar loadSOA(const BYTE *pSrc)
+ static simdscalar loadSOA(const uint8_t *pSrc)
{
#if KNOB_SIMD_WIDTH == 8
__m256 result = _mm256_setzero_ps();
@@ -260,7 +260,7 @@ struct PackTraits<16, true>
#endif
}
- static void storeSOA(BYTE *pDst, simdscalar src)
+ static void storeSOA(uint8_t *pDst, simdscalar src)
{
#if KNOB_SIMD_WIDTH == 8
// store 16B (2B * 8)
@@ -311,8 +311,8 @@ struct PackTraits<32, false>
{
static const uint32_t MyNumBits = 32;
- static simdscalar loadSOA(const BYTE *pSrc) { return _simd_load_ps((const float*)pSrc); }
- static void storeSOA(BYTE *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); }
+ static simdscalar loadSOA(const uint8_t *pSrc) { return _simd_load_ps((const float*)pSrc); }
+ static void storeSOA(uint8_t *pDst, simdscalar src) { _simd_store_ps((float*)pDst, src); }
static simdscalar unpack(simdscalar &in) { return in; }
static simdscalar pack(simdscalar &in) { return in; }
};
@@ -984,7 +984,7 @@ struct ComponentTraits
return TypeTraits<X, NumBitsX>::fromFloat();
}
- INLINE static simdscalar loadSOA(uint32_t comp, const BYTE* pSrc)
+ INLINE static simdscalar loadSOA(uint32_t comp, const uint8_t* pSrc)
{
switch (comp)
{
@@ -1001,7 +1001,7 @@ struct ComponentTraits
return TypeTraits<X, NumBitsX>::loadSOA(pSrc);
}
- INLINE static void storeSOA(uint32_t comp, BYTE *pDst, simdscalar src)
+ INLINE static void storeSOA(uint32_t comp, uint8_t *pDst, simdscalar src)
{
switch (comp)
{
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
index f43a672..36721e0 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.cpp
@@ -193,35 +193,71 @@ void ProcessStoreTiles(
/// @param workerId - thread's worker id. Even thread has a unique id.
/// @param pUserData - Pointer to user data passed back to callback.
/// @todo This should go away when we switch this to use compute threading.
-void ProcessInvalidateTiles(
+void ProcessDiscardInvalidateTiles(
SWR_CONTEXT *pContext,
DRAW_CONTEXT *pDC,
uint32_t workerId,
void *pUserData)
{
RDTSC_START(FEProcessInvalidateTiles);
- INVALIDATE_TILES_DESC *pInv = (INVALIDATE_TILES_DESC*)pUserData;
+ DISCARD_INVALIDATE_TILES_DESC *pInv = (DISCARD_INVALIDATE_TILES_DESC*)pUserData;
MacroTileMgr *pTileMgr = pDC->pTileMgr;
- const API_STATE& state = GetApiState(pDC);
+ SWR_RECT rect;
+
+ if (pInv->rect.top | pInv->rect.bottom | pInv->rect.right | pInv->rect.left)
+ {
+ // Valid rect
+ rect = pInv->rect;
+ }
+ else
+ {
+ // Use viewport dimensions
+ const API_STATE& state = GetApiState(pDC);
+
+ rect.left = (uint32_t)state.vp[0].x;
+ rect.right = (uint32_t)(state.vp[0].x + state.vp[0].width);
+ rect.top = (uint32_t)state.vp[0].y;
+ rect.bottom = (uint32_t)(state.vp[0].y + state.vp[0].height);
+ }
// queue a store to each macro tile
// compute macro tile bounds for the current render target
uint32_t macroWidth = KNOB_MACROTILE_X_DIM;
uint32_t macroHeight = KNOB_MACROTILE_Y_DIM;
- uint32_t numMacroTilesX = ((uint32_t)state.vp[0].width + (uint32_t)state.vp[0].x + (macroWidth - 1)) / macroWidth;
- uint32_t numMacroTilesY = ((uint32_t)state.vp[0].height + (uint32_t)state.vp[0].y + (macroHeight - 1)) / macroHeight;
+ // Setup region assuming full tiles
+ uint32_t macroTileStartX = (rect.left + (macroWidth - 1)) / macroWidth;
+ uint32_t macroTileStartY = (rect.top + (macroHeight - 1)) / macroHeight;
+
+ uint32_t macroTileEndX = rect.right / macroWidth;
+ uint32_t macroTileEndY = rect.bottom / macroHeight;
+
+ if (pInv->fullTilesOnly == false)
+ {
+ // include partial tiles
+ macroTileStartX = rect.left / macroWidth;
+ macroTileStartY = rect.top / macroHeight;
+
+ macroTileEndX = (rect.right + macroWidth - 1) / macroWidth;
+ macroTileEndY = (rect.bottom + macroHeight - 1) / macroHeight;
+ }
+
+ SWR_ASSERT(macroTileEndX <= KNOB_NUM_HOT_TILES_X);
+ SWR_ASSERT(macroTileEndY <= KNOB_NUM_HOT_TILES_Y);
+
+ macroTileEndX = std::min<uint32_t>(macroTileEndX, KNOB_NUM_HOT_TILES_X);
+ macroTileEndY = std::min<uint32_t>(macroTileEndY, KNOB_NUM_HOT_TILES_Y);
// load tiles
BE_WORK work;
- work.type = INVALIDATETILES;
- work.pfnWork = ProcessInvalidateTilesBE;
- work.desc.invalidateTiles = *pInv;
+ work.type = DISCARDINVALIDATETILES;
+ work.pfnWork = ProcessDiscardInvalidateTilesBE;
+ work.desc.discardInvalidateTiles = *pInv;
- for (uint32_t x = 0; x < numMacroTilesX; ++x)
+ for (uint32_t x = macroTileStartX; x < macroTileEndX; ++x)
{
- for (uint32_t y = 0; y < numMacroTilesY; ++y)
+ for (uint32_t y = macroTileStartY; y < macroTileEndY; ++y)
{
pTileMgr->enqueue(x, y, &work);
}
@@ -630,6 +666,8 @@ void ProcessStreamIdBuffer(uint32_t stream, uint8_t* pStreamIdBase, uint32_t num
}
}
+THREAD SWR_GS_CONTEXT tlsGsContext;
+
//////////////////////////////////////////////////////////////////////////
/// @brief Implements GS stage.
/// @param pDC - pointer to draw context.
@@ -651,7 +689,6 @@ static void GeometryShaderStage(
{
RDTSC_START(FEGeometryShader);
- SWR_GS_CONTEXT gsContext;
SWR_CONTEXT* pContext = pDC->pContext;
const API_STATE& state = GetApiState(pDC);
@@ -660,9 +697,9 @@ static void GeometryShaderStage(
SWR_ASSERT(pGsOut != nullptr, "GS output buffer should be initialized");
SWR_ASSERT(pCutBuffer != nullptr, "GS output cut buffer should be initialized");
- gsContext.pStream = (uint8_t*)pGsOut;
- gsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
- gsContext.PrimitiveID = primID;
+ tlsGsContext.pStream = (uint8_t*)pGsOut;
+ tlsGsContext.pCutOrStreamIdBuffer = (uint8_t*)pCutBuffer;
+ tlsGsContext.PrimitiveID = primID;
uint32_t numVertsPerPrim = NumVertsPerPrim(pa.binTopology, true);
simdvector attrib[MAX_ATTRIBUTES];
@@ -675,7 +712,7 @@ static void GeometryShaderStage(
for (uint32_t i = 0; i < numVertsPerPrim; ++i)
{
- gsContext.vert[i].attrib[attribSlot] = attrib[i];
+ tlsGsContext.vert[i].attrib[attribSlot] = attrib[i];
}
}
@@ -683,7 +720,7 @@ static void GeometryShaderStage(
pa.Assemble(VERTEX_POSITION_SLOT, attrib);
for (uint32_t i = 0; i < numVertsPerPrim; ++i)
{
- gsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
+ tlsGsContext.vert[i].attrib[VERTEX_POSITION_SLOT] = attrib[i];
}
const uint32_t vertexStride = sizeof(simdvertex);
@@ -710,14 +747,14 @@ static void GeometryShaderStage(
for (uint32_t instance = 0; instance < pState->instanceCount; ++instance)
{
- gsContext.InstanceID = instance;
- gsContext.mask = GenerateMask(numInputPrims);
+ tlsGsContext.InstanceID = instance;
+ tlsGsContext.mask = GenerateMask(numInputPrims);
// execute the geometry shader
- state.pfnGsFunc(GetPrivateState(pDC), &gsContext);
+ state.pfnGsFunc(GetPrivateState(pDC), &tlsGsContext);
- gsContext.pStream += instanceStride;
- gsContext.pCutOrStreamIdBuffer += cutInstanceStride;
+ tlsGsContext.pStream += instanceStride;
+ tlsGsContext.pCutOrStreamIdBuffer += cutInstanceStride;
}
// set up new binner and state for the GS output topology
@@ -736,7 +773,7 @@ static void GeometryShaderStage(
// foreach input prim:
// - setup a new PA based on the emitted verts for that prim
// - loop over the new verts, calling PA to assemble each prim
- uint32_t* pVertexCount = (uint32_t*)&gsContext.vertexCount;
+ uint32_t* pVertexCount = (uint32_t*)&tlsGsContext.vertexCount;
uint32_t* pPrimitiveId = (uint32_t*)&primID;
uint32_t totalPrimsGenerated = 0;
@@ -844,7 +881,7 @@ static void GeometryShaderStage(
static INLINE void AllocateGsBuffers(DRAW_CONTEXT* pDC, const API_STATE& state, void** ppGsOut, void** ppCutBuffer,
void **ppStreamCutBuffer)
{
- Arena* pArena = pDC->pArena;
+ auto pArena = pDC->pArena;
SWR_ASSERT(pArena != nullptr);
SWR_ASSERT(state.gsState.gsEnable);
// allocate arena space to hold GS output verts
@@ -1186,7 +1223,7 @@ void ProcessDraw(
// if the entire index buffer isn't being consumed, set the last index
// so that fetches < a SIMD wide will be masked off
- fetchInfo.pLastIndex = (const int32_t*)(((BYTE*)state.indexBuffer.pIndices) + state.indexBuffer.size);
+ fetchInfo.pLastIndex = (const int32_t*)(((uint8_t*)state.indexBuffer.pIndices) + state.indexBuffer.size);
if (pLastRequestedIndex < fetchInfo.pLastIndex)
{
fetchInfo.pLastIndex = pLastRequestedIndex;
@@ -1362,7 +1399,7 @@ void ProcessDraw(
i += KNOB_SIMD_WIDTH;
if (IsIndexedT)
{
- fetchInfo.pIndices = (int*)((BYTE*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
+ fetchInfo.pIndices = (int*)((uint8_t*)fetchInfo.pIndices + KNOB_SIMD_WIDTH * indexSize);
}
else
{
@@ -1776,7 +1813,7 @@ void BinTriangles(
work.pfnWork = gRasterizerTable[rastState.scissorEnable][SWR_MULTISAMPLE_1X];
}
- Arena* pArena = pDC->pArena;
+ auto pArena = pDC->pArena;
SWR_ASSERT(pArena != nullptr);
// store active attribs
@@ -1948,7 +1985,7 @@ void BinPoints(
work.pfnWork = RasterizeSimplePoint;
- Arena* pArena = pDC->pArena;
+ auto pArena = pDC->pArena;
SWR_ASSERT(pArena != nullptr);
// store attributes
@@ -2082,7 +2119,7 @@ void BinPoints(
work.pfnWork = RasterizeTriPoint;
- Arena* pArena = pDC->pArena;
+ auto pArena = pDC->pArena;
SWR_ASSERT(pArena != nullptr);
// store active attribs
@@ -2299,7 +2336,7 @@ void BinLines(
work.pfnWork = RasterizeLine;
- Arena* pArena = pDC->pArena;
+ auto pArena = pDC->pArena;
SWR_ASSERT(pArena != nullptr);
// store active attribs
diff --git a/src/gallium/drivers/swr/rasterizer/core/frontend.h b/src/gallium/drivers/swr/rasterizer/core/frontend.h
index acb935f..f92f88c 100644
--- a/src/gallium/drivers/swr/rasterizer/core/frontend.h
+++ b/src/gallium/drivers/swr/rasterizer/core/frontend.h
@@ -146,14 +146,13 @@ float calcDeterminantInt(const __m128i vA, const __m128i vB)
//vMul = [A1*B2 - B1*A2]
vMul = _mm_sub_epi64(vMul, vMul2);
- // According to emmintrin.h __mm_store1_pd(), address must be 16-byte aligned
- OSALIGN(int64_t, 16) result;
- _mm_store1_pd((double*)&result, _mm_castsi128_pd(vMul));
+ int64_t result;
+ _mm_store_sd((double*)&result, _mm_castsi128_pd(vMul));
- double fResult = (double)result;
- fResult = fResult * (1.0 / FIXED_POINT16_SCALE);
+ double dResult = (double)result;
+ dResult = dResult * (1.0 / FIXED_POINT16_SCALE);
- return (float)fResult;
+ return (float)dResult;
}
INLINE
@@ -316,7 +315,7 @@ void ProcessDraw(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, vo
void ProcessClear(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
void ProcessStoreTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
-void ProcessInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
+void ProcessDiscardInvalidateTiles(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
void ProcessSync(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
void ProcessQueryStats(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint32_t workerId, void *pUserData);
diff --git a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
index 3f19555..adf738c 100644
--- a/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
+++ b/src/gallium/drivers/swr/rasterizer/core/knobs_init.h
@@ -80,6 +80,11 @@ static inline void ConvertEnvToKnob(const char* pOverride, float& knobValue)
}
}
+static inline void ConvertEnvToKnob(const char* pOverride, std::string& knobValue)
+{
+ knobValue = pOverride;
+}
+
template <typename T>
static inline void InitKnob(T& knob)
{
diff --git a/src/gallium/drivers/swr/rasterizer/core/pa.h b/src/gallium/drivers/swr/rasterizer/core/pa.h
index 2028d9f..f8f1a33 100644
--- a/src/gallium/drivers/swr/rasterizer/core/pa.h
+++ b/src/gallium/drivers/swr/rasterizer/core/pa.h
@@ -34,12 +34,12 @@
struct PA_STATE
{
- DRAW_CONTEXT *pDC; // draw context
- uint8_t* pStreamBase; // vertex stream
- uint32_t streamSizeInVerts; // total size of the input stream in verts
+ DRAW_CONTEXT *pDC{ nullptr }; // draw context
+ uint8_t* pStreamBase{ nullptr }; // vertex stream
+ uint32_t streamSizeInVerts{ 0 }; // total size of the input stream in verts
// The topology the binner will use. In some cases the FE changes the topology from the api state.
- PRIMITIVE_TOPOLOGY binTopology;
+ PRIMITIVE_TOPOLOGY binTopology{ TOP_UNKNOWN };
PA_STATE() {}
PA_STATE(DRAW_CONTEXT *in_pDC, uint8_t* in_pStreamBase, uint32_t in_streamSizeInVerts) :
@@ -76,37 +76,37 @@ struct PA_STATE
// cuts
struct PA_STATE_OPT : public PA_STATE
{
- simdvertex leadingVertex; // For tri-fan
- uint32_t numPrims; // Total number of primitives for draw.
- uint32_t numPrimsComplete; // Total number of complete primitives.
+ simdvertex leadingVertex; // For tri-fan
+ uint32_t numPrims{ 0 }; // Total number of primitives for draw.
+ uint32_t numPrimsComplete{ 0 }; // Total number of complete primitives.
- uint32_t numSimdPrims; // Number of prims in current simd.
+ uint32_t numSimdPrims{ 0 }; // Number of prims in current simd.
- uint32_t cur; // index to current VS output.
- uint32_t prev; // index to prev VS output. Not really needed in the state.
- uint32_t first; // index to first VS output. Used for trifan.
+ uint32_t cur{ 0 }; // index to current VS output.
+ uint32_t prev{ 0 }; // index to prev VS output. Not really needed in the state.
+ uint32_t first{ 0 }; // index to first VS output. Used for trifan.
- uint32_t counter; // state counter
- bool reset; // reset state
+ uint32_t counter{ 0 }; // state counter
+ bool reset{ false }; // reset state
- uint32_t primIDIncr; // how much to increment for each vector (typically vector / {1, 2})
+ uint32_t primIDIncr{ 0 }; // how much to increment for each vector (typically vector / {1, 2})
simdscalari primID;
typedef bool(*PFN_PA_FUNC)(PA_STATE_OPT& state, uint32_t slot, simdvector verts[]);
typedef void(*PFN_PA_SINGLE_FUNC)(PA_STATE_OPT& pa, uint32_t slot, uint32_t primIndex, __m128 verts[]);
- PFN_PA_FUNC pfnPaFunc; // PA state machine function for assembling 4 triangles.
- PFN_PA_SINGLE_FUNC pfnPaSingleFunc; // PA state machine function for assembling single triangle.
- PFN_PA_FUNC pfnPaFuncReset; // initial state to set on reset
+ PFN_PA_FUNC pfnPaFunc{ nullptr }; // PA state machine function for assembling 4 triangles.
+ PFN_PA_SINGLE_FUNC pfnPaSingleFunc{ nullptr }; // PA state machine function for assembling single triangle.
+ PFN_PA_FUNC pfnPaFuncReset{ nullptr }; // initial state to set on reset
// state used to advance the PA when Next is called
- PFN_PA_FUNC pfnPaNextFunc;
- uint32_t nextNumSimdPrims;
- uint32_t nextNumPrimsIncrement;
- bool nextReset;
- bool isStreaming;
+ PFN_PA_FUNC pfnPaNextFunc{ nullptr };
+ uint32_t nextNumSimdPrims{ 0 };
+ uint32_t nextNumPrimsIncrement{ 0 };
+ bool nextReset{ false };
+ bool isStreaming{ false };
- simdmask tmpIndices; // temporary index store for unused virtual function
+ simdmask tmpIndices{ 0 }; // temporary index store for unused virtual function
PA_STATE_OPT() {}
PA_STATE_OPT(DRAW_CONTEXT* pDC, uint32_t numPrims, uint8_t* pStream, uint32_t streamSizeInVerts,
@@ -333,33 +333,33 @@ INLINE __m128 swizzleLaneN(const simdvector &a, int lane)
// Cut-aware primitive assembler.
struct PA_STATE_CUT : public PA_STATE
{
- simdmask* pCutIndices; // cut indices buffer, 1 bit per vertex
- uint32_t numVerts; // number of vertices available in buffer store
- uint32_t numAttribs; // number of attributes
- int32_t numRemainingVerts; // number of verts remaining to be assembled
- uint32_t numVertsToAssemble; // total number of verts to assemble for the draw
+ simdmask* pCutIndices{ nullptr }; // cut indices buffer, 1 bit per vertex
+ uint32_t numVerts{ 0 }; // number of vertices available in buffer store
+ uint32_t numAttribs{ 0 }; // number of attributes
+ int32_t numRemainingVerts{ 0 }; // number of verts remaining to be assembled
+ uint32_t numVertsToAssemble{ 0 }; // total number of verts to assemble for the draw
OSALIGNSIMD(uint32_t) indices[MAX_NUM_VERTS_PER_PRIM][KNOB_SIMD_WIDTH]; // current index buffer for gather
simdscalari vOffsets[MAX_NUM_VERTS_PER_PRIM]; // byte offsets for currently assembling simd
- uint32_t numPrimsAssembled; // number of primitives that are fully assembled
- uint32_t headVertex; // current unused vertex slot in vertex buffer store
- uint32_t tailVertex; // beginning vertex currently assembling
- uint32_t curVertex; // current unprocessed vertex
- uint32_t startPrimId; // starting prim id
- simdscalari vPrimId; // vector of prim ID
- bool needOffsets; // need to compute gather offsets for current SIMD
- uint32_t vertsPerPrim;
- simdvertex tmpVertex; // temporary simdvertex for unimplemented API
- bool processCutVerts; // vertex indices with cuts should be processed as normal, otherwise they
- // are ignored. Fetch shader sends invalid verts on cuts that should be ignored
- // while the GS sends valid verts for every index
+ uint32_t numPrimsAssembled{ 0 }; // number of primitives that are fully assembled
+ uint32_t headVertex{ 0 }; // current unused vertex slot in vertex buffer store
+ uint32_t tailVertex{ 0 }; // beginning vertex currently assembling
+ uint32_t curVertex{ 0 }; // current unprocessed vertex
+ uint32_t startPrimId{ 0 }; // starting prim id
+ simdscalari vPrimId; // vector of prim ID
+ bool needOffsets{ false }; // need to compute gather offsets for current SIMD
+ uint32_t vertsPerPrim{ 0 };
+ simdvertex tmpVertex; // temporary simdvertex for unimplemented API
+ bool processCutVerts{ false }; // vertex indices with cuts should be processed as normal, otherwise they
+ // are ignored. Fetch shader sends invalid verts on cuts that should be ignored
+ // while the GS sends valid verts for every index
// Topology state tracking
uint32_t vert[MAX_NUM_VERTS_PER_PRIM];
- uint32_t curIndex;
- bool reverseWinding; // indicates reverse winding for strips
- int32_t adjExtraVert; // extra vert uses for tristrip w/ adj
+ uint32_t curIndex{ 0 };
+ bool reverseWinding{ false }; // indicates reverse winding for strips
+ int32_t adjExtraVert{ 0 }; // extra vert uses for tristrip w/ adj
typedef void(PA_STATE_CUT::* PFN_PA_FUNC)(uint32_t vert, bool finish);
- PFN_PA_FUNC pfnPa; // per-topology function that processes a single vert
+ PFN_PA_FUNC pfnPa{ nullptr }; // per-topology function that processes a single vert
PA_STATE_CUT() {}
PA_STATE_CUT(DRAW_CONTEXT* pDC, uint8_t* in_pStream, uint32_t in_streamSizeInVerts, simdmask* in_pIndices, uint32_t in_numVerts,
@@ -1199,9 +1199,9 @@ struct PA_FACTORY
PA_STATE_OPT paOpt;
PA_STATE_CUT paCut;
- bool cutPA;
+ bool cutPA{ false };
- PRIMITIVE_TOPOLOGY topo;
+ PRIMITIVE_TOPOLOGY topo{ TOP_UNKNOWN };
simdvertex vertexStore[MAX_NUM_VERTS_PER_PRIM];
simdmask indexStore[MAX_NUM_VERTS_PER_PRIM];
diff --git a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
index 587e336..52fb7c8 100644
--- a/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/rasterizer.cpp
@@ -690,9 +690,10 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
// Evaluate edge equations at sample positions of each of the 4 corners of a raster tile
// used to for testing if entire raster tile is inside a triangle
- vEdgeFix16[0] = _mm256_add_pd(vEdgeFix16[0], rastEdges[0].vRasterTileOffsets);
- vEdgeFix16[1] = _mm256_add_pd(vEdgeFix16[1], rastEdges[1].vRasterTileOffsets);
- vEdgeFix16[2] = _mm256_add_pd(vEdgeFix16[2], rastEdges[2].vRasterTileOffsets);
+ for (uint32_t e = 0; e < numEdges; ++e)
+ {
+ vEdgeFix16[e] = _mm256_add_pd(vEdgeFix16[e], rastEdges[e].vRasterTileOffsets);
+ }
// at this point vEdge has been evaluated at the UL pixel corners of raster tile bbox
// step sample positions to the raster tile bbox of multisample points
@@ -700,7 +701,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
// | |
// | |
// min(xSamples),max(ySamples) ------ max(xSamples),max(ySamples)
- __m256d vEdge0TileBbox, vEdge1TileBbox, vEdge2TileBbox;
+ __m256d vEdgeTileBbox[3];
if (sampleCount > SWR_MULTISAMPLE_1X)
{
__m128i vTileSampleBBoxXh = MultisampleTraits<sampleCount>::TileSampleOffsetsX();
@@ -711,17 +712,12 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
// step edge equation tests from Tile
// used to for testing if entire raster tile is inside a triangle
- __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].a), vTileSampleBBoxXFix8);
- __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].b), vTileSampleBBoxYFix8);
- vEdge0TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-
- vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].a), vTileSampleBBoxXFix8);
- vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].b), vTileSampleBBoxYFix8);
- vEdge1TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16);
-
- vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].a), vTileSampleBBoxXFix8);
- vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].b), vTileSampleBBoxYFix8);
- vEdge2TileBbox = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+ for (uint32_t e = 0; e < 3; ++e)
+ {
+ __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vTileSampleBBoxXFix8);
+ __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vTileSampleBBoxYFix8);
+ vEdgeTileBbox[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+ }
}
RDTSC_STOP(BEStepSetup, 0, pDC->drawId);
@@ -756,7 +752,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
for (uint32_t tileX = tX; tileX <= maxX; ++tileX)
{
- uint64_t anyCoveredSamples = 0;
+ triDesc.anyCoveredSamples = 0;
// is the corner of the edge outside of the raster tile? (vEdge < 0)
int mask0, mask1, mask2;
@@ -770,9 +766,9 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
{
__m256d vSampleBboxTest0, vSampleBboxTest1, vSampleBboxTest2;
// evaluate edge equations at the tile multisample bounding box
- vSampleBboxTest0 = _mm256_add_pd(vEdge0TileBbox, vEdgeFix16[0]);
- vSampleBboxTest1 = _mm256_add_pd(vEdge1TileBbox, vEdgeFix16[1]);
- vSampleBboxTest2 = _mm256_add_pd(vEdge2TileBbox, vEdgeFix16[2]);
+ vSampleBboxTest0 = _mm256_add_pd(vEdgeTileBbox[0], vEdgeFix16[0]);
+ vSampleBboxTest1 = _mm256_add_pd(vEdgeTileBbox[1], vEdgeFix16[1]);
+ vSampleBboxTest2 = _mm256_add_pd(vEdgeTileBbox[2], vEdgeFix16[2]);
mask0 = _mm256_movemask_pd(vSampleBboxTest0);
mask1 = _mm256_movemask_pd(vSampleBboxTest1);
mask2 = _mm256_movemask_pd(vSampleBboxTest2);
@@ -789,20 +785,21 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
triDesc.coverageMask[sampleNum] = 0xffffffffffffffffULL;
if ((mask0 & mask1 & mask2) == 0xf)
{
- anyCoveredSamples = triDesc.coverageMask[sampleNum];
+ triDesc.anyCoveredSamples = triDesc.coverageMask[sampleNum];
// trivial accept, all 4 corners of all 3 edges are negative
// i.e. raster tile completely inside triangle
RDTSC_EVENT(BETrivialAccept, 1, 0);
}
else
{
- __m256d vEdge0AtSample, vEdge1AtSample, vEdge2AtSample;
+ __m256d vEdgeAtSample[numEdges];
if(sampleCount == SWR_MULTISAMPLE_1X)
{
// should get optimized out for single sample case (global value numbering or copy propagation)
- vEdge0AtSample = vEdgeFix16[0];
- vEdge1AtSample = vEdgeFix16[1];
- vEdge2AtSample = vEdgeFix16[2];
+ for (uint32_t e = 0; e < numEdges; ++e)
+ {
+ vEdgeAtSample[e] = vEdgeFix16[e];
+ }
}
else
{
@@ -815,31 +812,20 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
// for each edge and broadcasts it before offsetting to individual pixel quads
// step edge equation tests from UL tile corner to pixel sample position
- __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].a), vSampleOffsetX);
- __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[0].b), vSampleOffsetY);
- vEdge0AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16);
- vEdge0AtSample = _mm256_add_pd(vEdgeFix16[0], vEdge0AtSample);
-
- vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].a), vSampleOffsetX);
- vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[1].b), vSampleOffsetY);
- vEdge1AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16);
- vEdge1AtSample = _mm256_add_pd(vEdgeFix16[1], vEdge1AtSample);
-
- vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].a), vSampleOffsetX);
- vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[2].b), vSampleOffsetY);
- vEdge2AtSample = _mm256_add_pd(vResultAxFix16, vResultByFix16);
- vEdge2AtSample = _mm256_add_pd(vEdgeFix16[2], vEdge2AtSample);
+ for (uint32_t e = 0; e < numEdges; ++e)
+ {
+ __m256d vResultAxFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].a), vSampleOffsetX);
+ __m256d vResultByFix16 = _mm256_mul_pd(_mm256_set1_pd(rastEdges[e].b), vSampleOffsetY);
+ vEdgeAtSample[e] = _mm256_add_pd(vResultAxFix16, vResultByFix16);
+ vEdgeAtSample[e] = _mm256_add_pd(vEdgeFix16[e], vEdgeAtSample[e]);
+ }
}
double startQuadEdges[numEdges];
const __m256i vLane0Mask = _mm256_set_epi32(0, 0, 0, 0, 0, 0, -1, -1);
- _mm256_maskstore_pd(&startQuadEdges[0], vLane0Mask, vEdge0AtSample);
- _mm256_maskstore_pd(&startQuadEdges[1], vLane0Mask, vEdge1AtSample);
- _mm256_maskstore_pd(&startQuadEdges[2], vLane0Mask, vEdge2AtSample);
-
- for (uint32_t e = 3; e < numEdges; ++e)
+ for (uint32_t e = 0; e < numEdges; ++e)
{
- _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeFix16[e]);
+ _mm256_maskstore_pd(&startQuadEdges[e], vLane0Mask, vEdgeAtSample[e]);
}
// not trivial accept or reject, must rasterize full tile
@@ -854,7 +840,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
}
RDTSC_STOP(BERasterizePartial, 0, 0);
- anyCoveredSamples |= triDesc.coverageMask[sampleNum];
+ triDesc.anyCoveredSamples |= triDesc.coverageMask[sampleNum];
}
}
else
@@ -875,7 +861,7 @@ void RasterizeTriangle(DRAW_CONTEXT* pDC, uint32_t workerId, uint32_t macroTile,
}
else
#endif
- if(anyCoveredSamples)
+ if(triDesc.anyCoveredSamples)
{
RDTSC_START(BEPixelBackend);
backendFuncs.pfnBackend(pDC, workerId, tileX << KNOB_TILE_X_DIM_SHIFT, tileY << KNOB_TILE_Y_DIM_SHIFT, triDesc, renderBuffers);
diff --git a/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h
new file mode 100644
index 0000000..7ff109d
--- /dev/null
+++ b/src/gallium/drivers/swr/rasterizer/core/ringbuffer.h
@@ -0,0 +1,102 @@
+/****************************************************************************
+* Copyright (C) 2016 Intel Corporation. All Rights Reserved.
+*
+* Permission is hereby granted, free of charge, to any person obtaining a
+* copy of this software and associated documentation files (the "Software"),
+* to deal in the Software without restriction, including without limitation
+* the rights to use, copy, modify, merge, publish, distribute, sublicense,
+* and/or sell copies of the Software, and to permit persons to whom the
+* Software is furnished to do so, subject to the following conditions:
+*
+* The above copyright notice and this permission notice (including the next
+* paragraph) shall be included in all copies or substantial portions of the
+* Software.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+* IN THE SOFTWARE.
+*
+* @file arena.h
+*
+* @brief RingBuffer
+* The RingBuffer class manages all aspects of the ring buffer including
+* the head/tail indices, etc.
+*
+******************************************************************************/
+#pragma once
+
+template<typename T>
+class RingBuffer
+{
+public:
+ RingBuffer()
+ : mpRingBuffer(nullptr), mNumEntries(0), mRingHead(0), mRingTail(0)
+ {
+ }
+
+ ~RingBuffer()
+ {
+ Destroy();
+ }
+
+ void Init(uint32_t numEntries)
+ {
+ SWR_ASSERT(numEntries > 0);
+ mNumEntries = numEntries;
+ mpRingBuffer = (T*)_aligned_malloc(sizeof(T)*numEntries, 64);
+ SWR_ASSERT(mpRingBuffer != nullptr);
+ memset(mpRingBuffer, 0, sizeof(T)*numEntries);
+ }
+
+ void Destroy()
+ {
+ _aligned_free(mpRingBuffer);
+ mpRingBuffer = nullptr;
+ }
+
+ T& operator[](const uint32_t index)
+ {
+ SWR_ASSERT(index < mNumEntries);
+ return mpRingBuffer[index];
+ }
+
+ INLINE void Enqueue()
+ {
+ mRingHead++; // There's only one producer.
+ }
+
+ INLINE void Dequeue()
+ {
+ InterlockedIncrement(&mRingTail); // There are multiple consumers.
+ }
+
+ INLINE bool IsEmpty()
+ {
+ return (GetHead() == GetTail());
+ }
+
+ INLINE bool IsFull()
+ {
+ ///@note We don't handle wrap case due to using 64-bit indices.
+ /// It would take 11 million years to wrap at 50,000 DCs per sec.
+ /// If we used 32-bit indices then its about 23 hours to wrap.
+ uint64_t numEnqueued = GetHead() - GetTail();
+ SWR_ASSERT(numEnqueued <= mNumEntries);
+
+ return (numEnqueued == mNumEntries);
+ }
+
+ INLINE volatile uint64_t GetTail() { return mRingTail; }
+ INLINE volatile uint64_t GetHead() { return mRingHead; }
+
+protected:
+ T* mpRingBuffer;
+ uint32_t mNumEntries;
+
+ OSALIGNLINE(volatile uint64_t) mRingHead; // Consumer Counter
+ OSALIGNLINE(volatile uint64_t) mRingTail; // Producer Counter
+};
diff --git a/src/gallium/drivers/swr/rasterizer/core/state.h b/src/gallium/drivers/swr/rasterizer/core/state.h
index 2758555..5752094 100644
--- a/src/gallium/drivers/swr/rasterizer/core/state.h
+++ b/src/gallium/drivers/swr/rasterizer/core/state.h
@@ -307,6 +307,8 @@ struct PixelPositions
simdscalar centroid;
};
+#define SWR_MAX_NUM_MULTISAMPLES 16
+
//////////////////////////////////////////////////////////////////////////
/// SWR_PS_CONTEXT
/// @brief Input to pixel shader.
@@ -338,6 +340,7 @@ struct SWR_PS_CONTEXT
uint32_t frontFace; // IN: front- 1, back- 0
uint32_t primID; // IN: primitive ID
uint32_t sampleIndex; // IN: sampleIndex
+
};
//////////////////////////////////////////////////////////////////////////
@@ -748,7 +751,6 @@ struct SWR_RENDER_TARGET_BLEND_STATE
};
static_assert(sizeof(SWR_RENDER_TARGET_BLEND_STATE) == 1, "Invalid SWR_RENDER_TARGET_BLEND_STATE size");
-#define SWR_MAX_NUM_MULTISAMPLES 16
enum SWR_MULTISAMPLE_COUNT
{
SWR_MULTISAMPLE_1X = 0,
@@ -786,7 +788,8 @@ typedef void(__cdecl *PFN_GS_FUNC)(HANDLE hPrivateData, SWR_GS_CONTEXT* pGsConte
typedef void(__cdecl *PFN_CS_FUNC)(HANDLE hPrivateData, SWR_CS_CONTEXT* pCsContext);
typedef void(__cdecl *PFN_SO_FUNC)(SWR_STREAMOUT_CONTEXT& soContext);
typedef void(__cdecl *PFN_PIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext);
-typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, BYTE*, simdvector&, simdscalari*, simdscalari*);
+typedef void(__cdecl *PFN_CPIXEL_KERNEL)(HANDLE hPrivateData, SWR_PS_CONTEXT *pContext);
+typedef void(__cdecl *PFN_BLEND_JIT_FUNC)(const SWR_BLEND_STATE*, simdvector&, simdvector&, uint32_t, uint8_t*, simdvector&, simdscalari*, simdscalari*);
//////////////////////////////////////////////////////////////////////////
/// FRONTEND_STATE
@@ -941,6 +944,7 @@ struct SWR_BACKEND_STATE
uint8_t numComponents[KNOB_NUM_ATTRIBUTES];
};
+
union SWR_DEPTH_STENCIL_STATE
{
struct
@@ -980,7 +984,6 @@ enum SWR_SHADING_RATE
{
SWR_SHADING_RATE_PIXEL,
SWR_SHADING_RATE_SAMPLE,
- SWR_SHADING_RATE_COARSE,
SWR_SHADING_RATE_MAX,
};
@@ -1024,4 +1027,5 @@ struct SWR_PS_STATE
uint32_t barycentricsMask : 3; // which type(s) of barycentric coords does the PS interpolate attributes with
uint32_t usesUAV : 1; // pixel shader accesses UAV
uint32_t forceEarlyZ : 1; // force execution of early depth/stencil test
+
};
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.cpp b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
index 24c5588..ce8646f 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.cpp
@@ -24,7 +24,6 @@
#include <stdio.h>
#include <thread>
#include <algorithm>
-#include <unordered_set>
#include <float.h>
#include <vector>
#include <utility>
@@ -44,7 +43,6 @@
#include "rasterizer.h"
#include "rdtsc_core.h"
#include "tilemgr.h"
-#include "core/multisample.h"
@@ -265,9 +263,7 @@ void bindThread(uint32_t threadId, uint32_t procGroupId = 0, bool bindProcGroup=
INLINE
uint64_t GetEnqueuedDraw(SWR_CONTEXT *pContext)
{
- //uint64_t result = _InterlockedCompareExchange64((volatile __int64*)&pContext->DrawEnqueued, 0, 0);
- //return result;
- return pContext->DrawEnqueued;
+ return pContext->dcRing.GetHead();
}
INLINE
@@ -283,169 +279,21 @@ bool CheckDependency(SWR_CONTEXT *pContext, DRAW_CONTEXT *pDC, uint64_t lastReti
return (pDC->dependency > lastRetiredDraw);
}
-void ClearColorHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
-{
- // Load clear color into SIMD register...
- float *pClearData = (float*)(pHotTile->clearData);
- simdscalar valR = _simd_broadcast_ss(&pClearData[0]);
- simdscalar valG = _simd_broadcast_ss(&pClearData[1]);
- simdscalar valB = _simd_broadcast_ss(&pClearData[2]);
- simdscalar valA = _simd_broadcast_ss(&pClearData[3]);
-
- float *pfBuf = (float*)pHotTile->pBuffer;
- uint32_t numSamples = pHotTile->numSamples;
- for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
- {
- for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
- {
- for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++)
- {
- _simd_store_ps(pfBuf, valR);
- pfBuf += KNOB_SIMD_WIDTH;
- _simd_store_ps(pfBuf, valG);
- pfBuf += KNOB_SIMD_WIDTH;
- _simd_store_ps(pfBuf, valB);
- pfBuf += KNOB_SIMD_WIDTH;
- _simd_store_ps(pfBuf, valA);
- pfBuf += KNOB_SIMD_WIDTH;
- }
- }
- }
-}
-void ClearDepthHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
+INLINE void CompleteDrawContext(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC)
{
- // Load clear color into SIMD register...
- float *pClearData = (float*)(pHotTile->clearData);
- simdscalar valZ = _simd_broadcast_ss(&pClearData[0]);
+ int64_t result = InterlockedDecrement64(&pDC->threadsDone);
- float *pfBuf = (float*)pHotTile->pBuffer;
- uint32_t numSamples = pHotTile->numSamples;
-
- for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+ if (result == 0)
{
- for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
- {
- for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM)
- {
- _simd_store_ps(pfBuf, valZ);
- pfBuf += KNOB_SIMD_WIDTH;
- }
- }
- }
-}
-
-void ClearStencilHotTile(const HOTTILE* pHotTile)
-{
- // convert from F32 to U8.
- uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
- //broadcast 32x into __m256i...
- simdscalari valS = _simd_set1_epi8(clearVal);
-
- simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer;
- uint32_t numSamples = pHotTile->numSamples;
-
- for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
- {
- for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
- {
- // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
- for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4)
- {
- _simd_store_si(pBuf, valS);
- pBuf += 1;
- }
- }
- }
-}
-
-// for draw calls, we initialize the active hot tiles and perform deferred
-// load on them if tile is in invalid state. we do this in the outer thread loop instead of inside
-// the draw routine itself mainly for performance, to avoid unnecessary setup
-// every triangle
-// @todo support deferred clear
-INLINE
-void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, const TRIANGLE_WORK_DESC* pWork)
-{
- const API_STATE& state = GetApiState(pDC);
- HotTileMgr *pHotTileMgr = pContext->pHotTileMgr;
-
- uint32_t x, y;
- MacroTileMgr::getTileIndices(macroID, x, y);
- x *= KNOB_MACROTILE_X_DIM;
- y *= KNOB_MACROTILE_Y_DIM;
-
- uint32_t numSamples = GetNumSamples(state.rastState.sampleCount);
-
- // check RT if enabled
- unsigned long rtSlot = 0;
- uint32_t colorHottileEnableMask = state.colorHottileEnable;
- while(_BitScanForward(&rtSlot, colorHottileEnableMask))
- {
- HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples);
-
- if (pHotTile->state == HOTTILE_INVALID)
- {
- RDTSC_START(BELoadTiles);
- // invalid hottile before draw requires a load from surface before we can draw to it
- pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
- pHotTile->state = HOTTILE_DIRTY;
- RDTSC_STOP(BELoadTiles, 0, 0);
- }
- else if (pHotTile->state == HOTTILE_CLEAR)
- {
- RDTSC_START(BELoadTiles);
- // Clear the tile.
- ClearColorHotTile(pHotTile);
- pHotTile->state = HOTTILE_DIRTY;
- RDTSC_STOP(BELoadTiles, 0, 0);
- }
- colorHottileEnableMask &= ~(1 << rtSlot);
- }
+ _ReadWriteBarrier();
- // check depth if enabled
- if (state.depthHottileEnable)
- {
- HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
- if (pHotTile->state == HOTTILE_INVALID)
- {
- RDTSC_START(BELoadTiles);
- // invalid hottile before draw requires a load from surface before we can draw to it
- pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
- pHotTile->state = HOTTILE_DIRTY;
- RDTSC_STOP(BELoadTiles, 0, 0);
- }
- else if (pHotTile->state == HOTTILE_CLEAR)
- {
- RDTSC_START(BELoadTiles);
- // Clear the tile.
- ClearDepthHotTile(pHotTile);
- pHotTile->state = HOTTILE_DIRTY;
- RDTSC_STOP(BELoadTiles, 0, 0);
- }
- }
+ // Cleanup memory allocations
+ pDC->pArena->Reset(true);
+ pDC->pTileMgr->initialize();
- // check stencil if enabled
- if (state.stencilHottileEnable)
- {
- HOTTILE* pHotTile = pHotTileMgr->GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
- if (pHotTile->state == HOTTILE_INVALID)
- {
- RDTSC_START(BELoadTiles);
- // invalid hottile before draw requires a load from surface before we can draw to it
- pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
- pHotTile->state = HOTTILE_DIRTY;
- RDTSC_STOP(BELoadTiles, 0, 0);
- }
- else if (pHotTile->state == HOTTILE_CLEAR)
- {
- RDTSC_START(BELoadTiles);
- // Clear the tile.
- ClearStencilHotTile(pHotTile);
- pHotTile->state = HOTTILE_DIRTY;
- RDTSC_STOP(BELoadTiles, 0, 0);
- }
+ pContext->dcRing.Dequeue(); // Remove from tail
}
}
@@ -466,7 +314,7 @@ INLINE bool FindFirstIncompleteDraw(SWR_CONTEXT* pContext, uint64_t& curDrawBE)
if (isWorkComplete)
{
curDrawBE++;
- InterlockedIncrement(&pDC->threadsDoneBE);
+ CompleteDrawContext(pContext, pDC);
}
else
{
@@ -496,7 +344,7 @@ void WorkOnFifoBE(
SWR_CONTEXT *pContext,
uint32_t workerId,
uint64_t &curDrawBE,
- std::unordered_set<uint32_t>& lockedTiles)
+ TileSet& lockedTiles)
{
// Find the first incomplete draw that has pending work. If no such draw is found then
// return. FindFirstIncompleteDraw is responsible for incrementing the curDrawBE.
@@ -558,7 +406,7 @@ void WorkOnFifoBE(
SWR_ASSERT(pWork);
if (pWork->type == DRAW)
{
- InitializeHotTiles(pContext, pDC, tileID, (const TRIANGLE_WORK_DESC*)&pWork->desc);
+ pContext->pHotTileMgr->InitializeHotTiles(pContext, pDC, tileID);
}
}
@@ -579,7 +427,7 @@ void WorkOnFifoBE(
{
// We can increment the current BE and safely move to next draw since we know this draw is complete.
curDrawBE++;
- InterlockedIncrement(&pDC->threadsDoneBE);
+ CompleteDrawContext(pContext, pDC);
lastRetiredDraw++;
@@ -598,7 +446,7 @@ void WorkOnFifoBE(
}
}
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, UCHAR numaNode)
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, int numaNode)
{
// Try to grab the next DC from the ring
uint64_t drawEnqueued = GetEnqueuedDraw(pContext);
@@ -608,8 +456,8 @@ void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE,
DRAW_CONTEXT *pDC = &pContext->dcRing[dcSlot];
if (pDC->isCompute || pDC->doneFE || pDC->FeLock)
{
+ CompleteDrawContext(pContext, pDC);
curDrawFE++;
- InterlockedIncrement(&pDC->threadsDoneFE);
}
else
{
@@ -673,22 +521,12 @@ void WorkOnCompute(
// Is there any work remaining?
if (queue.getNumQueued() > 0)
{
- bool lastToComplete = false;
-
uint32_t threadGroupId = 0;
while (queue.getWork(threadGroupId))
{
ProcessComputeBE(pDC, workerId, threadGroupId);
- lastToComplete = queue.finishedWork();
- }
-
- _ReadWriteBarrier();
-
- if (lastToComplete)
- {
- SWR_ASSERT(queue.isWorkComplete() == true);
- pDC->doneCompute = true;
+ queue.finishedWork();
}
}
}
@@ -711,7 +549,7 @@ DWORD workerThreadMain(LPVOID pData)
// Track tiles locked by other threads. If we try to lock a macrotile and find its already
// locked then we'll add it to this list so that we don't try and lock it again.
- std::unordered_set<uint32_t> lockedTiles;
+ TileSet lockedTiles;
// each worker has the ability to work on any of the queued draws as long as certain
// conditions are met. the data associated
@@ -732,10 +570,10 @@ DWORD workerThreadMain(LPVOID pData)
// the worker can safely increment its oldestDraw counter and move on to the next draw.
std::unique_lock<std::mutex> lock(pContext->WaitLock, std::defer_lock);
- auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->DrawEnqueued; };
+ auto threadHasWork = [&](uint64_t curDraw) { return curDraw != pContext->dcRing.GetHead(); };
- uint64_t curDrawBE = 1;
- uint64_t curDrawFE = 1;
+ uint64_t curDrawBE = 0;
+ uint64_t curDrawFE = 0;
while (pContext->threadPool.inThreadShutdown == false)
{
@@ -853,9 +691,12 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
numThreads, KNOB_MAX_NUM_THREADS);
}
+ uint32_t numAPIReservedThreads = 1;
+
+
if (numThreads == 1)
{
- // If only 1 worker thread, try to move it to an available
+ // If only 1 worker threads, try to move it to an available
// HW thread. If that fails, use the API thread.
if (numCoresPerNode < numHWCoresPerNode)
{
@@ -878,8 +719,15 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
}
else
{
- // Save a HW thread for the API thread.
- numThreads--;
+ // Save HW threads for the API if we can
+ if (numThreads > numAPIReservedThreads)
+ {
+ numThreads -= numAPIReservedThreads;
+ }
+ else
+ {
+ numAPIReservedThreads = 0;
+ }
}
pPool->numThreads = numThreads;
@@ -918,9 +766,9 @@ void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool)
auto& core = node.cores[c];
for (uint32_t t = 0; t < numHyperThreads; ++t)
{
- if (c == 0 && n == 0 && t == 0)
+ if (numAPIReservedThreads)
{
- // Skip core 0, thread0 on node 0 to reserve for API thread
+ --numAPIReservedThreads;
continue;
}
diff --git a/src/gallium/drivers/swr/rasterizer/core/threads.h b/src/gallium/drivers/swr/rasterizer/core/threads.h
index 0fa7196..6b37e3a 100644
--- a/src/gallium/drivers/swr/rasterizer/core/threads.h
+++ b/src/gallium/drivers/swr/rasterizer/core/threads.h
@@ -54,10 +54,12 @@ struct THREAD_POOL
THREAD_DATA *pThreadData;
};
+typedef std::unordered_set<uint32_t> TileSet;
+
void CreateThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
void DestroyThreadPool(SWR_CONTEXT *pContext, THREAD_POOL *pPool);
// Expose FE and BE worker functions to the API thread if single threaded
-void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, UCHAR numaNode);
-void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, std::unordered_set<uint32_t> &usedTiles);
+void WorkOnFifoFE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawFE, int numaNode);
+void WorkOnFifoBE(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE, TileSet &usedTiles);
void WorkOnCompute(SWR_CONTEXT *pContext, uint32_t workerId, uint64_t &curDrawBE);
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
index 8603936..89c779e 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.cpp
@@ -29,7 +29,9 @@
#include <unordered_map>
#include "fifo.hpp"
-#include "tilemgr.h"
+#include "core/tilemgr.h"
+#include "core/multisample.h"
+#include "rdtsc_core.h"
#define TILE_ID(x,y) ((x << 16 | y))
@@ -54,24 +56,21 @@ void DispatchQueue::operator delete(void *p)
_aligned_free(p);
}
-MacroTileMgr::MacroTileMgr(Arena& arena) : mArena(arena)
+MacroTileMgr::MacroTileMgr(CachingArena& arena) : mArena(arena)
{
}
-void MacroTileMgr::initialize()
-{
- mWorkItemsProduced = 0;
- mWorkItemsConsumed = 0;
-
- mDirtyTiles.clear();
-}
-
void MacroTileMgr::enqueue(uint32_t x, uint32_t y, BE_WORK *pWork)
{
// Should not enqueue more then what we have backing for in the hot tile manager.
SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
+ if ((x & ~(KNOB_NUM_HOT_TILES_X-1)) | (y & ~(KNOB_NUM_HOT_TILES_Y-1)))
+ {
+ return;
+ }
+
uint32_t id = TILE_ID(x, y);
MacroTileQueue &tile = mTiles[id];
@@ -103,3 +102,282 @@ void MacroTileMgr::markTileComplete(uint32_t id)
tile.mWorkItemsFE = 0;
tile.mWorkItemsBE = 0;
}
+
+HOTTILE* HotTileMgr::GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples,
+ uint32_t renderTargetArrayIndex)
+{
+ uint32_t x, y;
+ MacroTileMgr::getTileIndices(macroID, x, y);
+
+ SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
+ SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
+
+ HotTileSet &tile = mHotTiles[x][y];
+ HOTTILE& hotTile = tile.Attachment[attachment];
+ if (hotTile.pBuffer == NULL)
+ {
+ if (create)
+ {
+ uint32_t size = numSamples * mHotTileSize[attachment];
+ hotTile.pBuffer = (uint8_t*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
+ hotTile.state = HOTTILE_INVALID;
+ hotTile.numSamples = numSamples;
+ hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
+ }
+ else
+ {
+ return NULL;
+ }
+ }
+ else
+ {
+ // free the old tile and create a new one with enough space to hold all samples
+ if (numSamples > hotTile.numSamples)
+ {
+ // tile should be either uninitialized or resolved if we're deleting and switching to a
+ // new sample count
+ SWR_ASSERT((hotTile.state == HOTTILE_INVALID) ||
+ (hotTile.state == HOTTILE_RESOLVED) ||
+ (hotTile.state == HOTTILE_CLEAR));
+ _aligned_free(hotTile.pBuffer);
+
+ uint32_t size = numSamples * mHotTileSize[attachment];
+ hotTile.pBuffer = (uint8_t*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
+ hotTile.state = HOTTILE_INVALID;
+ hotTile.numSamples = numSamples;
+ }
+
+ // if requested render target array index isn't currently loaded, need to store out the current hottile
+ // and load the requested array slice
+ if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex)
+ {
+ SWR_FORMAT format;
+ switch (attachment)
+ {
+ case SWR_ATTACHMENT_COLOR0:
+ case SWR_ATTACHMENT_COLOR1:
+ case SWR_ATTACHMENT_COLOR2:
+ case SWR_ATTACHMENT_COLOR3:
+ case SWR_ATTACHMENT_COLOR4:
+ case SWR_ATTACHMENT_COLOR5:
+ case SWR_ATTACHMENT_COLOR6:
+ case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break;
+ case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break;
+ case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break;
+ default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break;
+ }
+
+ if (hotTile.state == HOTTILE_DIRTY)
+ {
+ pContext->pfnStoreTile(GetPrivateState(pDC), format, attachment,
+ x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer);
+ }
+
+ pContext->pfnLoadTile(GetPrivateState(pDC), format, attachment,
+ x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer);
+
+ hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
+ hotTile.state = HOTTILE_DIRTY;
+ }
+ }
+ return &tile.Attachment[attachment];
+}
+
+HOTTILE* HotTileMgr::GetHotTileNoLoad(
+ SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID,
+ SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples)
+{
+ uint32_t x, y;
+ MacroTileMgr::getTileIndices(macroID, x, y);
+
+ SWR_ASSERT(x < KNOB_NUM_HOT_TILES_X);
+ SWR_ASSERT(y < KNOB_NUM_HOT_TILES_Y);
+
+ HotTileSet &tile = mHotTiles[x][y];
+ HOTTILE& hotTile = tile.Attachment[attachment];
+ if (hotTile.pBuffer == NULL)
+ {
+ if (create)
+ {
+ uint32_t size = numSamples * mHotTileSize[attachment];
+ hotTile.pBuffer = (uint8_t*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
+ hotTile.state = HOTTILE_INVALID;
+ hotTile.numSamples = numSamples;
+ hotTile.renderTargetArrayIndex = 0;
+ }
+ else
+ {
+ return NULL;
+ }
+ }
+
+ return &hotTile;
+}
+
+void HotTileMgr::ClearColorHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
+{
+ // Load clear color into SIMD register...
+ float *pClearData = (float*)(pHotTile->clearData);
+ simdscalar valR = _simd_broadcast_ss(&pClearData[0]);
+ simdscalar valG = _simd_broadcast_ss(&pClearData[1]);
+ simdscalar valB = _simd_broadcast_ss(&pClearData[2]);
+ simdscalar valA = _simd_broadcast_ss(&pClearData[3]);
+
+ float *pfBuf = (float*)pHotTile->pBuffer;
+ uint32_t numSamples = pHotTile->numSamples;
+
+ for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+ {
+ for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+ {
+ for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM) //SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM); si++)
+ {
+ _simd_store_ps(pfBuf, valR);
+ pfBuf += KNOB_SIMD_WIDTH;
+ _simd_store_ps(pfBuf, valG);
+ pfBuf += KNOB_SIMD_WIDTH;
+ _simd_store_ps(pfBuf, valB);
+ pfBuf += KNOB_SIMD_WIDTH;
+ _simd_store_ps(pfBuf, valA);
+ pfBuf += KNOB_SIMD_WIDTH;
+ }
+ }
+ }
+}
+
+void HotTileMgr::ClearDepthHotTile(const HOTTILE* pHotTile) // clear a macro tile from float4 clear data.
+{
+ // Load clear color into SIMD register...
+ float *pClearData = (float*)(pHotTile->clearData);
+ simdscalar valZ = _simd_broadcast_ss(&pClearData[0]);
+
+ float *pfBuf = (float*)pHotTile->pBuffer;
+ uint32_t numSamples = pHotTile->numSamples;
+
+ for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+ {
+ for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+ {
+ for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM)
+ {
+ _simd_store_ps(pfBuf, valZ);
+ pfBuf += KNOB_SIMD_WIDTH;
+ }
+ }
+ }
+}
+
+void HotTileMgr::ClearStencilHotTile(const HOTTILE* pHotTile)
+{
+ // convert from F32 to U8.
+ uint8_t clearVal = (uint8_t)(pHotTile->clearData[0]);
+ //broadcast 32x into __m256i...
+ simdscalari valS = _simd_set1_epi8(clearVal);
+
+ simdscalari* pBuf = (simdscalari*)pHotTile->pBuffer;
+ uint32_t numSamples = pHotTile->numSamples;
+
+ for (uint32_t row = 0; row < KNOB_MACROTILE_Y_DIM; row += KNOB_TILE_Y_DIM)
+ {
+ for (uint32_t col = 0; col < KNOB_MACROTILE_X_DIM; col += KNOB_TILE_X_DIM)
+ {
+ // We're putting 4 pixels in each of the 32-bit slots, so increment 4 times as quickly.
+ for (uint32_t si = 0; si < (KNOB_TILE_X_DIM * KNOB_TILE_Y_DIM * numSamples); si += SIMD_TILE_X_DIM * SIMD_TILE_Y_DIM * 4)
+ {
+ _simd_store_si(pBuf, valS);
+ pBuf += 1;
+ }
+ }
+ }
+}
+
+//////////////////////////////////////////////////////////////////////////
+/// @brief InitializeHotTiles
+/// for draw calls, we initialize the active hot tiles and perform deferred
+/// load on them if tile is in invalid state. we do this in the outer thread
+/// loop instead of inside the draw routine itself mainly for performance,
+/// to avoid unnecessary setup every triangle
+/// @todo support deferred clear
+/// @param pCreateInfo - pointer to creation info.
+void HotTileMgr::InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID)
+{
+ const API_STATE& state = GetApiState(pDC);
+ HotTileMgr *pHotTileMgr = pContext->pHotTileMgr;
+
+ uint32_t x, y;
+ MacroTileMgr::getTileIndices(macroID, x, y);
+ x *= KNOB_MACROTILE_X_DIM;
+ y *= KNOB_MACROTILE_Y_DIM;
+
+ uint32_t numSamples = GetNumSamples(state.rastState.sampleCount);
+
+ // check RT if enabled
+ unsigned long rtSlot = 0;
+ uint32_t colorHottileEnableMask = state.colorHottileEnable;
+ while (_BitScanForward(&rtSlot, colorHottileEnableMask))
+ {
+ HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), true, numSamples);
+
+ if (pHotTile->state == HOTTILE_INVALID)
+ {
+ RDTSC_START(BELoadTiles);
+ // invalid hottile before draw requires a load from surface before we can draw to it
+ pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_COLOR_HOT_TILE_FORMAT, (SWR_RENDERTARGET_ATTACHMENT)(SWR_ATTACHMENT_COLOR0 + rtSlot), x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+ pHotTile->state = HOTTILE_DIRTY;
+ RDTSC_STOP(BELoadTiles, 0, 0);
+ }
+ else if (pHotTile->state == HOTTILE_CLEAR)
+ {
+ RDTSC_START(BELoadTiles);
+ // Clear the tile.
+ ClearColorHotTile(pHotTile);
+ pHotTile->state = HOTTILE_DIRTY;
+ RDTSC_STOP(BELoadTiles, 0, 0);
+ }
+ colorHottileEnableMask &= ~(1 << rtSlot);
+ }
+
+ // check depth if enabled
+ if (state.depthHottileEnable)
+ {
+ HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_DEPTH, true, numSamples);
+ if (pHotTile->state == HOTTILE_INVALID)
+ {
+ RDTSC_START(BELoadTiles);
+ // invalid hottile before draw requires a load from surface before we can draw to it
+ pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_DEPTH_HOT_TILE_FORMAT, SWR_ATTACHMENT_DEPTH, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+ pHotTile->state = HOTTILE_DIRTY;
+ RDTSC_STOP(BELoadTiles, 0, 0);
+ }
+ else if (pHotTile->state == HOTTILE_CLEAR)
+ {
+ RDTSC_START(BELoadTiles);
+ // Clear the tile.
+ ClearDepthHotTile(pHotTile);
+ pHotTile->state = HOTTILE_DIRTY;
+ RDTSC_STOP(BELoadTiles, 0, 0);
+ }
+ }
+
+ // check stencil if enabled
+ if (state.stencilHottileEnable)
+ {
+ HOTTILE* pHotTile = GetHotTile(pContext, pDC, macroID, SWR_ATTACHMENT_STENCIL, true, numSamples);
+ if (pHotTile->state == HOTTILE_INVALID)
+ {
+ RDTSC_START(BELoadTiles);
+ // invalid hottile before draw requires a load from surface before we can draw to it
+ pContext->pfnLoadTile(GetPrivateState(pDC), KNOB_STENCIL_HOT_TILE_FORMAT, SWR_ATTACHMENT_STENCIL, x, y, pHotTile->renderTargetArrayIndex, pHotTile->pBuffer);
+ pHotTile->state = HOTTILE_DIRTY;
+ RDTSC_STOP(BELoadTiles, 0, 0);
+ }
+ else if (pHotTile->state == HOTTILE_CLEAR)
+ {
+ RDTSC_START(BELoadTiles);
+ // Clear the tile.
+ ClearStencilHotTile(pHotTile);
+ pHotTile->state = HOTTILE_DIRTY;
+ RDTSC_STOP(BELoadTiles, 0, 0);
+ }
+ }
+}
diff --git a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
index 9137941..cf9d2fe 100644
--- a/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
+++ b/src/gallium/drivers/swr/rasterizer/core/tilemgr.h
@@ -59,7 +59,8 @@ struct MacroTileQueue
//////////////////////////////////////////////////////////////////////////
/// @brief Clear fifo and unlock it.
- void clear(Arena& arena)
+ template <typename ArenaT>
+ void clear(ArenaT& arena)
{
mFifo.clear(arena);
}
@@ -71,7 +72,8 @@ struct MacroTileQueue
return mFifo.peek();
}
- bool enqueue_try_nosync(Arena& arena, const BE_WORK* entry)
+ template <typename ArenaT>
+ bool enqueue_try_nosync(ArenaT& arena, const BE_WORK* entry)
{
return mFifo.enqueue_try_nosync(arena, entry);
}
@@ -104,7 +106,7 @@ private:
class MacroTileMgr
{
public:
- MacroTileMgr(Arena& arena);
+ MacroTileMgr(CachingArena& arena);
~MacroTileMgr()
{
for (auto &tile : mTiles)
@@ -113,7 +115,14 @@ public:
}
}
- void initialize();
+ INLINE void initialize()
+ {
+ mWorkItemsProduced = 0;
+ mWorkItemsConsumed = 0;
+
+ mDirtyTiles.clear();
+ }
+
INLINE std::vector<uint32_t>& getDirtyTiles() { return mDirtyTiles; }
INLINE MacroTileQueue& getMacroTileQueue(uint32_t id) { return mTiles[id]; }
void markTileComplete(uint32_t id);
@@ -135,15 +144,14 @@ public:
void operator delete (void *p);
private:
- Arena& mArena;
- SWR_FORMAT mFormat;
+ CachingArena& mArena;
std::unordered_map<uint32_t, MacroTileQueue> mTiles;
// Any tile that has work queued to it is a dirty tile.
std::vector<uint32_t> mDirtyTiles;
- OSALIGNLINE(LONG) mWorkItemsProduced;
- OSALIGNLINE(volatile LONG) mWorkItemsConsumed;
+ OSALIGNLINE(LONG) mWorkItemsProduced { 0 };
+ OSALIGNLINE(volatile LONG) mWorkItemsConsumed { 0 };
};
//////////////////////////////////////////////////////////////////////////
@@ -224,7 +232,7 @@ public:
void *operator new(size_t size);
void operator delete (void *p);
- void* mpTaskData; // The API thread will set this up and the callback task function will interpet this.
+ void* mpTaskData{ nullptr }; // The API thread will set this up and the callback task function will interpet this.
OSALIGNLINE(volatile LONG) mTasksAvailable{ 0 };
OSALIGNLINE(volatile LONG) mTasksOutstanding{ 0 };
@@ -241,7 +249,7 @@ enum HOTTILE_STATE
struct HOTTILE
{
- BYTE *pBuffer;
+ uint8_t *pBuffer;
HOTTILE_STATE state;
DWORD clearData[4]; // May need to change based on pfnClearTile implementation. Reorder for alignment?
uint32_t numSamples;
@@ -293,95 +301,16 @@ public:
}
}
- HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1,
- uint32_t renderTargetArrayIndex = 0)
- {
- uint32_t x, y;
- MacroTileMgr::getTileIndices(macroID, x, y);
+ void InitializeHotTiles(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID);
- assert(x < KNOB_NUM_HOT_TILES_X);
- assert(y < KNOB_NUM_HOT_TILES_Y);
+ HOTTILE *GetHotTile(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1,
+ uint32_t renderTargetArrayIndex = 0);
- HotTileSet &tile = mHotTiles[x][y];
- HOTTILE& hotTile = tile.Attachment[attachment];
- if (hotTile.pBuffer == NULL)
- {
- if (create)
- {
- uint32_t size = numSamples * mHotTileSize[attachment];
- hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
- hotTile.state = HOTTILE_INVALID;
- hotTile.numSamples = numSamples;
- hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
- }
- else
- {
- return NULL;
- }
- }
- else
- {
- // free the old tile and create a new one with enough space to hold all samples
- if (numSamples > hotTile.numSamples)
- {
- // tile should be either uninitialized or resolved if we're deleting and switching to a
- // new sample count
- assert((hotTile.state == HOTTILE_INVALID) ||
- (hotTile.state == HOTTILE_RESOLVED) ||
- (hotTile.state == HOTTILE_CLEAR));
- _aligned_free(hotTile.pBuffer);
-
- uint32_t size = numSamples * mHotTileSize[attachment];
- hotTile.pBuffer = (BYTE*)_aligned_malloc(size, KNOB_SIMD_WIDTH * 4);
- hotTile.state = HOTTILE_INVALID;
- hotTile.numSamples = numSamples;
- }
+ HOTTILE *GetHotTileNoLoad(SWR_CONTEXT* pContext, DRAW_CONTEXT* pDC, uint32_t macroID, SWR_RENDERTARGET_ATTACHMENT attachment, bool create, uint32_t numSamples = 1);
- // if requested render target array index isn't currently loaded, need to store out the current hottile
- // and load the requested array slice
- if (renderTargetArrayIndex != hotTile.renderTargetArrayIndex)
- {
- SWR_FORMAT format;
- switch (attachment)
- {
- case SWR_ATTACHMENT_COLOR0:
- case SWR_ATTACHMENT_COLOR1:
- case SWR_ATTACHMENT_COLOR2:
- case SWR_ATTACHMENT_COLOR3:
- case SWR_ATTACHMENT_COLOR4:
- case SWR_ATTACHMENT_COLOR5:
- case SWR_ATTACHMENT_COLOR6:
- case SWR_ATTACHMENT_COLOR7: format = KNOB_COLOR_HOT_TILE_FORMAT; break;
- case SWR_ATTACHMENT_DEPTH: format = KNOB_DEPTH_HOT_TILE_FORMAT; break;
- case SWR_ATTACHMENT_STENCIL: format = KNOB_STENCIL_HOT_TILE_FORMAT; break;
- default: SWR_ASSERT(false, "Unknown attachment: %d", attachment); format = KNOB_COLOR_HOT_TILE_FORMAT; break;
- }
-
- if (hotTile.state == HOTTILE_DIRTY)
- {
- pContext->pfnStoreTile(GetPrivateState(pDC), format, attachment,
- x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, hotTile.renderTargetArrayIndex, hotTile.pBuffer);
- }
-
- pContext->pfnLoadTile(GetPrivateState(pDC), format, attachment,
- x * KNOB_MACROTILE_X_DIM, y * KNOB_MACROTILE_Y_DIM, renderTargetArrayIndex, hotTile.pBuffer);
-
- hotTile.renderTargetArrayIndex = renderTargetArrayIndex;
- hotTile.state = HOTTILE_DIRTY;
- }
- }
- return &tile.Attachment[attachment];
- }
-
- HotTileSet &GetHotTile(uint32_t macroID)
- {
- uint32_t x, y;
- MacroTileMgr::getTileIndices(macroID, x, y);
- assert(x < KNOB_NUM_HOT_TILES_X);
- assert(y < KNOB_NUM_HOT_TILES_Y);
-
- return mHotTiles[x][y];
- }
+ static void ClearColorHotTile(const HOTTILE* pHotTile);
+ static void ClearDepthHotTile(const HOTTILE* pHotTile);
+ static void ClearStencilHotTile(const HOTTILE* pHotTile);
private:
HotTileSet mHotTiles[KNOB_NUM_HOT_TILES_X][KNOB_NUM_HOT_TILES_Y];
diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.cpp b/src/gallium/drivers/swr/rasterizer/core/utils.cpp
index f36452f..a1d665e 100644
--- a/src/gallium/drivers/swr/rasterizer/core/utils.cpp
+++ b/src/gallium/drivers/swr/rasterizer/core/utils.cpp
@@ -27,6 +27,11 @@
******************************************************************************/
#if defined(_WIN32)
+#if defined(NOMINMAX)
+// GDI Plus requires non-std min / max macros be defined :(
+#undef NOMINMAX
+#endif
+
#include<Windows.h>
#include <Gdiplus.h>
#include <Gdiplusheaders.h>
diff --git a/src/gallium/drivers/swr/rasterizer/core/utils.h b/src/gallium/drivers/swr/rasterizer/core/utils.h
index b9dc48c..60a3a6a 100644
--- a/src/gallium/drivers/swr/rasterizer/core/utils.h
+++ b/src/gallium/drivers/swr/rasterizer/core/utils.h
@@ -46,8 +46,7 @@ void OpenBitmapFromFile(
uint32_t *height);
#endif
-/// @todo assume linux is always 64 bit
-#if defined(_WIN64) || defined(__linux__) || defined(__gnu_linux__)
+#if defined(_WIN64) || defined(__x86_64__)
#define _MM_INSERT_EPI64 _mm_insert_epi64
#define _MM_EXTRACT_EPI64 _mm_extract_epi64
#else
@@ -89,7 +88,10 @@ INLINE __m128i _MM_INSERT_EPI64(__m128i a, INT64 b, const int32_t ndx)
OSALIGNLINE(struct) BBOX
{
- int top, bottom, left, right;
+ int top{ 0 };
+ int bottom{ 0 };
+ int left{ 0 };
+ int right{ 0 };
BBOX() {}
BBOX(int t, int b, int l, int r) : top(t), bottom(b), left(l), right(r) {}
@@ -110,7 +112,10 @@ OSALIGNLINE(struct) BBOX
struct simdBBox
{
- simdscalari top, bottom, left, right;
+ simdscalari top;
+ simdscalari bottom;
+ simdscalari left;
+ simdscalari right;
};
INLINE
@@ -271,7 +276,7 @@ struct TransposeSingleComponent
/// @brief Pass-thru for single component.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
memcpy(pDst, pSrc, (bpp * KNOB_SIMD_WIDTH) / 8);
}
@@ -286,7 +291,7 @@ struct Transpose8_8_8_8
/// @brief Performs an SOA to AOS conversion for packed 8_8_8_8 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
simdscalari src = _simd_load_si((const simdscalari*)pSrc);
#if KNOB_SIMD_WIDTH == 8
@@ -325,7 +330,7 @@ struct Transpose8_8_8
/// @brief Performs an SOA to AOS conversion for packed 8_8_8 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+ INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -337,7 +342,7 @@ struct Transpose8_8
/// @brief Performs an SOA to AOS conversion for packed 8_8 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
simdscalari src = _simd_load_si((const simdscalari*)pSrc);
@@ -361,7 +366,7 @@ struct Transpose32_32_32_32
/// @brief Performs an SOA to AOS conversion for packed 32_32_32_32 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
#if KNOB_SIMD_WIDTH == 8
simdscalar src0 = _simd_load_ps((const float*)pSrc);
@@ -394,7 +399,7 @@ struct Transpose32_32_32
/// @brief Performs an SOA to AOS conversion for packed 32_32_32 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
#if KNOB_SIMD_WIDTH == 8
simdscalar src0 = _simd_load_ps((const float*)pSrc);
@@ -426,7 +431,7 @@ struct Transpose32_32
/// @brief Performs an SOA to AOS conversion for packed 32_32 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
const float* pfSrc = (const float*)pSrc;
__m128 src_r0 = _mm_load_ps(pfSrc + 0);
@@ -456,7 +461,7 @@ struct Transpose16_16_16_16
/// @brief Performs an SOA to AOS conversion for packed 16_16_16_16 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
#if KNOB_SIMD_WIDTH == 8
simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
@@ -496,7 +501,7 @@ struct Transpose16_16_16
/// @brief Performs an SOA to AOS conversion for packed 16_16_16 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
#if KNOB_SIMD_WIDTH == 8
simdscalari src_rg = _simd_load_si((const simdscalari*)pSrc);
@@ -535,7 +540,7 @@ struct Transpose16_16
/// @brief Performs an SOA to AOS conversion for packed 16_16 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- INLINE static void Transpose(const BYTE* pSrc, BYTE* pDst)
+ INLINE static void Transpose(const uint8_t* pSrc, uint8_t* pDst)
{
simdscalar src = _simd_load_ps((const float*)pSrc);
@@ -566,7 +571,7 @@ struct Transpose24_8
/// @brief Performs an SOA to AOS conversion for packed 24_8 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+ static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -578,7 +583,7 @@ struct Transpose32_8_24
/// @brief Performs an SOA to AOS conversion for packed 32_8_24 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+ static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
@@ -592,7 +597,7 @@ struct Transpose4_4_4_4
/// @brief Performs an SOA to AOS conversion for packed 4_4_4_4 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+ static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -604,7 +609,7 @@ struct Transpose5_6_5
/// @brief Performs an SOA to AOS conversion for packed 5_6_5 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+ static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -616,7 +621,7 @@ struct Transpose9_9_9_5
/// @brief Performs an SOA to AOS conversion for packed 9_9_9_5 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+ static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -628,7 +633,7 @@ struct Transpose5_5_5_1
/// @brief Performs an SOA to AOS conversion for packed 5_5_5_1 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+ static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -640,7 +645,7 @@ struct Transpose10_10_10_2
/// @brief Performs an SOA to AOS conversion for packed 10_10_10_2 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+ static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
//////////////////////////////////////////////////////////////////////////
@@ -652,7 +657,7 @@ struct Transpose11_11_10
/// @brief Performs an SOA to AOS conversion for packed 11_11_10 data.
/// @param pSrc - source data in SOA form
/// @param pDst - output data in AOS form
- static void Transpose(const BYTE* pSrc, BYTE* pDst) = delete;
+ static void Transpose(const uint8_t* pSrc, uint8_t* pDst) = delete;
};
// helper function to unroll loops
@@ -694,7 +699,7 @@ uint32_t ComputeCRC(uint32_t crc, const void *pData, uint32_t size)
}
#endif
- BYTE* pRemainderBytes = (BYTE*)pDataWords;
+ uint8_t* pRemainderBytes = (uint8_t*)pDataWords;
for (uint32_t i = 0; i < sizeRemainderBytes; ++i)
{
crc = _mm_crc32_u8(crc, *pRemainderBytes++);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
index 734c897..de856c4 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.cpp
@@ -47,6 +47,10 @@
#include "llvm/Analysis/CFGPrinter.h"
#include "llvm/IRReader/IRReader.h"
+#if LLVM_USE_INTEL_JITEVENTS
+#include "llvm/ExecutionEngine/JITEventListener.h"
+#endif
+
#include "core/state.h"
#include "common/containers.hpp"
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
index c974a61..4ffb0fb 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/JitManager.h
@@ -53,6 +53,10 @@
#include "llvm/Config/config.h"
#endif
+#ifndef HAVE_LLVM
+#define HAVE_LLVM (LLVM_VERSION_MAJOR << 8) || LLVM_VERSION_MINOR
+#endif
+
#include "llvm/IR/Verifier.h"
#include "llvm/ExecutionEngine/MCJIT.h"
#include "llvm/Support/FileSystem.h"
@@ -60,11 +64,10 @@
#include "llvm/Analysis/Passes.h"
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
#include "llvm/PassManager.h"
#else
#include "llvm/IR/LegacyPassManager.h"
-using namespace llvm::legacy;
#endif
#include "llvm/CodeGen/Passes.h"
@@ -166,7 +169,6 @@ struct JitManager
FunctionType* mTrinaryFPTy;
FunctionType* mUnaryIntTy;
FunctionType* mBinaryIntTy;
- FunctionType* mTrinaryIntTy;
Type* mSimtFP32Ty;
Type* mSimtInt32Ty;
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
index 954524a..2fed2bf 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/blend_jit.cpp
@@ -717,7 +717,13 @@ struct BlendJit : public Builder
JitManager::DumpToFile(blendFunc, "");
- FunctionPassManager passes(JM()->mpCurrentModule);
+#if HAVE_LLVM == 0x306
+ FunctionPassManager
+#else
+ llvm::legacy::FunctionPassManager
+#endif
+ passes(JM()->mpCurrentModule);
+
passes.add(createBreakCriticalEdgesPass());
passes.add(createCFGSimplificationPass());
passes.add(createEarlyCSEPass());
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
index c15bdf1..757ea3f 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -38,6 +38,8 @@ using namespace llvm;
Builder::Builder(JitManager *pJitMgr)
: mpJitMgr(pJitMgr)
{
+ mVWidth = pJitMgr->mVWidth;
+
mpIRBuilder = &pJitMgr->mBuilder;
mVoidTy = Type::getVoidTy(pJitMgr->mContext);
@@ -48,14 +50,18 @@ Builder::Builder(JitManager *pJitMgr)
mInt8Ty = Type::getInt8Ty(pJitMgr->mContext);
mInt16Ty = Type::getInt16Ty(pJitMgr->mContext);
mInt32Ty = Type::getInt32Ty(pJitMgr->mContext);
+ mInt8PtrTy = PointerType::get(mInt8Ty, 0);
+ mInt16PtrTy = PointerType::get(mInt16Ty, 0);
+ mInt32PtrTy = PointerType::get(mInt32Ty, 0);
mInt64Ty = Type::getInt64Ty(pJitMgr->mContext);
mV4FP32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mFP32Ty), false); // vector4 float type (represented as structure)
mV4Int32Ty = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mInt32Ty), false); // vector4 int type
- mSimdInt16Ty = VectorType::get(mInt16Ty, mpJitMgr->mVWidth);
- mSimdInt32Ty = VectorType::get(mInt32Ty, mpJitMgr->mVWidth);
- mSimdInt64Ty = VectorType::get(mInt64Ty, mpJitMgr->mVWidth);
- mSimdFP16Ty = VectorType::get(mFP16Ty, mpJitMgr->mVWidth);
- mSimdFP32Ty = VectorType::get(mFP32Ty, mpJitMgr->mVWidth);
+ mSimdInt16Ty = VectorType::get(mInt16Ty, mVWidth);
+ mSimdInt32Ty = VectorType::get(mInt32Ty, mVWidth);
+ mSimdInt64Ty = VectorType::get(mInt64Ty, mVWidth);
+ mSimdFP16Ty = VectorType::get(mFP16Ty, mVWidth);
+ mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth);
+ mSimdVectorTy = StructType::get(pJitMgr->mContext, std::vector<Type*>(4, mSimdFP32Ty), false);
if (sizeof(uint32_t*) == 4)
{
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index 4921661..239ef2a 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -43,6 +43,8 @@ struct Builder
JitManager* mpJitMgr;
IRBuilder<>* mpIRBuilder;
+ uint32_t mVWidth;
+
// Built in types.
Type* mVoidTy;
Type* mInt1Ty;
@@ -54,12 +56,16 @@ struct Builder
Type* mFP16Ty;
Type* mFP32Ty;
Type* mDoubleTy;
+ Type* mInt8PtrTy;
+ Type* mInt16PtrTy;
+ Type* mInt32PtrTy;
Type* mSimdFP16Ty;
Type* mSimdFP32Ty;
Type* mSimdInt16Ty;
Type* mSimdInt32Ty;
Type* mSimdInt64Ty;
Type* mSimdIntPtrTy;
+ Type* mSimdVectorTy;
StructType* mV4FP32Ty;
StructType* mV4Int32Ty;
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 5394fc7..c6cf793 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -28,6 +28,8 @@
*
******************************************************************************/
#include "builder.h"
+#include "common/rdtsc_buckets.h"
+
#include "llvm/Support/DynamicLibrary.h"
void __cdecl CallPrint(const char* fmt, ...);
@@ -189,32 +191,32 @@ Constant *Builder::PRED(bool pred)
Value *Builder::VIMMED1(int i)
{
- return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
+ return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
}
Value *Builder::VIMMED1(uint32_t i)
{
- return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
+ return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
}
Value *Builder::VIMMED1(float i)
{
- return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantFP>(C(i)));
+ return ConstantVector::getSplat(mVWidth, cast<ConstantFP>(C(i)));
}
Value *Builder::VIMMED1(bool i)
{
- return ConstantVector::getSplat(JM()->mVWidth, cast<ConstantInt>(C(i)));
+ return ConstantVector::getSplat(mVWidth, cast<ConstantInt>(C(i)));
}
Value *Builder::VUNDEF_IPTR()
{
- return UndefValue::get(VectorType::get(PointerType::get(mInt32Ty, 0),JM()->mVWidth));
+ return UndefValue::get(VectorType::get(mInt32PtrTy,mVWidth));
}
Value *Builder::VUNDEF_I()
{
- return UndefValue::get(VectorType::get(mInt32Ty, JM()->mVWidth));
+ return UndefValue::get(VectorType::get(mInt32Ty, mVWidth));
}
Value *Builder::VUNDEF(Type *ty, uint32_t size)
@@ -224,15 +226,15 @@ Value *Builder::VUNDEF(Type *ty, uint32_t size)
Value *Builder::VUNDEF_F()
{
- return UndefValue::get(VectorType::get(mFP32Ty, JM()->mVWidth));
+ return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
}
Value *Builder::VUNDEF(Type* t)
{
- return UndefValue::get(VectorType::get(t, JM()->mVWidth));
+ return UndefValue::get(VectorType::get(t, mVWidth));
}
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
Value *Builder::VINSERT(Value *vec, Value *val, uint64_t index)
{
return VINSERT(vec, val, C((int64_t)index));
@@ -247,7 +249,7 @@ Value *Builder::VBROADCAST(Value *src)
return src;
}
- return VECTOR_SPLAT(JM()->mVWidth, src);
+ return VECTOR_SPLAT(mVWidth, src);
}
uint32_t Builder::IMMED(Value* v)
@@ -257,6 +259,13 @@ uint32_t Builder::IMMED(Value* v)
return pValConst->getZExtValue();
}
+int32_t Builder::S_IMMED(Value* v)
+{
+ SWR_ASSERT(isa<ConstantInt>(v));
+ ConstantInt *pValConst = cast<ConstantInt>(v);
+ return pValConst->getSExtValue();
+}
+
Value *Builder::GEP(Value* ptr, const std::initializer_list<Value*> &indexList)
{
std::vector<Value*> indices;
@@ -342,8 +351,8 @@ Value *Builder::MASKLOADD(Value* src,Value* mask)
else
{
Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule,Intrinsic::x86_avx_maskload_ps_256);
- Value* fMask = BITCAST(mask,VectorType::get(mFP32Ty,JM()->mVWidth));
- vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,JM()->mVWidth));
+ Value* fMask = BITCAST(mask,VectorType::get(mFP32Ty,mVWidth));
+ vResult = BITCAST(CALL(func,{src,fMask}), VectorType::get(mInt32Ty,mVWidth));
}
return vResult;
}
@@ -512,7 +521,7 @@ CallInst *Builder::PRINT(const std::string &printStr,const std::initializer_list
// get a pointer to the first character in the constant string array
std::vector<Constant*> geplist{C(0),C(0)};
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
Constant *strGEP = ConstantExpr::getGetElementPtr(gvPtr,geplist,false);
#else
Constant *strGEP = ConstantExpr::getGetElementPtr(nullptr, gvPtr,geplist,false);
@@ -575,7 +584,7 @@ Value *Builder::GATHERPS(Value* vSrc, Value* pBase, Value* vIndices, Value* vMas
Value *vScaleVec = VBROADCAST(Z_EXT(scale,mInt32Ty));
Value *vOffsets = MUL(vIndices,vScaleVec);
Value *mask = MASK(vMask);
- for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+ for(uint32_t i = 0; i < mVWidth; ++i)
{
// single component byte index
Value *offset = VEXTRACT(vOffsets,C(i));
@@ -625,7 +634,7 @@ Value *Builder::GATHERDD(Value* vSrc, Value* pBase, Value* vIndices, Value* vMas
Value *vScaleVec = VBROADCAST(Z_EXT(scale, mInt32Ty));
Value *vOffsets = MUL(vIndices, vScaleVec);
Value *mask = MASK(vMask);
- for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+ for(uint32_t i = 0; i < mVWidth; ++i)
{
// single component byte index
Value *offset = VEXTRACT(vOffsets, C(i));
@@ -774,12 +783,61 @@ Value *Builder::PERMD(Value* a, Value* idx)
}
else
{
- res = VSHUFFLE(a, a, idx);
+ if (isa<Constant>(idx))
+ {
+ res = VSHUFFLE(a, a, idx);
+ }
+ else
+ {
+ res = VUNDEF_I();
+ for (uint32_t l = 0; l < JM()->mVWidth; ++l)
+ {
+ Value* pIndex = VEXTRACT(idx, C(l));
+ Value* pVal = VEXTRACT(a, pIndex);
+ res = VINSERT(res, pVal, C(l));
+ }
+ }
}
return res;
}
//////////////////////////////////////////////////////////////////////////
+/// @brief Generate a VPERMPS operation (shuffle 32 bit float values
+/// across 128 bit lanes) in LLVM IR. If not supported on the underlying
+/// platform, emulate it
+/// @param a - 256bit SIMD lane(8x32bit) of float values.
+/// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
+Value *Builder::PERMPS(Value* a, Value* idx)
+{
+ Value* res;
+ // use avx2 permute instruction if available
+ if (JM()->mArch.AVX2())
+ {
+ // llvm 3.6.0 swapped the order of the args to vpermd
+ res = VPERMPS(idx, a);
+ }
+ else
+ {
+ if (isa<Constant>(idx))
+ {
+ res = VSHUFFLE(a, a, idx);
+ }
+ else
+ {
+ res = VUNDEF_F();
+ for (uint32_t l = 0; l < JM()->mVWidth; ++l)
+ {
+ Value* pIndex = VEXTRACT(idx, C(l));
+ Value* pVal = VEXTRACT(a, pIndex);
+ res = VINSERT(res, pVal, C(l));
+ }
+ }
+ }
+
+ return res;
+}
+
+//////////////////////////////////////////////////////////////////////////
/// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
/// in LLVM IR. If not supported on the underlying platform, emulate it
/// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
@@ -800,7 +858,7 @@ Value *Builder::CVTPH2PS(Value* a)
}
Value* pResult = UndefValue::get(mSimdFP32Ty);
- for (uint32_t i = 0; i < JM()->mVWidth; ++i)
+ for (uint32_t i = 0; i < mVWidth; ++i)
{
Value* pSrc = VEXTRACT(a, C(i));
Value* pConv = CALL(pCvtPh2Ps, std::initializer_list<Value*>{pSrc});
@@ -833,7 +891,7 @@ Value *Builder::CVTPS2PH(Value* a, Value* rounding)
}
Value* pResult = UndefValue::get(mSimdInt16Ty);
- for (uint32_t i = 0; i < JM()->mVWidth; ++i)
+ for (uint32_t i = 0; i < mVWidth; ++i)
{
Value* pSrc = VEXTRACT(a, C(i));
Value* pConv = CALL(pCvtPs2Ph, std::initializer_list<Value*>{pSrc});
@@ -1085,8 +1143,8 @@ void Builder::GATHER4DD(const SWR_FORMAT_INFO &info, Value* pSrcBase, Value* byt
void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[2], Value* vGatherOutput[4], bool bPackedOutput)
{
// cast types
- Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
- Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits
+ Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+ Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
// input could either be float or int vector; do shuffle work in int
vGatherInput[0] = BITCAST(vGatherInput[0], mSimdInt32Ty);
@@ -1094,7 +1152,7 @@ void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInp
if(bPackedOutput)
{
- Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+ Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
// shuffle mask
Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
@@ -1179,12 +1237,12 @@ void Builder::Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInp
void Builder::Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput)
{
// cast types
- Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
- Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits
+ Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+ Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
if(bPackedOutput)
{
- Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+ Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
// shuffle mask
Value* vConstMask = C<char>({0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15,
0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15});
@@ -1286,16 +1344,18 @@ void Builder::SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask)
{
Value* pStack = STACKSAVE();
+ Type* pSrcTy = vSrc->getType()->getVectorElementType();
+
// allocate tmp stack for masked off lanes
- Value* vTmpPtr = ALLOCA(vSrc->getType()->getVectorElementType());
+ Value* vTmpPtr = ALLOCA(pSrcTy);
Value *mask = MASK(vMask);
- for (uint32_t i = 0; i < JM()->mVWidth; ++i)
+ for (uint32_t i = 0; i < mVWidth; ++i)
{
Value *offset = VEXTRACT(vOffsets, C(i));
// byte pointer to component
Value *storeAddress = GEP(pDst, offset);
- storeAddress = BITCAST(storeAddress, PointerType::get(mFP32Ty, 0));
+ storeAddress = BITCAST(storeAddress, PointerType::get(pSrcTy, 0));
Value *selMask = VEXTRACT(mask, C(i));
Value *srcElem = VEXTRACT(vSrc, C(i));
// switch in a safe address to load if we're trying to access a vertex
@@ -1349,7 +1409,7 @@ Value *Builder::FCLAMP(Value* src, float low, float high)
Value* Builder::STACKSAVE()
{
Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
return CALL(pfnStackSave);
#else
return CALLA(pfnStackSave);
@@ -1401,11 +1461,13 @@ void __cdecl CallPrint(const char* fmt, ...)
vsnprintf_s(strBuf, _TRUNCATE, fmt, args);
OutputDebugString(strBuf);
#endif
+
+ va_end(args);
}
Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
{
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
Function *func =
Intrinsic::getDeclaration(JM()->mpCurrentModule,
Intrinsic::x86_avx_vextractf128_si_256);
@@ -1413,8 +1475,8 @@ Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
#else
bool flag = !imm8->isZeroValue();
SmallVector<Constant*,8> idx;
- for (unsigned i = 0; i < JM()->mVWidth / 2; i++) {
- idx.push_back(C(flag ? i + JM()->mVWidth / 2 : i));
+ for (unsigned i = 0; i < mVWidth / 2; i++) {
+ idx.push_back(C(flag ? i + mVWidth / 2 : i));
}
return VSHUFFLE(a, VUNDEF_I(), ConstantVector::get(idx));
#endif
@@ -1422,7 +1484,7 @@ Value *Builder::VEXTRACTI128(Value* a, Constant* imm8)
Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
{
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
Function *func =
Intrinsic::getDeclaration(JM()->mpCurrentModule,
Intrinsic::x86_avx_vinsertf128_si_256);
@@ -1430,18 +1492,54 @@ Value *Builder::VINSERTI128(Value* a, Value* b, Constant* imm8)
#else
bool flag = !imm8->isZeroValue();
SmallVector<Constant*,8> idx;
- for (unsigned i = 0; i < JM()->mVWidth; i++) {
+ for (unsigned i = 0; i < mVWidth; i++) {
idx.push_back(C(i));
}
Value *inter = VSHUFFLE(b, VUNDEF_I(), ConstantVector::get(idx));
SmallVector<Constant*,8> idx2;
- for (unsigned i = 0; i < JM()->mVWidth / 2; i++) {
- idx2.push_back(C(flag ? i : i + JM()->mVWidth));
+ for (unsigned i = 0; i < mVWidth / 2; i++) {
+ idx2.push_back(C(flag ? i : i + mVWidth));
}
- for (unsigned i = JM()->mVWidth / 2; i < JM()->mVWidth; i++) {
- idx2.push_back(C(flag ? i + JM()->mVWidth / 2 : i));
+ for (unsigned i = mVWidth / 2; i < mVWidth; i++) {
+ idx2.push_back(C(flag ? i + mVWidth / 2 : i));
}
return VSHUFFLE(a, inter, ConstantVector::get(idx2));
#endif
}
+
+// rdtsc buckets macros
+void Builder::RDTSC_START(Value* pBucketMgr, Value* pId)
+{
+ std::vector<Type*> args{
+ PointerType::get(mInt32Ty, 0), // pBucketMgr
+ mInt32Ty // id
+ };
+
+ FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
+ Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StartBucket", pFuncTy));
+ if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StartBucket") == nullptr)
+ {
+ sys::DynamicLibrary::AddSymbol("BucketManager_StartBucket", (void*)&BucketManager_StartBucket);
+ }
+
+ CALL(pFunc, { pBucketMgr, pId });
+}
+
+void Builder::RDTSC_STOP(Value* pBucketMgr, Value* pId)
+{
+ std::vector<Type*> args{
+ PointerType::get(mInt32Ty, 0), // pBucketMgr
+ mInt32Ty // id
+ };
+
+ FunctionType* pFuncTy = FunctionType::get(Type::getVoidTy(JM()->mContext), args, false);
+ Function* pFunc = cast<Function>(JM()->mpCurrentModule->getOrInsertFunction("BucketManager_StopBucket", pFuncTy));
+ if (sys::DynamicLibrary::SearchForAddressOfSymbol("BucketManager_StopBucket") == nullptr)
+ {
+ sys::DynamicLibrary::AddSymbol("BucketManager_StopBucket", (void*)&BucketManager_StopBucket);
+ }
+
+ CALL(pFunc, { pBucketMgr, pId });
+}
+
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 48e0558..f43ef69 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -59,7 +59,7 @@ Value *VUNDEF_F();
Value *VUNDEF_I();
Value *VUNDEF(Type* ty, uint32_t size);
Value *VUNDEF_IPTR();
-#if LLVM_VERSION_MAJOR == 3 && LLVM_VERSION_MINOR == 6
+#if HAVE_LLVM == 0x306
Value *VINSERT(Value *vec, Value *val, uint64_t index);
#endif
Value *VBROADCAST(Value *src);
@@ -67,6 +67,7 @@ Value *VRCP(Value *va);
Value *VPLANEPS(Value* vA, Value* vB, Value* vC, Value* &vX, Value* &vY);
uint32_t IMMED(Value* i);
+int32_t S_IMMED(Value* i);
Value *GEP(Value* ptr, const std::initializer_list<Value*> &indexList);
Value *GEP(Value* ptr, const std::initializer_list<uint32_t> &indexList);
@@ -115,6 +116,7 @@ Value *PSHUFB(Value* a, Value* b);
Value *PMOVSXBD(Value* a);
Value *PMOVSXWD(Value* a);
Value *PERMD(Value* a, Value* idx);
+Value *PERMPS(Value* a, Value* idx);
Value *CVTPH2PS(Value* a);
Value *CVTPS2PH(Value* a, Value* rounding);
Value *PMAXSD(Value* a, Value* b);
@@ -147,3 +149,7 @@ Value* INT3() { return INTERRUPT(C((uint8_t)3)); }
Value *VEXTRACTI128(Value* a, Constant* imm8);
Value *VINSERTI128(Value* a, Value* b, Constant* imm8);
+
+// rdtsc buckets macros
+void RDTSC_START(Value* pBucketMgr, Value* pId);
+void RDTSC_STOP(Value* pBucketMgr, Value* pId);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index c5a180e..2c2c56b 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -105,7 +105,7 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
std::vector<Value*> vtxInputIndices(2, C(0));
// GEP
pVtxOut = GEP(pVtxOut, C(0));
- pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, JM()->mVWidth), 0));
+ pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
// SWR_FETCH_CONTEXT::pStreams
Value* streams = LOAD(fetchInfo,{0, SWR_FETCH_CONTEXT_pStreams});
@@ -174,7 +174,12 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
verifyFunction(*fetch);
- FunctionPassManager setupPasses(JM()->mpCurrentModule);
+#if HAVE_LLVM == 0x306
+ FunctionPassManager
+#else
+ llvm::legacy::FunctionPassManager
+#endif
+ setupPasses(JM()->mpCurrentModule);
///@todo We don't need the CFG passes for fetch. (e.g. BreakCriticalEdges and CFGSimplification)
setupPasses.add(createBreakCriticalEdgesPass());
@@ -186,7 +191,12 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
JitManager::DumpToFile(fetch, "se");
- FunctionPassManager optPasses(JM()->mpCurrentModule);
+#if HAVE_LLVM == 0x306
+ FunctionPassManager
+#else
+ llvm::legacy::FunctionPassManager
+#endif
+ optPasses(JM()->mpCurrentModule);
///@todo Haven't touched these either. Need to remove some of these and add others.
optPasses.add(createCFGSimplificationPass());
@@ -220,8 +230,8 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet
SWRL::UncheckedFixedVector<Value*, 16> vectors;
- std::vector<Constant*> pMask(JM()->mVWidth);
- for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+ std::vector<Constant*> pMask(mVWidth);
+ for(uint32_t i = 0; i < mVWidth; ++i)
{
pMask[i] = (C(i < 4 ? i : 4));
}
@@ -254,7 +264,7 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet
Value* startVertexOffset = MUL(Z_EXT(startVertex, mInt64Ty), stride);
// Load from the stream.
- for(uint32_t lane = 0; lane < JM()->mVWidth; ++lane)
+ for(uint32_t lane = 0; lane < mVWidth; ++lane)
{
// Get index
Value* index = VEXTRACT(vIndices, C(lane));
@@ -380,44 +390,44 @@ void FetchJit::JitLoadVertices(const FETCH_COMPILE_STATE &fetchState, Value* fet
vectors.push_back(wvec);
}
- std::vector<Constant*> v01Mask(JM()->mVWidth);
- std::vector<Constant*> v23Mask(JM()->mVWidth);
- std::vector<Constant*> v02Mask(JM()->mVWidth);
- std::vector<Constant*> v13Mask(JM()->mVWidth);
+ std::vector<Constant*> v01Mask(mVWidth);
+ std::vector<Constant*> v23Mask(mVWidth);
+ std::vector<Constant*> v02Mask(mVWidth);
+ std::vector<Constant*> v13Mask(mVWidth);
// Concatenate the vectors together.
elements[0] = VUNDEF_F();
elements[1] = VUNDEF_F();
elements[2] = VUNDEF_F();
elements[3] = VUNDEF_F();
- for(uint32_t b = 0, num4Wide = JM()->mVWidth / 4; b < num4Wide; ++b)
+ for(uint32_t b = 0, num4Wide = mVWidth / 4; b < num4Wide; ++b)
{
v01Mask[4 * b + 0] = C(0 + 4 * b);
v01Mask[4 * b + 1] = C(1 + 4 * b);
- v01Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth);
- v01Mask[4 * b + 3] = C(1 + 4 * b + JM()->mVWidth);
+ v01Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
+ v01Mask[4 * b + 3] = C(1 + 4 * b + mVWidth);
v23Mask[4 * b + 0] = C(2 + 4 * b);
v23Mask[4 * b + 1] = C(3 + 4 * b);
- v23Mask[4 * b + 2] = C(2 + 4 * b + JM()->mVWidth);
- v23Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth);
+ v23Mask[4 * b + 2] = C(2 + 4 * b + mVWidth);
+ v23Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
v02Mask[4 * b + 0] = C(0 + 4 * b);
v02Mask[4 * b + 1] = C(2 + 4 * b);
- v02Mask[4 * b + 2] = C(0 + 4 * b + JM()->mVWidth);
- v02Mask[4 * b + 3] = C(2 + 4 * b + JM()->mVWidth);
+ v02Mask[4 * b + 2] = C(0 + 4 * b + mVWidth);
+ v02Mask[4 * b + 3] = C(2 + 4 * b + mVWidth);
v13Mask[4 * b + 0] = C(1 + 4 * b);
v13Mask[4 * b + 1] = C(3 + 4 * b);
- v13Mask[4 * b + 2] = C(1 + 4 * b + JM()->mVWidth);
- v13Mask[4 * b + 3] = C(3 + 4 * b + JM()->mVWidth);
+ v13Mask[4 * b + 2] = C(1 + 4 * b + mVWidth);
+ v13Mask[4 * b + 3] = C(3 + 4 * b + mVWidth);
- std::vector<Constant*> iMask(JM()->mVWidth);
- for(uint32_t i = 0; i < JM()->mVWidth; ++i)
+ std::vector<Constant*> iMask(mVWidth);
+ for(uint32_t i = 0; i < mVWidth; ++i)
{
if(((4 * b) <= i) && (i < (4 * (b + 1))))
{
- iMask[i] = C(i % 4 + JM()->mVWidth);
+ iMask[i] = C(i % 4 + mVWidth);
}
else
{
@@ -805,7 +815,7 @@ Value* FetchJit::GetSimdValid8bitIndices(Value* pIndices, Value* pLastIndex)
STORE(C((uint8_t)0), pZeroIndex);
// Load a SIMD of index pointers
- for(int64_t lane = 0; lane < JM()->mVWidth; lane++)
+ for(int64_t lane = 0; lane < mVWidth; lane++)
{
// Calculate the address of the requested index
Value *pIndex = GEP(pIndices, C(lane));
@@ -840,7 +850,7 @@ Value* FetchJit::GetSimdValid16bitIndices(Value* pIndices, Value* pLastIndex)
STORE(C((uint16_t)0), pZeroIndex);
// Load a SIMD of index pointers
- for(int64_t lane = 0; lane < JM()->mVWidth; lane++)
+ for(int64_t lane = 0; lane < mVWidth; lane++)
{
// Calculate the address of the requested index
Value *pIndex = GEP(pIndices, C(lane));
@@ -925,13 +935,13 @@ void FetchJit::Shuffle8bpcGatherd(Shuffle8bpcArgs &args)
const uint32_t (&swizzle)[4] = std::get<9>(args);
// cast types
- Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
- Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4 ); // vwidth is units of 32 bits
+ Type* vGatherTy = mSimdInt32Ty;
+ Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4 ); // vwidth is units of 32 bits
// have to do extra work for sign extending
if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)){
- Type* v16x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 2); // 8x16bit ints in a 128bit lane
- Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+ Type* v16x8Ty = VectorType::get(mInt8Ty, mVWidth * 2); // 8x16bit ints in a 128bit lane
+ Type* v128Ty = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
// shuffle mask, including any swizzling
const char x = (char)swizzle[0]; const char y = (char)swizzle[1];
@@ -1138,8 +1148,8 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
Value* (&vVertexElements)[4] = std::get<8>(args);
// cast types
- Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), JM()->mVWidth);
- Type* v32x8Ty = VectorType::get(mInt8Ty, JM()->mVWidth * 4); // vwidth is units of 32 bits
+ Type* vGatherTy = VectorType::get(IntegerType::getInt32Ty(JM()->mContext), mVWidth);
+ Type* v32x8Ty = VectorType::get(mInt8Ty, mVWidth * 4); // vwidth is units of 32 bits
// have to do extra work for sign extending
if ((extendType == Instruction::CastOps::SExt) || (extendType == Instruction::CastOps::SIToFP)||
@@ -1149,7 +1159,7 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
bool bFP = (extendType == Instruction::CastOps::FPExt) ? true : false;
Type* v8x16Ty = VectorType::get(mInt16Ty, 8); // 8x16bit in a 128bit lane
- Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), JM()->mVWidth / 4); // vwidth is units of 32 bits
+ Type* v128bitTy = VectorType::get(IntegerType::getIntNTy(JM()->mContext, 128), mVWidth / 4); // vwidth is units of 32 bits
// shuffle mask
Value* vConstMask = C<char>({0, 1, 4, 5, 8, 9, 12, 13, 2, 3, 6, 7, 10, 11, 14, 15,
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
index 1814b7c..e73b232 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_ir_macros.py
@@ -27,7 +27,7 @@ import json as JSON
import operator
header = r"""/****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
@@ -84,16 +84,16 @@ inst_aliases = {
}
intrinsics = [
- ["VGATHERPS", "x86_avx2_gather_d_ps_256", ["src", "pBase", "indices", "mask", "scale"]],
+ ["VGATHERPS", "x86_avx2_gather_d_ps_256", ["src", "pBase", "indices", "mask", "scale"]],
["VGATHERDD", "x86_avx2_gather_d_d_256", ["src", "pBase", "indices", "mask", "scale"]],
- ["VSQRTPS", "x86_avx_sqrt_ps_256", ["a"]],
- ["VRSQRTPS", "x86_avx_rsqrt_ps_256", ["a"]],
- ["VRCPPS", "x86_avx_rcp_ps_256", ["a"]],
- ["VMINPS", "x86_avx_min_ps_256", ["a", "b"]],
- ["VMAXPS", "x86_avx_max_ps_256", ["a", "b"]],
- ["VPMINSD", "x86_avx2_pmins_d", ["a", "b"]],
- ["VPMAXSD", "x86_avx2_pmaxs_d", ["a", "b"]],
- ["VROUND", "x86_avx_round_ps_256", ["a", "rounding"]],
+ ["VSQRTPS", "x86_avx_sqrt_ps_256", ["a"]],
+ ["VRSQRTPS", "x86_avx_rsqrt_ps_256", ["a"]],
+ ["VRCPPS", "x86_avx_rcp_ps_256", ["a"]],
+ ["VMINPS", "x86_avx_min_ps_256", ["a", "b"]],
+ ["VMAXPS", "x86_avx_max_ps_256", ["a", "b"]],
+ ["VPMINSD", "x86_avx2_pmins_d", ["a", "b"]],
+ ["VPMAXSD", "x86_avx2_pmaxs_d", ["a", "b"]],
+ ["VROUND", "x86_avx_round_ps_256", ["a", "rounding"]],
["VCMPPS", "x86_avx_cmp_ps_256", ["a", "b", "cmpop"]],
["VBLENDVPS", "x86_avx_blendv_ps_256", ["a", "b", "mask"]],
["BEXTR_32", "x86_bmi_bextr_32", ["src", "control"]],
@@ -103,6 +103,7 @@ intrinsics = [
["VPMOVSXBD", "x86_avx2_pmovsxbd", ["a"]], # sign extend packed 8bit components
["VPMOVSXWD", "x86_avx2_pmovsxwd", ["a"]], # sign extend packed 16bit components
["VPERMD", "x86_avx2_permd", ["idx", "a"]],
+ ["VPERMPS", "x86_avx2_permps", ["idx", "a"]],
["VCVTPH2PS", "x86_vcvtph2ps_256", ["a"]],
["VCVTPS2PH", "x86_vcvtps2ph_256", ["a", "round"]],
["VHSUBPS", "x86_avx_hsub_ps_256", ["a", "b"]],
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
index 7bba435..0b53a92 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
+++ b/src/gallium/drivers/swr/rasterizer/jitter/scripts/gen_llvm_types.py
@@ -28,7 +28,7 @@ import operator
header = r"""
/****************************************************************************
-* Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+* Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
index 6c5f22b..36baa8d 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
@@ -293,7 +293,13 @@ struct StreamOutJit : public Builder
JitManager::DumpToFile(soFunc, "SoFunc");
- FunctionPassManager passes(JM()->mpCurrentModule);
+#if HAVE_LLVM == 0x306
+ FunctionPassManager
+#else
+ llvm::legacy::FunctionPassManager
+#endif
+ passes(JM()->mpCurrentModule);
+
passes.add(createBreakCriticalEdgesPass());
passes.add(createCFGSimplificationPass());
passes.add(createEarlyCSEPass());
diff --git a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
index ad73cd8..d001cb6 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
+++ b/src/gallium/drivers/swr/rasterizer/memory/ClearTile.cpp
@@ -33,7 +33,7 @@
#include "memory/tilingtraits.h"
#include "memory/Convert.h"
-typedef void(*PFN_STORE_TILES_CLEAR)(const FLOAT*, SWR_SURFACE_STATE*, UINT, UINT);
+typedef void(*PFN_STORE_TILES_CLEAR)(const float*, SWR_SURFACE_STATE*, UINT, UINT);
//////////////////////////////////////////////////////////////////////////
/// Clear Raster Tile Function Tables.
@@ -54,17 +54,17 @@ struct StoreRasterTileClear
/// @param pDstSurface - Destination surface state
/// @param x, y - Coordinates to raster tile.
INLINE static void StoreClear(
- const BYTE* dstFormattedColor,
+ const uint8_t* dstFormattedColor,
UINT dstBytesPerPixel,
SWR_SURFACE_STATE* pDstSurface,
UINT x, UINT y) // (x, y) pixel coordinate to start of raster tile.
{
// Compute destination address for raster tile.
- BYTE* pDstTile = (BYTE*)pDstSurface->pBaseAddress +
+ uint8_t* pDstTile = (uint8_t*)pDstSurface->pBaseAddress +
(y * pDstSurface->pitch) + (x * dstBytesPerPixel);
// start of first row
- BYTE* pDst = pDstTile;
+ uint8_t* pDst = pDstTile;
UINT dstBytesPerRow = 0;
// For each raster tile pixel in row 0 (rx, 0)
@@ -104,15 +104,15 @@ struct StoreMacroTileClear
/// @param pDstSurface - Destination surface state
/// @param x, y - Coordinates to macro tile
static void StoreClear(
- const FLOAT *pColor,
+ const float *pColor,
SWR_SURFACE_STATE* pDstSurface,
UINT x, UINT y)
{
UINT dstBytesPerPixel = (FormatTraits<DstFormat>::bpp / 8);
- BYTE dstFormattedColor[16]; // max bpp is 128, so 16 is all we need here for one pixel
+ uint8_t dstFormattedColor[16]; // max bpp is 128, so 16 is all we need here for one pixel
- FLOAT srcColor[4];
+ float srcColor[4];
for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
{
diff --git a/src/gallium/drivers/swr/rasterizer/memory/Convert.h b/src/gallium/drivers/swr/rasterizer/memory/Convert.h
index 0f9e0ad..7c185e5 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/Convert.h
+++ b/src/gallium/drivers/swr/rasterizer/memory/Convert.h
@@ -227,10 +227,10 @@ static uint16_t Convert32To16Float(float val)
/// @param srcPixel - Pointer to source pixel (pre-swizzled according to dest).
template<SWR_FORMAT DstFormat>
static void ConvertPixelFromFloat(
- BYTE* pDstPixel,
+ uint8_t* pDstPixel,
const float srcPixel[4])
{
- UINT outColor[4]; // typeless bits
+ uint32_t outColor[4] = { 0 }; // typeless bits
// Store component
for (UINT comp = 0; comp < FormatTraits<DstFormat>::numComps; ++comp)
@@ -390,9 +390,9 @@ static void ConvertPixelFromFloat(
template<SWR_FORMAT SrcFormat>
INLINE static void ConvertPixelToFloat(
float dstPixel[4],
- const BYTE* pSrc)
+ const uint8_t* pSrc)
{
- UINT srcColor[4]; // typeless bits
+ uint32_t srcColor[4]; // typeless bits
// unpack src pixel
typename FormatTraits<SrcFormat>::FormatT* pPixel = (typename FormatTraits<SrcFormat>::FormatT*)pSrc;
@@ -421,11 +421,11 @@ INLINE static void ConvertPixelToFloat(
}
// Convert components
- for (UINT comp = 0; comp < FormatTraits<SrcFormat>::numComps; ++comp)
+ for (uint32_t comp = 0; comp < FormatTraits<SrcFormat>::numComps; ++comp)
{
SWR_TYPE type = FormatTraits<SrcFormat>::GetType(comp);
- UINT src = srcColor[comp];
+ uint32_t src = srcColor[comp];
switch (type)
{
@@ -486,7 +486,7 @@ INLINE static void ConvertPixelToFloat(
}
case SWR_TYPE_UINT:
{
- UINT dst = (UINT)src;
+ uint32_t dst = (uint32_t)src;
dstPixel[FormatTraits<SrcFormat>::swizzle(comp)] = *(float*)&dst;
break;
}
diff --git a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
index 50f8e57..381ac89 100644
--- a/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
+++ b/src/gallium/drivers/swr/rasterizer/memory/tilingtraits.h
@@ -28,6 +28,7 @@
#pragma once
#include "core/state.h"
+#include "common/simdintrin.h"
template<SWR_TILE_MODE mode, int>
struct TilingTraits
@@ -130,63 +131,6 @@ template<int X> struct TilingTraits <SWR_TILE_MODE_WMAJOR, X>
static UINT GetPdepY() { return 0x1ea; }
};
-INLINE
-UINT pdep_u32(UINT a, UINT mask)
-{
-#if KNOB_ARCH==KNOB_ARCH_AVX2
- return _pdep_u32(a, mask);
-#else
- UINT result = 0;
-
- // copied from http://wm.ite.pl/articles/pdep-soft-emu.html
- // using bsf instead of funky loop
- DWORD maskIndex;
- while (_BitScanForward(&maskIndex, mask))
- {
- // 1. isolate lowest set bit of mask
- const UINT lowest = 1 << maskIndex;
-
- // 2. populate LSB from src
- const UINT LSB = (UINT)((int)(a << 31) >> 31);
-
- // 3. copy bit from mask
- result |= LSB & lowest;
-
- // 4. clear lowest bit
- mask &= ~lowest;
-
- // 5. prepare for next iteration
- a >>= 1;
- }
-
- return result;
-#endif
-}
-
-INLINE
-UINT pext_u32(UINT a, UINT mask)
-{
-#if KNOB_ARCH==KNOB_ARCH_AVX2
- return _pext_u32(a, mask);
-#else
- UINT result = 0;
- DWORD maskIndex;
- uint32_t currentBit = 0;
- while (_BitScanForward(&maskIndex, mask))
- {
- // 1. isolate lowest set bit of mask
- const UINT lowest = 1 << maskIndex;
-
- // 2. copy bit from mask
- result |= ((a & lowest) > 0) << currentBit++;
-
- // 3. clear lowest bit
- mask &= ~lowest;
- }
- return result;
-#endif
-}
-
//////////////////////////////////////////////////////////////////////////
/// @brief Computes the tileID for 2D tiled surfaces
/// @param pitch - surface pitch in bytes
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py b/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py
index 44ab698..3d003fb 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py
+++ b/src/gallium/drivers/swr/rasterizer/scripts/gen_knobs.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+# Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
index 8c51e1e..0f3ded6 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
+++ b/src/gallium/drivers/swr/rasterizer/scripts/knob_defs.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2014-2015 Intel Corporation. All Rights Reserved.
+# Copyright (C) 2014-2016 Intel Corporation. All Rights Reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a
# copy of this software and associated documentation files (the "Software"),
@@ -21,24 +21,20 @@
# Python source
KNOBS = [
- ['ENABLE_ASSERT_DIALOGS', {
- 'type' : 'bool',
- 'default' : 'true',
- 'desc' : ['Use dialogs when asserts fire.',
- 'Asserts are only enabled in debug builds'],
- }],
['SINGLE_THREADED', {
'type' : 'bool',
'default' : 'false',
'desc' : ['If enabled will perform all rendering on the API thread.',
'This is useful mainly for debugging purposes.'],
+ 'category' : 'debug',
}],
['DUMP_SHADER_IR', {
- 'type' : 'bool',
- 'default' : 'false',
- 'desc' : ['Dumps shader LLVM IR at various stages of jit compilation.'],
+ 'type' : 'bool',
+ 'default' : 'false',
+ 'desc' : ['Dumps shader LLVM IR at various stages of jit compilation.'],
+ 'category' : 'debug',
}],
['USE_GENERIC_STORETILE', {
@@ -46,6 +42,7 @@ KNOBS = [
'default' : 'false',
'desc' : ['Always use generic function for performing StoreTile.',
'Will be slightly slower than using optimized (jitted) path'],
+ 'category' : 'debug',
}],
['FAST_CLEAR', {
@@ -53,6 +50,7 @@ KNOBS = [
'default' : 'true',
'desc' : ['Replace 3D primitive execute with a SWRClearRT operation and',
'defer clear execution to first backend op on hottile, or hottile store'],
+ 'category' : 'perf',
}],
['MAX_NUMA_NODES', {
@@ -61,6 +59,7 @@ KNOBS = [
'desc' : ['Maximum # of NUMA-nodes per system used for worker threads',
' 0 == ALL NUMA-nodes in the system',
' N == Use at most N NUMA-nodes for rendering'],
+ 'category' : 'perf',
}],
['MAX_CORES_PER_NUMA_NODE', {
@@ -69,6 +68,7 @@ KNOBS = [
'desc' : ['Maximum # of cores per NUMA-node used for worker threads.',
' 0 == ALL non-API thread cores per NUMA-node',
' N == Use at most N cores per NUMA-node'],
+ 'category' : 'perf',
}],
['MAX_THREADS_PER_CORE', {
@@ -77,6 +77,7 @@ KNOBS = [
'desc' : ['Maximum # of (hyper)threads per physical core used for worker threads.',
' 0 == ALL hyper-threads per core',
' N == Use at most N hyper-threads per physical core'],
+ 'category' : 'perf',
}],
['MAX_WORKER_THREADS', {
@@ -87,6 +88,7 @@ KNOBS = [
'IMPORTANT: If this is non-zero, no worker threads will be bound to',
'specific HW threads. They will all be "floating" SW threads.',
'In this case, the above 3 KNOBS will be ignored.'],
+ 'category' : 'perf',
}],
['BUCKETS_START_FRAME', {
@@ -96,6 +98,7 @@ KNOBS = [
'',
'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h',
'for this to have an effect.'],
+ 'category' : 'perf',
}],
['BUCKETS_END_FRAME', {
@@ -105,6 +108,7 @@ KNOBS = [
'',
'NOTE: KNOB_ENABLE_RDTSC must be enabled in core/knobs.h',
'for this to have an effect.'],
+ 'category' : 'perf',
}],
['WORKER_SPIN_LOOP_COUNT', {
@@ -112,46 +116,32 @@ KNOBS = [
'default' : '5000',
'desc' : ['Number of spin-loop iterations worker threads will perform',
'before going to sleep when waiting for work'],
+ 'category' : 'perf',
}],
['MAX_DRAWS_IN_FLIGHT', {
'type' : 'uint32_t',
- 'default' : '160',
+ 'default' : '96',
'desc' : ['Maximum number of draws outstanding before API thread blocks.'],
+ 'category' : 'perf',
}],
['MAX_PRIMS_PER_DRAW', {
- 'type' : 'uint32_t',
- 'default' : '2040',
- 'desc' : ['Maximum primitives in a single Draw().',
+ 'type' : 'uint32_t',
+ 'default' : '2040',
+ 'desc' : ['Maximum primitives in a single Draw().',
'Larger primitives are split into smaller Draw calls.',
'Should be a multiple of (3 * vectorWidth).'],
+ 'category' : 'perf',
}],
['MAX_TESS_PRIMS_PER_DRAW', {
- 'type' : 'uint32_t',
- 'default' : '16',
- 'desc' : ['Maximum primitives in a single Draw() with tessellation enabled.',
+ 'type' : 'uint32_t',
+ 'default' : '16',
+ 'desc' : ['Maximum primitives in a single Draw() with tessellation enabled.',
'Larger primitives are split into smaller Draw calls.',
'Should be a multiple of (vectorWidth).'],
- }],
-
- ['MAX_FRAC_ODD_TESS_FACTOR', {
- 'type' : 'float',
- 'default' : '63.0f',
- 'desc' : ['(DEBUG) Maximum tessellation factor for fractional-odd partitioning.'],
- }],
-
- ['MAX_FRAC_EVEN_TESS_FACTOR', {
- 'type' : 'float',
- 'default' : '64.0f',
- 'desc' : ['(DEBUG) Maximum tessellation factor for fractional-even partitioning.'],
- }],
-
- ['MAX_INTEGER_TESS_FACTOR', {
- 'type' : 'uint32_t',
- 'default' : '64',
- 'desc' : ['(DEBUG) Maximum tessellation factor for integer partitioning.'],
+ 'category' : 'perf',
}],
@@ -159,12 +149,14 @@ KNOBS = [
'type' : 'bool',
'default' : 'false',
'desc' : ['Enable threadviz output.'],
+ 'category' : 'perf',
}],
['TOSS_DRAW', {
'type' : 'bool',
'default' : 'false',
'desc' : ['Disable per-draw/dispatch execution'],
+ 'category' : 'perf',
}],
['TOSS_QUEUE_FE', {
@@ -173,6 +165,7 @@ KNOBS = [
'desc' : ['Stop per-draw execution at worker FE',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+ 'category' : 'perf',
}],
['TOSS_FETCH', {
@@ -181,6 +174,7 @@ KNOBS = [
'desc' : ['Stop per-draw execution at vertex fetch',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+ 'category' : 'perf',
}],
['TOSS_IA', {
@@ -189,6 +183,7 @@ KNOBS = [
'desc' : ['Stop per-draw execution at input assembler',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+ 'category' : 'perf',
}],
['TOSS_VS', {
@@ -197,6 +192,7 @@ KNOBS = [
'desc' : ['Stop per-draw execution at vertex shader',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+ 'category' : 'perf',
}],
['TOSS_SETUP_TRIS', {
@@ -205,6 +201,7 @@ KNOBS = [
'desc' : ['Stop per-draw execution at primitive setup',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+ 'category' : 'perf',
}],
['TOSS_BIN_TRIS', {
@@ -213,6 +210,7 @@ KNOBS = [
'desc' : ['Stop per-draw execution at primitive binning',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
+ 'category' : 'perf',
}],
['TOSS_RS', {
@@ -221,6 +219,5 @@ KNOBS = [
'desc' : ['Stop per-draw execution at rasterizer',
'',
'NOTE: Requires KNOB_ENABLE_TOSS_POINTS to be enabled in core/knobs.h'],
- }],
-
-]
+ 'category' : 'perf',
+ }],]
diff --git a/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template
index 922117e..521346c 100644
--- a/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template
+++ b/src/gallium/drivers/swr/rasterizer/scripts/templates/knobs.template
@@ -10,7 +10,7 @@
return ' '*(max_len - knob_len)
%>/******************************************************************************
*
-* Copyright 2015
+* Copyright 2015-2016
* Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
@@ -77,7 +77,11 @@ struct GlobalKnobs
% for line in knob[1]['desc']:
// ${line}
% endfor
+ % if knob[1]['type'] == 'std::string':
+ DEFINE_KNOB(${knob[0]}, ${knob[1]['type']}, "${repr(knob[1]['default'])[1:-1]}");
+ % else:
DEFINE_KNOB(${knob[0]}, ${knob[1]['type']}, ${knob[1]['default']});
+ % endif
% endfor
GlobalKnobs();
@@ -125,7 +129,7 @@ std::string GlobalKnobs::ToString(const char* optPerLinePrefix)
str << optPerLinePrefix << "KNOB_${knob[0]}:${space_knob(knob[0])}";
% if knob[1]['type'] == 'bool':
str << (KNOB_${knob[0]} ? "+\n" : "-\n");
- % elif knob[1]['type'] != 'float':
+ % elif knob[1]['type'] != 'float' and knob[1]['type'] != 'std::string':
str << std::hex << std::setw(11) << std::left << KNOB_${knob[0]};
str << std::dec << KNOB_${knob[0]} << "\n";
% else:
diff --git a/src/gallium/drivers/swr/swr_context.cpp b/src/gallium/drivers/swr/swr_context.cpp
index 78b8fdf..46c79a1 100644
--- a/src/gallium/drivers/swr/swr_context.cpp
+++ b/src/gallium/drivers/swr/swr_context.cpp
@@ -338,7 +338,6 @@ swr_create_context(struct pipe_screen *p_screen, void *priv, unsigned flags)
SWR_CREATECONTEXT_INFO createInfo;
createInfo.driver = GL;
createInfo.privateStateSize = sizeof(swr_draw_context);
- createInfo.maxSubContexts = 0;
createInfo.pfnLoadTile = swr_LoadHotTile;
createInfo.pfnStoreTile = swr_StoreHotTile;
createInfo.pfnClearTile = swr_StoreHotTileClear;
--
1.9.1
More information about the mesa-dev
mailing list