[Mesa-dev] [PATCH 02/10] swr/rast: Widen fetch shader to SIMD16
Tim Rowley
timothy.o.rowley at intel.com
Mon Nov 20 17:18:45 UTC 2017
Widen fetch shader to SIMD16, enable SIMD16 types in the jitter,
and provide utility EXTRACT/INSERT SIMD8 <-> SIMD16 utility functions.
---
.../drivers/swr/rasterizer/jitter/builder.cpp | 20 ++++++++
.../drivers/swr/rasterizer/jitter/builder.h | 16 ++++++
.../drivers/swr/rasterizer/jitter/builder_misc.cpp | 52 ++++++++++++++++++++
.../drivers/swr/rasterizer/jitter/builder_misc.h | 9 ++++
.../drivers/swr/rasterizer/jitter/fetch_jit.cpp | 57 ++++++++++++++++++++--
5 files changed, 151 insertions(+), 3 deletions(-)
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
index 6a33ec265f..4b83a3204c 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.cpp
@@ -41,6 +41,9 @@ namespace SwrJit
: mpJitMgr(pJitMgr)
{
mVWidth = pJitMgr->mVWidth;
+#if USE_SIMD16_BUILDER
+ mVWidth2 = pJitMgr->mVWidth * 2;
+#endif
mpIRBuilder = &pJitMgr->mBuilder;
@@ -65,17 +68,34 @@ namespace SwrJit
mSimdFP32Ty = VectorType::get(mFP32Ty, mVWidth);
mSimdVectorTy = ArrayType::get(mSimdFP32Ty, 4);
mSimdVectorTRTy = ArrayType::get(mSimdFP32Ty, 5);
+#if USE_SIMD16_BUILDER
+ mSimd2Int1Ty = VectorType::get(mInt1Ty, mVWidth2);
+ mSimd2Int16Ty = VectorType::get(mInt16Ty, mVWidth2);
+ mSimd2Int32Ty = VectorType::get(mInt32Ty, mVWidth2);
+ mSimd2Int64Ty = VectorType::get(mInt64Ty, mVWidth2);
+ mSimd2FP16Ty = VectorType::get(mFP16Ty, mVWidth2);
+ mSimd2FP32Ty = VectorType::get(mFP32Ty, mVWidth2);
+ mSimd2VectorTy = ArrayType::get(mSimd2FP32Ty, 4);
+ mSimd2VectorTRTy = ArrayType::get(mSimd2FP32Ty, 5);
+#endif
if (sizeof(uint32_t*) == 4)
{
mIntPtrTy = mInt32Ty;
mSimdIntPtrTy = mSimdInt32Ty;
+#if USE_SIMD16_BUILDER
+ mSimd2IntPtrTy = mSimd2Int32Ty;
+#endif
}
else
{
SWR_ASSERT(sizeof(uint32_t*) == 8);
+
mIntPtrTy = mInt64Ty;
mSimdIntPtrTy = mSimdInt64Ty;
+#if USE_SIMD16_BUILDER
+ mSimd2IntPtrTy = mSimd2Int64Ty;
+#endif
}
}
}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index 8210e49b18..c6ab64e06e 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -32,6 +32,8 @@
#include "JitManager.h"
#include "common/formats.h"
+#define USE_SIMD16_BUILDER 0
+
namespace SwrJit
{
using namespace llvm;
@@ -45,6 +47,9 @@ namespace SwrJit
IRBuilder<>* mpIRBuilder;
uint32_t mVWidth;
+#if USE_SIMD16_BUILDER
+ uint32_t mVWidth2;
+#endif
// Built in types.
Type* mVoidTy;
@@ -70,6 +75,17 @@ namespace SwrJit
Type* mSimdIntPtrTy;
Type* mSimdVectorTy;
Type* mSimdVectorTRTy;
+#if USE_SIMD16_BUILDER
+ Type* mSimd2FP16Ty;
+ Type* mSimd2FP32Ty;
+ Type* mSimd2Int1Ty;
+ Type* mSimd2Int16Ty;
+ Type* mSimd2Int32Ty;
+ Type* mSimd2Int64Ty;
+ Type* mSimd2IntPtrTy;
+ Type* mSimd2VectorTy;
+ Type* mSimd2VectorTRTy;
+#endif
#include "gen_builder.hpp"
#include "gen_builder_x86.hpp"
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 9ca36b2467..daa9cb1ec1 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -231,6 +231,13 @@ namespace SwrJit
return UndefValue::get(VectorType::get(mFP32Ty, mVWidth));
}
+#if USE_SIMD16_BUILDER
+ Value *Builder::VUNDEF2_F()
+ {
+ return UndefValue::get(VectorType::get(mFP32Ty, mVWidth2));
+ }
+
+#endif
Value *Builder::VUNDEF(Type* t)
{
return UndefValue::get(VectorType::get(t, mVWidth));
@@ -690,6 +697,51 @@ namespace SwrJit
return vGather;
}
+#if USE_SIMD16_BUILDER
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief
+ Value *Builder::EXTRACT(Value *a2, uint32_t imm)
+ {
+ const uint32_t i0 = (imm > 0) ? mVWidth : 0;
+
+ Value *result = VUNDEF_F();
+
+ for (uint32_t i = 0; i < mVWidth; i += 1)
+ {
+ Value *temp = VEXTRACT(a2, C(i0 + i));
+
+ result = VINSERT(result, temp, C(i));
+ }
+
+ return result;
+ }
+
+ //////////////////////////////////////////////////////////////////////////
+ /// @brief
+ Value *Builder::INSERT(Value *a2, Value * b, uint32_t imm)
+ {
+ const uint32_t i0 = (imm > 0) ? mVWidth : 0;
+
+ Value *result = BITCAST(a2, mSimd2FP32Ty);
+
+ for (uint32_t i = 0; i < mVWidth; i += 1)
+ {
+#if 1
+ if (!b->getType()->getScalarType()->isFloatTy())
+ {
+ b = BITCAST(b, mSimdFP32Ty);
+ }
+
+#endif
+ Value *temp = VEXTRACT(b, C(i));
+
+ result = VINSERT(result, temp, C(i0 + i));
+ }
+
+ return result;
+ }
+
+#endif
//////////////////////////////////////////////////////////////////////////
/// @brief convert x86 <N x float> mask to llvm <N x i1> mask
Value* Builder::MASK(Value* vmask)
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 662574d638..d9ff4a2156 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -56,6 +56,9 @@ Value *VIMMED1(float i);
Value *VIMMED1(bool i);
Value *VUNDEF(Type* t);
Value *VUNDEF_F();
+#if USE_SIMD16_BUILDER
+Value *VUNDEF2_F();
+#endif
Value *VUNDEF_I();
Value *VUNDEF(Type* ty, uint32_t size);
Value *VUNDEF_IPTR();
@@ -98,6 +101,12 @@ Value *VMASK(Value* mask);
/// @brief functions that build IR to call x86 intrinsics directly, or
/// emulate them with other instructions if not available on the host
//////////////////////////////////////////////////////////////////////////
+
+#if USE_SIMD16_BUILDER
+Value *EXTRACT(Value *a, uint32_t imm);
+Value *INSERT(Value *a, Value *b, uint32_t imm);
+
+#endif
Value *MASKLOADD(Value* src, Value* mask);
void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 30dbcfc8ce..062852e2d2 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -80,6 +80,9 @@ struct FetchJit : public Builder
#endif
void StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
+#if USE_SIMD16_BUILDER
+ void StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4]);
+#endif
#if USE_SIMD16_SHADERS
Value* GenerateCompCtrlVector(const ComponentControl ctrl, bool useVertexID2);
@@ -137,8 +140,8 @@ Function* FetchJit::Create(const FETCH_COMPILE_STATE& fetchState)
// GEP
pVtxOut = GEP(pVtxOut, C(0));
#if USE_SIMD16_SHADERS
-#if 0
- pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth * 2), 0));
+#if 0// USE_SIMD16_BUILDER
+ pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
#else
pVtxOut = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth), 0));
#endif
@@ -1250,9 +1253,27 @@ void FetchJit::JitGatherVertices(const FETCH_COMPILE_STATE &fetchState,
if (currentVertexElement > 3)
{
+#if USE_SIMD16_BUILDER
+ Value *pVtxSrc2[4];
+
+ // pack adjacent pairs of SIMD8s into SIMD16s
+ for (uint32_t i = 0; i < 4; i += 1)
+ {
+ pVtxSrc2[i] = VUNDEF2_F();
+
+ pVtxSrc2[i] = INSERT(pVtxSrc2[i], vVertexElements[i], 0);
+ pVtxSrc2[i] = INSERT(pVtxSrc2[i], vVertexElements2[i], 1);
+ }
+
+ // store SIMD16s
+ Value *pVtxOut2 = BITCAST(pVtxOut, PointerType::get(VectorType::get(mFP32Ty, mVWidth2), 0));
+ StoreVertexElements2(pVtxOut2, outputElt, 4, pVtxSrc2);
+
+#else
StoreVertexElements(pVtxOut, outputElt, 4, vVertexElements);
StoreVertexElements(GEP(pVtxOut, C(1)), outputElt, 4, vVertexElements2);
+#endif
outputElt += 1;
// reset to the next vVertexElement to output
@@ -2312,7 +2333,8 @@ void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, con
for(uint32_t c = 0; c < numEltsToStore; ++c)
{
// STORE expects FP32 x vWidth type, just bitcast if needed
- if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy()){
+ if(!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
+ {
#if FETCH_DUMP_VERTEX
PRINT("vVertexElements[%d]: 0x%x\n", {C(c), vVertexElements[c]});
#endif
@@ -2335,6 +2357,35 @@ void FetchJit::StoreVertexElements(Value* pVtxOut, const uint32_t outputElt, con
}
}
+#if USE_SIMD16_BUILDER
+void FetchJit::StoreVertexElements2(Value* pVtxOut, const uint32_t outputElt, const uint32_t numEltsToStore, Value* (&vVertexElements)[4])
+{
+ SWR_ASSERT(numEltsToStore <= 4, "Invalid element count.");
+
+ for (uint32_t c = 0; c < numEltsToStore; ++c)
+ {
+ // STORE expects FP32 x vWidth type, just bitcast if needed
+ if (!vVertexElements[c]->getType()->getScalarType()->isFloatTy())
+ {
+#if FETCH_DUMP_VERTEX
+ PRINT("vVertexElements[%d]: 0x%x\n", { C(c), vVertexElements[c] });
+#endif
+ vVertexElements[c] = BITCAST(vVertexElements[c], mSimd2FP32Ty);
+ }
+#if FETCH_DUMP_VERTEX
+ else
+ {
+ PRINT("vVertexElements[%d]: %f\n", { C(c), vVertexElements[c] });
+ }
+#endif
+ // outputElt * 4 = offsetting by the size of a simdvertex
+ // + c offsets to a 32bit x vWidth row within the current vertex
+ Value* dest = GEP(pVtxOut, C(outputElt * 4 + c), "destGEP");
+ STORE(vVertexElements[c], dest);
+ }
+}
+
+#endif
//////////////////////////////////////////////////////////////////////////
/// @brief Generates a constant vector of values based on the
/// ComponentControl value
--
2.14.1
More information about the mesa-dev
mailing list