[Mesa-dev] [PATCH 21/45] swr/rast: Lower PERMD and PERMPS to x86.
George Kyriazis
george.kyriazis at intel.com
Fri Apr 13 19:02:01 UTC 2018
Add support for providing an emulation callback function for arch/width
combinations that don't map cleanly to an x86 intrinsic.
---
.../drivers/swr/rasterizer/jitter/builder_mem.cpp | 8 +--
.../drivers/swr/rasterizer/jitter/builder_misc.cpp | 70 ----------------------
.../drivers/swr/rasterizer/jitter/builder_misc.h | 2 -
.../drivers/swr/rasterizer/jitter/fetch_jit.cpp | 20 +++----
4 files changed, 14 insertions(+), 86 deletions(-)
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
index f366864..a27f02e 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -555,7 +555,7 @@ namespace SwrJit
// 256i - 0 1 2 3 4 5 6 7
// xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
- Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ Value* vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
// after PERMD: move and pack xy components into each 128bit lane
// 256i - 0 1 2 3 4 5 6 7
// xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
@@ -565,7 +565,7 @@ namespace SwrJit
if (info.numComps > 2)
{
Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
- vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
}
for (uint32_t i = 0; i < 4; i++)
@@ -644,7 +644,7 @@ namespace SwrJit
// 256i - 0 1 2 3 4 5 6 7
// xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
- Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
+ Value* vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
// after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
// 256i - 0 1 2 3 4 5 6 7
// xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
@@ -653,7 +653,7 @@ namespace SwrJit
Value* vi128ZW = nullptr;
if (info.numComps > 2)
{
- vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
+ vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
}
// sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 54987c7..aa9e2dd 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -602,76 +602,6 @@ namespace SwrJit
}
//////////////////////////////////////////////////////////////////////////
- /// @brief Generate a VPERMD operation (shuffle 32 bit integer values
- /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
- /// platform, emulate it
- /// @param a - 256bit SIMD lane(8x32bit) of integer values.
- /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
- Value *Builder::PERMD(Value* a, Value* idx)
- {
- Value* res;
- // use avx2 permute instruction if available
- if(JM()->mArch.AVX2())
- {
- res = VPERMD(a, idx);
- }
- else
- {
- if (isa<Constant>(idx))
- {
- res = VSHUFFLE(a, a, idx);
- }
- else
- {
- res = VUNDEF_I();
- for (uint32_t l = 0; l < JM()->mVWidth; ++l)
- {
- Value* pIndex = VEXTRACT(idx, C(l));
- Value* pVal = VEXTRACT(a, pIndex);
- res = VINSERT(res, pVal, C(l));
- }
- }
- }
- return res;
- }
-
- //////////////////////////////////////////////////////////////////////////
- /// @brief Generate a VPERMPS operation (shuffle 32 bit float values
- /// across 128 bit lanes) in LLVM IR. If not supported on the underlying
- /// platform, emulate it
- /// @param a - 256bit SIMD lane(8x32bit) of float values.
- /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
- Value *Builder::PERMPS(Value* a, Value* idx)
- {
- Value* res;
- // use avx2 permute instruction if available
- if (JM()->mArch.AVX2())
- {
- // llvm 3.6.0 swapped the order of the args to vpermd
- res = VPERMPS(idx, a);
- }
- else
- {
- if (isa<Constant>(idx))
- {
- res = VSHUFFLE(a, a, idx);
- }
- else
- {
- res = VUNDEF_F();
- for (uint32_t l = 0; l < JM()->mVWidth; ++l)
- {
- Value* pIndex = VEXTRACT(idx, C(l));
- Value* pVal = VEXTRACT(a, pIndex);
- res = VINSERT(res, pVal, C(l));
- }
- }
- }
-
- return res;
- }
-
- //////////////////////////////////////////////////////////////////////////
/// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
/// in LLVM IR. If not supported on the underlying platform, emulate it
/// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 343a9b0..7308821 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -113,8 +113,6 @@ Value *JOIN_16(Value *a, Value *b);
Value *PSHUFB(Value* a, Value* b);
Value *PMOVSXBD(Value* a);
Value *PMOVSXWD(Value* a);
-Value *PERMD(Value* a, Value* idx);
-Value *PERMPS(Value* a, Value* idx);
Value *CVTPH2PS(Value* a, const llvm::Twine& name = "");
Value *CVTPS2PH(Value* a, Value* rounding);
Value *PMAXSD(Value* a, Value* b);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index f9293aa..da6d982 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -1955,8 +1955,8 @@ void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args)
Value *vi128XY_hi = nullptr;
if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
{
- vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
- vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
+ vi128XY_lo = BITCAST(VPERMD(vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
+ vi128XY_hi = BITCAST(VPERMD(vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
// after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
// 256i - 0 1 2 3 4 5 6 7
@@ -1968,8 +1968,8 @@ void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args)
Value *vi128ZW_hi = nullptr;
if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
{
- vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
- vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
+ vi128ZW_lo = BITCAST(VPERMD(vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
+ vi128ZW_hi = BITCAST(VPERMD(vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
}
// init denormalize variables if needed
@@ -2306,8 +2306,8 @@ void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args)
// 256i - 0 1 2 3 4 5 6 7
// xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
- vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
- vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ vi128XY_lo = BITCAST(VPERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ vi128XY_hi = BITCAST(VPERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
// after PERMD: move and pack xy components into each 128bit lane
// 256i - 0 1 2 3 4 5 6 7
@@ -2325,8 +2325,8 @@ void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args)
Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
- vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
- vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ vi128ZW_lo = BITCAST(VPERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ vi128ZW_hi = BITCAST(VPERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
}
// init denormalize variables if needed
@@ -2547,7 +2547,7 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
// 256i - 0 1 2 3 4 5 6 7
// xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
- vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
// after PERMD: move and pack xy components into each 128bit lane
// 256i - 0 1 2 3 4 5 6 7
// xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
@@ -2557,7 +2557,7 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
Value* vi128ZW = nullptr;
if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) {
Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
- vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+ vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
}
// init denormalize variables if needed
--
2.7.4
More information about the mesa-dev
mailing list