[Mesa-dev] [PATCH v2 21/45] swr/rast: Lower PERMD and PERMPS to x86.

Wed Apr 18 01:32:01 UTC 2018

Add support for providing an emulation callback function for arch/width
combinations that don't map cleanly to an x86 intrinsic.
---
 .../drivers/swr/rasterizer/jitter/builder_mem.cpp  |  8 +--
 .../drivers/swr/rasterizer/jitter/builder_misc.cpp | 70 ----------------------
 .../drivers/swr/rasterizer/jitter/builder_misc.h   |  2 -
 .../drivers/swr/rasterizer/jitter/fetch_jit.cpp    | 20 +++----
 4 files changed, 14 insertions(+), 86 deletions(-)

diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
index 68695c4..d8ec885 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -555,7 +555,7 @@ namespace SwrJit
             // 256i - 0    1    2    3    4    5    6    7
             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
 
-            Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+            Value* vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
             // after PERMD: move and pack xy components into each 128bit lane
             // 256i - 0    1    2    3    4    5    6    7
             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
@@ -565,7 +565,7 @@ namespace SwrJit
             if (info.numComps > 2)
             {
                 Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherInput[1], v32x8Ty), vConstMask), vGatherTy);
-                vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+                vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
             }
 
             for (uint32_t i = 0; i < 4; i++)
@@ -644,7 +644,7 @@ namespace SwrJit
             // 256i - 0    1    2    3    4    5    6    7
             //        xxxx yyyy zzzz wwww xxxx yyyy zzzz wwww
 
-            Value* vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
+            Value* vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
             // 256i - 0    1    2    3    4    5    6    7
             //        xxxx xxxx dcdc dcdc yyyy yyyy dcdc dcdc (dc - don't care)
@@ -653,7 +653,7 @@ namespace SwrJit
             Value* vi128ZW = nullptr;
             if (info.numComps > 2)
             {
-                vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
+                vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
             }
 
             // sign extend all enabled components. If we have a fill vVertexElements, output to current simdvertex
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 54987c7..aa9e2dd 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -602,76 +602,6 @@ namespace SwrJit
     }
 
     //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a VPERMD operation (shuffle 32 bit integer values 
-    /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying 
-    /// platform, emulate it
-    /// @param a - 256bit SIMD lane(8x32bit) of integer values.
-    /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
-    Value *Builder::PERMD(Value* a, Value* idx)
-    {
-        Value* res;
-        // use avx2 permute instruction if available
-        if(JM()->mArch.AVX2())
-        {
-            res = VPERMD(a, idx);
-        }
-        else
-        {
-            if (isa<Constant>(idx))
-            {
-                res = VSHUFFLE(a, a, idx);
-            }
-            else
-            {
-                res = VUNDEF_I();
-                for (uint32_t l = 0; l < JM()->mVWidth; ++l)
-                {
-                    Value* pIndex = VEXTRACT(idx, C(l));
-                    Value* pVal = VEXTRACT(a, pIndex);
-                    res = VINSERT(res, pVal, C(l));
-                }
-            }
-        }
-        return res;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
-    /// @brief Generate a VPERMPS operation (shuffle 32 bit float values 
-    /// across 128 bit lanes) in LLVM IR.  If not supported on the underlying 
-    /// platform, emulate it
-    /// @param a - 256bit SIMD lane(8x32bit) of float values.
-    /// @param idx - 256bit SIMD lane(8x32bit) of 3 bit lane index values
-    Value *Builder::PERMPS(Value* a, Value* idx)
-    {
-        Value* res;
-        // use avx2 permute instruction if available
-        if (JM()->mArch.AVX2())
-        {
-            // llvm 3.6.0 swapped the order of the args to vpermd
-            res = VPERMPS(idx, a);
-        }
-        else
-        {
-            if (isa<Constant>(idx))
-            {
-                res = VSHUFFLE(a, a, idx);
-            }
-            else
-            {
-                res = VUNDEF_F();
-                for (uint32_t l = 0; l < JM()->mVWidth; ++l)
-                {
-                    Value* pIndex = VEXTRACT(idx, C(l));
-                    Value* pVal = VEXTRACT(a, pIndex);
-                    res = VINSERT(res, pVal, C(l));
-                }
-            }
-        }
-
-        return res;
-    }
-
-    //////////////////////////////////////////////////////////////////////////
     /// @brief Generate a VCVTPH2PS operation (float16->float32 conversion)
     /// in LLVM IR.  If not supported on the underlying platform, emulate it
     /// @param a - 128bit SIMD lane(8x16bit) of float16 in int16 format.
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 343a9b0..7308821 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -113,8 +113,6 @@ Value *JOIN_16(Value *a, Value *b);
 Value *PSHUFB(Value* a, Value* b);
 Value *PMOVSXBD(Value* a);
 Value *PMOVSXWD(Value* a);
-Value *PERMD(Value* a, Value* idx);
-Value *PERMPS(Value* a, Value* idx);
 Value *CVTPH2PS(Value* a, const llvm::Twine& name = "");
 Value *CVTPS2PH(Value* a, Value* rounding);
 Value *PMAXSD(Value* a, Value* b);
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index f9293aa..da6d982 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -1955,8 +1955,8 @@ void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args)
         Value *vi128XY_hi = nullptr;
         if (isComponentEnabled(compMask, 0) || isComponentEnabled(compMask, 1))
         {
-            vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
-            vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
+            vi128XY_lo = BITCAST(VPERMD(vShufResult_lo, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
+            vi128XY_hi = BITCAST(VPERMD(vShufResult_hi, C<int32_t>({ 0, 4, 0, 0, 1, 5, 0, 0 })), v128Ty);
 
             // after PERMD: move and pack xy and zw components in low 64 bits of each 128bit lane
             // 256i - 0    1    2    3    4    5    6    7
@@ -1968,8 +1968,8 @@ void FetchJit::Shuffle8bpcGatherd16(Shuffle8bpcArgs &args)
         Value *vi128ZW_hi = nullptr;
         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3))
         {
-            vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
-            vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
+            vi128ZW_lo = BITCAST(VPERMD(vShufResult_lo, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
+            vi128ZW_hi = BITCAST(VPERMD(vShufResult_hi, C<int32_t>({ 2, 6, 0, 0, 3, 7, 0, 0 })), v128Ty);
         }
 
         // init denormalize variables if needed
@@ -2306,8 +2306,8 @@ void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args)
             // 256i - 0    1    2    3    4    5    6    7
             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
 
-            vi128XY_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
-            vi128XY_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+            vi128XY_lo = BITCAST(VPERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+            vi128XY_hi = BITCAST(VPERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
 
             // after PERMD: move and pack xy components into each 128bit lane
             // 256i - 0    1    2    3    4    5    6    7
@@ -2325,8 +2325,8 @@ void FetchJit::Shuffle16bpcGather16(Shuffle16bpcArgs &args)
             Value *vShufResult_lo = BITCAST(PSHUFB(BITCAST(vGatherResult_lo, v32x8Ty), vConstMask), vGatherTy);
             Value *vShufResult_hi = BITCAST(PSHUFB(BITCAST(vGatherResult_hi, v32x8Ty), vConstMask), vGatherTy);
 
-            vi128ZW_lo = BITCAST(PERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
-            vi128ZW_hi = BITCAST(PERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+            vi128ZW_lo = BITCAST(VPERMD(vShufResult_lo, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+            vi128ZW_hi = BITCAST(VPERMD(vShufResult_hi, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
         }
 
         // init denormalize variables if needed
@@ -2547,7 +2547,7 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
             // 256i - 0    1    2    3    4    5    6    7
             //        xxxx xxxx yyyy yyyy xxxx xxxx yyyy yyyy
 
-            vi128XY = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+            vi128XY = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
             // after PERMD: move and pack xy components into each 128bit lane
             // 256i - 0    1    2    3    4    5    6    7
             //        xxxx xxxx xxxx xxxx yyyy yyyy yyyy yyyy
@@ -2557,7 +2557,7 @@ void FetchJit::Shuffle16bpcGather(Shuffle16bpcArgs &args)
         Value* vi128ZW = nullptr;
         if (isComponentEnabled(compMask, 2) || isComponentEnabled(compMask, 3)) {
             Value* vShufResult = BITCAST(PSHUFB(BITCAST(vGatherResult[1], v32x8Ty), vConstMask), vGatherTy);
-            vi128ZW = BITCAST(PERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
+            vi128ZW = BITCAST(VPERMD(vShufResult, C<int32_t>({ 0, 1, 4, 5, 2, 3, 6, 7 })), v128bitTy);
         }
 
         // init denormalize variables if needed
-- 
2.7.4