[Mesa-dev] [PATCH v2 10/45] swr/rast: Add autogen of helper llvm intrinsics.
George Kyriazis
george.kyriazis at intel.com
Wed Apr 18 01:31:50 UTC 2018
Replace sqrt, maskload, fp min/max, cttz, ctlz with llvm equivalent.
Replace AVX maskedstore intrinsic with LLVM intrinsic. Add helper llvm
macros for stacksave, stackrestore, popcnt.
---
src/gallium/drivers/swr/Makefile.am | 8 ++
src/gallium/drivers/swr/SConscript | 9 ++
src/gallium/drivers/swr/meson.build | 2 +-
.../swr/rasterizer/codegen/gen_llvm_ir_macros.py | 100 ++++++++++++++-------
.../rasterizer/codegen/templates/gen_builder.hpp | 20 ++++-
.../drivers/swr/rasterizer/jitter/builder.h | 1 +
.../drivers/swr/rasterizer/jitter/builder_mem.cpp | 50 +----------
.../drivers/swr/rasterizer/jitter/builder_mem.h | 5 --
.../drivers/swr/rasterizer/jitter/builder_misc.cpp | 13 ---
.../drivers/swr/rasterizer/jitter/builder_misc.h | 11 ---
.../drivers/swr/rasterizer/jitter/fetch_jit.cpp | 8 +-
.../drivers/swr/rasterizer/jitter/meson.build | 11 +++
.../swr/rasterizer/jitter/streamout_jit.cpp | 18 ++--
13 files changed, 130 insertions(+), 126 deletions(-)
diff --git a/src/gallium/drivers/swr/Makefile.am b/src/gallium/drivers/swr/Makefile.am
index 5ec9213..32dd9e5 100644
--- a/src/gallium/drivers/swr/Makefile.am
+++ b/src/gallium/drivers/swr/Makefile.am
@@ -81,6 +81,7 @@ BUILT_SOURCES = \
rasterizer/jitter/gen_state_llvm.h \
rasterizer/jitter/gen_builder.hpp \
rasterizer/jitter/gen_builder_x86.hpp \
+ rasterizer/jitter/gen_builder_intrin.hpp \
rasterizer/archrast/gen_ar_event.hpp \
rasterizer/archrast/gen_ar_event.cpp \
rasterizer/archrast/gen_ar_eventhandler.hpp \
@@ -140,6 +141,13 @@ rasterizer/jitter/gen_builder_x86.hpp: rasterizer/codegen/gen_llvm_ir_macros.py
--output rasterizer/jitter \
--gen_x86_h
+rasterizer/jitter/gen_builder_intrin.hpp: rasterizer/codegen/gen_llvm_ir_macros.py rasterizer/codegen/templates/gen_builder.hpp rasterizer/codegen/gen_common.py
+ $(MKDIR_GEN)
+ $(PYTHON_GEN) \
+ $(srcdir)/rasterizer/codegen/gen_llvm_ir_macros.py \
+ --output rasterizer/jitter \
+ --gen_intrin_h
+
rasterizer/archrast/gen_ar_event.hpp: rasterizer/codegen/gen_archrast.py rasterizer/codegen/templates/gen_ar_event.hpp rasterizer/archrast/events.proto rasterizer/archrast/events_private.proto rasterizer/codegen/gen_common.py
$(MKDIR_GEN)
$(PYTHON_GEN) \
diff --git a/src/gallium/drivers/swr/SConscript b/src/gallium/drivers/swr/SConscript
index cc4025b..5097be6 100644
--- a/src/gallium/drivers/swr/SConscript
+++ b/src/gallium/drivers/swr/SConscript
@@ -85,6 +85,15 @@ Depends('rasterizer/jitter/gen_builder.hpp',
swrroot + 'rasterizer/codegen/templates/gen_builder.hpp')
env.CodeGenerate(
+ target = 'rasterizer/jitter/gen_builder_intrin.hpp',
+ script = swrroot + 'rasterizer/codegen/gen_llvm_ir_macros.py',
+ source = '',
+ command = python_cmd + ' $SCRIPT --output ' + bldroot + '/rasterizer/jitter --gen_intrin_h'
+)
+Depends('rasterizer/jitter/gen_builder.hpp',
+ swrroot + 'rasterizer/codegen/templates/gen_builder.hpp')
+
+env.CodeGenerate(
target = './gen_swr_context_llvm.h',
script = swrroot + 'rasterizer/codegen/gen_llvm_types.py',
source = 'swr_context.h',
diff --git a/src/gallium/drivers/swr/meson.build b/src/gallium/drivers/swr/meson.build
index 4bcd4f4..b28abd6 100644
--- a/src/gallium/drivers/swr/meson.build
+++ b/src/gallium/drivers/swr/meson.build
@@ -296,7 +296,7 @@ endif
libmesaswr = static_library(
'mesaswr',
[files_swr_mesa, files_swr_common, gen_knobs_h, gen_knobs_cpp,
- gen_builder_hpp, gen_builder_x86_hpp],
+ gen_builder_hpp, gen_builder_x86_hpp, gen_builder_intrin_hpp],
cpp_args : [cpp_vis_args, swr_cpp_args, swr_avx_args, swr_arch_defines],
include_directories : [inc_common, swr_incs],
dependencies : dep_llvm,
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
index 3e1fbfe..9dfc1e7 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
+++ b/src/gallium/drivers/swr/rasterizer/codegen/gen_llvm_ir_macros.py
@@ -42,32 +42,40 @@ inst_aliases = {
}
intrinsics = [
- ['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 'mask', 'scale']],
- ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 'mask', 'scale']],
- ['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 'indices', 'mask', 'scale']],
- ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 'mask', 'scale']],
- ['VGATHERDD_16', 'x86_avx512_gather_dpi_512', ['src', 'pBase', 'indices', 'mask', 'scale']],
- ['VSQRTPS', 'x86_avx_sqrt_ps_256', ['a']],
- ['VRSQRTPS', 'x86_avx_rsqrt_ps_256', ['a']],
- ['VRCPPS', 'x86_avx_rcp_ps_256', ['a']],
- ['VMINPS', 'x86_avx_min_ps_256', ['a', 'b']],
- ['VMAXPS', 'x86_avx_max_ps_256', ['a', 'b']],
- ['VROUND', 'x86_avx_round_ps_256', ['a', 'rounding']],
- ['BEXTR_32', 'x86_bmi_bextr_32', ['src', 'control']],
- ['VPSHUFB', 'x86_avx2_pshuf_b', ['a', 'b']],
- ['VPERMD', 'x86_avx2_permd', ['a', 'idx']],
- ['VPERMPS', 'x86_avx2_permps', ['idx', 'a']],
- ['VCVTPD2PS', 'x86_avx_cvt_pd2_ps_256', ['a']],
- ['VCVTPH2PS', 'x86_vcvtph2ps_256', ['a']],
- ['VCVTPS2PH', 'x86_vcvtps2ph_256', ['a', 'round']],
- ['VHSUBPS', 'x86_avx_hsub_ps_256', ['a', 'b']],
- ['VPTESTC', 'x86_avx_ptestc_256', ['a', 'b']],
- ['VPTESTZ', 'x86_avx_ptestz_256', ['a', 'b']],
- ['VFMADDPS', 'x86_fma_vfmadd_ps_256', ['a', 'b', 'c']],
- ['VMOVMSKPS', 'x86_avx_movmsk_ps_256', ['a']],
- ['INTERRUPT', 'x86_int', ['a']],
- ['VPHADDD', 'x86_avx2_phadd_d', ['a', 'b']],
- ]
+ ['VGATHERPD', 'x86_avx2_gather_d_pd_256', ['src', 'pBase', 'indices', 'mask', 'scale']],
+ ['VGATHERPS', 'x86_avx2_gather_d_ps_256', ['src', 'pBase', 'indices', 'mask', 'scale']],
+ ['VGATHERPS_16', 'x86_avx512_gather_dps_512', ['src', 'pBase', 'indices', 'mask', 'scale']],
+ ['VGATHERDD', 'x86_avx2_gather_d_d_256', ['src', 'pBase', 'indices', 'mask', 'scale']],
+ ['VGATHERDD_16', 'x86_avx512_gather_dpi_512', ['src', 'pBase', 'indices', 'mask', 'scale']],
+ ['VRCPPS', 'x86_avx_rcp_ps_256', ['a']],
+ ['VROUND', 'x86_avx_round_ps_256', ['a', 'rounding']],
+ ['BEXTR_32', 'x86_bmi_bextr_32', ['src', 'control']],
+ ['VPSHUFB', 'x86_avx2_pshuf_b', ['a', 'b']],
+ ['VPERMD', 'x86_avx2_permd', ['a', 'idx']],
+ ['VPERMPS', 'x86_avx2_permps', ['idx', 'a']],
+ ['VCVTPD2PS', 'x86_avx_cvt_pd2_ps_256', ['a']],
+ ['VCVTPH2PS', 'x86_vcvtph2ps_256', ['a']],
+ ['VCVTPS2PH', 'x86_vcvtps2ph_256', ['a', 'round']],
+ ['VHSUBPS', 'x86_avx_hsub_ps_256', ['a', 'b']],
+ ['VPTESTC', 'x86_avx_ptestc_256', ['a', 'b']],
+ ['VPTESTZ', 'x86_avx_ptestz_256', ['a', 'b']],
+ ['VFMADDPS', 'x86_fma_vfmadd_ps_256', ['a', 'b', 'c']],
+ ['VMOVMSKPS', 'x86_avx_movmsk_ps_256', ['a']],
+ ['INTERRUPT', 'x86_int', ['a']],
+ ['VPHADDD', 'x86_avx2_phadd_d', ['a', 'b']],
+]
+
+llvm_intrinsics = [
+ ['CTTZ', 'cttz', ['a', 'flag'], ['a']],
+ ['CTLZ', 'ctlz', ['a', 'flag'], ['a']],
+ ['VSQRTPS', 'sqrt', ['a'], ['a']],
+ ['STACKSAVE', 'stacksave', [], []],
+ ['STACKRESTORE', 'stackrestore', ['a'], []],
+ ['VMINPS', 'minnum', ['a', 'b'], ['a']],
+ ['VMAXPS', 'maxnum', ['a', 'b'], ['a']],
+ ['DEBUGTRAP', 'debugtrap', [], []],
+ ['POPCNT', 'ctpop', ['a'], ['a']]
+]
this_dir = os.path.dirname(os.path.abspath(__file__))
template = os.path.join(this_dir, 'templates', 'gen_builder.hpp')
@@ -195,7 +203,7 @@ def generate_gen_h(functions, output_dir):
templfuncs.append({
'decl' : decl,
'intrin' : func['name'],
- 'args' : ', '.join(func['arg_names']),
+ 'args' : func['arg_names'],
})
MakoTemplateWriter.to_file(
@@ -205,7 +213,7 @@ def generate_gen_h(functions, output_dir):
comment='Builder IR Wrappers',
filename=filename,
functions=templfuncs,
- isX86=False)
+ isX86=False, isIntrin=False)
'''
Auto-generates macros for LLVM IR
@@ -221,8 +229,8 @@ def generate_x86_h(output_dir):
functions.append({
'decl' : 'Value* %s(%s, const llvm::Twine& name = "")' % (inst[0], declargs),
- 'args' : ', '.join(inst[2]),
'intrin' : inst[1],
+ 'args' : inst[2],
})
MakoTemplateWriter.to_file(
@@ -232,8 +240,36 @@ def generate_x86_h(output_dir):
comment='x86 intrinsics',
filename=filename,
functions=functions,
- isX86=True)
+ isX86=True, isIntrin=False)
+def generate_intrin_h(output_dir):
+ filename = 'gen_builder_intrin.hpp'
+ output_filename = os.path.join(output_dir, filename)
+
+ functions = []
+ for inst in llvm_intrinsics:
+ #print('Inst: %s, x86: %s numArgs: %d' % (inst[0], inst[1], len(inst[2])))
+ if len(inst[2]) != 0:
+ declargs = 'Value* ' + ', Value* '.join(inst[2])
+ decl = 'Value* %s(%s, const llvm::Twine& name = "")' % (inst[0], declargs)
+ else:
+ decl = 'Value* %s(const llvm::Twine& name = "")' % (inst[0])
+
+ functions.append({
+ 'decl' : decl,
+ 'intrin' : inst[1],
+ 'args' : inst[2],
+ 'types' : inst[3],
+ })
+
+ MakoTemplateWriter.to_file(
+ template,
+ output_filename,
+ cmdline=sys.argv,
+ comment='llvm intrinsics',
+ filename=filename,
+ functions=functions,
+ isX86=False, isIntrin=True)
'''
Function which is invoked when this script is started from a command line.
Will present and consume a set of arguments which will tell this script how
@@ -247,6 +283,7 @@ def main():
parser.add_argument('--output-dir', '-o', action='store', dest='output', help='Path to output directory', required=True)
parser.add_argument('--gen_h', help='Generate builder_gen.h', action='store_true', default=False)
parser.add_argument('--gen_x86_h', help='Generate x86 intrinsics. No input is needed.', action='store_true', default=False)
+ parser.add_argument('--gen_intrin_h', help='Generate llvm intrinsics. No input is needed.', action='store_true', default=False)
args = parser.parse_args()
if not os.path.exists(args.output):
@@ -264,6 +301,9 @@ def main():
if args.gen_x86_h:
generate_x86_h(args.output)
+ if args.gen_intrin_h:
+ generate_intrin_h(args.output)
+
if __name__ == '__main__':
main()
# END OF FILE
diff --git a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp
index b6cf03e..5a47c9a 100644
--- a/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp
+++ b/src/gallium/drivers/swr/rasterizer/codegen/templates/gen_builder.hpp
@@ -35,15 +35,27 @@
//============================================================================
// Auto-generated ${comment}
//============================================================================
-
%for func in functions:
+<%argList = ', '.join(func['args'])%>\
${func['decl']}
{
%if isX86:
- Function *pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']});
- return CALL(pFunc, std::initializer_list<Value*>{${func['args']}}, name);
+ Function * pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']});
+ return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name);
+%elif isIntrin:
+ %if len(func['types']) != 0:
+ SmallVector<Type*, ${len(func['types'])}> args;
+ %for arg in func['types']:
+ args.push_back(${arg}->getType());
+ %endfor
+ Function * pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']}, args);
+ return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name);
+ %else:
+ Function * pFunc = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::${func['intrin']});
+ return CALL(pFunc, std::initializer_list<Value*>{${argList}}, name);
+ %endif
%else:
- return IRB()->${func['intrin']}(${func['args']});
+ return IRB()->${func['intrin']}(${argList});
%endif
}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder.h b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
index 763d29f..516e872 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder.h
@@ -92,6 +92,7 @@ namespace SwrJit
#include "gen_builder.hpp"
#include "gen_builder_x86.hpp"
+#include "gen_builder_intrin.hpp"
#include "builder_misc.h"
#include "builder_math.h"
#include "builder_mem.h"
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
index cc0f897..a825434 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.cpp
@@ -129,30 +129,6 @@ namespace SwrJit
return STORE(val, GEPA(basePtr, valIndices));
}
- //////////////////////////////////////////////////////////////////////////
- /// @brief Generate an i32 masked load operation in LLVM IR. If not
- /// supported on the underlying platform, emulate it with float masked load
- /// @param src - base address pointer for the load
- /// @param vMask - SIMD wide mask that controls whether to access memory load 0
- Value *Builder::MASKLOADD(Value* src, Value* mask)
- {
- Value* vResult;
- // use avx2 gather instruction is available
- if (JM()->mArch.AVX2())
- {
- Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx2_maskload_d_256);
- vResult = CALL(func, { src,mask });
- }
- else
- {
- // maskload intrinsic expects integer mask operand in llvm >= 3.8
- mask = BITCAST(mask, VectorType::get(mInt32Ty, mVWidth));
- Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskload_ps_256);
- vResult = BITCAST(CALL(func, { src,mask }), VectorType::get(mInt32Ty, mVWidth));
- }
- return vResult;
- }
-
Value* Builder::OFFSET_TO_NEXT_COMPONENT(Value* base, Constant *offset)
{
return GEP(base, offset);
@@ -390,9 +366,7 @@ namespace SwrJit
/// @param pVecPassthru - SIMD wide vector of values to load when lane is inactive
Value* Builder::GATHER_PTR(Value* pVecSrcPtr, Value* pVecMask, Value* pVecPassthru)
{
- Function* pMaskedGather = llvm::Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::masked_gather, { pVecPassthru->getType(), pVecSrcPtr->getType() });
-
- return CALL(pMaskedGather, { pVecSrcPtr, C(0), pVecMask, pVecPassthru });
+ return MASKED_GATHER(pVecSrcPtr, 4, pVecMask, pVecPassthru);
}
void Builder::Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
@@ -791,14 +765,11 @@ namespace SwrJit
Value* pMask = VMOVMSKPS(BITCAST(vMask, mSimdFP32Ty));
- // Get cttz function
- Function* pfnCttz = Intrinsic::getDeclaration(mpJitMgr->mpCurrentModule, Intrinsic::cttz, { mInt32Ty });
-
// Setup loop basic block
BasicBlock* pLoop = BasicBlock::Create(mpJitMgr->mContext, "Scatter_Loop", pFunc);
// compute first set bit
- Value* pIndex = CALL(pfnCttz, { pMask, C(false) });
+ Value* pIndex = CTTZ(pMask, C(false));
Value* pIsUndef = ICMP_EQ(pIndex, C(32));
@@ -835,7 +806,7 @@ namespace SwrJit
Value* pNewMask = AND(pMaskPhi, NOT(SHL(C(1), pIndexPhi)));
// Terminator
- Value* pNewIndex = CALL(pfnCttz, { pNewMask, C(false) });
+ Value* pNewIndex = CTTZ(pNewMask, C(false));
pIsUndef = ICMP_EQ(pNewIndex, C(32));
COND_BR(pIsUndef, pPostLoop, pLoop);
@@ -848,19 +819,4 @@ namespace SwrJit
IRB()->SetInsertPoint(pPostLoop, pPostLoop->begin());
}
- //////////////////////////////////////////////////////////////////////////
- /// @brief save/restore stack, providing ability to push/pop the stack and
- /// reduce overall stack requirements for temporary stack use
- Value* Builder::STACKSAVE()
- {
- Function* pfnStackSave = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stacksave);
- return CALLA(pfnStackSave);
- }
-
- void Builder::STACKRESTORE(Value* pSaved)
- {
- Function* pfnStackRestore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::stackrestore);
- CALL(pfnStackRestore, std::initializer_list<Value*>{pSaved});
- }
-
}
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h
index 5a755e6..b538342 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_mem.h
@@ -60,8 +60,6 @@ LoadInst *LOADV(Value *BasePtr, const std::initializer_list<Value*> &offset, con
StoreInst *STORE(Value *Val, Value *BasePtr, const std::initializer_list<uint32_t> &offset);
StoreInst *STOREV(Value *Val, Value *BasePtr, const std::initializer_list<Value*> &offset);
-Value *MASKLOADD(Value* src, Value* mask);
-
void Gather4(const SWR_FORMAT format, Value* pSrcBase, Value* byteOffsets,
Value* mask, Value* vGatherComponents[], bool bPackedOutput, JIT_MEM_CLIENT usage = MEM_CLIENT_INTERNAL);
@@ -87,9 +85,6 @@ void SCATTERPS(Value* pDst, Value* vSrc, Value* vOffsets, Value* vMask);
void Shuffle8bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput, Value* vGatherOutput[], bool bPackedOutput);
void Shuffle16bpcGather4(const SWR_FORMAT_INFO &info, Value* vGatherInput[], Value* vGatherOutput[], bool bPackedOutput);
-Value* STACKSAVE();
-void STACKRESTORE(Value* pSaved);
-
// Static stack allocations for scatter operations
Value* pScatterStackSrc{ nullptr };
Value* pScatterStackOffsets{ nullptr };
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
index 704b0f2..c266018 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.cpp
@@ -335,13 +335,6 @@ namespace SwrJit
return CALLA(Callee, args);
}
- //////////////////////////////////////////////////////////////////////////
- Value *Builder::DEBUGTRAP()
- {
- Function *func = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::debugtrap);
- return CALL(func);
- }
-
Value *Builder::VRCP(Value *va, const llvm::Twine& name)
{
return FDIV(VIMMED1(1.0f), va, name); // 1 / a
@@ -841,12 +834,6 @@ namespace SwrJit
return vOut;
}
- Value* Builder::POPCNT(Value* a)
- {
- Function* pCtPop = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::ctpop, { a->getType() });
- return CALL(pCtPop, std::initializer_list<Value*>{a});
- }
-
//////////////////////////////////////////////////////////////////////////
/// @brief pop count on vector mask (e.g. <8 x i1>)
Value* Builder::VPOPCNT(Value* a)
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
index 549f328..343a9b0 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
+++ b/src/gallium/drivers/swr/rasterizer/jitter/builder_misc.h
@@ -124,15 +124,6 @@ Value *PMINUD(Value* a, Value* b);
Value *VABSPS(Value* a);
Value *FMADDPS(Value* a, Value* b, Value* c);
-// LLVM removed VPCMPGTD x86 intrinsic. This emulates that behavior
-Value *VPCMPGTD(Value* a, Value* b)
-{
- Value* vIndexMask = ICMP_UGT(a,b);
-
- // need to set the high bit for x86 intrinsic masks
- return S_EXT(vIndexMask,VectorType::get(mInt32Ty,JM()->mVWidth));
-}
-
Value *ICLAMP(Value* src, Value* low, Value* high, const llvm::Twine& name = "");
Value *FCLAMP(Value* src, Value* low, Value* high);
Value *FCLAMP(Value* src, float low, float high);
@@ -140,10 +131,8 @@ Value *FCLAMP(Value* src, float low, float high);
CallInst *PRINT(const std::string &printStr);
CallInst *PRINT(const std::string &printStr,const std::initializer_list<Value*> &printArgs);
-Value* POPCNT(Value* a);
Value* VPOPCNT(Value* a);
-Value* DEBUGTRAP();
Value* INT3() { return DEBUGTRAP(); }
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
index 1ee6691..5c8d813 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/fetch_jit.cpp
@@ -1884,13 +1884,11 @@ Value* FetchJit::GetSimdValid32bitIndices(Value* pIndices, Value* pLastIndex)
// vIndexMask -1-1-1-1 0 0 0 0 : offsets < max pass
// vLoadedIndices 0 1 2 3 0 0 0 0 : offsets >= max masked to 0
Value* vMaxIndex = VBROADCAST(numIndicesLeft);
- Value* vIndexMask = VPCMPGTD(vMaxIndex, vIndexOffsets);
-
- // VMASKLOAD takes an *i8 src pointer
- pIndices = BITCAST(pIndices,PointerType::get(mInt8Ty,0));
+ Value* vIndexMask = ICMP_SGT(vMaxIndex, vIndexOffsets);
// Load the indices; OOB loads 0
- return MASKLOADD(pIndices,vIndexMask);
+ pIndices = BITCAST(pIndices, PointerType::get(mSimdInt32Ty, 0));
+ return MASKED_LOAD(pIndices, 4, vIndexMask, VIMMED1(0));
}
//////////////////////////////////////////////////////////////////////////
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/meson.build b/src/gallium/drivers/swr/rasterizer/jitter/meson.build
index dd1ddcf..4a2f46a 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/meson.build
+++ b/src/gallium/drivers/swr/rasterizer/jitter/meson.build
@@ -48,3 +48,14 @@ gen_builder_x86_hpp = custom_target(
depend_files : swr_gen_builder_depends,
)
+gen_builder_intrin_hpp = custom_target(
+ 'gen_builder_intrin.hpp',
+ input : '../codegen/gen_llvm_ir_macros.py',
+ output : 'gen_builder_intrin.hpp',
+ command : [
+ prog_python2, '@INPUT0@', '--gen_intrin_h', '--output', '@OUTPUT@',
+ '--output-dir', '@OUTDIR@'
+ ],
+ depend_files : swr_gen_builder_depends,
+)
+
diff --git a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
index ac4436e..f9d8580 100644
--- a/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
+++ b/src/gallium/drivers/swr/rasterizer/jitter/streamout_jit.cpp
@@ -112,11 +112,11 @@ struct StreamOutJit : public Builder
{
if (bitmask & (1 << i))
{
- indices.push_back(C(-1.0f));
+ indices.push_back(C(true));
}
else
{
- indices.push_back(C(0.0f));
+ indices.push_back(C(false));
}
}
return ConstantVector::get(indices);
@@ -131,9 +131,6 @@ struct StreamOutJit : public Builder
// @param decl - input decl
void buildDecl(Value* pStream, Value* pOutBuffers[4], const STREAMOUT_DECL& decl)
{
- // @todo add this to x86 macros
- Function* maskStore = Intrinsic::getDeclaration(JM()->mpCurrentModule, Intrinsic::x86_avx_maskstore_ps);
-
uint32_t numComponents = _mm_popcnt_u32(decl.componentMask);
uint32_t packedMask = (1 << numComponents) - 1;
if (!decl.hole)
@@ -152,15 +149,14 @@ struct StreamOutJit : public Builder
// store to output buffer
// cast SO buffer to i8*, needed by maskstore
- Value* pOut = BITCAST(pOutBuffers[decl.bufferIndex], PointerType::get(mInt8Ty, 0));
+ Value* pOut = BITCAST(pOutBuffers[decl.bufferIndex], PointerType::get(simd4Ty, 0));
// cast input to <4xfloat>
Value* src = BITCAST(vpackedAttrib, simd4Ty);
- // cast mask to <4xint>
+ // cast mask to <4xi1>
Value* mask = ToMask(packedMask);
- mask = BITCAST(mask, VectorType::get(IRB()->getInt32Ty(), 4));
- CALL(maskStore, {pOut, mask, src});
+ MASKED_STORE(src, pOut, 4, mask);
}
// increment SO buffer
@@ -325,13 +321,15 @@ struct StreamOutJit : public Builder
/// @return PFN_SO_FUNC - pointer to SOS function
PFN_SO_FUNC JitStreamoutFunc(HANDLE hJitMgr, const HANDLE hFunc)
{
- const llvm::Function *func = (const llvm::Function*)hFunc;
+ llvm::Function *func = (llvm::Function*)hFunc;
JitManager* pJitMgr = reinterpret_cast<JitManager*>(hJitMgr);
PFN_SO_FUNC pfnStreamOut;
pfnStreamOut = (PFN_SO_FUNC)(pJitMgr->mpExec->getFunctionAddress(func->getName().str()));
// MCJIT finalizes modules the first time you JIT code from them. After finalized, you cannot add new IR to the module
pJitMgr->mIsModuleFinalized = true;
+ pJitMgr->DumpAsm(func, "SoFunc_optimized");
+
return pfnStreamOut;
}
--
2.7.4
More information about the mesa-dev
mailing list