[Mesa-dev] [PATCH 1/3] clover+gallium: support per-kernel limits
Rob Clark
robdclark at gmail.com
Tue Apr 24 12:29:02 UTC 2018
Some limits, such as max # of threads in a work-group, vary depending
on the resources (ie. registers) used by a kernel. OpenCL provides
clGetKernelWorkGroupInfo() for querying these kernel specific limits.
To implement this properly, we need a variant of get_compute_param()
which takes the compute-state CSO as an argument.
Signed-off-by: Rob Clark <robdclark at gmail.com>
---
src/gallium/include/pipe/p_defines.h | 2 +-
src/gallium/include/pipe/p_screen.h | 22 ++++++-
src/gallium/state_trackers/clover/api/kernel.cpp | 9 ++-
src/gallium/state_trackers/clover/core/device.cpp | 10 ++++
src/gallium/state_trackers/clover/core/device.hpp | 5 ++
src/gallium/state_trackers/clover/core/kernel.cpp | 73 ++++++++++++++++++++---
src/gallium/state_trackers/clover/core/kernel.hpp | 7 ++-
7 files changed, 115 insertions(+), 13 deletions(-)
diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 2ae12f12a1e..0fa96c0d412 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -899,7 +899,7 @@ enum pipe_shader_ir
/**
* Compute-specific implementation capability. They can be queried
- * using pipe_screen::get_compute_param.
+ * using pipe_screen::get_compute_param or pipe_screen::get_kernel_param.
*/
enum pipe_compute_cap
{
diff --git a/src/gallium/include/pipe/p_screen.h b/src/gallium/include/pipe/p_screen.h
index 101e229088b..cf6049bec43 100644
--- a/src/gallium/include/pipe/p_screen.h
+++ b/src/gallium/include/pipe/p_screen.h
@@ -117,7 +117,10 @@ struct pipe_screen {
enum pipe_video_cap param );
/**
- * Query a compute-specific capability/parameter/limit.
+ * Query a compute-specific capability/parameter/limit. Some parameters
+ * may have kernel specific lower limits based on the resources used by
+ * the kernel. See pipe_context::get_kernel_param.
+ *
* \param ir_type shader IR type for which the param applies, or don't care
* if the param is not shader related
* \param param one of PIPE_COMPUTE_CAP_x
@@ -131,6 +134,23 @@ struct pipe_screen {
enum pipe_compute_cap param,
void *ret);
+ /**
+ * Query a compute kernel-specific limit. Some parameters
+ * may have kernel specific lower limits based on the resources used by
+ * the kernel. See pipe_screen::get_compute_param.
+ *
+ * \param hwso shader state obj (as returned by create_compute_state())
+ * if the param is not shader related
+ * \param param one of PIPE_COMPUTE_CAP_x
+ * \param ret pointer to a preallocated buffer that will be
+ * initialized to the parameter value, or NULL.
+ * \return size in bytes of the parameter value that would be
+ * returned.
+ */
+ int (*get_kernel_param)(struct pipe_screen *, void *hwcso,
+ enum pipe_compute_cap param,
+ void *ret);
+
/**
* Query a timestamp in nanoseconds. The returned value should match
* PIPE_QUERY_TIMESTAMP. This function returns immediately and doesn't
diff --git a/src/gallium/state_trackers/clover/api/kernel.cpp b/src/gallium/state_trackers/clover/api/kernel.cpp
index b665773d9ec..60ffd01c827 100644
--- a/src/gallium/state_trackers/clover/api/kernel.cpp
+++ b/src/gallium/state_trackers/clover/api/kernel.cpp
@@ -155,9 +155,12 @@ clGetKernelWorkGroupInfo(cl_kernel d_kern, cl_device_id d_dev,
if (!count(dev, kern.program().devices()))
throw error(CL_INVALID_DEVICE);
+ /* try to ensure kernel is built for build specific limits: */
+ kern.build(dev);
+
switch (param) {
case CL_KERNEL_WORK_GROUP_SIZE:
- buf.as_scalar<size_t>() = dev.max_threads_per_block();
+ buf.as_scalar<size_t>() = kern.max_threads_per_block(dev);
break;
case CL_KERNEL_COMPILE_WORK_GROUP_SIZE:
@@ -169,7 +172,7 @@ clGetKernelWorkGroupInfo(cl_kernel d_kern, cl_device_id d_dev,
break;
case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE:
- buf.as_scalar<size_t>() = dev.subgroup_size();
+ buf.as_scalar<size_t>() = kern.subgroup_size(dev);
break;
case CL_KERNEL_PRIVATE_MEM_SIZE:
@@ -262,7 +265,7 @@ namespace {
throw error(CL_INVALID_WORK_GROUP_SIZE);
if (fold(multiplies(), 1u, block_size) >
- q.device().max_threads_per_block())
+ kern.max_threads_per_block(q.device()))
throw error(CL_INVALID_WORK_GROUP_SIZE);
return block_size;
diff --git a/src/gallium/state_trackers/clover/core/device.cpp b/src/gallium/state_trackers/clover/core/device.cpp
index 70f54c9caed..97e098f65de 100644
--- a/src/gallium/state_trackers/clover/core/device.cpp
+++ b/src/gallium/state_trackers/clover/core/device.cpp
@@ -50,6 +50,16 @@ device::device(clover::platform &platform, pipe_loader_device *ldev) :
pipe->destroy(pipe);
throw error(CL_INVALID_DEVICE);
}
+
+ uint32_t shareable_shaders =
+ pipe->get_param(pipe, PIPE_CAP_SHAREABLE_SHADERS);
+
+ if (shareable_shaders) {
+ /* create dummy context to use for compiling shaders */
+ pctx = pipe->context_create(pipe, NULL, PIPE_CONTEXT_COMPUTE_ONLY);
+ } else {
+ pctx = NULL;
+ }
}
device::~device() {
diff --git a/src/gallium/state_trackers/clover/core/device.hpp b/src/gallium/state_trackers/clover/core/device.hpp
index db791e8cfbe..63cf3abccc4 100644
--- a/src/gallium/state_trackers/clover/core/device.hpp
+++ b/src/gallium/state_trackers/clover/core/device.hpp
@@ -94,6 +94,11 @@ namespace clover {
clover::platform &platform;
pipe_screen *pipe;
+ /* dummy context for compiling kernels, if the driver supports
+ * shareable compute-state CSO.
+ */
+ pipe_context *pctx;
+
private:
pipe_loader_device *ldev;
};
diff --git a/src/gallium/state_trackers/clover/core/kernel.cpp b/src/gallium/state_trackers/clover/core/kernel.cpp
index 9730450ceb9..424e44f4ab4 100644
--- a/src/gallium/state_trackers/clover/core/kernel.cpp
+++ b/src/gallium/state_trackers/clover/core/kernel.cpp
@@ -110,6 +110,11 @@ kernel::launch(command_queue &q,
exec.unbind();
}
+void
+kernel::build(const device &d) {
+ exec.bind_st(d, false);
+}
+
size_t
kernel::mem_local() const {
size_t sz = 0;
@@ -140,11 +145,41 @@ kernel::optimal_block_size(const command_queue &q,
grid_size);
}
+
+namespace {
+ template<typename T>
+ std::vector<T>
+ get_compute_param(pipe_screen *pipe, void *hwcso,
+ pipe_compute_cap cap) {
+ int sz = pipe->get_kernel_param(pipe, hwcso, cap, NULL);
+ std::vector<T> v(sz / sizeof(T));
+
+ pipe->get_kernel_param(pipe, hwcso, cap, &v.front());
+ return v;
+ }
+}
+
std::vector<size_t>
kernel::required_block_size() const {
return { 0, 0, 0 };
}
+size_t
+kernel::max_threads_per_block(const device &d) const {
+ if (!d.pipe->get_kernel_param || !exec.st)
+ return d.max_threads_per_block();
+ return get_compute_param<uint64_t>(d.pipe, exec.st,
+ PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK)[0];
+}
+
+cl_uint
+kernel::subgroup_size(const device &d) const {
+ if (!d.pipe->get_kernel_param || !exec.st)
+ return d.subgroup_size();
+ return get_compute_param<uint32_t>(d.pipe, exec.st,
+ PIPE_COMPUTE_CAP_SUBGROUP_SIZE)[0];
+}
+
kernel::argument_range
kernel::args() {
return map(derefs(), _args);
@@ -234,23 +269,47 @@ kernel::exec_context::bind(intrusive_ptr<command_queue> _q,
}
}
+ const device &d = q->device();
+ return bind_st(d, (_q != q) && !d.pctx);
+}
+
+/* Try to build compute-state CSO.. if the queue is not known (ie. NULL),
+ * but the device supports sharable compute-state CSO's, then compile using
+ * the device's dummy context. This case is for clGetKernelWorkGroupInfo()
+ * where we need to compile the kernel in order to get kernel specific
+ * limits.
+ */
+void *
+kernel::exec_context::bind_st(const device &_d, bool force) {
+ pipe_context *pctx = q ? q->pipe : _d.pctx;
+ bool needs_rebuild = force || !st;
+
+ if (!pctx)
+ return NULL;
+
+ if (cs.req_input_mem != input.size())
+ needs_rebuild = true;
+
+ if (cs.req_local_mem != mem_local)
+ needs_rebuild = true;
+
// Create a new compute state if anything changed.
- if (!st || q != _q ||
- cs.req_local_mem != mem_local ||
- cs.req_input_mem != input.size()) {
+ if (needs_rebuild) {
if (st)
- _q->pipe->delete_compute_state(_q->pipe, st);
+ pctx->delete_compute_state(pctx, st);
- cs.ir_type = q->device().ir_format();
+ cs.ir_type = _d.ir_format();
if (cs.ir_type == PIPE_SHADER_IR_NIR) {
// driver takes ownership of nir_shader:
- cs.prog = nir_shader_clone(NULL, (nir_shader *)kern.nir(q->device()));
+ cs.prog = nir_shader_clone(NULL, (nir_shader *)kern.nir(_d));
} else {
+ auto &m = kern.program().build(_d).binary;
+ auto msec = find(type_equals(module::section::text_executable), m.secs);
cs.prog = &(msec.data[0]);
}
cs.req_local_mem = mem_local;
cs.req_input_mem = input.size();
- st = q->pipe->create_compute_state(q->pipe, &cs);
+ st = pctx->create_compute_state(pctx, &cs);
}
return st;
diff --git a/src/gallium/state_trackers/clover/core/kernel.hpp b/src/gallium/state_trackers/clover/core/kernel.hpp
index d60b3d6af35..54eb570de92 100644
--- a/src/gallium/state_trackers/clover/core/kernel.hpp
+++ b/src/gallium/state_trackers/clover/core/kernel.hpp
@@ -48,6 +48,7 @@ namespace clover {
void *bind(intrusive_ptr<command_queue> _q,
const std::vector<size_t> &grid_offset);
+ void *bind_st(const device &d, bool force);
void unbind();
kernel &kern;
@@ -60,9 +61,9 @@ namespace clover {
std::vector<pipe_resource *> g_buffers;
std::vector<size_t> g_handles;
size_t mem_local;
+ void *st;
private:
- void *st;
pipe_compute_state cs;
};
@@ -120,6 +121,7 @@ namespace clover {
const std::vector<size_t> &grid_offset,
const std::vector<size_t> &grid_size,
const std::vector<size_t> &block_size);
+ void build(const device &d);
size_t mem_local() const;
size_t mem_private() const;
@@ -132,6 +134,9 @@ namespace clover {
std::vector<size_t>
required_block_size() const;
+ size_t max_threads_per_block(const device &d) const;
+ cl_uint subgroup_size(const device &d) const;
+
argument_range args();
const_argument_range args() const;
--
2.14.3
More information about the mesa-dev
mailing list