[Mesa-dev] [PATCH 1/3] clover+gallium: support per-kernel limits

Tue Apr 24 12:29:02 UTC 2018

Some limits, such as max # of threads in a work-group, vary depending
on the resources (ie. registers) used by a kernel.  OpenCL provides
clGetKernelWorkGroupInfo() for querying these kernel specific limits.
To implement this properly, we need a variant of get_compute_param()
which takes the compute-state CSO as an argument.

Signed-off-by: Rob Clark <robdclark at gmail.com>
---
 src/gallium/include/pipe/p_defines.h              |  2 +-
 src/gallium/include/pipe/p_screen.h               | 22 ++++++-
 src/gallium/state_trackers/clover/api/kernel.cpp  |  9 ++-
 src/gallium/state_trackers/clover/core/device.cpp | 10 ++++
 src/gallium/state_trackers/clover/core/device.hpp |  5 ++
 src/gallium/state_trackers/clover/core/kernel.cpp | 73 ++++++++++++++++++++---
 src/gallium/state_trackers/clover/core/kernel.hpp |  7 ++-
 7 files changed, 115 insertions(+), 13 deletions(-)

diff --git a/src/gallium/include/pipe/p_defines.h b/src/gallium/include/pipe/p_defines.h
index 2ae12f12a1e..0fa96c0d412 100644
--- a/src/gallium/include/pipe/p_defines.h
+++ b/src/gallium/include/pipe/p_defines.h
@@ -899,7 +899,7 @@ enum pipe_shader_ir
 
 /**
  * Compute-specific implementation capability.  They can be queried
- * using pipe_screen::get_compute_param.
+ * using pipe_screen::get_compute_param or pipe_screen::get_kernel_param.
  */
 enum pipe_compute_cap
 {
diff --git a/src/gallium/include/pipe/p_screen.h b/src/gallium/include/pipe/p_screen.h
index 101e229088b..cf6049bec43 100644
--- a/src/gallium/include/pipe/p_screen.h
+++ b/src/gallium/include/pipe/p_screen.h
@@ -117,7 +117,10 @@ struct pipe_screen {
 			   enum pipe_video_cap param );
 
    /**
-    * Query a compute-specific capability/parameter/limit.
+    * Query a compute-specific capability/parameter/limit.  Some parameters
+    * may have kernel specific lower limits based on the resources used by
+    * the kernel.  See pipe_context::get_kernel_param.
+    *
     * \param ir_type shader IR type for which the param applies, or don't care
     *                if the param is not shader related
     * \param param   one of PIPE_COMPUTE_CAP_x
@@ -131,6 +134,23 @@ struct pipe_screen {
 			    enum pipe_compute_cap param,
 			    void *ret);
 
+   /**
+    * Query a compute kernel-specific limit.  Some parameters
+    * may have kernel specific lower limits based on the resources used by
+    * the kernel.  See pipe_screen::get_compute_param.
+    *
+    * \param hwso    shader state obj (as returned by create_compute_state())
+    *                if the param is not shader related
+    * \param param   one of PIPE_COMPUTE_CAP_x
+    * \param ret     pointer to a preallocated buffer that will be
+    *                initialized to the parameter value, or NULL.
+    * \return        size in bytes of the parameter value that would be
+    *                returned.
+    */
+   int (*get_kernel_param)(struct pipe_screen *, void *hwcso,
+                           enum pipe_compute_cap param,
+                           void *ret);
+
    /**
     * Query a timestamp in nanoseconds. The returned value should match
     * PIPE_QUERY_TIMESTAMP. This function returns immediately and doesn't
diff --git a/src/gallium/state_trackers/clover/api/kernel.cpp b/src/gallium/state_trackers/clover/api/kernel.cpp
index b665773d9ec..60ffd01c827 100644
--- a/src/gallium/state_trackers/clover/api/kernel.cpp
+++ b/src/gallium/state_trackers/clover/api/kernel.cpp
@@ -155,9 +155,12 @@ clGetKernelWorkGroupInfo(cl_kernel d_kern, cl_device_id d_dev,
    if (!count(dev, kern.program().devices()))
       throw error(CL_INVALID_DEVICE);
 
+   /* try to ensure kernel is built for build specific limits: */
+   kern.build(dev);
+
    switch (param) {
    case CL_KERNEL_WORK_GROUP_SIZE:
-      buf.as_scalar<size_t>() = dev.max_threads_per_block();
+      buf.as_scalar<size_t>() = kern.max_threads_per_block(dev);
       break;
 
    case CL_KERNEL_COMPILE_WORK_GROUP_SIZE:
@@ -169,7 +172,7 @@ clGetKernelWorkGroupInfo(cl_kernel d_kern, cl_device_id d_dev,
       break;
 
    case CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE:
-      buf.as_scalar<size_t>() = dev.subgroup_size();
+      buf.as_scalar<size_t>() = kern.subgroup_size(dev);
       break;
 
    case CL_KERNEL_PRIVATE_MEM_SIZE:
@@ -262,7 +265,7 @@ namespace {
             throw error(CL_INVALID_WORK_GROUP_SIZE);
 
          if (fold(multiplies(), 1u, block_size) >
-             q.device().max_threads_per_block())
+             kern.max_threads_per_block(q.device()))
             throw error(CL_INVALID_WORK_GROUP_SIZE);
 
          return block_size;
diff --git a/src/gallium/state_trackers/clover/core/device.cpp b/src/gallium/state_trackers/clover/core/device.cpp
index 70f54c9caed..97e098f65de 100644
--- a/src/gallium/state_trackers/clover/core/device.cpp
+++ b/src/gallium/state_trackers/clover/core/device.cpp
@@ -50,6 +50,16 @@ device::device(clover::platform &platform, pipe_loader_device *ldev) :
          pipe->destroy(pipe);
       throw error(CL_INVALID_DEVICE);
    }
+
+   uint32_t shareable_shaders =
+      pipe->get_param(pipe, PIPE_CAP_SHAREABLE_SHADERS);
+
+   if (shareable_shaders) {
+      /* create dummy context to use for compiling shaders */
+      pctx = pipe->context_create(pipe, NULL, PIPE_CONTEXT_COMPUTE_ONLY);
+   } else {
+      pctx = NULL;
+   }
 }
 
 device::~device() {
diff --git a/src/gallium/state_trackers/clover/core/device.hpp b/src/gallium/state_trackers/clover/core/device.hpp
index db791e8cfbe..63cf3abccc4 100644
--- a/src/gallium/state_trackers/clover/core/device.hpp
+++ b/src/gallium/state_trackers/clover/core/device.hpp
@@ -94,6 +94,11 @@ namespace clover {
       clover::platform &platform;
       pipe_screen *pipe;
 
+      /* dummy context for compiling kernels, if the driver supports
+       * shareable compute-state CSO.
+       */
+      pipe_context *pctx;
+
    private:
       pipe_loader_device *ldev;
    };
diff --git a/src/gallium/state_trackers/clover/core/kernel.cpp b/src/gallium/state_trackers/clover/core/kernel.cpp
index 9730450ceb9..424e44f4ab4 100644
--- a/src/gallium/state_trackers/clover/core/kernel.cpp
+++ b/src/gallium/state_trackers/clover/core/kernel.cpp
@@ -110,6 +110,11 @@ kernel::launch(command_queue &q,
    exec.unbind();
 }
 
+void
+kernel::build(const device &d) {
+   exec.bind_st(d, false);
+}
+
 size_t
 kernel::mem_local() const {
    size_t sz = 0;
@@ -140,11 +145,41 @@ kernel::optimal_block_size(const command_queue &q,
       grid_size);
 }
 
+
+namespace {
+   template<typename T>
+   std::vector<T>
+   get_compute_param(pipe_screen *pipe, void *hwcso,
+                     pipe_compute_cap cap) {
+      int sz = pipe->get_kernel_param(pipe, hwcso, cap, NULL);
+      std::vector<T> v(sz / sizeof(T));
+
+      pipe->get_kernel_param(pipe, hwcso, cap, &v.front());
+      return v;
+   }
+}
+
 std::vector<size_t>
 kernel::required_block_size() const {
    return { 0, 0, 0 };
 }
 
+size_t
+kernel::max_threads_per_block(const device &d) const {
+   if (!d.pipe->get_kernel_param || !exec.st)
+      return d.max_threads_per_block();
+   return get_compute_param<uint64_t>(d.pipe, exec.st,
+                                      PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK)[0];
+}
+
+cl_uint
+kernel::subgroup_size(const device &d) const {
+   if (!d.pipe->get_kernel_param || !exec.st)
+      return d.subgroup_size();
+   return get_compute_param<uint32_t>(d.pipe, exec.st,
+                                      PIPE_COMPUTE_CAP_SUBGROUP_SIZE)[0];
+}
+
 kernel::argument_range
 kernel::args() {
    return map(derefs(), _args);
@@ -234,23 +269,47 @@ kernel::exec_context::bind(intrusive_ptr<command_queue> _q,
       }
    }
 
+   const device &d = q->device();
+   return bind_st(d, (_q != q) && !d.pctx);
+}
+
+/* Try to build compute-state CSO.. if the queue is not known (ie. NULL),
+ * but the device supports sharable compute-state CSO's, then compile using
+ * the device's dummy context.  This case is for clGetKernelWorkGroupInfo()
+ * where we need to compile the kernel in order to get kernel specific
+ * limits.
+ */
+void *
+kernel::exec_context::bind_st(const device &_d, bool force) {
+   pipe_context *pctx = q ? q->pipe : _d.pctx;
+   bool needs_rebuild = force || !st;
+
+   if (!pctx)
+      return NULL;
+
+   if (cs.req_input_mem != input.size())
+      needs_rebuild = true;
+
+   if (cs.req_local_mem != mem_local)
+      needs_rebuild = true;
+
    // Create a new compute state if anything changed.
-   if (!st || q != _q ||
-       cs.req_local_mem != mem_local ||
-       cs.req_input_mem != input.size()) {
+   if (needs_rebuild) {
       if (st)
-         _q->pipe->delete_compute_state(_q->pipe, st);
+         pctx->delete_compute_state(pctx, st);
 
-      cs.ir_type = q->device().ir_format();
+      cs.ir_type = _d.ir_format();
       if (cs.ir_type == PIPE_SHADER_IR_NIR) {
          // driver takes ownership of nir_shader:
-         cs.prog = nir_shader_clone(NULL, (nir_shader *)kern.nir(q->device()));
+         cs.prog = nir_shader_clone(NULL, (nir_shader *)kern.nir(_d));
       } else {
+         auto &m = kern.program().build(_d).binary;
+         auto msec = find(type_equals(module::section::text_executable), m.secs);
          cs.prog = &(msec.data[0]);
       }
       cs.req_local_mem = mem_local;
       cs.req_input_mem = input.size();
-      st = q->pipe->create_compute_state(q->pipe, &cs);
+      st = pctx->create_compute_state(pctx, &cs);
    }
 
    return st;
diff --git a/src/gallium/state_trackers/clover/core/kernel.hpp b/src/gallium/state_trackers/clover/core/kernel.hpp
index d60b3d6af35..54eb570de92 100644
--- a/src/gallium/state_trackers/clover/core/kernel.hpp
+++ b/src/gallium/state_trackers/clover/core/kernel.hpp
@@ -48,6 +48,7 @@ namespace clover {
 
          void *bind(intrusive_ptr<command_queue> _q,
                     const std::vector<size_t> &grid_offset);
+         void *bind_st(const device &d, bool force);
          void unbind();
 
          kernel &kern;
@@ -60,9 +61,9 @@ namespace clover {
          std::vector<pipe_resource *> g_buffers;
          std::vector<size_t> g_handles;
          size_t mem_local;
+         void *st;
 
       private:
-         void *st;
          pipe_compute_state cs;
       };
 
@@ -120,6 +121,7 @@ namespace clover {
                   const std::vector<size_t> &grid_offset,
                   const std::vector<size_t> &grid_size,
                   const std::vector<size_t> &block_size);
+      void build(const device &d);
 
       size_t mem_local() const;
       size_t mem_private() const;
@@ -132,6 +134,9 @@ namespace clover {
       std::vector<size_t>
       required_block_size() const;
 
+      size_t max_threads_per_block(const device &d) const;
+      cl_uint subgroup_size(const device &d) const;
+
       argument_range args();
       const_argument_range args() const;
 
-- 
2.14.3