[Mesa-dev] [PATCH 7/7] softpipe: add support for compute shaders.

Tue Apr 26 23:08:16 UTC 2016

Am 26.04.2016 um 23:18 schrieb Dave Airlie:
> On 27 April 2016 at 06:07, Roland Scheidegger <sroland at vmware.com> wrote:
>> Am 26.04.2016 um 06:42 schrieb Dave Airlie:
>>> From: Dave Airlie <airlied at redhat.com>
>>>
>>> This enables ARB_compute_shader on softpipe. I've only
>>> tested this with piglit so far, and I hopefully plan
>>> on integrating it with my vulkan work. I'll get to
>>> testing it with deqp more later.
>>>
>>> The basic premise is to create up to 1024 restartable
>>> TGSI machines, and execute workgroups of those machines.
>>>
>>> Signed-off-by: Dave Airlie <airlied at redhat.com>
>>> ---
>>>  src/gallium/drivers/softpipe/Makefile.sources  |   1 +
>>>  src/gallium/drivers/softpipe/sp_compute.c      | 211 +++++++++++++++++++++++++
>>>  src/gallium/drivers/softpipe/sp_context.c      |   3 +
>>>  src/gallium/drivers/softpipe/sp_context.h      |   4 +-
>>>  src/gallium/drivers/softpipe/sp_screen.c       |  48 +++++-
>>>  src/gallium/drivers/softpipe/sp_state.h        |   9 ++
>>>  src/gallium/drivers/softpipe/sp_state_shader.c |  51 ++++++
>>>  7 files changed, 324 insertions(+), 3 deletions(-)
>>>  create mode 100644 src/gallium/drivers/softpipe/sp_compute.c
>>>
>>> diff --git a/src/gallium/drivers/softpipe/Makefile.sources b/src/gallium/drivers/softpipe/Makefile.sources
>>> index 1d42351..d72266f 100644
>>> --- a/src/gallium/drivers/softpipe/Makefile.sources
>>> +++ b/src/gallium/drivers/softpipe/Makefile.sources
>>> @@ -4,6 +4,7 @@ C_SOURCES := \
>>>       sp_clear.h \
>>>       sp_context.c \
>>>       sp_context.h \
>>> +     sp_compute.c \
>>>       sp_draw_arrays.c \
>>>       sp_fence.c \
>>>       sp_fence.h \
>>> diff --git a/src/gallium/drivers/softpipe/sp_compute.c b/src/gallium/drivers/softpipe/sp_compute.c
>>> new file mode 100644
>>> index 0000000..7467686
>>> --- /dev/null
>>> +++ b/src/gallium/drivers/softpipe/sp_compute.c
>>> @@ -0,0 +1,211 @@
>>> +#include "util/u_inlines.h"
>>> +#include "util/u_math.h"
>>> +#include "util/u_memory.h"
>>> +#include "util/u_pstipple.h"
>>> +#include "pipe/p_shader_tokens.h"
>>> +#include "draw/draw_context.h"
>>> +#include "draw/draw_vertex.h"
>>> +#include "sp_context.h"
>>> +#include "sp_screen.h"
>>> +#include "sp_state.h"
>>> +#include "sp_texture.h"
>>> +#include "sp_tex_sample.h"
>>> +#include "sp_tex_tile_cache.h"
>>> +#include "tgsi/tgsi_parse.h"
>>> +
>>> +static void
>>> +cs_prepare(const struct sp_compute_shader *cs,
>>> +           struct tgsi_exec_machine *machine,
>>> +           int w, int h, int d,
>>> +           int g_w, int g_h, int g_d,
>>> +           int b_w, int b_h, int b_d,
>>> +           struct tgsi_sampler *sampler,
>>> +           struct tgsi_image *image,
>>> +           struct tgsi_buffer *buffer )
>>> +{
>>> +   int j;
>>> +   /*
>>> +    * Bind tokens/shader to the interpreter's machine state.
>>> +    */
>>> +   tgsi_exec_machine_bind_shader(machine,
>>> +                                 cs->tokens,
>>> +                                 sampler, image, buffer);
>>> +
>>> +   if (machine->SysSemanticToIndex[TGSI_SEMANTIC_THREAD_ID] != -1) {
>>> +      unsigned i = machine->SysSemanticToIndex[TGSI_SEMANTIC_THREAD_ID];
>>> +      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
>>> +         machine->SystemValue[i].xyzw[0].i[j] = w;
>>> +         machine->SystemValue[i].xyzw[1].i[j] = h;
>>> +         machine->SystemValue[i].xyzw[2].i[j] = d;
>>> +      }
>>> +   }
>>> +
>>> +   if (machine->SysSemanticToIndex[TGSI_SEMANTIC_GRID_SIZE] != -1) {
>>> +      unsigned i = machine->SysSemanticToIndex[TGSI_SEMANTIC_GRID_SIZE];
>>> +      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
>>> +         machine->SystemValue[i].xyzw[0].i[j] = g_w;
>>> +         machine->SystemValue[i].xyzw[1].i[j] = g_h;
>>> +         machine->SystemValue[i].xyzw[2].i[j] = g_d;
>>> +      }
>>> +   }
>>> +
>>> +   if (machine->SysSemanticToIndex[TGSI_SEMANTIC_BLOCK_SIZE] != -1) {
>>> +      unsigned i = machine->SysSemanticToIndex[TGSI_SEMANTIC_BLOCK_SIZE];
>>> +      for (j = 0; j < TGSI_QUAD_SIZE; j++) {
>>> +         machine->SystemValue[i].xyzw[0].i[j] = b_w;
>>> +         machine->SystemValue[i].xyzw[1].i[j] = b_h;
>>> +         machine->SystemValue[i].xyzw[2].i[j] = b_d;
>>> +      }
>>> +   }
>>> +}
>>> +
>>> +static bool
>>> +cs_run(const struct sp_compute_shader *cs,
>>> +       int g_w, int g_h, int g_d,
>>> +       struct tgsi_exec_machine *machine, bool restart)
>>> +{
>>> +   if (!restart) {
>>> +      if (machine->SysSemanticToIndex[TGSI_SEMANTIC_BLOCK_ID] != -1) {
>>> +         unsigned i = machine->SysSemanticToIndex[TGSI_SEMANTIC_BLOCK_ID];
>>> +         int j;
>>> +         for (j = 0; j < TGSI_QUAD_SIZE; j++) {
>>> +            machine->SystemValue[i].xyzw[0].i[j] = g_w;
>>> +            machine->SystemValue[i].xyzw[1].i[j] = g_h;
>>> +            machine->SystemValue[i].xyzw[2].i[j] = g_d;
>>> +         }
>>> +      }
>>> +      machine->NonHelperMask = (1 << 1) - 1;
>>> +   }
>>> +
>>> +   tgsi_exec_machine_run(machine, restart ? machine->pc : 0);
>>> +
>>> +   if (machine->pc != -1)
>>> +      return true;
>>> +   return false;
>>> +}
>>> +
>>> +static void
>>> +run_workgroup(const struct sp_compute_shader *cs,
>>> +              int g_w, int g_h, int g_d, int num_threads,
>>> +              struct tgsi_exec_machine **machines)
>>> +{
>>> +   int i;
>>> +   bool grp_hit_barrier, restart_threads = false;
>>> +
>>> +   do {
>>> +      grp_hit_barrier = false;
>>> +      for (i = 0; i < num_threads; i++) {
>>> +         grp_hit_barrier |= cs_run(cs, g_w, g_h, g_d, machines[i], restart_threads);
>>> +      }
>>> +      restart_threads = false;
>>> +      if (grp_hit_barrier) {
>>> +         grp_hit_barrier = false;
>>> +         restart_threads = true;
>>> +      }
>>> +   } while (restart_threads);
>>> +}
>>> +
>>> +static void
>>> +cs_delete(const struct sp_compute_shader *cs,
>>> +          struct tgsi_exec_machine *machine)
>>> +{
>>> +   if (machine->Tokens == cs->tokens) {
>>> +      tgsi_exec_machine_bind_shader(machine, NULL, NULL, NULL, NULL);
>>> +   }
>>> +}
>>> +
>>> +static void
>>> +fill_grid_size(struct pipe_context *context,
>>> +               const struct pipe_grid_info *info,
>>> +               uint32_t grid_size[3])
>>> +{
>>> +   struct pipe_transfer *transfer;
>>> +   uint32_t *params;
>>> +   if (!info->indirect) {
>>> +      grid_size[0] = info->grid[0];
>>> +      grid_size[1] = info->grid[1];
>>> +      grid_size[2] = info->grid[2];
>>> +      return;
>>> +   }
>>> +   params = pipe_buffer_map_range(context, info->indirect,
>>> +                                  info->indirect_offset,
>>> +                                  3 * sizeof(uint32_t),
>>> +                                  PIPE_TRANSFER_READ,
>>> +                                  &transfer);
>>> +
>>> +   if (!transfer)
>>> +      return;
>>> +
>>> +   grid_size[0] = params[0];
>>> +   grid_size[1] = params[1];
>>> +   grid_size[2] = params[2];
>>> +   pipe_buffer_unmap(context, transfer);
>>> +}
>>> +
>>> +void
>>> +softpipe_launch_grid(struct pipe_context *context,
>>> +                     const struct pipe_grid_info *info)
>>> +{
>>> +   struct softpipe_context *softpipe = softpipe_context(context);
>>> +   struct sp_compute_shader *cs = softpipe->cs;
>>> +   int num_threads_in_group;
>>> +   struct tgsi_exec_machine **machines;
>>> +   int bwidth, bheight, bdepth;
>>> +   int w, h, d, i;
>>> +   int g_w, g_h, g_d;
>>> +   uint32_t grid_size[3];
>>> +   void *local_mem = NULL;
>>> +
>>> +   bwidth = cs->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_WIDTH];
>>> +   bheight = cs->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_HEIGHT];
>>> +   bdepth = cs->info.properties[TGSI_PROPERTY_CS_FIXED_BLOCK_DEPTH];
>>> +   num_threads_in_group = bwidth * bheight * bdepth;
>>> +
>>> +   fill_grid_size(context, info, grid_size);
>>> +
>>> +   if (cs->shader.req_local_mem) {
>>> +      local_mem = CALLOC(1, cs->shader.req_local_mem);
>>> +   }
>>> +
>>> +   machines = CALLOC(sizeof(struct tgsi_exec_machine *), num_threads_in_group);
>>> +   if (!machines)
>>> +      return;
>>> +
>>> +   /* initialise machines + GRID_SIZE + THREAD_ID  + BLOCK_SIZE */
>>> +   for (d = 0; d < bdepth; d++) {
>>> +      for (h = 0; h < bheight; h++) {
>>> +         for (w = 0; w < bwidth; w++) {
>>> +            int idx = w + (h * bwidth) + (d * bheight * bwidth);
>>> +            machines[idx] = tgsi_exec_machine_create(PIPE_SHADER_COMPUTE);
>>> +
>>> +            machines[idx]->LocalMem = local_mem;
>>> +            machines[idx]->LocalMemSize = cs->shader.req_local_mem;
>>> +            cs_prepare(cs, machines[idx],
>>> +                       w, h, d,
>>> +                       grid_size[0], grid_size[1], grid_size[2],
>>> +                       bwidth, bheight, bdepth,
>>> +                       (struct tgsi_sampler *)softpipe->tgsi.sampler[PIPE_SHADER_COMPUTE],
>>> +                       (struct tgsi_image *)softpipe->tgsi.image[PIPE_SHADER_COMPUTE],
>>> +                       (struct tgsi_buffer *)softpipe->tgsi.buffer[PIPE_SHADER_COMPUTE]);
>>> +            tgsi_exec_set_constant_buffers(machines[idx], PIPE_MAX_CONSTANT_BUFFERS,
>>> +                                           softpipe->mapped_constants[PIPE_SHADER_COMPUTE],
>>> +                                           softpipe->const_buffer_size[PIPE_SHADER_COMPUTE]);
>>> +         }
>>> +      }
>>> +   }
>>> +
>>> +   for (g_d = 0; g_d < grid_size[2]; g_d++) {
>>> +      for (g_h = 0; g_h < grid_size[1]; g_h++) {
>>> +         for (g_w = 0; g_w < grid_size[0]; g_w++) {
>>> +            run_workgroup(cs, g_w, g_h, g_d, num_threads_in_group, machines);
>>> +         }
>>> +      }
>>> +   }
>>> +
>>> +   for (i = 0; i < num_threads_in_group; i++) {
>>> +      cs_delete(cs, machines[i]);
>>> +   }
>>> +
>>> +   FREE(local_mem);
>>> +   FREE(machines);
>>> +}
>>> diff --git a/src/gallium/drivers/softpipe/sp_context.c b/src/gallium/drivers/softpipe/sp_context.c
>>> index e3ec524..1690e38 100644
>>> --- a/src/gallium/drivers/softpipe/sp_context.c
>>> +++ b/src/gallium/drivers/softpipe/sp_context.c
>>> @@ -212,6 +212,7 @@ softpipe_create_context(struct pipe_screen *screen,
>>>
>>>     softpipe->dump_fs = debug_get_bool_option( "SOFTPIPE_DUMP_FS", FALSE );
>>>     softpipe->dump_gs = debug_get_bool_option( "SOFTPIPE_DUMP_GS", FALSE );
>>> +   softpipe->dump_cs = debug_get_bool_option( "SOFTPIPE_DUMP_CS", FALSE );
>>>
>>>     softpipe->pipe.screen = screen;
>>>     softpipe->pipe.destroy = softpipe_destroy;
>>> @@ -233,6 +234,8 @@ softpipe_create_context(struct pipe_screen *screen,
>>>
>>>     softpipe->pipe.draw_vbo = softpipe_draw_vbo;
>>>
>>> +   softpipe->pipe.launch_grid = softpipe_launch_grid;
>>> +
>>>     softpipe->pipe.clear = softpipe_clear;
>>>     softpipe->pipe.flush = softpipe_flush_wrapped;
>>>     softpipe->pipe.texture_barrier = softpipe_texture_barrier;
>>> diff --git a/src/gallium/drivers/softpipe/sp_context.h b/src/gallium/drivers/softpipe/sp_context.h
>>> index 70d00c8..a57f587 100644
>>> --- a/src/gallium/drivers/softpipe/sp_context.h
>>> +++ b/src/gallium/drivers/softpipe/sp_context.h
>>> @@ -71,6 +71,7 @@ struct softpipe_context {
>>>     struct sp_geometry_shader *gs;
>>>     struct sp_velems_state *velems;
>>>     struct sp_so_state *so;
>>> +   struct sp_compute_shader *cs;
>>>
>>>     /** Other rendering state */
>>>     struct pipe_blend_color blend_color;
>>> @@ -205,10 +206,11 @@ struct softpipe_context {
>>>      * XXX wouldn't it make more sense for the tile cache to just be part
>>>      * of sp_sampler_view?
>>>      */
>>> -   struct softpipe_tex_tile_cache *tex_cache[PIPE_SHADER_GEOMETRY+1][PIPE_MAX_SHADER_SAMPLER_VIEWS];
>>> +   struct softpipe_tex_tile_cache *tex_cache[PIPE_SHADER_TYPES][PIPE_MAX_SHADER_SAMPLER_VIEWS];
>>>
>>>     unsigned dump_fs : 1;
>>>     unsigned dump_gs : 1;
>>> +   unsigned dump_cs : 1;
>>>     unsigned no_rast : 1;
>>>  };
>>>
>>> diff --git a/src/gallium/drivers/softpipe/sp_screen.c b/src/gallium/drivers/softpipe/sp_screen.c
>>> index d89d95c..4beeb80 100644
>>> --- a/src/gallium/drivers/softpipe/sp_screen.c
>>> +++ b/src/gallium/drivers/softpipe/sp_screen.c
>>> @@ -157,7 +157,7 @@ softpipe_get_param(struct pipe_screen *screen, enum pipe_cap param)
>>>     case PIPE_CAP_QUADS_FOLLOW_PROVOKING_VERTEX_CONVENTION:
>>>        return 0;
>>>     case PIPE_CAP_COMPUTE:
>>> -      return 0;
>>> +      return 1;
>>>     case PIPE_CAP_USER_VERTEX_BUFFERS:
>>>     case PIPE_CAP_USER_INDEX_BUFFERS:
>>>     case PIPE_CAP_USER_CONSTANT_BUFFERS:
>>> @@ -289,6 +289,8 @@ softpipe_get_shader_param(struct pipe_screen *screen, unsigned shader, enum pipe
>>>     {
>>>     case PIPE_SHADER_FRAGMENT:
>>>        return tgsi_exec_get_shader_param(param);
>>> +   case PIPE_SHADER_COMPUTE:
>>> +      return tgsi_exec_get_shader_param(param);
>>>     case PIPE_SHADER_VERTEX:
>>>     case PIPE_SHADER_GEOMETRY:
>>>        if (sp_screen->use_llvm)
>>> @@ -447,6 +449,48 @@ softpipe_get_timestamp(struct pipe_screen *_screen)
>>>     return os_time_get_nano();
>>>  }
>>>
>>> +static int
>>> +softpipe_get_compute_param(struct pipe_screen *_screen,
>>> +                           enum pipe_shader_ir ir_type,
>>> +                           enum pipe_compute_cap param,
>>> +                           void *ret)
>>> +{
>>> +   switch (param) {
>>> +   case PIPE_COMPUTE_CAP_IR_TARGET:
>>> +      return 0;
>>> +   case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
>>> +      if (ret) {
>>> +         uint64_t *grid_size = ret;
>>> +         grid_size[0] = 65535;
>>> +         grid_size[1] = 65535;
>>> +         grid_size[2] = 65535;
>>> +      }
>>> +      return 3 * sizeof(uint64_t) ;
>>> +   case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
>>> +      if (ret) {
>>> +         uint64_t *block_size = ret;
>>> +         block_size[0] = 1024;
>>> +         block_size[1] = 1024;
>>> +         block_size[2] = 1024;
>>> +      }
>>> +      return 3 * sizeof(uint64_t);
>>> +   case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
>>> +      if (ret) {
>>> +         uint64_t *max_threads_per_block = ret;
>>> +         *max_threads_per_block = 2048;
>>> +      }
>>> +      return sizeof(uint64_t);
>>> +   case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE:
>>> +      if (ret) {
>>> +         uint64_t *max_local_size = ret;
>>> +         /* Value reported by the closed source driver. */
>>
>> The comment here doesn't make much sense...
>>
>> 1024 interpreted tgsi machines, all running serially - I'm sure
>> performance is going to be amazing.
>>
>> But the approach looks reasonable to me.
>>
>> I'm not really familiar with compute shaders, but what I'm wondering is
>> since tgsi exec always operates on 4 values at a time, is that somehow
>> implicit in compute shaders?
> 
> So far I've set the execmask to 1 active channel, I'm contemplating
> changing that
> though and using less machines.
Ah yes, I think that would indeed be desirable.

> 
> Any ideas how to implement this in llvm? :-) 1024 CPU threads?
I suppose 1024 is really the minimum work size you have to support?
But since things are always run 4-wide (or 8-wide) that would "only" be
256 (or 128) threads. That many threads sound a bit suboptimal to me
(unless you really have a boatload of cpu cores), but why not - I
suppose you can always pause some of the threads, not all need to be
active at the same time.
Though I wonder what the opencl-on-cpu guys do...

Roland