[Mesa-dev] [PATCH 1/5] nv50: implement a basic compute support

Samuel Pitoiset samuel.pitoiset at gmail.com
Thu Nov 12 16:04:17 PST 2015


This adds the ability to launch simple compute kernels like the one I
will use to read out MP performance counters in the upcoming patch.

This compute support is based on the work of Francisco Jerez (aka curro)
that he did as part of his EVoC project in 2011/2012 to get OpenCL
working on Tesla. His original work can be found here:
https://github.com/curro/mesa/commits/nv50-compute

I did some improvements on the original code, like fixing using both 3D
and COMPUTE simultaneously, improving global buffers binding, and making
the code closer to what nvc0 already does. This compute support has been
tested by Pierre Moreau and myself with some compute kernels. This is a
step towards OpenCL.

Speaking about this, it seems like compute programs overlap fragment
programs when they are used both. To fix this, we need to re-validate
fragment programs when binding compute programs and vice versa.

Note that, textures, samplers and surfaces still need to be implemented.

Signed-off-by: Samuel Pitoiset <samuel.pitoiset at gmail.com>
Tested-by: Pierre Moreau <pierre.morrow at free.fr>
---
 src/gallium/drivers/nouveau/Makefile.sources       |   1 +
 .../drivers/nouveau/codegen/nv50_ir_driver.h       |   1 +
 src/gallium/drivers/nouveau/nv50/nv50_compute.c    | 332 +++++++++++++++
 .../drivers/nouveau/nv50/nv50_compute.xml.h        | 444 +++++++++++++++++++++
 src/gallium/drivers/nouveau/nv50/nv50_context.c    |  30 +-
 src/gallium/drivers/nouveau/nv50/nv50_context.h    |  23 +-
 src/gallium/drivers/nouveau/nv50/nv50_program.c    |  24 +-
 src/gallium/drivers/nouveau/nv50/nv50_program.h    |   7 +
 src/gallium/drivers/nouveau/nv50/nv50_screen.c     |  61 ++-
 src/gallium/drivers/nouveau/nv50/nv50_screen.h     |   8 +
 src/gallium/drivers/nouveau/nv50/nv50_state.c      |  99 +++++
 11 files changed, 1021 insertions(+), 9 deletions(-)
 create mode 100644 src/gallium/drivers/nouveau/nv50/nv50_compute.c
 create mode 100644 src/gallium/drivers/nouveau/nv50/nv50_compute.xml.h

diff --git a/src/gallium/drivers/nouveau/Makefile.sources b/src/gallium/drivers/nouveau/Makefile.sources
index 83f8113..c2ff8e9 100644
--- a/src/gallium/drivers/nouveau/Makefile.sources
+++ b/src/gallium/drivers/nouveau/Makefile.sources
@@ -64,6 +64,7 @@ NV50_C_SOURCES := \
 	nv50/nv50_3ddefs.xml.h \
 	nv50/nv50_3d.xml.h \
 	nv50/nv50_blit.h \
+	nv50/nv50_compute.c \
 	nv50/nv50_context.c \
 	nv50/nv50_context.h \
 	nv50/nv50_defs.xml.h \
diff --git a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
index b49bf9d..3cb392e 100644
--- a/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
+++ b/src/gallium/drivers/nouveau/codegen/nv50_ir_driver.h
@@ -149,6 +149,7 @@ struct nv50_ir_prog_info
       } fp;
       struct {
          uint32_t inputOffset; /* base address for user args */
+         uint32_t localOffset; /* base address for user args */
          uint32_t sharedOffset; /* reserved space in s[] */
          uint32_t gridInfoBase;  /* base address for NTID,NCTAID */
       } cp;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_compute.c b/src/gallium/drivers/nouveau/nv50/nv50_compute.c
new file mode 100644
index 0000000..e345792
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_compute.c
@@ -0,0 +1,332 @@
+/*
+ * Copyright 2012 Francisco Jerez
+ * Copyright 2015 Samuel Pitoiset
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the
+ * next paragraph) shall be included in all copies or substantial
+ * portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+ * IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+#include "nv50/nv50_context.h"
+#include "nv50/nv50_compute.xml.h"
+
+#include "codegen/nv50_ir_driver.h"
+
+int
+nv50_screen_compute_setup(struct nv50_screen *screen,
+                          struct nouveau_pushbuf *push)
+{
+   struct nouveau_device *dev = screen->base.device;
+   struct nouveau_object *chan = screen->base.channel;
+   struct nv04_fifo *fifo = (struct nv04_fifo *)chan->data;
+   unsigned obj_class;
+   int i, ret;
+
+   switch (dev->chipset & 0xf0) {
+   case 0x50:
+   case 0x80:
+   case 0x90:
+      obj_class = NV50_COMPUTE_CLASS;
+      break;
+   case 0xa0:
+      switch (dev->chipset) {
+      case 0xa3:
+      case 0xa5:
+      case 0xa8:
+         obj_class = NVA3_COMPUTE_CLASS;
+         break;
+      default:
+         obj_class = NV50_COMPUTE_CLASS;
+         break;
+      }
+      break;
+   default:
+      NOUVEAU_ERR("unsupported chipset: NV%02x\n", dev->chipset);
+      return -1;
+   }
+
+   ret = nouveau_object_new(chan, 0xbeef50c0, obj_class, NULL, 0,
+                            &screen->compute);
+   if (ret)
+      return ret;
+
+   BEGIN_NV04(push, SUBC_COMPUTE(NV01_SUBCHAN_OBJECT), 1);
+   PUSH_DATA (push, screen->compute->handle);
+
+   BEGIN_NV04(push, NV50_COMPUTE(UNK02A0), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_STACK), 1);
+   PUSH_DATA (push, fifo->vram);
+   BEGIN_NV04(push, NV50_COMPUTE(STACK_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, screen->stack_bo->offset);
+   PUSH_DATA (push, screen->stack_bo->offset);
+   BEGIN_NV04(push, NV50_COMPUTE(STACK_SIZE_LOG), 1);
+   PUSH_DATA (push, 4);
+
+   BEGIN_NV04(push, NV50_COMPUTE(UNK0290), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_COMPUTE(LANES32_ENABLE), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_COMPUTE(REG_MODE), 1);
+   PUSH_DATA (push, NV50_COMPUTE_REG_MODE_STRIPED);
+   BEGIN_NV04(push, NV50_COMPUTE(UNK0384), 1);
+   PUSH_DATA (push, 0x100);
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_GLOBAL), 1);
+   PUSH_DATA (push, fifo->vram);
+
+   for (i = 0; i < 15; i++) {
+      BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_ADDRESS_HIGH(i)), 2);
+      PUSH_DATA (push, 0);
+      PUSH_DATA (push, 0);
+      BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_LIMIT(i)), 1);
+      PUSH_DATA (push, 0);
+      BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_MODE(i)), 1);
+      PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);
+   }
+
+   BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_ADDRESS_HIGH(15)), 2);
+   PUSH_DATA (push, 0);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_LIMIT(15)), 1);
+   PUSH_DATA (push, ~0);
+   BEGIN_NV04(push, NV50_COMPUTE(GLOBAL_MODE(15)), 1);
+   PUSH_DATA (push, NV50_COMPUTE_GLOBAL_MODE_LINEAR);
+
+   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_WARPS_LOG_ALLOC), 1);
+   PUSH_DATA (push, 7);
+   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_WARPS_NO_CLAMP), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_COMPUTE(STACK_WARPS_LOG_ALLOC), 1);
+   PUSH_DATA (push, 7);
+   BEGIN_NV04(push, NV50_COMPUTE(STACK_WARPS_NO_CLAMP), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM_COUNT), 1);
+   PUSH_DATA (push, 0);
+
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_TEXTURE), 1);
+   PUSH_DATA (push, fifo->vram);
+   BEGIN_NV04(push, NV50_COMPUTE(TEX_LIMITS), 1);
+   PUSH_DATA (push, 0x54);
+   BEGIN_NV04(push, NV50_COMPUTE(LINKED_TSC), 1);
+   PUSH_DATA (push, 0);
+
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_TIC), 1);
+   PUSH_DATA (push, fifo->vram);
+   BEGIN_NV04(push, NV50_COMPUTE(TIC_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->txc->offset);
+   PUSH_DATA (push, screen->txc->offset);
+   PUSH_DATA (push, NV50_TIC_MAX_ENTRIES - 1);
+
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_TSC), 1);
+   PUSH_DATA (push, fifo->vram);
+   BEGIN_NV04(push, NV50_COMPUTE(TSC_ADDRESS_HIGH), 3);
+   PUSH_DATAh(push, screen->txc->offset + 65536);
+   PUSH_DATA (push, screen->txc->offset + 65536);
+   PUSH_DATA (push, NV50_TSC_MAX_ENTRIES - 1);
+
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_CODE_CB), 1);
+   PUSH_DATA (push, fifo->vram);
+
+   BEGIN_NV04(push, NV50_COMPUTE(DMA_LOCAL), 1);
+   PUSH_DATA (push, fifo->vram);
+   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_ADDRESS_HIGH), 2);
+   PUSH_DATAh(push, screen->tls_bo->offset + 65536);
+   PUSH_DATA (push, screen->tls_bo->offset + 65536);
+   BEGIN_NV04(push, NV50_COMPUTE(LOCAL_SIZE_LOG), 1);
+   PUSH_DATA (push, util_logbase2((screen->max_tls_space / ONE_TEMP_SIZE) * 2));
+
+   return 0;
+}
+
+static bool
+nv50_compute_validate_program(struct nv50_context *nv50)
+{
+   struct nv50_program *prog = nv50->compprog;
+
+   if (prog->mem)
+      return true;
+
+   if (!prog->translated) {
+      prog->translated = nv50_program_translate(
+         prog, nv50->screen->base.device->chipset, NULL);
+      if (!prog->translated)
+         return false;
+   }
+   if (unlikely(!prog->code_size))
+      return false;
+
+   if (likely(prog->code_size)) {
+      if (nv50_program_upload_code(nv50, prog)) {
+         struct nouveau_pushbuf *push = nv50->base.pushbuf;
+         BEGIN_NV04(push, NV50_COMPUTE(CODE_CB_FLUSH), 1);
+         PUSH_DATA (push, 0);
+         return true;
+      }
+   }
+   return false;
+}
+
+static void
+nv50_compute_validate_globals(struct nv50_context *nv50)
+{
+   unsigned i;
+
+   for (i = 0; i < nv50->global_residents.size / sizeof(struct pipe_resource *);
+        ++i) {
+      struct pipe_resource *res = *util_dynarray_element(
+         &nv50->global_residents, struct pipe_resource *, i);
+      if (res)
+         nv50_add_bufctx_resident(nv50->bufctx_cp, NV50_BIND_CP_GLOBAL,
+                                  nv04_resource(res), NOUVEAU_BO_RDWR);
+   }
+}
+
+static bool
+nv50_compute_state_validate(struct nv50_context *nv50)
+{
+   if (!nv50_compute_validate_program(nv50))
+      return false;
+
+   if (nv50->dirty_cp & NV50_NEW_CP_GLOBALS)
+      nv50_compute_validate_globals(nv50);
+
+   /* TODO: validate textures, samplers, surfaces */
+
+   nv50_bufctx_fence(nv50->bufctx_cp, false);
+
+   nouveau_pushbuf_bufctx(nv50->base.pushbuf, nv50->bufctx_cp);
+   if (unlikely(nouveau_pushbuf_validate(nv50->base.pushbuf)))
+      return false;
+   if (unlikely(nv50->state.flushed))
+      nv50_bufctx_fence(nv50->bufctx_cp, true);
+
+   return true;
+}
+
+static void
+nv50_compute_upload_input(struct nv50_context *nv50, const uint32_t *input)
+{
+   struct nv50_screen *screen = nv50->screen;
+   struct nouveau_pushbuf *push = screen->base.pushbuf;
+   unsigned size = align(nv50->compprog->parm_size, 0x4);
+
+   BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM_COUNT), 1);
+   PUSH_DATA (push, (size / 4) << 8);
+
+   if (size) {
+      struct nouveau_mm_allocation *mm;
+      struct nouveau_bo *bo = NULL;
+      unsigned offset;
+
+      mm = nouveau_mm_allocate(screen->base.mm_GART, size, &bo, &offset);
+      assert(mm);
+
+      nouveau_bo_map(bo, 0, screen->base.client);
+      memcpy(bo->map + offset, input, size);
+
+//      for (i = 0; i < Elements(nv50->globals); ++i) {
+//         if (nv50->globals[i].r) {
+//            struct nv04_resource *r = nv04_resource(nv50->globals[i].r);
+//            size_t reloc_off = (char *)nv50->globals[i].handle - (char *)input;
+//            uint32_t *reloc_loc = (uint32_t *)
+//               ((uint8_t *)bo->map + offset + reloc_off);
+//
+//            if (reloc_off >= 0 && reloc_off < size)
+//               *reloc_loc = r->address;
+//         }
+//      }
+
+      nouveau_bufctx_refn(nv50->bufctx, 0, bo, NOUVEAU_BO_GART | NOUVEAU_BO_RD);
+      nouveau_pushbuf_bufctx(push, nv50->bufctx);
+      nouveau_pushbuf_validate(push);
+
+      BEGIN_NV04(push, NV50_COMPUTE(USER_PARAM(0)), size / 4);
+      nouveau_pushbuf_data(push, bo, offset, size);
+
+      nouveau_fence_work(screen->base.fence.current, nouveau_mm_free_work, mm);
+      nouveau_bo_ref(NULL, &bo);
+      nouveau_bufctx_reset(nv50->bufctx, 0);
+   }
+}
+
+static uint32_t
+nv50_compute_find_symbol(struct nv50_context *nv50, uint32_t label)
+{
+   struct nv50_program *prog = nv50->compprog;
+   const struct nv50_ir_prog_symbol *syms =
+      (const struct nv50_ir_prog_symbol *)prog->cp.syms;
+   unsigned i;
+
+   for (i = 0; i < prog->cp.num_syms; ++i) {
+      if (syms[i].label == label)
+         return prog->code_base + syms[i].offset;
+   }
+   return prog->code_base; /* no symbols or symbol not found */
+}
+
+void
+nv50_launch_grid(struct pipe_context *pipe,
+                 const uint *block_layout, const uint *grid_layout,
+                 uint32_t label, const void *input)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   struct nouveau_pushbuf *push = nv50->base.pushbuf;
+   unsigned block_size = block_layout[0] * block_layout[1] * block_layout[2];
+   struct nv50_program *cp = nv50->compprog;
+   bool ret;
+
+   ret = !nv50_compute_state_validate(nv50);
+   if (ret) {
+      NOUVEAU_ERR("Failed to launch grid !\n");
+      return;
+   }
+
+   nv50_compute_upload_input(nv50, input);
+
+   BEGIN_NV04(push, NV50_COMPUTE(CP_START_ID), 1);
+   PUSH_DATA (push, nv50_compute_find_symbol(nv50, label));
+
+   BEGIN_NV04(push, NV50_COMPUTE(SHARED_SIZE), 1);
+   PUSH_DATA (push, align(cp->cp.smem_size + cp->parm_size + 0x10, 0x40));
+   BEGIN_NV04(push, NV50_COMPUTE(CP_REG_ALLOC_TEMP), 1);
+   PUSH_DATA (push, cp->max_gpr);
+
+   /* grid/block setup */
+   BEGIN_NV04(push, NV50_COMPUTE(BLOCKDIM_XY), 2);
+   PUSH_DATA (push, block_layout[1] << 16 | block_layout[0]);
+   PUSH_DATA (push, block_layout[2]);
+   BEGIN_NV04(push, NV50_COMPUTE(BLOCK_ALLOC), 1);
+   PUSH_DATA (push, 1 << 16 | block_size);
+   BEGIN_NV04(push, NV50_COMPUTE(BLOCKDIM_LATCH), 1);
+   PUSH_DATA (push, 1);
+   BEGIN_NV04(push, NV50_COMPUTE(GRIDDIM), 1);
+   PUSH_DATA (push, grid_layout[1] << 16 | grid_layout[0]);
+   BEGIN_NV04(push, NV50_COMPUTE(GRIDID), 1);
+   PUSH_DATA (push, 1);
+
+   /* kernel launching */
+   BEGIN_NV04(push, NV50_COMPUTE(LAUNCH), 1);
+   PUSH_DATA (push, 0);
+   BEGIN_NV04(push, SUBC_COMPUTE(NV50_GRAPH_SERIALIZE), 1);
+   PUSH_DATA (push, 0);
+
+   /* bind a compute shader clobbers fragment shader state */
+   nv50->dirty |= NV50_NEW_FRAGPROG;
+}
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_compute.xml.h b/src/gallium/drivers/nouveau/nv50/nv50_compute.xml.h
new file mode 100644
index 0000000..268d112
--- /dev/null
+++ b/src/gallium/drivers/nouveau/nv50/nv50_compute.xml.h
@@ -0,0 +1,444 @@
+#ifndef NV50_COMPUTE_XML
+#define NV50_COMPUTE_XML
+
+/* Autogenerated file, DO NOT EDIT manually!
+
+This file was generated by the rules-ng-ng headergen tool in this git repository:
+http://github.com/envytools/envytools/
+git clone https://github.com/envytools/envytools.git
+
+The rules-ng-ng source files this header was generated from are:
+- rnndb/graph/g80_compute.xml (  14027 bytes, from 2015-02-14 02:01:36)
+- rnndb/copyright.xml         (   6456 bytes, from 2015-02-14 02:01:36)
+- rnndb/nvchipsets.xml        (   2833 bytes, from 2015-04-28 16:28:33)
+- rnndb/fifo/nv_object.xml    (  15390 bytes, from 2015-04-22 20:36:09)
+- rnndb/g80_defs.xml          (  18210 bytes, from 2015-10-19 20:49:59)
+
+Copyright (C) 2006-2015 by the following authors:
+- Artur Huillet <arthur.huillet at free.fr> (ahuillet)
+- Ben Skeggs (darktama, darktama_)
+- B. R. <koala_br at users.sourceforge.net> (koala_br)
+- Carlos Martin <carlosmn at users.sf.net> (carlosmn)
+- Christoph Bumiller <e0425955 at student.tuwien.ac.at> (calim, chrisbmr)
+- Dawid Gajownik <gajownik at users.sf.net> (gajownik)
+- Dmitry Baryshkov
+- Dmitry Eremin-Solenikov <lumag at users.sf.net> (lumag)
+- EdB <edb_ at users.sf.net> (edb_)
+- Erik Waling <erikwailing at users.sf.net> (erikwaling)
+- Francisco Jerez <currojerez at riseup.net> (curro)
+- Ilia Mirkin <imirkin at alum.mit.edu> (imirkin)
+- jb17bsome <jb17bsome at bellsouth.net> (jb17bsome)
+- Jeremy Kolb <kjeremy at users.sf.net> (kjeremy)
+- Laurent Carlier <lordheavym at gmail.com> (lordheavy)
+- Luca Barbieri <luca at luca-barbieri.com> (lb, lb1)
+- Maarten Maathuis <madman2003 at gmail.com> (stillunknown)
+- Marcin Koƛcielnicki <koriakin at 0x04.net> (mwk, koriakin)
+- Mark Carey <mark.carey at gmail.com> (careym)
+- Matthieu Castet <matthieu.castet at parrot.com> (mat-c)
+- nvidiaman <nvidiaman at users.sf.net> (nvidiaman)
+- Patrice Mandin <patmandin at gmail.com> (pmandin, pmdata)
+- Pekka Paalanen <pq at iki.fi> (pq, ppaalanen)
+- Peter Popov <ironpeter at users.sf.net> (ironpeter)
+- Richard Hughes <hughsient at users.sf.net> (hughsient)
+- Rudi Cilibrasi <cilibrar at users.sf.net> (cilibrar)
+- Serge Martin
+- Simon Raffeiner
+- Stephane Loeuillet <leroutier at users.sf.net> (leroutier)
+- Stephane Marchesin <stephane.marchesin at gmail.com> (marcheu)
+- sturmflut <sturmflut at users.sf.net> (sturmflut)
+- Sylvain Munaut <tnt at 246tNt.com>
+- Victor Stinner <victor.stinner at haypocalc.com> (haypo)
+- Wladmir van der Laan <laanwj at gmail.com> (miathan6)
+- Younes Manton <younes.m at gmail.com> (ymanton)
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice (including the
+next paragraph) shall be included in all copies or substantial
+portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+*/
+
+
+
+#define NV50_COMPUTE_DMA_NOTIFY					0x00000180
+
+#define NV50_COMPUTE_DMA_GLOBAL					0x000001a0
+
+#define NV50_COMPUTE_DMA_QUERY					0x000001a4
+
+#define NV50_COMPUTE_DMA_LOCAL					0x000001b8
+
+#define NV50_COMPUTE_DMA_STACK					0x000001bc
+
+#define NV50_COMPUTE_DMA_CODE_CB					0x000001c0
+
+#define NV50_COMPUTE_DMA_TSC					0x000001c4
+
+#define NV50_COMPUTE_DMA_TIC					0x000001c8
+
+#define NV50_COMPUTE_DMA_TEXTURE					0x000001cc
+
+#define NV50_COMPUTE_UNK0200					0x00000200
+#define NV50_COMPUTE_UNK0200_UNK1__MASK				0x0000ffff
+#define NV50_COMPUTE_UNK0200_UNK1__SHIFT				0
+#define NV50_COMPUTE_UNK0200_UNK2__MASK				0x00ff0000
+#define NV50_COMPUTE_UNK0200_UNK2__SHIFT				16
+
+#define NV50_COMPUTE_UNK0204					0x00000204
+
+#define NV50_COMPUTE_UNK0208					0x00000208
+
+#define NV50_COMPUTE_UNK020C					0x0000020c
+
+#define NV50_COMPUTE_CP_ADDRESS_HIGH				0x00000210
+
+#define NV50_COMPUTE_CP_ADDRESS_LOW				0x00000214
+
+#define NV50_COMPUTE_STACK_ADDRESS_HIGH				0x00000218
+
+#define NV50_COMPUTE_STACK_ADDRESS_LOW				0x0000021c
+
+#define NV50_COMPUTE_STACK_SIZE_LOG				0x00000220
+
+#define NV50_COMPUTE_CALL_LIMIT_LOG				0x00000224
+
+#define NV50_COMPUTE_UNK0228					0x00000228
+#define NV50_COMPUTE_UNK0228_UNK0				0x00000001
+#define NV50_COMPUTE_UNK0228_UNK4__MASK				0x00000ff0
+#define NV50_COMPUTE_UNK0228_UNK4__SHIFT				4
+#define NV50_COMPUTE_UNK0228_UNK12__MASK				0x000ff000
+#define NV50_COMPUTE_UNK0228_UNK12__SHIFT			12
+
+#define NV50_COMPUTE_TSC_ADDRESS_HIGH				0x0000022c
+
+#define NV50_COMPUTE_TSC_ADDRESS_LOW				0x00000230
+#define NV50_COMPUTE_TSC_ADDRESS_LOW__ALIGN			0x00000020
+
+#define NV50_COMPUTE_TSC_LIMIT					0x00000234
+#define NV50_COMPUTE_TSC_LIMIT__MAX				0x00001fff
+
+#define NV50_COMPUTE_CB_ADDR					0x00000238
+#define NV50_COMPUTE_CB_ADDR_ID__MASK				0x003fff00
+#define NV50_COMPUTE_CB_ADDR_ID__SHIFT				8
+#define NV50_COMPUTE_CB_ADDR_BUFFER__MASK			0x0000007f
+#define NV50_COMPUTE_CB_ADDR_BUFFER__SHIFT			0
+
+#define NV50_COMPUTE_CB_DATA(i0)				       (0x0000023c + 0x4*(i0))
+#define NV50_COMPUTE_CB_DATA__ESIZE				0x00000004
+#define NV50_COMPUTE_CB_DATA__LEN				0x00000010
+
+#define NV50_COMPUTE_TSC_FLUSH					0x0000027c
+#define NV50_COMPUTE_TSC_FLUSH_SPECIFIC				0x00000001
+#define NV50_COMPUTE_TSC_FLUSH_ENTRY__MASK			0x03fffff0
+#define NV50_COMPUTE_TSC_FLUSH_ENTRY__SHIFT			4
+
+#define NV50_COMPUTE_TIC_FLUSH					0x00000280
+#define NV50_COMPUTE_TIC_FLUSH_SPECIFIC				0x00000001
+#define NV50_COMPUTE_TIC_FLUSH_ENTRY__MASK			0x03fffff0
+#define NV50_COMPUTE_TIC_FLUSH_ENTRY__SHIFT			4
+
+#define NV50_COMPUTE_DELAY1					0x00000284
+
+#define NV50_COMPUTE_WATCHDOG_TIMER				0x00000288
+
+#define NV50_COMPUTE_DELAY2					0x0000028c
+
+#define NV50_COMPUTE_UNK0290					0x00000290
+
+#define NV50_COMPUTE_LOCAL_ADDRESS_HIGH				0x00000294
+
+#define NV50_COMPUTE_LOCAL_ADDRESS_LOW				0x00000298
+#define NV50_COMPUTE_LOCAL_ADDRESS_LOW__ALIGN			0x00000100
+
+#define NV50_COMPUTE_LOCAL_SIZE_LOG				0x0000029c
+
+#define NV50_COMPUTE_UNK02A0					0x000002a0
+
+#define NV50_COMPUTE_CB_DEF_ADDRESS_HIGH				0x000002a4
+
+#define NV50_COMPUTE_CB_DEF_ADDRESS_LOW				0x000002a8
+
+#define NV50_COMPUTE_CB_DEF_SET					0x000002ac
+#define NV50_COMPUTE_CB_DEF_SET_SIZE__MASK			0x0000ffff
+#define NV50_COMPUTE_CB_DEF_SET_SIZE__SHIFT			0
+#define NV50_COMPUTE_CB_DEF_SET_BUFFER__MASK			0x007f0000
+#define NV50_COMPUTE_CB_DEF_SET_BUFFER__SHIFT			16
+
+#define NV50_COMPUTE_UNK02B0					0x000002b0
+
+#define NV50_COMPUTE_BLOCK_ALLOC					0x000002b4
+#define NV50_COMPUTE_BLOCK_ALLOC_THREADS__MASK			0x0000ffff
+#define NV50_COMPUTE_BLOCK_ALLOC_THREADS__SHIFT			0
+#define NV50_COMPUTE_BLOCK_ALLOC_BARRIERS__MASK			0x00ff0000
+#define NV50_COMPUTE_BLOCK_ALLOC_BARRIERS__SHIFT			16
+
+#define NV50_COMPUTE_LANES32_ENABLE				0x000002b8
+
+#define NV50_COMPUTE_UNK02BC					0x000002bc
+#define NV50_COMPUTE_UNK02BC_UNK1__MASK				0x00000007
+#define NV50_COMPUTE_UNK02BC_UNK1__SHIFT				0
+#define NV50_COMPUTE_UNK02BC_UNK2__MASK				0x00000070
+#define NV50_COMPUTE_UNK02BC_UNK2__SHIFT				4
+
+#define NV50_COMPUTE_CP_REG_ALLOC_TEMP				0x000002c0
+
+#define NV50_COMPUTE_TIC_ADDRESS_HIGH				0x000002c4
+
+#define NV50_COMPUTE_TIC_ADDRESS_LOW				0x000002c8
+
+#define NV50_COMPUTE_TIC_LIMIT					0x000002cc
+
+#define NV50_COMPUTE_MP_PM_SET(i0)			       (0x000002d0 + 0x4*(i0))
+#define NV50_COMPUTE_MP_PM_SET__ESIZE				0x00000004
+#define NV50_COMPUTE_MP_PM_SET__LEN				0x00000004
+
+#define NV50_COMPUTE_MP_PM_CONTROL(i0)			       (0x000002e0 + 0x4*(i0))
+#define NV50_COMPUTE_MP_PM_CONTROL__ESIZE			0x00000004
+#define NV50_COMPUTE_MP_PM_CONTROL__LEN				0x00000004
+#define NV50_COMPUTE_MP_PM_CONTROL_MODE__MASK			0x00000001
+#define NV50_COMPUTE_MP_PM_CONTROL_MODE__SHIFT			0
+#define NV50_COMPUTE_MP_PM_CONTROL_MODE_LOGOP			0x00000000
+#define NV50_COMPUTE_MP_PM_CONTROL_MODE_LOGOP_PULSE		0x00000001
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT__MASK			0x00000070
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT__SHIFT			4
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT_UNK0			0x00000000
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT_UNK1			0x00000010
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT_UNK2			0x00000020
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT_UNK3			0x00000030
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT_UNK4			0x00000040
+#define NV50_COMPUTE_MP_PM_CONTROL_UNIT_UNK5			0x00000050
+#define NV50_COMPUTE_MP_PM_CONTROL_FUNC__MASK			0x00ffff00
+#define NV50_COMPUTE_MP_PM_CONTROL_FUNC__SHIFT			8
+#define NV50_COMPUTE_MP_PM_CONTROL_SIG__MASK			0xff000000
+#define NV50_COMPUTE_MP_PM_CONTROL_SIG__SHIFT			24
+
+#define NV50_COMPUTE_MP_PM_OVERFLOW_TRAP_ENABLE			0x000002f0
+#define NV50_COMPUTE_MP_PM_OVERFLOW_TRAP_ENABLE_0		0x00000001
+#define NV50_COMPUTE_MP_PM_OVERFLOW_TRAP_ENABLE_1		0x00000002
+#define NV50_COMPUTE_MP_PM_OVERFLOW_TRAP_ENABLE_2		0x00000004
+#define NV50_COMPUTE_MP_PM_OVERFLOW_TRAP_ENABLE_3		0x00000008
+
+#define NV50_COMPUTE_UNK02F4					0x000002f4
+
+#define NV50_COMPUTE_BLOCKDIM_LATCH				0x000002f8
+
+#define NV50_COMPUTE_LOCAL_WARPS_LOG_ALLOC			0x000002fc
+
+#define NV50_COMPUTE_LOCAL_WARPS_NO_CLAMP			0x00000300
+
+#define NV50_COMPUTE_STACK_WARPS_LOG_ALLOC			0x00000304
+
+#define NV50_COMPUTE_STACK_WARPS_NO_CLAMP			0x00000308
+
+#define NV50_COMPUTE_UNK030C					0x0000030c
+
+#define NV50_COMPUTE_QUERY_ADDRESS_HIGH				0x00000310
+
+#define NV50_COMPUTE_QUERY_ADDRESS_LOW				0x00000314
+
+#define NV50_COMPUTE_QUERY_SEQUENCE				0x00000318
+
+#define NV50_COMPUTE_QUERY_GET					0x0000031c
+#define NV50_COMPUTE_QUERY_GET_INTR				0x00000200
+#define NV50_COMPUTE_QUERY_GET_SHORT				0x00008000
+
+#define NV50_COMPUTE_COND_ADDRESS_HIGH				0x00000320
+
+#define NV50_COMPUTE_COND_ADDRESS_LOW				0x00000324
+
+#define NV50_COMPUTE_COND_MODE					0x00000328
+#define NV50_COMPUTE_COND_MODE_NEVER				0x00000000
+#define NV50_COMPUTE_COND_MODE_ALWAYS				0x00000001
+#define NV50_COMPUTE_COND_MODE_RES_NON_ZERO			0x00000002
+#define NV50_COMPUTE_COND_MODE_EQUAL				0x00000003
+#define NV50_COMPUTE_COND_MODE_NOT_EQUAL				0x00000004
+
+#define NV50_COMPUTE_UNK032C					0x0000032c
+
+#define NV50_COMPUTE_UNK0330					0x00000330
+
+#define NV50_COMPUTE_UNK0334(i0)				       (0x00000334 + 0x4*(i0))
+#define NV50_COMPUTE_UNK0334__ESIZE				0x00000004
+#define NV50_COMPUTE_UNK0334__LEN				0x00000003
+
+#define NV50_COMPUTE_UNK0340(i0)				       (0x00000340 + 0x4*(i0))
+#define NV50_COMPUTE_UNK0340__ESIZE				0x00000004
+#define NV50_COMPUTE_UNK0340__LEN				0x00000002
+
+#define NV50_COMPUTE_UNK0348(i0)				       (0x00000348 + 0x4*(i0))
+#define NV50_COMPUTE_UNK0348__ESIZE				0x00000004
+#define NV50_COMPUTE_UNK0348__LEN				0x00000002
+
+#define NV50_COMPUTE_UNK0350(i0)				       (0x00000350 + 0x4*(i0))
+#define NV50_COMPUTE_UNK0350__ESIZE				0x00000004
+#define NV50_COMPUTE_UNK0350__LEN				0x00000002
+
+#define NV50_COMPUTE_UNK0358					0x00000358
+
+#define NV50_COMPUTE_UNK035C					0x0000035c
+
+#define NV50_COMPUTE_UNK0360					0x00000360
+#define NV50_COMPUTE_UNK0360_UNK0__MASK				0x000000f0
+#define NV50_COMPUTE_UNK0360_UNK0__SHIFT				4
+#define NV50_COMPUTE_UNK0360_UNK1__MASK				0x00000f00
+#define NV50_COMPUTE_UNK0360_UNK1__SHIFT				8
+
+#define NV50_COMPUTE_UNK0364					0x00000364
+
+#define NV50_COMPUTE_LAUNCH					0x00000368
+
+#define NV50_COMPUTE_UNK036C					0x0000036c
+
+#define NV50_COMPUTE_UNK0370					0x00000370
+
+#define NV50_COMPUTE_USER_PARAM_COUNT				0x00000374
+#define NV50_COMPUTE_USER_PARAM_COUNT_UNK0__MASK			0x000000ff
+#define NV50_COMPUTE_USER_PARAM_COUNT_UNK0__SHIFT		0
+#define NV50_COMPUTE_USER_PARAM_COUNT_COUNT__MASK		0x0000ff00
+#define NV50_COMPUTE_USER_PARAM_COUNT_COUNT__SHIFT		8
+#define NV50_COMPUTE_USER_PARAM_COUNT_COUNT__MAX			0x00000040
+
+#define NV50_COMPUTE_LINKED_TSC					0x00000378
+
+#define NV50_COMPUTE_UNK037C					0x0000037c
+#define NV50_COMPUTE_UNK037C_ALWAYS_DERIV			0x00000001
+#define NV50_COMPUTE_UNK037C_UNK16				0x00010000
+
+#define NV50_COMPUTE_CODE_CB_FLUSH				0x00000380
+
+#define NV50_COMPUTE_UNK0384					0x00000384
+
+#define NV50_COMPUTE_GRIDID					0x00000388
+
+#define NV50_COMPUTE_UNK038C(i0)				       (0x0000038c + 0x4*(i0))
+#define NV50_COMPUTE_UNK038C__ESIZE				0x00000004
+#define NV50_COMPUTE_UNK038C__LEN				0x00000003
+
+#define NV50_COMPUTE_WRCACHE_FLUSH				0x00000398
+
+#define NV50_COMPUTE_UNK039C(i0)				       (0x0000039c + 0x4*(i0))
+#define NV50_COMPUTE_UNK039C__ESIZE				0x00000004
+#define NV50_COMPUTE_UNK039C__LEN				0x00000002
+
+#define NV50_COMPUTE_GRIDDIM					0x000003a4
+#define NV50_COMPUTE_GRIDDIM_X__MASK				0x0000ffff
+#define NV50_COMPUTE_GRIDDIM_X__SHIFT				0
+#define NV50_COMPUTE_GRIDDIM_Y__MASK				0xffff0000
+#define NV50_COMPUTE_GRIDDIM_Y__SHIFT				16
+
+#define NV50_COMPUTE_SHARED_SIZE					0x000003a8
+#define NV50_COMPUTE_SHARED_SIZE__MAX				0x00004000
+#define NV50_COMPUTE_SHARED_SIZE__ALIGN				0x00000040
+
+#define NV50_COMPUTE_BLOCKDIM_XY					0x000003ac
+#define NV50_COMPUTE_BLOCKDIM_XY_X__MASK				0x0000ffff
+#define NV50_COMPUTE_BLOCKDIM_XY_X__SHIFT			0
+#define NV50_COMPUTE_BLOCKDIM_XY_Y__MASK				0xffff0000
+#define NV50_COMPUTE_BLOCKDIM_XY_Y__SHIFT			16
+
+#define NV50_COMPUTE_BLOCKDIM_Z					0x000003b0
+#define NV50_COMPUTE_BLOCKDIM_Z__MIN				0x00000001
+#define NV50_COMPUTE_BLOCKDIM_Z__MAX				0x00000040
+
+#define NV50_COMPUTE_CP_START_ID					0x000003b4
+
+#define NV50_COMPUTE_REG_MODE					0x000003b8
+#define NV50_COMPUTE_REG_MODE_PACKED				0x00000001
+#define NV50_COMPUTE_REG_MODE_STRIPED				0x00000002
+
+#define NV50_COMPUTE_TEX_LIMITS					0x000003bc
+#define NV50_COMPUTE_TEX_LIMITS_SAMPLERS_LOG2__MASK		0x0000000f
+#define NV50_COMPUTE_TEX_LIMITS_SAMPLERS_LOG2__SHIFT		0
+#define NV50_COMPUTE_TEX_LIMITS_SAMPLERS_LOG2__MIN		0x00000000
+#define NV50_COMPUTE_TEX_LIMITS_SAMPLERS_LOG2__MAX		0x00000004
+#define NV50_COMPUTE_TEX_LIMITS_TEXTURES_LOG2__MASK		0x000000f0
+#define NV50_COMPUTE_TEX_LIMITS_TEXTURES_LOG2__SHIFT		4
+#define NV50_COMPUTE_TEX_LIMITS_TEXTURES_LOG2__MIN		0x00000000
+#define NV50_COMPUTE_TEX_LIMITS_TEXTURES_LOG2__MAX		0x00000007
+
+#define NV50_COMPUTE_BIND_TSC					0x000003c0
+#define NV50_COMPUTE_BIND_TSC_VALID				0x00000001
+#define NV50_COMPUTE_BIND_TSC_SAMPLER__MASK			0x000000f0
+#define NV50_COMPUTE_BIND_TSC_SAMPLER__SHIFT			4
+#define NV50_COMPUTE_BIND_TSC_TSC__MASK				0x001ff000
+#define NV50_COMPUTE_BIND_TSC_TSC__SHIFT				12
+
+#define NV50_COMPUTE_BIND_TIC					0x000003c4
+#define NV50_COMPUTE_BIND_TIC_VALID				0x00000001
+#define NV50_COMPUTE_BIND_TIC_TEXTURE__MASK			0x000001fe
+#define NV50_COMPUTE_BIND_TIC_TEXTURE__SHIFT			1
+#define NV50_COMPUTE_BIND_TIC_TIC__MASK				0x7ffffe00
+#define NV50_COMPUTE_BIND_TIC_TIC__SHIFT				9
+
+#define NV50_COMPUTE_SET_PROGRAM_CB				0x000003c8
+#define NV50_COMPUTE_SET_PROGRAM_CB_INDEX__MASK			0x00000f00
+#define NV50_COMPUTE_SET_PROGRAM_CB_INDEX__SHIFT			8
+#define NV50_COMPUTE_SET_PROGRAM_CB_BUFFER__MASK			0x0007f000
+#define NV50_COMPUTE_SET_PROGRAM_CB_BUFFER__SHIFT		12
+#define NV50_COMPUTE_SET_PROGRAM_CB_VALID			0x000000ff
+
+#define NV50_COMPUTE_UNK03CC					0x000003cc
+
+#define NV50_COMPUTE_TEX_CACHE_CTL				0x000003d0
+#define NV50_COMPUTE_TEX_CACHE_CTL_UNK1__MASK			0x00000030
+#define NV50_COMPUTE_TEX_CACHE_CTL_UNK1__SHIFT			4
+
+#define NV50_COMPUTE_UNK03D4					0x000003d4
+
+#define NV50_COMPUTE_UNK03D8					0x000003d8
+
+#define NV50_COMPUTE_UNK03DC					0x000003dc
+
+#define NV50_COMPUTE_UNK03E0					0x000003e0
+
+#define NV50_COMPUTE_UNK03E4					0x000003e4
+
+#define NVA3_COMPUTE_TEX_MISC					0x000003e8
+#define NVA3_COMPUTE_TEX_MISC_UNK1				0x00000001
+#define NVA3_COMPUTE_TEX_MISC_SEAMLESS_CUBE_MAP		0x00000002
+
+#define NV50_COMPUTE_GLOBAL(i0)				       (0x00000400 + 0x20*(i0))
+#define NV50_COMPUTE_GLOBAL__ESIZE				0x00000020
+#define NV50_COMPUTE_GLOBAL__LEN					0x00000010
+
+#define NV50_COMPUTE_GLOBAL_ADDRESS_HIGH(i0)		       (0x00000400 + 0x20*(i0))
+
+#define NV50_COMPUTE_GLOBAL_ADDRESS_LOW(i0)		       (0x00000404 + 0x20*(i0))
+
+#define NV50_COMPUTE_GLOBAL_PITCH(i0)			       (0x00000408 + 0x20*(i0))
+#define NV50_COMPUTE_GLOBAL_PITCH__MAX				0x00800000
+#define NV50_COMPUTE_GLOBAL_PITCH__ALIGN				0x00000100
+
+#define NV50_COMPUTE_GLOBAL_LIMIT(i0)			       (0x0000040c + 0x20*(i0))
+
+#define NV50_COMPUTE_GLOBAL_MODE(i0)			       (0x00000410 + 0x20*(i0))
+#define NV50_COMPUTE_GLOBAL_MODE_LINEAR				0x00000001
+#define NV50_COMPUTE_GLOBAL_MODE_UNK1__MASK			0x000000f0
+#define NV50_COMPUTE_GLOBAL_MODE_UNK1__SHIFT			4
+#define NV50_COMPUTE_GLOBAL_MODE_TILE_MODE__MASK			0x00000f00
+#define NV50_COMPUTE_GLOBAL_MODE_TILE_MODE__SHIFT		8
+
+#define NV50_COMPUTE_USER_PARAM(i0)			       (0x00000600 + 0x4*(i0))
+#define NV50_COMPUTE_USER_PARAM__ESIZE				0x00000004
+#define NV50_COMPUTE_USER_PARAM__LEN				0x00000040
+
+#define NV50_COMPUTE_UNK0700(i0)				       (0x00000700 + 0x4*(i0))
+#define NV50_COMPUTE_UNK0700__ESIZE				0x00000004
+#define NV50_COMPUTE_UNK0700__LEN				0x00000010
+
+
+#endif /* NV50_COMPUTE_XML */
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.c b/src/gallium/drivers/nouveau/nv50/nv50_context.c
index 7867c2d..f645a4d 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.c
@@ -113,6 +113,7 @@ nv50_context_unreference_resources(struct nv50_context *nv50)
 
    nouveau_bufctx_del(&nv50->bufctx_3d);
    nouveau_bufctx_del(&nv50->bufctx);
+   nouveau_bufctx_del(&nv50->bufctx_cp);
 
    util_unreference_framebuffer_state(&nv50->framebuffer);
 
@@ -131,6 +132,14 @@ nv50_context_unreference_resources(struct nv50_context *nv50)
          if (!nv50->constbuf[s][i].user)
             pipe_resource_reference(&nv50->constbuf[s][i].u.buf, NULL);
    }
+
+   for (i = 0; i < nv50->global_residents.size / sizeof(struct pipe_resource *);
+        ++i) {
+      struct pipe_resource **res = util_dynarray_element(
+         &nv50->global_residents, struct pipe_resource *, i);
+      pipe_resource_reference(res, NULL);
+   }
+   util_dynarray_fini(&nv50->global_residents);
 }
 
 static void
@@ -263,10 +272,13 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
    nv50->base.pushbuf = screen->base.pushbuf;
    nv50->base.client = screen->base.client;
 
-   ret = nouveau_bufctx_new(screen->base.client, NV50_BIND_COUNT,
-                            &nv50->bufctx_3d);
+   ret = nouveau_bufctx_new(screen->base.client, 2, &nv50->bufctx);
+   if (!ret)
+      ret = nouveau_bufctx_new(screen->base.client, NV50_BIND_3D_COUNT,
+                               &nv50->bufctx_3d);
    if (!ret)
-      ret = nouveau_bufctx_new(screen->base.client, 2, &nv50->bufctx);
+      ret = nouveau_bufctx_new(screen->base.client, NV50_BIND_CP_COUNT,
+                               &nv50->bufctx_cp);
    if (ret)
       goto out_err;
 
@@ -290,6 +302,7 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
 
    pipe->draw_vbo = nv50_draw_vbo;
    pipe->clear = nv50_clear;
+   pipe->launch_grid = nv50_launch_grid;
 
    pipe->flush = nv50_flush;
    pipe->texture_barrier = nv50_texture_barrier;
@@ -335,19 +348,30 @@ nv50_create(struct pipe_screen *pscreen, void *priv, unsigned ctxflags)
    BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->uniforms);
    BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->txc);
    BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->stack_bo);
+   if (screen->compute) {
+      BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->code);
+      BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->txc);
+      BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->stack_bo);
+   }
 
    flags = NOUVEAU_BO_GART | NOUVEAU_BO_WR;
 
    BCTX_REFN_bo(nv50->bufctx_3d, SCREEN, flags, screen->fence.bo);
    BCTX_REFN_bo(nv50->bufctx, FENCE, flags, screen->fence.bo);
+   if (screen->compute)
+      BCTX_REFN_bo(nv50->bufctx_cp, CP_SCREEN, flags, screen->fence.bo);
 
    nv50->base.scratch.bo_size = 2 << 20;
 
+   util_dynarray_init(&nv50->global_residents);
+
    return pipe;
 
 out_err:
    if (nv50->bufctx_3d)
       nouveau_bufctx_del(&nv50->bufctx_3d);
+   if (nv50->bufctx_cp)
+      nouveau_bufctx_del(&nv50->bufctx_cp);
    if (nv50->bufctx)
       nouveau_bufctx_del(&nv50->bufctx);
    FREE(nv50->blit);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_context.h b/src/gallium/drivers/nouveau/nv50/nv50_context.h
index fb74a97..fbafe02 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_context.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_context.h
@@ -49,6 +49,10 @@
 #define NV50_NEW_MIN_SAMPLES  (1 << 22)
 #define NV50_NEW_CONTEXT      (1 << 31)
 
+#define NV50_NEW_CP_PROGRAM   (1 << 0)
+#define NV50_NEW_CP_GLOBALS   (1 << 1)
+
+/* 3d bufctx (during draw_vbo, blit_3d) */
 #define NV50_BIND_FB          0
 #define NV50_BIND_VERTEX      1
 #define NV50_BIND_VERTEX_TMP  2
@@ -58,7 +62,14 @@
 #define NV50_BIND_SO         53
 #define NV50_BIND_SCREEN     54
 #define NV50_BIND_TLS        55
-#define NV50_BIND_COUNT      56
+#define NV50_BIND_3D_COUNT   56
+
+/* compute bufctx (during launch_grid) */
+#define NV50_BIND_CP_GLOBAL   0
+#define NV50_BIND_CP_SCREEN   1
+#define NV50_BIND_CP_COUNT    2
+
+/* bufctx for other operations */
 #define NV50_BIND_2D          0
 #define NV50_BIND_M2MF        0
 #define NV50_BIND_FENCE       1
@@ -101,8 +112,10 @@ struct nv50_context {
 
    struct nouveau_bufctx *bufctx_3d;
    struct nouveau_bufctx *bufctx;
+   struct nouveau_bufctx *bufctx_cp;
 
    uint32_t dirty;
+   uint32_t dirty_cp; /* dirty flags for compute state */
    bool cb_dirty;
 
    struct nv50_graph_state state;
@@ -115,6 +128,7 @@ struct nv50_context {
    struct nv50_program *vertprog;
    struct nv50_program *gmtyprog;
    struct nv50_program *fragprog;
+   struct nv50_program *compprog;
 
    struct nv50_constbuf constbuf[3][NV50_MAX_PIPE_CONSTBUFS];
    uint16_t constbuf_dirty[3];
@@ -163,6 +177,8 @@ struct nv50_context {
    uint32_t cond_condmode; /* the calculated condition */
 
    struct nv50_blitctx *blit;
+
+   struct util_dynarray global_residents;
 };
 
 static inline struct nv50_context *
@@ -302,4 +318,9 @@ struct pipe_video_buffer *
 nv98_video_buffer_create(struct pipe_context *pipe,
                          const struct pipe_video_buffer *template);
 
+/* nv50_compute.c */
+void
+nv50_launch_grid(struct pipe_context *, const uint *, const uint *,
+                 uint32_t, const void *);
+
 #endif
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.c b/src/gallium/drivers/nouveau/nv50/nv50_program.c
index 89e7a33..0e5b402 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.c
@@ -259,6 +259,8 @@ nv50_program_assign_varying_slots(struct nv50_ir_prog_info *info)
       return nv50_vertprog_assign_slots(info);
    case PIPE_SHADER_FRAGMENT:
       return nv50_fragprog_assign_slots(info);
+   case PIPE_SHADER_COMPUTE:
+      return 0;
    default:
       return -1;
    }
@@ -355,6 +357,11 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
    prog->gp.has_layer = 0;
    prog->gp.has_viewport = 0;
 
+   if (prog->type == PIPE_SHADER_COMPUTE) {
+      info->prop.cp.inputOffset = 0x10;
+      info->prop.cp.localOffset = 0x10 + prog->parm_size;
+   }
+
    info->driverPriv = prog;
 
 #ifdef DEBUG
@@ -401,6 +408,10 @@ nv50_program_translate(struct nv50_program *prog, uint16_t chipset,
          break;
       }
       prog->gp.vert_count = info->prop.gp.maxVertices;
+   } else
+   if (prog->type == PIPE_SHADER_COMPUTE) {
+      prog->cp.syms = info->bin.syms;
+      prog->cp.num_syms = info->bin.numSyms;
    }
 
    if (prog->pipe.stream_output.num_outputs)
@@ -423,11 +434,13 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
    struct nouveau_heap *heap;
    int ret;
    uint32_t size = align(prog->code_size, 0x40);
+   uint8_t prog_type;
 
    switch (prog->type) {
    case PIPE_SHADER_VERTEX:   heap = nv50->screen->vp_code_heap; break;
    case PIPE_SHADER_GEOMETRY: heap = nv50->screen->gp_code_heap; break;
    case PIPE_SHADER_FRAGMENT: heap = nv50->screen->fp_code_heap; break;
+   case PIPE_SHADER_COMPUTE:  heap = nv50->screen->fp_code_heap; break;
    default:
       assert(!"invalid program type");
       return false;
@@ -450,7 +463,14 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
          return false;
       }
    }
-   prog->code_base = prog->mem->start;
+
+   if (prog->type == PIPE_SHADER_COMPUTE) {
+      /* CP code must be uploaded in FP code segment. */
+      prog_type = 1;
+   } else {
+      prog->code_base = prog->mem->start;
+      prog_type = prog->type;
+   }
 
    ret = nv50_tls_realloc(nv50->screen, prog->tls_space);
    if (ret < 0) {
@@ -468,7 +488,7 @@ nv50_program_upload_code(struct nv50_context *nv50, struct nv50_program *prog)
                             false /* flatshade */);
 
    nv50_sifc_linear_u8(&nv50->base, nv50->screen->code,
-                       (prog->type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base,
+                       (prog_type << NV50_CODE_BO_SIZE_LOG2) + prog->code_base,
                        NOUVEAU_BO_VRAM, prog->code_size, prog->code);
 
    BEGIN_NV04(nv50->base.pushbuf, NV50_3D(CODE_CB_FLUSH), 1);
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_program.h b/src/gallium/drivers/nouveau/nv50/nv50_program.h
index 7a33eb1..f001670 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_program.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_program.h
@@ -98,6 +98,13 @@ struct nv50_program {
       ubyte viewportid; /* hw value of viewport index output */
    } gp;
 
+   struct {
+      uint32_t lmem_size; /* local memory (TGSI PRIVATE resource) size */
+      uint32_t smem_size; /* shared memory (TGSI LOCAL resource) size */
+      void *syms;
+      unsigned num_syms;
+   } cp;
+
    void *fixups; /* relocation records */
    void *interps; /* interpolation records */
 
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.c b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
index f47e998..0142e86 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.c
@@ -41,8 +41,6 @@
 
 #define THREADS_IN_WARP 32
 
-#define ONE_TEMP_SIZE (4/*vector*/ * sizeof(float))
-
 static boolean
 nv50_screen_is_format_supported(struct pipe_screen *pscreen,
                                 enum pipe_format format,
@@ -183,6 +181,7 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
    case PIPE_CAP_SHAREABLE_SHADERS:
    case PIPE_CAP_CLEAR_TEXTURE:
+   case PIPE_CAP_COMPUTE:
       return 1;
    case PIPE_CAP_SEAMLESS_CUBE_MAP:
       return 1; /* class_3d >= NVA0_3D_CLASS; */
@@ -212,7 +211,6 @@ nv50_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
    case PIPE_CAP_FAKE_SW_MSAA:
    case PIPE_CAP_TEXTURE_GATHER_OFFSETS:
    case PIPE_CAP_TGSI_VS_WINDOW_SPACE_POSITION:
-   case PIPE_CAP_COMPUTE:
    case PIPE_CAP_DRAW_INDIRECT:
    case PIPE_CAP_VERTEXID_NOBASE:
    case PIPE_CAP_MULTISAMPLE_Z_RESOLVE: /* potentially supported on some hw */
@@ -251,6 +249,7 @@ nv50_screen_get_shader_param(struct pipe_screen *pscreen, unsigned shader,
    case PIPE_SHADER_VERTEX:
    case PIPE_SHADER_GEOMETRY:
    case PIPE_SHADER_FRAGMENT:
+   case PIPE_SHADER_COMPUTE:
       break;
    default:
       return 0;
@@ -336,6 +335,52 @@ nv50_screen_get_paramf(struct pipe_screen *pscreen, enum pipe_capf param)
    return 0.0f;
 }
 
+static int
+nv50_screen_get_compute_param(struct pipe_screen *pscreen,
+                              enum pipe_compute_cap param, void *data)
+{
+   struct nv50_screen *screen = nv50_screen(pscreen);
+
+#define RET(x) do {                  \
+   if (data)                         \
+      memcpy(data, x, sizeof(x));    \
+   return sizeof(x);                 \
+} while (0)
+
+   switch (param) {
+   case PIPE_COMPUTE_CAP_GRID_DIMENSION:
+      RET((uint64_t []) { 2 });
+   case PIPE_COMPUTE_CAP_MAX_GRID_SIZE:
+      RET(((uint64_t []) { 65535, 65535 }));
+   case PIPE_COMPUTE_CAP_MAX_BLOCK_SIZE:
+      RET(((uint64_t []) { 512, 512, 64 }));
+   case PIPE_COMPUTE_CAP_MAX_THREADS_PER_BLOCK:
+      RET((uint64_t []) { 512 });
+   case PIPE_COMPUTE_CAP_MAX_GLOBAL_SIZE: /* g0-15[] */
+      RET((uint64_t []) { 1ULL << 32 });
+   case PIPE_COMPUTE_CAP_MAX_LOCAL_SIZE: /* s[] */
+      RET((uint64_t []) { 16 << 10 });
+   case PIPE_COMPUTE_CAP_MAX_PRIVATE_SIZE: /* l[] */
+      RET((uint64_t []) { 16 << 10 });
+   case PIPE_COMPUTE_CAP_MAX_INPUT_SIZE: /* c[], arbitrary limit */
+      RET((uint64_t []) { 4096 });
+   case PIPE_COMPUTE_CAP_SUBGROUP_SIZE:
+      RET((uint32_t []) { 32 });
+   case PIPE_COMPUTE_CAP_MAX_MEM_ALLOC_SIZE:
+      RET((uint64_t []) { 1ULL << 40 });
+   case PIPE_COMPUTE_CAP_IMAGES_SUPPORTED:
+      RET((uint32_t []) { 0 });
+   case PIPE_COMPUTE_CAP_MAX_COMPUTE_UNITS:
+      RET((uint32_t []) { screen->mp_count });
+   case PIPE_COMPUTE_CAP_MAX_CLOCK_FREQUENCY:
+      RET((uint32_t []) { 512 }); /* FIXME: arbitrary limit */
+   default:
+      return 0;
+   }
+
+#undef RET
+}
+
 static void
 nv50_screen_destroy(struct pipe_screen *pscreen)
 {
@@ -377,6 +422,7 @@ nv50_screen_destroy(struct pipe_screen *pscreen)
    nouveau_object_del(&screen->tesla);
    nouveau_object_del(&screen->eng2d);
    nouveau_object_del(&screen->m2mf);
+   nouveau_object_del(&screen->compute);
    nouveau_object_del(&screen->sync);
 
    nouveau_screen_fini(&screen->base);
@@ -742,6 +788,7 @@ nv50_screen_create(struct nouveau_device *dev)
    pscreen->get_param = nv50_screen_get_param;
    pscreen->get_shader_param = nv50_screen_get_shader_param;
    pscreen->get_paramf = nv50_screen_get_paramf;
+   pscreen->get_compute_param = nv50_screen_get_compute_param;
 
    nv50_screen_init_resource_functions(pscreen);
 
@@ -851,6 +898,8 @@ nv50_screen_create(struct nouveau_device *dev)
    screen->TPs = util_bitcount(value & 0xffff);
    screen->MPsInTP = util_bitcount((value >> 24) & 0xf);
 
+   screen->mp_count = screen->TPs * screen->MPsInTP;
+
    stack_size = util_next_power_of_two(screen->TPs) * screen->MPsInTP *
          STACK_WARPS_ALLOC * 64 * 8;
 
@@ -902,6 +951,12 @@ nv50_screen_create(struct nouveau_device *dev)
 
    nv50_screen_init_hwctx(screen);
 
+   ret = nv50_screen_compute_setup(screen, screen->base.pushbuf);
+   if (ret) {
+      NOUVEAU_ERR("Failed to init compute context: %d\n", ret);
+      goto fail;
+   }
+
    nouveau_fence_new(&screen->base, &screen->base.fence.current, false);
 
    return pscreen;
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_screen.h b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
index ce51f0f..153ceea 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_screen.h
+++ b/src/gallium/drivers/nouveau/nv50/nv50_screen.h
@@ -23,6 +23,10 @@ struct nv50_context;
 
 #define NV50_MAX_VIEWPORTS 16
 
+#define NV50_MAX_GLOBALS 16
+
+#define ONE_TEMP_SIZE (4/*vector*/ * sizeof(float))
+
 struct nv50_blitter;
 
 struct nv50_graph_state {
@@ -66,6 +70,7 @@ struct nv50_screen {
    unsigned MPsInTP;
    unsigned max_tls_space;
    unsigned cur_tls_space;
+   unsigned mp_count;
 
    struct nouveau_heap *vp_code_heap;
    struct nouveau_heap *gp_code_heap;
@@ -93,6 +98,7 @@ struct nv50_screen {
    struct nouveau_object *sync;
 
    struct nouveau_object *tesla;
+   struct nouveau_object *compute;
    struct nouveau_object *eng2d;
    struct nouveau_object *m2mf;
 };
@@ -109,6 +115,8 @@ void nv50_blitter_destroy(struct nv50_screen *);
 int nv50_screen_tic_alloc(struct nv50_screen *, void *);
 int nv50_screen_tsc_alloc(struct nv50_screen *, void *);
 
+int nv50_screen_compute_setup(struct nv50_screen *, struct nouveau_pushbuf *);
+
 static inline void
 nv50_resource_fence(struct nv04_resource *res, uint32_t flags)
 {
diff --git a/src/gallium/drivers/nouveau/nv50/nv50_state.c b/src/gallium/drivers/nouveau/nv50/nv50_state.c
index d27f12c..b4ea08d 100644
--- a/src/gallium/drivers/nouveau/nv50/nv50_state.c
+++ b/src/gallium/drivers/nouveau/nv50/nv50_state.c
@@ -792,6 +792,35 @@ nv50_gp_state_bind(struct pipe_context *pipe, void *hwcso)
     nv50->dirty |= NV50_NEW_GMTYPROG;
 }
 
+static void *
+nv50_cp_state_create(struct pipe_context *pipe,
+                     const struct pipe_compute_state *cso)
+{
+   struct nv50_program *prog;
+
+   prog = CALLOC_STRUCT(nv50_program);
+   if (!prog)
+      return NULL;
+   prog->type = PIPE_SHADER_COMPUTE;
+
+   prog->cp.smem_size = cso->req_local_mem;
+   prog->cp.lmem_size = cso->req_private_mem;
+   prog->parm_size = cso->req_input_mem;
+
+   prog->pipe.tokens = tgsi_dup_tokens((const struct tgsi_token *)cso->prog);
+
+   return (void *)prog;
+}
+
+static void
+nv50_cp_state_bind(struct pipe_context *pipe, void *hwcso)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+
+   nv50->compprog = hwcso;
+   nv50->dirty_cp |= NV50_NEW_CP_PROGRAM;
+}
+
 static void
 nv50_set_constant_buffer(struct pipe_context *pipe, uint shader, uint index,
                          struct pipe_constant_buffer *cb)
@@ -1134,6 +1163,70 @@ nv50_set_stream_output_targets(struct pipe_context *pipe,
       nv50->dirty |= NV50_NEW_STRMOUT;
 }
 
+static void
+nv50_set_compute_resources(struct pipe_context *pipe,
+                           unsigned start, unsigned nr,
+                           struct pipe_surface **resources)
+{
+   /* TODO: bind surfaces */
+}
+
+static inline void
+nv50_set_global_handle(uint32_t *phandle, struct pipe_resource *res)
+{
+   struct nv04_resource *buf = nv04_resource(res);
+   if (buf) {
+      uint64_t limit = (buf->address + buf->base.width0) - 1;
+      if (limit < (1ULL << 32)) {
+         *phandle = (uint32_t)buf->address;
+      } else {
+         NOUVEAU_ERR("Cannot map into TGSI_RESOURCE_GLOBAL: "
+                     "resource not contained within 32-bit address space !\n");
+         *phandle = 0;
+      }
+   } else {
+      *phandle = 0;
+   }
+}
+
+static void
+nv50_set_global_bindings(struct pipe_context *pipe,
+                         unsigned start, unsigned nr,
+                         struct pipe_resource **resources,
+                         uint32_t **handles)
+{
+   struct nv50_context *nv50 = nv50_context(pipe);
+   struct pipe_resource **ptr;
+   unsigned i;
+   const unsigned end = start + nr;
+
+   if (nv50->global_residents.size <= (end * sizeof(struct pipe_resource *))) {
+      const unsigned old_size = nv50->global_residents.size;
+      const unsigned req_size = end * sizeof(struct pipe_resource *);
+      util_dynarray_resize(&nv50->global_residents, req_size);
+      memset((uint8_t *)nv50->global_residents.data + old_size, 0,
+             req_size - old_size);
+   }
+
+   if (resources) {
+      ptr = util_dynarray_element(
+         &nv50->global_residents, struct pipe_resource *, start);
+      for (i = 0; i < nr; ++i) {
+         pipe_resource_reference(&ptr[i], resources[i]);
+         nv50_set_global_handle(handles[i], resources[i]);
+      }
+   } else {
+      ptr = util_dynarray_element(
+         &nv50->global_residents, struct pipe_resource *, start);
+      for (i = 0; i < nr; ++i)
+         pipe_resource_reference(&ptr[i], NULL);
+   }
+
+   nouveau_bufctx_reset(nv50->bufctx_cp, NV50_BIND_CP_GLOBAL);
+
+   nv50->dirty_cp = NV50_NEW_CP_GLOBALS;
+}
+
 void
 nv50_init_state_functions(struct nv50_context *nv50)
 {
@@ -1162,12 +1255,15 @@ nv50_init_state_functions(struct nv50_context *nv50)
    pipe->create_vs_state = nv50_vp_state_create;
    pipe->create_fs_state = nv50_fp_state_create;
    pipe->create_gs_state = nv50_gp_state_create;
+   pipe->create_compute_state = nv50_cp_state_create;
    pipe->bind_vs_state = nv50_vp_state_bind;
    pipe->bind_fs_state = nv50_fp_state_bind;
    pipe->bind_gs_state = nv50_gp_state_bind;
+   pipe->bind_compute_state = nv50_cp_state_bind;
    pipe->delete_vs_state = nv50_sp_state_delete;
    pipe->delete_fs_state = nv50_sp_state_delete;
    pipe->delete_gs_state = nv50_sp_state_delete;
+   pipe->delete_compute_state = nv50_sp_state_delete;
 
    pipe->set_blend_color = nv50_set_blend_color;
    pipe->set_stencil_ref = nv50_set_stencil_ref;
@@ -1191,6 +1287,9 @@ nv50_init_state_functions(struct nv50_context *nv50)
    pipe->stream_output_target_destroy = nv50_so_target_destroy;
    pipe->set_stream_output_targets = nv50_set_stream_output_targets;
 
+   pipe->set_global_binding = nv50_set_global_bindings;
+   pipe->set_compute_resources = nv50_set_compute_resources;
+
    nv50->sample_mask = ~0;
    nv50->min_samples = 1;
 }
-- 
2.6.2



More information about the mesa-dev mailing list