[Nouveau] [RFC PATCH 4/5] channel timeout using repeated sched timeouts

Wed Aug 5 04:27:52 PDT 2015

Enable the scheduling timeout error interrupt and set it to a low value
to happen periodically, since it can be missed in HW in certain
conditions. Increment a channel-specific counter in software if the
current channel hasn't advanced. Abort the channel once the timeout
limit is hit (with the periodic granularity). The error notifier is set
to NOUVEAU_GEM_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT when this occurs.

A new KEPLER_SET_CHANNEL_TIMEOUT mthd sets the timeout limit, in
milliseconds. The interrupt granularity is set to 100 ms.

Signed-off-by: Konsta Hölttä <kholtta at nvidia.com>
---
 drm/nouveau/include/nvif/class.h     |  8 ++++
 drm/nouveau/nvkm/engine/fifo/gk104.c | 78 +++++++++++++++++++++++++++++++-----
 2 files changed, 75 insertions(+), 11 deletions(-)

diff --git a/drm/nouveau/include/nvif/class.h b/drm/nouveau/include/nvif/class.h
index 381c72d..f9a3647 100644
--- a/drm/nouveau/include/nvif/class.h
+++ b/drm/nouveau/include/nvif/class.h
@@ -620,18 +620,26 @@ struct fermi_a_zbc_depth_v0 {
 	__u8  format;
 	__u8  index;
 	__u8  pad03[5];
 	__u32 ds;
 	__u32 l2;
 };
 
 #define KEPLER_SET_CHANNEL_PRIORITY                                        0x42 // XXX
+#define KEPLER_SET_CHANNEL_TIMEOUT                                         0x43 // XXX
+
 struct kepler_set_channel_priority_v0 {
 	__u8  version;
 #define KEPLER_SET_CHANNEL_PRIORITY_LOW                                    0x00
 #define KEPLER_SET_CHANNEL_PRIORITY_MEDIUM                                 0x01
 #define KEPLER_SET_CHANNEL_PRIORITY_HIGH                                   0x02
 	__u8 priority;
 	__u8  pad03[6];
 };
 
+struct kepler_set_channel_timeout_v0 {
+	__u8  version;
+	__u8  pad03[3];
+	__u32 timeout_ms;
+};
+
 #endif
diff --git a/drm/nouveau/nvkm/engine/fifo/gk104.c b/drm/nouveau/nvkm/engine/fifo/gk104.c
index 2bab45e..15360a6 100644
--- a/drm/nouveau/nvkm/engine/fifo/gk104.c
+++ b/drm/nouveau/nvkm/engine/fifo/gk104.c
@@ -83,18 +83,25 @@ struct gk104_fifo_base {
 struct gk104_fifo_chan {
 	struct nvkm_fifo_chan base;
 	u32 engine;
 	enum {
 		STOPPED,
 		RUNNING,
 		KILLED
 	} state;
+	struct {
+		u32 sum_ms;
+		u32 limit_ms;
+		u32 gpfifo_get;
+	} timeout;
 };
 
+#define GRFIFO_TIMEOUT_CHECK_PERIOD_MS 100
+
 /*******************************************************************************
  * FIFO channel objects
  ******************************************************************************/
 
 static void
 gk104_fifo_runlist_update(struct gk104_fifo_priv *priv, u32 engine)
 {
 	struct nvkm_bar *bar = nvkm_bar(priv);
@@ -288,16 +295,21 @@ gk104_fifo_chan_ctor(struct nvkm_object *parent, struct nvkm_object *engine,
 	nv_wo32(base, 0x94, 0x30000001);
 	nv_wo32(base, 0x9c, 0x00000100);
 	nv_wo32(base, 0xac, 0x0000001f);
 	nv_wo32(base, 0xe8, chan->base.chid);
 	nv_wo32(base, 0xb8, 0xf8000000);
 	nv_wo32(base, 0xf8, 0x10003080); /* 0x002310 */
 	nv_wo32(base, 0xfc, 0x10000010); /* 0x002350 */
 	bar->flush(bar);
+
+	chan->timeout.sum_ms = 0;
+	chan->timeout.limit_ms = -1;
+	chan->timeout.gpfifo_get = 0;
+
 	return 0;
 }
 
 static int
 gk104_fifo_chan_init(struct nvkm_object *object)
 {
 	struct nvkm_gpuobj *base = nv_gpuobj(object->parent);
 	struct gk104_fifo_priv *priv = (void *)object->engine;
@@ -379,21 +391,39 @@ gk104_fifo_chan_set_priority(struct nvkm_object *object, void *data, u32 size)
 			return -EINVAL;
 		}
 	}
 
 	return ret;
 }
 
 int
+gk104_fifo_chan_set_timeout(struct nvkm_object *object, void *data, u32 size)
+{
+	struct gk104_fifo_chan *chan = (void *)object;
+	union {
+		struct kepler_set_channel_timeout_v0 v0;
+	} *args = data;
+	int ret;
+
+	if (nvif_unpack(args->v0, 0, 0, false)) {
+		chan->timeout.limit_ms = args->v0.timeout_ms;
+	}
+
+	return ret;
+}
+
+int
 gk104_fifo_chan_mthd(struct nvkm_object *object, u32 mthd, void *data, u32 size)
 {
 	switch (mthd) {
 	case KEPLER_SET_CHANNEL_PRIORITY:
 		return gk104_fifo_chan_set_priority(object, data, size);
+	case KEPLER_SET_CHANNEL_TIMEOUT:
+		return gk104_fifo_chan_set_timeout(object, data, size);
 	default:
 		break;
 	}
 	return -EINVAL;
 }
 
 struct nvkm_ofuncs
 gk104_fifo_chan_ofuncs = {
@@ -604,61 +634,83 @@ gk104_fifo_intr_bind(struct gk104_fifo_priv *priv)
 }
 
 static const struct nvkm_enum
 gk104_fifo_sched_reason[] = {
 	{ 0x0a, "CTXSW_TIMEOUT" },
 	{}
 };
 
+static bool
+gk104_fifo_update_timeout(struct gk104_fifo_priv *priv,
+		struct gk104_fifo_chan *chan, u32 dt)
+{
+	u32 gpfifo_get = nv_rd32(priv, 34);
+	if (gpfifo_get == chan->timeout.gpfifo_get) {
+		chan->timeout.sum_ms += dt;
+	} else {
+		chan->timeout.sum_ms = dt;
+	}
+
+	chan->timeout.gpfifo_get = gpfifo_get;
+
+	return chan->timeout.sum_ms > chan->timeout.limit_ms;
+}
+
 static void
 gk104_fifo_intr_sched_ctxsw(struct gk104_fifo_priv *priv)
 {
 	struct nvkm_engine *engine;
 	struct gk104_fifo_chan *chan;
 	u32 engn;
 
 	for (engn = 0; engn < ARRAY_SIZE(fifo_engine); engn++) {
 		u32 stat = nv_rd32(priv, 0x002640 + (engn * 0x04));
 		u32 busy = (stat & 0x80000000);
 		u32 next = (stat & 0x07ff0000) >> 16;
-		u32 chsw = (stat & 0x00008000);
-		u32 save = (stat & 0x00004000);
-		u32 load = (stat & 0x00002000);
+		u32 cxsw = (stat & 0x0000e000) >> 13;
 		u32 prev = (stat & 0x000007ff);
-		u32 chid = load ? next : prev;
-		(void)save;
+		/* if loading context, take next id */
+		u32 chid = cxsw == 5 ? next : prev;
 
-		if (busy && chsw) {
+		nv_error(priv, "ctxsw eng stat: %08x\n", stat);
+		/* doing context switch? */
+		if (busy && (cxsw >= 5 && cxsw <= 7)) {
 			if (!(chan = (void *)priv->base.channel[chid]))
 				continue;
 			if (!(engine = gk104_fifo_engine(priv, engn)))
 				continue;
 
-			nvkm_fifo_eevent(&priv->base, chid,
-					NOUVEAU_GEM_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
-
-			gk104_fifo_recover(priv, engine, chan);
+			if (gk104_fifo_update_timeout(priv, chan,
+						GRFIFO_TIMEOUT_CHECK_PERIOD_MS)) {
+				nvkm_fifo_eevent(&priv->base, chid,
+						NOUVEAU_GEM_CHANNEL_FIFO_ERROR_IDLE_TIMEOUT);
+				gk104_fifo_recover(priv, engine, chan);
+			} else {
+				nv_debug(priv, "fifo waiting for ctxsw %d ms on ch %d\n",
+						chan->timeout.sum_ms, chid);
+			}
 		}
 	}
 }
 
 static void
 gk104_fifo_intr_sched(struct gk104_fifo_priv *priv)
 {
 	u32 intr = nv_rd32(priv, 0x00254c);
 	u32 code = intr & 0x000000ff;
 	const struct nvkm_enum *en;
 	char enunk[6] = "";
 
 	en = nvkm_enum_find(gk104_fifo_sched_reason, code);
 	if (!en)
 		snprintf(enunk, sizeof(enunk), "UNK%02x", code);
 
-	nv_error(priv, "SCHED_ERROR [ %s ]\n", en ? en->name : enunk);
+	/* this is a normal situation, not so loud */
+	nv_debug(priv, "SCHED_ERROR [ %s ]\n", en ? en->name : enunk);
 
 	switch (code) {
 	case 0x0a:
 		gk104_fifo_intr_sched_ctxsw(priv);
 		break;
 	default:
 		break;
 	}
@@ -1131,18 +1183,22 @@ gk104_fifo_init(struct nvkm_object *object)
 	/* PBDMA[n].HCE */
 	for (i = 0; i < priv->spoon_nr; i++) {
 		nv_wr32(priv, 0x040148 + (i * 0x2000), 0xffffffff); /* INTR */
 		nv_wr32(priv, 0x04014c + (i * 0x2000), 0xffffffff); /* INTREN */
 	}
 
 	nv_wr32(priv, 0x002254, 0x10000000 | priv->user.bar.offset >> 12);
 
+	/* enable interrupts */
 	nv_wr32(priv, 0x002100, 0xffffffff);
 	nv_wr32(priv, 0x002140, 0x7fffffff);
+
+	/* engine timeout */
+	nv_wr32(priv, 0x002a0c, 0x80000000 | (1000 * GRFIFO_TIMEOUT_CHECK_PERIOD_MS));
 	return 0;
 }
 
 void
 gk104_fifo_dtor(struct nvkm_object *object)
 {
 	struct gk104_fifo_priv *priv = (void *)object;
 	int i;
-- 
2.1.4