[Intel-gfx] [PATCH 2/5] drm/i915: Dynamic Parity Detection handling
Daniel Vetter
daniel at ffwll.ch
Tue May 1 20:05:37 CEST 2012
On Fri, Apr 27, 2012 at 05:40:18PM -0700, Ben Widawsky wrote:
> On IVB hardware we are given an interrupt whenever a L3 parity error
> occurs in the L3 cache. The L3 cache is used by internal GPU clients
> only. This is a very rare occurrence (in fact to test this I need to
> use specially instrumented silicon).
>
> When a row in the L3 cache detects a parity error the HW generates an
> interrupt. The interrupt is masked in GTIMR until we get a chance to
> read some registers and alert userspace via a uevent. With this
> information userspace can use a sysfs interface (follow-up patch) to
> remap those rows.
>
> Way above my level of understanding, but if a given row fails, it is
> statistically more likely to fail again than a row which has not failed.
> Therefore it is desirable for an operating system to maintain a lifelong
> list of failing rows and always remap any bad rows on driver load.
> Hardware limits the number of rows that are remappable per bank/subbank,
> and should more than that many rows detect parity errors, software
> should maintain a list of the most frequent errors, and remap those
> rows.
>
> Signed-off-by: Ben Widawsky <ben at bwidawsk.net>
> ---
> drivers/gpu/drm/i915/i915_drv.h | 2 +
> drivers/gpu/drm/i915/i915_irq.c | 83 +++++++++++++++++++++++++++++++++++++++
> drivers/gpu/drm/i915/i915_reg.h | 17 ++++++++
> 3 files changed, 102 insertions(+)
>
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 69e1539..9505fc0 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -804,6 +804,8 @@ typedef struct drm_i915_private {
>
> struct drm_property *broadcast_rgb_property;
> struct drm_property *force_audio_property;
> +
> + struct work_struct parity_error_work;
> } drm_i915_private_t;
>
> enum hdmi_force_audio {
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index ab023ca..81e5a7d 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -430,6 +430,83 @@ static void gen6_pm_rps_work(struct work_struct *work)
> mutex_unlock(&dev_priv->dev->struct_mutex);
> }
>
> +
> +/**
> + * ivybridge_parity_work - Workqueue called when a parity error interrupt
> + * occurred.
> + *
> + * Doesn't actually do anything except notify userspace so that userspace may
> + * disable things later on.
> + */
> +static void ivybridge_parity_work(struct work_struct *work)
> +{
> + drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t,
> + parity_error_work);
> +
> + u32 error_status, row, bank, subbank;
> + char *parity_event[5];
> + uint32_t misccpctl;
> + unsigned long flags;
> +
> + /* We must turn off DOP level clock gating to access the L3 registers.
> + * In order to prevent a get/put style interface, acquire struct mutex
> + * any time we access those registers.
> + */
> + mutex_lock(&dev_priv->dev->struct_mutex);
> +
> + misccpctl = I915_READ(GEN7_MISCCPCTL);
> + I915_WRITE(GEN7_MISCCPCTL, misccpctl & ~GEN7_DOP_CLOCK_GATE_ENABLE);
> + POSTING_READ(GEN7_MISCCPCTL);
> +
> + error_status = I915_READ(GEN7_L3CDERRST1);
> + row = GEN7_PARITY_ERROR_ROW(error_status);
> + bank = GEN7_PARITY_ERROR_BANK(error_status);
> + subbank = GEN7_PARITY_ERROR_SUBBANK(error_status);
> +
> + I915_WRITE(GEN7_L3CDERRST1, GEN7_PARITY_ERROR_VALID |
> + GEN7_L3CDERRST1_ENABLE);
> + POSTING_READ(GEN7_L3CDERRST1);
> +
> + I915_WRITE(GEN7_MISCCPCTL, misccpctl);
> +
> + spin_lock_irqsave(&dev_priv->irq_lock, flags);
> + dev_priv->gt_irq_mask &= ~GT_GEN7_L3_PARITY_ERROR_INTERRUPT;
> + I915_WRITE(GTIMR, dev_priv->gt_irq_mask);
> + spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
> +
> + mutex_unlock(&dev_priv->dev->struct_mutex);
> +
> + parity_event[0] = "L3_PARITY_ERROR=1";
> + parity_event[1] = kasprintf(GFP_KERNEL, "ROW=%d", row);
> + parity_event[2] = kasprintf(GFP_KERNEL, "BANK=%d", bank);
> + parity_event[3] = kasprintf(GFP_KERNEL, "SUBBANK=%d", subbank);
> + parity_event[4] = NULL;
> +
> + kobject_uevent_env(&dev_priv->dev->primary->kdev.kobj,
> + KOBJ_CHANGE, parity_event);
> +
> + kfree(parity_event[3]);
> + kfree(parity_event[2]);
> + kfree(parity_event[1]);
> +}
> +
> +void ivybridge_handle_parity_error(struct drm_device *dev)
> +{
> + drm_i915_private_t *dev_priv = (drm_i915_private_t *) dev->dev_private;
> + unsigned long flags;
> +
> + if (WARN_ON(IS_GEN6(dev)))
> + return;
> +
> + spin_lock_irqsave(&dev_priv->irq_lock, flags);
> + dev_priv->gt_irq_mask |= GT_GEN7_L3_PARITY_ERROR_INTERRUPT;
> + I915_WRITE(GTIMR, dev_priv->gt_irq_mask);
> + spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
> +
> + queue_work(dev_priv->wq, &dev_priv->parity_error_work);
> + DRM_INFO("Parity error interrupt. Scheduling work\n");
> +}
> +
> static void snb_gt_irq_handler(struct drm_device *dev,
> struct drm_i915_private *dev_priv,
> u32 gt_iir)
> @@ -449,6 +526,9 @@ static void snb_gt_irq_handler(struct drm_device *dev,
> DRM_ERROR("GT error interrupt 0x%08x\n", gt_iir);
> i915_handle_error(dev, false);
> }
> +
> + if (gt_iir & GT_GEN7_L3_PARITY_ERROR_INTERRUPT)
> + ivybridge_handle_parity_error(dev);
> }
>
> static void gen6_queue_rps_work(struct drm_i915_private *dev_priv,
> @@ -1978,6 +2058,9 @@ static void ironlake_irq_preinstall(struct drm_device *dev)
> if (IS_GEN6(dev) || IS_IVYBRIDGE(dev))
> INIT_WORK(&dev_priv->rps_work, gen6_pm_rps_work);
>
> + if (IS_IVYBRIDGE(dev))
> + INIT_WORK(&dev_priv->parity_error_work, ivybridge_parity_work);
> +
work init has moved to intel_irq_init in dinq, and for good reasons as
I've figured out after merging the patch: _preinstall is also called on
resume, and if we're unlucky we have a work item outstanding from before
the suspend, so that the we re-init a life work item. The core work queue
code doesn't approve of that, resulting in decent hilarity (NULL deref
after suspend).
-Daniel
> I915_WRITE(HWSTAM, 0xeffe);
>
> /* XXX hotplug from PCH */
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index 5ac9837..72db6a9 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -4030,6 +4030,23 @@
> #define GEN6_RC6 3
> #define GEN6_RC7 4
>
> +#define GEN7_MISCCPCTL (0x9424)
> +#define GEN7_DOP_CLOCK_GATE_ENABLE (1<<0)
> +
> +/* IVYBRIDGE DPF */
> +#define GEN7_L3CDERRST1 0xB008 /* L3CD Error Status 1 */
> +#define GEN7_L3CDERRST1_ROW_MASK (0x7ff<<14)
> +#define GEN7_PARITY_ERROR_VALID (1<<13)
> +#define GEN7_L3CDERRST1_BANK_MASK (3<<11)
> +#define GEN7_L3CDERRST1_SUBBANK_MASK (7<<8)
> +#define GEN7_PARITY_ERROR_ROW(reg) \
> + ((reg & GEN7_L3CDERRST1_ROW_MASK) >> 14)
> +#define GEN7_PARITY_ERROR_BANK(reg) \
> + ((reg & GEN7_L3CDERRST1_BANK_MASK) >> 11)
> +#define GEN7_PARITY_ERROR_SUBBANK(reg) \
> + ((reg & GEN7_L3CDERRST1_SUBBANK_MASK) >> 8)
> +#define GEN7_L3CDERRST1_ENABLE (1<<7)
> +
> #define G4X_AUD_VID_DID 0x62020
> #define INTEL_AUDIO_DEVCL 0x808629FB
> #define INTEL_AUDIO_DEVBLC 0x80862801
> --
> 1.7.10
>
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx
--
Daniel Vetter
Mail: daniel at ffwll.ch
Mobile: +41 (0)79 365 57 48
More information about the Intel-gfx
mailing list