[Intel-gfx] [PATCH 2/5] drm/i915: Dynamic Parity Detection handling

Tue May 1 20:05:37 CEST 2012

On Fri, Apr 27, 2012 at 05:40:18PM -0700, Ben Widawsky wrote:
> On IVB hardware we are given an interrupt whenever a L3 parity error
> occurs in the L3 cache. The L3 cache is used by internal GPU clients
> only.  This is a very rare occurrence (in fact to test this I need to
> use specially instrumented silicon).
> 
> When a row in the L3 cache detects a parity error the HW generates an
> interrupt. The interrupt is masked in GTIMR until we get a chance to
> read some registers and alert userspace via a uevent. With this
> information userspace can use a sysfs interface (follow-up patch) to
> remap those rows.
> 
> Way above my level of understanding, but if a given row fails, it is
> statistically more likely to fail again than a row which has not failed.
> Therefore it is desirable for an operating system to maintain a lifelong
> list of failing rows and always remap any bad rows on driver load.
> Hardware limits the number of rows that are remappable per bank/subbank,
> and should more than that many rows detect parity errors, software
> should maintain a list of the most frequent errors, and remap those
> rows.
> 
> Signed-off-by: Ben Widawsky <ben at bwidawsk.net>
> ---
>  drivers/gpu/drm/i915/i915_drv.h |    2 +
>  drivers/gpu/drm/i915/i915_irq.c |   83 +++++++++++++++++++++++++++++++++++++++
>  drivers/gpu/drm/i915/i915_reg.h |   17 ++++++++
>  3 files changed, 102 insertions(+)
> 
> diff --git a/drivers/gpu/drm/i915/i915_drv.h b/drivers/gpu/drm/i915/i915_drv.h
> index 69e1539..9505fc0 100644
> --- a/drivers/gpu/drm/i915/i915_drv.h
> +++ b/drivers/gpu/drm/i915/i915_drv.h
> @@ -804,6 +804,8 @@ typedef struct drm_i915_private {
>  
>  	struct drm_property *broadcast_rgb_property;
>  	struct drm_property *force_audio_property;
> +
> +	struct work_struct parity_error_work;
>  } drm_i915_private_t;
>  
>  enum hdmi_force_audio {
> diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c
> index ab023ca..81e5a7d 100644
> --- a/drivers/gpu/drm/i915/i915_irq.c
> +++ b/drivers/gpu/drm/i915/i915_irq.c
> @@ -430,6 +430,83 @@ static void gen6_pm_rps_work(struct work_struct *work)
>  	mutex_unlock(&dev_priv->dev->struct_mutex);
>  }
>  
> +
> +/**
> + * ivybridge_parity_work - Workqueue called when a parity error interrupt
> + * occurred.
> + *
> + * Doesn't actually do anything except notify userspace so that userspace may
> + * disable things later on.
> + */
> +static void ivybridge_parity_work(struct work_struct *work)
> +{
> +	drm_i915_private_t *dev_priv = container_of(work, drm_i915_private_t,
> +						    parity_error_work);
> +
> +	u32 error_status, row, bank, subbank;
> +	char *parity_event[5];
> +	uint32_t misccpctl;
> +	unsigned long flags;
> +
> +	/* We must turn off DOP level clock gating to access the L3 registers.
> +	 * In order to prevent a get/put style interface, acquire struct mutex
> +	 * any time we access those registers.
> +	 */
> +	mutex_lock(&dev_priv->dev->struct_mutex);
> +
> +	misccpctl = I915_READ(GEN7_MISCCPCTL);
> +	I915_WRITE(GEN7_MISCCPCTL, misccpctl & ~GEN7_DOP_CLOCK_GATE_ENABLE);
> +	POSTING_READ(GEN7_MISCCPCTL);
> +
> +	error_status = I915_READ(GEN7_L3CDERRST1);
> +	row = GEN7_PARITY_ERROR_ROW(error_status);
> +	bank = GEN7_PARITY_ERROR_BANK(error_status);
> +	subbank = GEN7_PARITY_ERROR_SUBBANK(error_status);
> +
> +	I915_WRITE(GEN7_L3CDERRST1, GEN7_PARITY_ERROR_VALID |
> +				    GEN7_L3CDERRST1_ENABLE);
> +	POSTING_READ(GEN7_L3CDERRST1);
> +
> +	I915_WRITE(GEN7_MISCCPCTL, misccpctl);
> +
> +	spin_lock_irqsave(&dev_priv->irq_lock, flags);
> +	dev_priv->gt_irq_mask &= ~GT_GEN7_L3_PARITY_ERROR_INTERRUPT;
> +	I915_WRITE(GTIMR, dev_priv->gt_irq_mask);
> +	spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
> +
> +	mutex_unlock(&dev_priv->dev->struct_mutex);
> +
> +	parity_event[0] = "L3_PARITY_ERROR=1";
> +	parity_event[1] = kasprintf(GFP_KERNEL, "ROW=%d", row);
> +	parity_event[2] = kasprintf(GFP_KERNEL, "BANK=%d", bank);
> +	parity_event[3] = kasprintf(GFP_KERNEL, "SUBBANK=%d", subbank);
> +	parity_event[4] = NULL;
> +
> +	kobject_uevent_env(&dev_priv->dev->primary->kdev.kobj,
> +			   KOBJ_CHANGE, parity_event);
> +
> +	kfree(parity_event[3]);
> +	kfree(parity_event[2]);
> +	kfree(parity_event[1]);
> +}
> +
> +void ivybridge_handle_parity_error(struct drm_device *dev)
> +{
> +	drm_i915_private_t *dev_priv = (drm_i915_private_t *) dev->dev_private;
> +	unsigned long flags;
> +
> +	if (WARN_ON(IS_GEN6(dev)))
> +		return;
> +
> +	spin_lock_irqsave(&dev_priv->irq_lock, flags);
> +	dev_priv->gt_irq_mask |= GT_GEN7_L3_PARITY_ERROR_INTERRUPT;
> +	I915_WRITE(GTIMR, dev_priv->gt_irq_mask);
> +	spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
> +
> +	queue_work(dev_priv->wq, &dev_priv->parity_error_work);
> +	DRM_INFO("Parity error interrupt. Scheduling work\n");
> +}
> +
>  static void snb_gt_irq_handler(struct drm_device *dev,
>  			       struct drm_i915_private *dev_priv,
>  			       u32 gt_iir)
> @@ -449,6 +526,9 @@ static void snb_gt_irq_handler(struct drm_device *dev,
>  		DRM_ERROR("GT error interrupt 0x%08x\n", gt_iir);
>  		i915_handle_error(dev, false);
>  	}
> +
> +	if (gt_iir & GT_GEN7_L3_PARITY_ERROR_INTERRUPT)
> +		ivybridge_handle_parity_error(dev);
>  }
>  
>  static void gen6_queue_rps_work(struct drm_i915_private *dev_priv,
> @@ -1978,6 +2058,9 @@ static void ironlake_irq_preinstall(struct drm_device *dev)
>  	if (IS_GEN6(dev) || IS_IVYBRIDGE(dev))
>  		INIT_WORK(&dev_priv->rps_work, gen6_pm_rps_work);
>  
> +	if (IS_IVYBRIDGE(dev))
> +		INIT_WORK(&dev_priv->parity_error_work, ivybridge_parity_work);
> +

work init has moved to intel_irq_init in dinq, and for good reasons as
I've figured out after merging the patch: _preinstall is also called on
resume, and if we're unlucky we have a work item outstanding from before
the suspend, so that the we re-init a life work item. The core work queue
code doesn't approve of that, resulting in decent hilarity (NULL deref
after suspend).
-Daniel

>  	I915_WRITE(HWSTAM, 0xeffe);
>  
>  	/* XXX hotplug from PCH */
> diff --git a/drivers/gpu/drm/i915/i915_reg.h b/drivers/gpu/drm/i915/i915_reg.h
> index 5ac9837..72db6a9 100644
> --- a/drivers/gpu/drm/i915/i915_reg.h
> +++ b/drivers/gpu/drm/i915/i915_reg.h
> @@ -4030,6 +4030,23 @@
>  #define   GEN6_RC6			3
>  #define   GEN6_RC7			4
>  
> +#define GEN7_MISCCPCTL			(0x9424)
> +#define   GEN7_DOP_CLOCK_GATE_ENABLE	(1<<0)
> +
> +/* IVYBRIDGE DPF */
> +#define GEN7_L3CDERRST1			0xB008 /* L3CD Error Status 1 */
> +#define   GEN7_L3CDERRST1_ROW_MASK	(0x7ff<<14)
> +#define   GEN7_PARITY_ERROR_VALID	(1<<13)
> +#define   GEN7_L3CDERRST1_BANK_MASK	(3<<11)
> +#define   GEN7_L3CDERRST1_SUBBANK_MASK	(7<<8)
> +#define GEN7_PARITY_ERROR_ROW(reg) \
> +		((reg & GEN7_L3CDERRST1_ROW_MASK) >> 14)
> +#define GEN7_PARITY_ERROR_BANK(reg) \
> +		((reg & GEN7_L3CDERRST1_BANK_MASK) >> 11)
> +#define GEN7_PARITY_ERROR_SUBBANK(reg) \
> +		((reg & GEN7_L3CDERRST1_SUBBANK_MASK) >> 8)
> +#define   GEN7_L3CDERRST1_ENABLE	(1<<7)
> +
>  #define G4X_AUD_VID_DID			0x62020
>  #define INTEL_AUDIO_DEVCL		0x808629FB
>  #define INTEL_AUDIO_DEVBLC		0x80862801
> -- 
> 1.7.10
> 
> _______________________________________________
> Intel-gfx mailing list
> Intel-gfx at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/intel-gfx

-- 
Daniel Vetter
Mail: daniel at ffwll.ch
Mobile: +41 (0)79 365 57 48