[PATCH 05/33] drm/omap: partial workaround for DRA7 DMM errata i878

Tue Feb 23 21:57:23 UTC 2016

Hi Tomi,

Thank you for the patch.

On Friday 19 February 2016 11:47:40 Tomi Valkeinen wrote:
> Errata i878 says that MPU should not be used to access RAM and DMM at
> the same time. As it's not possible to prevent MPU accessing RAM, we
> need to access DMM via a proxy.
> 
> This patch changes DMM driver to access DMM registers via sDMA. Instead
> of doing a normal readl/writel call to read/write a register, we use
> sDMA to copy 4 bytes from/to the DMM registers.
> 
> This patch provides only a partial workaround for i878, as not only DMM
> register reads/writes are affected, but also accesses to the DMM mapped
> buffers (framebuffers, usually).

Will this patch really improve the situation if the DMM mapping is accessed 
anyway ?

> Signed-off-by: Tomi Valkeinen <tomi.valkeinen at ti.com>
> ---
>  Documentation/devicetree/bindings/arm/omap/dmm.txt |   3 +-
>  drivers/gpu/drm/omapdrm/omap_dmm_priv.h            |   8 ++
>  drivers/gpu/drm/omapdrm/omap_dmm_tiler.c           | 141 +++++++++++++++++-
>  3 files changed, 149 insertions(+), 3 deletions(-)
> 
> diff --git a/Documentation/devicetree/bindings/arm/omap/dmm.txt
> b/Documentation/devicetree/bindings/arm/omap/dmm.txt index
> 8bd6d0a238a8..e5fc2eb7f4da 100644
> --- a/Documentation/devicetree/bindings/arm/omap/dmm.txt
> +++ b/Documentation/devicetree/bindings/arm/omap/dmm.txt
> @@ -8,7 +8,8 @@ translation for initiators which need contiguous dma bus
> addresses.
> 
>  Required properties:
>  - compatible:	Should contain "ti,omap4-dmm" for OMAP4 family
> -		Should contain "ti,omap5-dmm" for OMAP5 and DRA7x family
> +		Should contain "ti,omap5-dmm" for OMAP5 family
> +		Should contain "ti,dra7-dmm" for DRA7x family

Isn't it DRA7xx instead of DRA7x ?

>  - reg:		Contains DMM register address range (base address and length)
>  - interrupts:	Should contain an interrupt-specifier for DMM_IRQ.
>  - ti,hwmods:	Name of the hwmod associated to DMM, which is typically 
"dmm"
> diff --git a/drivers/gpu/drm/omapdrm/omap_dmm_priv.h
> b/drivers/gpu/drm/omapdrm/omap_dmm_priv.h index 9f32a83ca507..4d292ba5bcca
> 100644
> --- a/drivers/gpu/drm/omapdrm/omap_dmm_priv.h
> +++ b/drivers/gpu/drm/omapdrm/omap_dmm_priv.h
> @@ -155,10 +155,12 @@ struct refill_engine {
> 
>  struct dmm_platform_data {
>  	uint32_t cpu_cache_flags;
> +	bool errata_i878_wa;
>  };
> 
>  struct dmm {
>  	struct device *dev;
> +	u32 phys_base;
>  	void __iomem *base;
>  	int irq;
> 
> @@ -189,6 +191,12 @@ struct dmm {
>  	struct list_head alloc_head;
> 
>  	const struct dmm_platform_data *plat_data;
> +
> +	bool dmm_workaround;
> +	spinlock_t wa_lock;
> +	u32 *wa_dma_data;
> +	dma_addr_t wa_dma_handle;
> +	struct dma_chan *wa_dma_chan;
>  };
> 
>  #endif
> diff --git a/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c
> b/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c index fe5260477b52..3ec5c6633585
> 100644
> --- a/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c
> +++ b/drivers/gpu/drm/omapdrm/omap_dmm_tiler.c
> @@ -19,6 +19,7 @@
>  #include <linux/completion.h>
>  #include <linux/delay.h>
>  #include <linux/dma-mapping.h>
> +#include <linux/dmaengine.h>
>  #include <linux/errno.h>
>  #include <linux/init.h>
>  #include <linux/interrupt.h>
> @@ -79,14 +80,124 @@ static const uint32_t reg[][4] = {
>  			DMM_PAT_DESCR__2, DMM_PAT_DESCR__3},
>  };
> 
> +static int dmm_dma_copy(struct dmm *dmm, u32 src, u32 dst)

Anything wrong with using dma_addr_t ?

> +{
> +	struct dma_device *dma_dev = dmm->wa_dma_chan->device;
> +	struct dma_async_tx_descriptor *tx = NULL;

No need to initialize tx to NULL.

> +	enum dma_status status;
> +	dma_cookie_t cookie;
> +
> +	tx = dma_dev->device_prep_dma_memcpy(dmm->wa_dma_chan, dst, src, 4, 0);

Given that you're transferring between an I/O mapped register and system 
memory, I believe you should use dmaengine_prep_slave_single() instead, with a 
call to dmaengine_slave_config() to set the I/O mapped register physical 
address. You will also need to request a slave DMA channel instead of a memcpy 
DMA channel.

> +	if (!tx) {
> +		dev_err(dmm->dev, "Failed to prepare DMA memcpy\n");
> +		return -EIO;
> +	}
> +
> +	cookie = tx->tx_submit(tx);
> +	if (dma_submit_error(cookie)) {
> +		dev_err(dmm->dev, "Failed to do DMA tx_submit\n");
> +		return -EIO;
> +	}
> +
> +	dma_async_issue_pending(dmm->wa_dma_chan);
> +	status = dma_sync_wait(dmm->wa_dma_chan, cookie);
> +	if (status != DMA_COMPLETE)
> +		dev_err(dmm->dev, "i878 wa DMA copy failure\n");
> +
> +	dmaengine_terminate_all(dmm->wa_dma_chan);
> +	return 0;
> +}
> +
> +static u32 dmm_read_wa(struct dmm *dmm, u32 reg)
> +{
> +	u32 src, dst;
> +	int r;
> +
> +	src = (u32)(dmm->phys_base + reg);
> +	dst = (u32)dmm->wa_dma_handle;
> +
> +	r = dmm_dma_copy(dmm, src, dst);
> +	if (r) {
> +		dev_err(dmm->dev, "sDMA read transfer timeout\n");
> +		return readl(dmm->base + reg);
> +	}
> +
> +	return readl(dmm->wa_dma_data);

readl() has a memory barrier *after* the read, not before. Similarly writel() 
has its memory barrier *before* the write. I don't think that's what you want, 
shouldn't you use readl_relaxed() and writel_relaxed() with explicit barriers 
?

I'm also unsure whether readl and writel are the right primitives, as they 
target access to I/O memory, not system memory. Can't you use explicit 
barriers and direct access (return *dmm->wa_dma_data) ?

> +}
> +
> +static void dmm_write_wa(struct dmm *dmm, u32 val, u32 reg)
> +{
> +	u32 src, dst;
> +	int r;
> +
> +	writel(val, dmm->wa_dma_data);

Same comment here.

> +	src = (u32)dmm->wa_dma_handle;
> +	dst = (u32)(dmm->phys_base + reg);
> +
> +	r = dmm_dma_copy(dmm, src, dst);
> +	if (r) {
> +		dev_err(dmm->dev, "sDMA write transfer timeout\n");
> +		writel(val, dmm->base + reg);
> +	}
> +}
> +
>  static u32 dmm_read(struct dmm *dmm, u32 reg)
>  {
> -	return readl(dmm->base + reg);
> +	if (dmm->dmm_workaround) {
> +		u32 v;
> +		unsigned long flags;
> +
> +		spin_lock_irqsave(&dmm->wa_lock, flags);
> +		v = dmm_read_wa(dmm, reg);

dma_sync_wait() with a spinlock held doesn't seem like the best idea to me.

> +		spin_unlock_irqrestore(&dmm->wa_lock, flags);
> +
> +		return v;
> +	} else {
> +		return readl(dmm->base + reg);
> +	}
>  }
> 
>  static void dmm_write(struct dmm *dmm, u32 val, u32 reg)
>  {
> -	writel(val, dmm->base + reg);
> +	if (dmm->dmm_workaround) {
> +		unsigned long flags;
> +
> +		spin_lock_irqsave(&dmm->wa_lock, flags);
> +		dmm_write_wa(dmm, val, reg);

Ditto.

> +		spin_unlock_irqrestore(&dmm->wa_lock, flags);
> +	} else {
> +		writel(val, dmm->base + reg);
> +	}
> +}
> +
> +static int dmm_workaround_init(struct dmm *dmm)
> +{
> +	dma_cap_mask_t mask;
> +
> +	dma_cap_zero(mask);
> +	dma_cap_set(DMA_MEMCPY, mask);

Nitpicking, I'd move those two lines down right before the 
dma_request_channel() call, as they are part of the same logical block of 
code.

> +	spin_lock_init(&dmm->wa_lock);
> +
> +	dmm->wa_dma_data = dma_alloc_coherent(dmm->dev, 4, &dmm->wa_dma_handle,
> GFP_KERNEL);

Maybe a macro instead of 4 ?

> +	if (!dmm->wa_dma_data)
> +		return -ENOMEM;
> +
> +	dmm->wa_dma_chan = dma_request_channel(mask, NULL, NULL);
> +	if (!dmm->wa_dma_chan) {
> +		dma_free_coherent(dmm->dev, 4, dmm->wa_dma_data, dmm->wa_dma_handle);
> +		return -ENODEV;
> +	}
> +
> +	return 0;
> +}
> +
> +static void dmm_workaround_uninit(struct dmm *dmm)
> +{
> +	dma_release_channel(dmm->wa_dma_chan);
> +
> +	dma_free_coherent(dmm->dev, 4, dmm->wa_dma_data, dmm->wa_dma_handle);
>  }
> 
>  /* simple allocator to grab next 16 byte aligned memory from txn */
> @@ -606,6 +717,9 @@ static int omap_dmm_remove(struct platform_device *dev)
>  		if (omap_dmm->dummy_page)
>  			__free_page(omap_dmm->dummy_page);
> 
> +		if (omap_dmm->dmm_workaround)
> +			dmm_workaround_uninit(omap_dmm);
> +
>  		if (omap_dmm->irq > 0)
>  			free_irq(omap_dmm->irq, omap_dmm);
> 
> @@ -653,6 +767,7 @@ static int omap_dmm_probe(struct platform_device *dev)
>  		goto fail;
>  	}
> 
> +	omap_dmm->phys_base = mem->start;
>  	omap_dmm->base = ioremap(mem->start, SZ_2K);
> 
>  	if (!omap_dmm->base) {
> @@ -668,6 +783,19 @@ static int omap_dmm_probe(struct platform_device *dev)
> 
>  	omap_dmm->dev = &dev->dev;
> 
> +	omap_dmm->dmm_workaround = omap_dmm->plat_data->errata_i878_wa;
> +
> +	if (omap_dmm->dmm_workaround) {

I'd test omap_dmm->plat_data->errata_i878_wa here, omap_dmm->dmm_workaround is 
initialized to false by kzalloc().

> +		int r;
> +		r = dmm_workaround_init(omap_dmm);

Can't you use the ret variable ? I know it's pre-initialized to -EFAULT, but 
it seems better to me to set it to explicit values in the three goto fail; 
error cases that would be impacted, especially given that -EFAULT is the wrong 
error code to return (and as it would touch that code, the 
platform_get_resource() + ioremap() calls can be optimized in a 
devm_ioremap_resource() in another cleanup patch).

> +		if (r) {
> +			omap_dmm->dmm_workaround = false;
> +			dev_err(&dev->dev, "failed to initialize work-around, WA not 
used\n");

As this is a non-fatal error, maybe dev_warn() ?

> +		} else {
> +			dev_info(&dev->dev, "workaround for errata i878 in use\n");
> +		}
> +	}
> +
>  	hwinfo = dmm_read(omap_dmm, DMM_PAT_HWINFO);
>  	omap_dmm->num_engines = (hwinfo >> 24) & 0x1F;
>  	omap_dmm->num_lut = (hwinfo >> 16) & 0x1F;
> @@ -1031,6 +1159,11 @@ static const struct dmm_platform_data
> dmm_omap5_platform_data = { .cpu_cache_flags = OMAP_BO_UNCACHED,
>  };
> 
> +static const struct dmm_platform_data dmm_dra7_platform_data = {
> +	.cpu_cache_flags = OMAP_BO_UNCACHED,
> +	.errata_i878_wa = true,
> +};
> +
>  static const struct of_device_id dmm_of_match[] = {
>  	{
>  		.compatible = "ti,omap4-dmm",
> @@ -1040,6 +1173,10 @@ static const struct of_device_id dmm_of_match[] = {
>  		.compatible = "ti,omap5-dmm",
>  		.data = &dmm_omap5_platform_data,
>  	},
> +	{
> +		.compatible = "ti,dra7-dmm",
> +		.data = &dmm_dra7_platform_data,
> +	},
>  	{},
>  };
>  #endif

-- 
Regards,

Laurent Pinchart