[Freedreno] [PATCH 5/5] drm/msm/A6xx: Add support for using system cache(llc)

Thu Apr 5 05:25:53 UTC 2018


On 4/4/2018 2:54 AM, Jordan Crouse wrote:
> On Fri, Mar 23, 2018 at 12:49:51PM +0530, Sharat Masetty wrote:
>> The last level system cache can be partitioned to 32
>> different slices of which GPU has two slices preallocated.
>> The "gpu" slice is used for caching GPU buffers and
>> the "gpuhtw" slice is used for caching the GPU SMMU
>> pagetables.  This patch talks to the core system cache
>> driver to acquire the slice handles, configure the SCID's
>> to those slices and activates and deactivates the slices
>> upon GPU power collapse and restore.
>>
>> Some support from the IOMMU driver is also needed to
>> make use of the system cache. IOMMU_UPSTREAM_HINT is
>> a buffer protection flag which enables caching GPU data
>> buffers in the system cache with memory attributes such
>> as outer cacheable, read-allocate, write-allocate for buffers.
>> The GPU then has the ability to override a few cacheability
>> parameters which it does to override write-allocate to
>> write-no-allocate as the GPU hardware does not benefit much
>> from it.
>> Similarly DOMAIN_ATTR_USE_UPSTREAM_HINT is another domain level
>> attribute used by the IOMMU driver to set the right attributes
>> to cache the hardware pagetables into the system cache.
> 
> This has a dependency on the LLCC driver and the API to that may change (it is
> under review now).  When it does, this will have to naturally change as well but
> that'll be a minor tweek and won't affect the functionality of this driver so
> pending those changes..

Thanks for the review Jordan. Vivek will also submit the SMMU changes 
for the UPSTREAM_HINT support to the mailing list soon. So once the 
dependencies are sorted out, I will review and submit a fresh patch set 
if needed.
> 
> Reviewed-by: Jordan Crouse <jcrouse at codeaurora.org> >
>> Signed-off-by: Sharat Masetty <smasetty at codeaurora.org>
>> ---
>>   drivers/gpu/drm/msm/adreno/a6xx_gpu.c | 162 +++++++++++++++++++++++++++++++++-
>>   drivers/gpu/drm/msm/adreno/a6xx_gpu.h |   9 ++
>>   drivers/gpu/drm/msm/msm_iommu.c       |  13 +++
>>   drivers/gpu/drm/msm/msm_mmu.h         |   3 +
>>   4 files changed, 186 insertions(+), 1 deletion(-)
>>
>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>> index bd50674..e4554eb 100644
>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.c
>> @@ -13,6 +13,7 @@
>>   
>>   #include <linux/qcom_scm.h>
>>   #include <linux/soc/qcom/mdt_loader.h>
>> +#include <linux/soc/qcom/llcc-qcom.h>
>>   
>>   #include "msm_gem.h"
>>   #include "msm_mmu.h"
>> @@ -913,6 +914,154 @@ static irqreturn_t a6xx_irq(struct msm_gpu *gpu)
>>   	~0
>>   };
>>   
>> +#define A6XX_LLC_NUM_GPU_SCIDS		5
>> +#define A6XX_GPU_LLC_SCID_NUM_BITS	5
>> +
>> +#define A6XX_GPU_LLC_SCID_MASK \
>> +	((1 << (A6XX_LLC_NUM_GPU_SCIDS * A6XX_GPU_LLC_SCID_NUM_BITS)) - 1)
>> +
>> +#define A6XX_GPUHTW_LLC_SCID_SHIFT	25
>> +#define A6XX_GPUHTW_LLC_SCID_MASK \
>> +	(((1 << A6XX_GPU_LLC_SCID_NUM_BITS) - 1) << A6XX_GPUHTW_LLC_SCID_SHIFT)
>> +
>> +static inline void a6xx_gpu_cx_rmw(struct a6xx_llc *llc,
>> +	u32 reg, u32 mask, u32 or)
>> +{
>> +	msm_rmw(llc->mmio + (reg << 2), mask, or);
>> +}
>> +
>> +static void a6xx_llc_deactivate(struct msm_gpu *gpu)
>> +{
>> +	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
>> +	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
>> +	struct a6xx_llc *llc = &a6xx_gpu->llc;
>> +
>> +	llcc_slice_deactivate(llc->gpu_llc_slice);
>> +	llcc_slice_deactivate(llc->gpuhtw_llc_slice);
>> +}
>> +
>> +static void a6xx_llc_activate(struct msm_gpu *gpu)
>> +{
>> +	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
>> +	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
>> +	struct a6xx_llc *llc = &a6xx_gpu->llc;
>> +
>> +	if (!llc->mmio)
>> +		return;
>> +
>> +	if (llc->gpu_llc_slice)
>> +		if (!llcc_slice_activate(llc->gpu_llc_slice))
>> +			/* Program the sub-cache ID for all GPU blocks */
>> +			a6xx_gpu_cx_rmw(llc,
>> +				REG_A6XX_GPU_CX_MISC_SYSTEM_CACHE_CNTL_1,
>> +				A6XX_GPU_LLC_SCID_MASK,
>> +				(llc->cntl1_regval &
>> +					A6XX_GPU_LLC_SCID_MASK));
>> +
>> +	if (llc->gpuhtw_llc_slice)
>> +		if (!llcc_slice_activate(llc->gpuhtw_llc_slice))
>> +			/* Program the sub-cache ID for GPU pagetables */
>> +			a6xx_gpu_cx_rmw(llc,
>> +				REG_A6XX_GPU_CX_MISC_SYSTEM_CACHE_CNTL_1,
>> +				A6XX_GPUHTW_LLC_SCID_MASK,
>> +				(llc->cntl1_regval &
>> +					A6XX_GPUHTW_LLC_SCID_MASK));
>> +
>> +	/* Program cacheability overrides */
>> +	a6xx_gpu_cx_rmw(llc, REG_A6XX_GPU_CX_MISC_SYSTEM_CACHE_CNTL_0, 0xF,
>> +		llc->cntl0_regval);
>> +}
>> +
>> +void a6xx_llc_slices_destroy(struct a6xx_llc *llc)
>> +{
>> +	if (llc->mmio) {
>> +		iounmap(llc->mmio);
>> +		llc->mmio = NULL;
>> +	}
>> +
>> +	llcc_slice_putd(llc->gpu_llc_slice);
>> +	llc->gpu_llc_slice = NULL;
>> +
>> +	llcc_slice_putd(llc->gpuhtw_llc_slice);
>> +	llc->gpuhtw_llc_slice = NULL;
>> +}
>> +
>> +static int a6xx_llc_slices_init(struct platform_device *pdev,
>> +		struct a6xx_llc *llc)
>> +{
>> +	int i;
>> +
>> +	/* Get the system cache slice descriptor for GPU and GPUHTWs */
>> +	llc->gpu_llc_slice = llcc_slice_getd(&pdev->dev, "gpu");
>> +	if (IS_ERR(llc->gpu_llc_slice))
>> +		llc->gpu_llc_slice = NULL;
>> +
>> +	llc->gpuhtw_llc_slice = llcc_slice_getd(&pdev->dev, "gpuhtw");
>> +	if (IS_ERR(llc->gpuhtw_llc_slice))
>> +		llc->gpuhtw_llc_slice = NULL;
>> +
>> +	if (llc->gpu_llc_slice == NULL && llc->gpuhtw_llc_slice == NULL)
>> +		return -1;
>> +
>> +	/* Map registers */
>> +	llc->mmio = msm_ioremap(pdev, "cx_mem", "gpu_cx");
>> +	if (IS_ERR(llc->mmio)) {
>> +		llc->mmio = NULL;
>> +		a6xx_llc_slices_destroy(llc);
>> +		return -1;
>> +	}
>> +
>> +	/*
>> +	 * Setup GPU system cache CNTL0 and CNTL1 register values.
>> +	 * These values will be programmed everytime GPU comes out
>> +	 * of power collapse as these are non-retention registers.
>> +	 */
>> +
>> +	/*
>> +	 * CNTL0 provides options to override the settings for the
>> +	 * read and write allocation policies for the LLC. These
>> +	 * overrides are global for all memory transactions from
>> +	 * the GPU.
>> +	 *
>> +	 * 0x3: read-no-alloc-overridden = 0
>> +	 *      read-no-alloc = 0 - Allocate lines on read miss
>> +	 *      write-no-alloc-overridden = 1
>> +	 *      write-no-alloc = 1 - Do not allocates lines on write miss
>> +	 */
>> +	llc->cntl0_regval = 0x03;
>> +
>> +	/*
>> +	 * CNTL1 is used to specify SCID for (CP, TP, VFD, CCU and UBWC
>> +	 * FLAG cache) GPU blocks. This value will be passed along with
>> +	 * the address for any memory transaction from GPU to identify
>> +	 * the sub-cache for that transaction.
>> +	 *
>> +	 * Currently there is only one SCID allocated for all GPU blocks
>> +	 * Hence set same SCID for all the blocks.
>> +	 */
>> +
>> +	if (llc->gpu_llc_slice) {
>> +		u32 gpu_scid = llcc_get_slice_id(llc->gpu_llc_slice);
>> +
>> +		for (i = 0; i < A6XX_LLC_NUM_GPU_SCIDS; i++)
>> +			llc->cntl1_regval |=
>> +				gpu_scid << (A6XX_GPU_LLC_SCID_NUM_BITS * i);
>> +	}
>> +
>> +	/*
>> +	 * Set SCID for GPU IOMMU. This will be used to access
>> +	 * page tables that are cached in LLC.
>> +	 */
>> +	if (llc->gpuhtw_llc_slice) {
>> +		u32 gpuhtw_scid = llcc_get_slice_id(llc->gpuhtw_llc_slice);
>> +
>> +		llc->cntl1_regval |=
>> +			gpuhtw_scid << A6XX_GPUHTW_LLC_SCID_SHIFT;
>> +	}
>> +
>> +	return 0;
>> +}
>> +
>>   static int a6xx_pm_resume(struct msm_gpu *gpu)
>>   {
>>   	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
>> @@ -923,6 +1072,9 @@ static int a6xx_pm_resume(struct msm_gpu *gpu)
>>   
>>   	gpu->needs_hw_init = true;
>>   
>> +	/* Activate LLC slices */
>> +	a6xx_llc_activate(gpu);
>> +
>>   	return ret;
>>   }
>>   
>> @@ -931,6 +1083,9 @@ static int a6xx_pm_suspend(struct msm_gpu *gpu)
>>   	struct adreno_gpu *adreno_gpu = to_adreno_gpu(gpu);
>>   	struct a6xx_gpu *a6xx_gpu = to_a6xx_gpu(adreno_gpu);
>>   
>> +	/* Deactivate LLC slices */
>> +	a6xx_llc_deactivate(gpu);
>> +
>>   	/*
>>   	 * Make sure the GMU is idle before continuing (because some transitions
>>   	 * may use VBIF
>> @@ -993,6 +1148,8 @@ static void a6xx_destroy(struct msm_gpu *gpu)
>>   		drm_gem_object_unreference_unlocked(a6xx_gpu->sqe_bo);
>>   	}
>>   
>> +	a6xx_llc_slices_destroy(&a6xx_gpu->llc);
>> +
>>   	a6xx_gmu_remove(a6xx_gpu);
>>   
>>   	adreno_gpu_cleanup(adreno_gpu);
>> @@ -1040,7 +1197,10 @@ struct msm_gpu *a6xx_gpu_init(struct drm_device *dev)
>>   	adreno_gpu->registers = a6xx_registers;
>>   	adreno_gpu->reg_offsets = a6xx_register_offsets;
>>   
>> -	ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 4, 0);
>> +	ret = a6xx_llc_slices_init(pdev, &a6xx_gpu->llc);
>> +
>> +	ret = adreno_gpu_init(dev, pdev, adreno_gpu, &funcs, 4,
>> +			ret ? 0 : MMU_FEATURE_USE_SYSTEM_CACHE);
>>   	if (ret) {
>>   		a6xx_destroy(&(a6xx_gpu->base.base));
>>   		return ERR_PTR(ret);
>> diff --git a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
>> index 21ab701..392c426 100644
>> --- a/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
>> +++ b/drivers/gpu/drm/msm/adreno/a6xx_gpu.h
>> @@ -21,6 +21,14 @@
>>   
>>   extern bool hang_debug;
>>   
>> +struct a6xx_llc {
>> +	void __iomem *mmio;
>> +	void *gpu_llc_slice;
>> +	void *gpuhtw_llc_slice;
>> +	u32 cntl0_regval;
>> +	u32 cntl1_regval;
>> +};
>> +
>>   struct a6xx_gpu {
>>   	struct adreno_gpu base;
>>   
>> @@ -46,6 +54,7 @@ struct a6xx_gpu {
>>   	uint64_t scratch_iova;
>>   
>>   	struct a6xx_gmu gmu;
>> +	struct a6xx_llc llc;
>>   };
>>   
>>   #define to_a6xx_gpu(x) container_of(x, struct a6xx_gpu, base)
>> diff --git a/drivers/gpu/drm/msm/msm_iommu.c b/drivers/gpu/drm/msm/msm_iommu.c
>> index 1ab629b..6c03eda 100644
>> --- a/drivers/gpu/drm/msm/msm_iommu.c
>> +++ b/drivers/gpu/drm/msm/msm_iommu.c
>> @@ -39,6 +39,16 @@ static int msm_iommu_attach(struct msm_mmu *mmu, const char * const *names,
>>   {
>>   	struct msm_iommu *iommu = to_msm_iommu(mmu);
>>   	int ret;
>> +	int gpu_htw_llc = 1;
>> +
>> +	/*
>> +	 * This allows GPU to set the bus attributes required
>> +	 * to use system cache on behalf of the iommu page table
>> +	 * walker.
>> +	 */
>> +	if (msm_mmu_has_feature(mmu, MMU_FEATURE_USE_SYSTEM_CACHE))
>> +		iommu_domain_set_attr(iommu->domain,
>> +				DOMAIN_ATTR_USE_UPSTREAM_HINT, &gpu_htw_llc);
>>   
>>   	pm_runtime_get_suppliers(mmu->dev);
>>   	ret = iommu_attach_device(iommu->domain, mmu->dev);
>> @@ -63,6 +73,9 @@ static int msm_iommu_map(struct msm_mmu *mmu, uint64_t iova,
>>   	struct msm_iommu *iommu = to_msm_iommu(mmu);
>>   	size_t ret;
>>   
>> +	if (msm_mmu_has_feature(mmu, MMU_FEATURE_USE_SYSTEM_CACHE))
>> +		prot |= IOMMU_USE_UPSTREAM_HINT;
>> +
>>   	pm_runtime_get_suppliers(mmu->dev);
>>   	ret = iommu_map_sg(iommu->domain, iova, sgt->sgl, sgt->nents, prot);
>>   	pm_runtime_put_suppliers(mmu->dev);
>> diff --git a/drivers/gpu/drm/msm/msm_mmu.h b/drivers/gpu/drm/msm/msm_mmu.h
>> index 85df78d..257bdea 100644
>> --- a/drivers/gpu/drm/msm/msm_mmu.h
>> +++ b/drivers/gpu/drm/msm/msm_mmu.h
>> @@ -30,6 +30,9 @@ struct msm_mmu_funcs {
>>   	void (*destroy)(struct msm_mmu *mmu);
>>   };
>>   
>> +/* MMU features */
>> +#define MMU_FEATURE_USE_SYSTEM_CACHE (1 << 0)
>> +
>>   struct msm_mmu {
>>   	const struct msm_mmu_funcs *funcs;
>>   	struct device *dev;
>> -- 
>> 1.9.1
>>
>> _______________________________________________
>> Freedreno mailing list
>> Freedreno at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/freedreno
> 

-- 
The Qualcomm Innovation Center, Inc. is a member of the Code Aurora Forum,
Linux Foundation Collaborative Project