[RESEND PATCH v2 2/2] drm/panfrost: Expose perf counters through debugfs

Tue May 14 13:55:03 UTC 2019

Hi Rob,

On Tue, 14 May 2019 08:31:29 -0500
Rob Herring <robh+dt at kernel.org> wrote:

> > +
> > +static void panfrost_perfcnt_setup(struct panfrost_device *pfdev)
> > +{
> > +       u32 cfg;
> > +
> > +       /*
> > +        * Always use address space 0 for now.
> > +        * FIXME: this needs to be updated when we start using different
> > +        * address space.
> > +        */
> > +       cfg = GPU_PERFCNT_CFG_AS(0);
> > +       if (panfrost_model_cmp(pfdev, 0x1000) >= 0)  
> 
> You've got a couple of these. Perhaps we should add either a
> model_is_bifrost() helper or an is_bifrost variable to use.

Oops, Alyssa already mentioned that in her review and I forgot to
address it in my v2. I'll add such an helper in v3.

> 
> > +               cfg |= GPU_PERFCNT_CFG_SETSEL(1);
> > +
> > +       gpu_write(pfdev, GPU_PERFCNT_CFG,
> > +                 cfg | GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_OFF));
> > +
> > +       if (!pfdev->perfcnt->enabled)
> > +               return;
> > +
> > +       gpu_write(pfdev, GPU_PRFCNT_JM_EN, 0xffffffff);
> > +       gpu_write(pfdev, GPU_PRFCNT_SHADER_EN, 0xffffffff);
> > +       gpu_write(pfdev, GPU_PRFCNT_MMU_L2_EN, 0xffffffff);
> > +
> > +       /*
> > +        * Due to PRLAM-8186 we need to disable the Tiler before we enable HW
> > +        * counters.  
> 
> Seems like you could just always apply the work-around? It doesn't
> look like it would be much different.

Yes, thought about doing that too, but I decided to keep it as done in
mali_kbase just to be safe.

> 
> > +        */
> > +       if (panfrost_has_hw_issue(pfdev, HW_ISSUE_8186))
> > +               gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0);
> > +       else
> > +               gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0xffffffff);
> > +
> > +       gpu_write(pfdev, GPU_PERFCNT_CFG,
> > +                 cfg | GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_MANUAL));
> > +
> > +       if (panfrost_has_hw_issue(pfdev, HW_ISSUE_8186))
> > +               gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0xffffffff);
> > +}
> > +
> > +static int panfrost_perfcnt_dump(struct panfrost_device *pfdev)
> > +{
> > +       u64 gpuva;
> > +       int ret;
> > +
> > +       reinit_completion(&pfdev->perfcnt->dump_comp);
> > +       gpuva = pfdev->perfcnt->bo->node.start << PAGE_SHIFT;
> > +       gpu_write(pfdev, GPU_PERFCNT_BASE_LO, gpuva);
> > +       gpu_write(pfdev, GPU_PERFCNT_BASE_HI, gpuva >> 32);
> > +       gpu_write(pfdev, GPU_CMD, GPU_CMD_PERFCNT_SAMPLE);
> > +       ret = wait_for_completion_interruptible_timeout(&pfdev->perfcnt->dump_comp,
> > +                                                       msecs_to_jiffies(1000));
> > +       if (!ret)
> > +               ret = -ETIMEDOUT;
> > +       else if (ret > 0)
> > +               ret = 0;
> > +
> > +       return ret;
> > +}
> > +
> > +void panfrost_perfcnt_resume(struct panfrost_device *pfdev)  
> 
> No suspend handling needed?

Nope, unless we care about dumping the counters before suspending the
GPU, but given that this dump will be overridden by the next one, I
don't see the point. Would make sense to do it if the "trigger dump"
and "read dumped values" were 2 separate operations (behavior I can
easily implement BTW).

> 
> > +{
> > +       if (pfdev->perfcnt)
> > +               panfrost_perfcnt_setup(pfdev);
> > +}
> > +

> > +int panfrost_perfcnt_init(struct panfrost_device *pfdev)
> > +{
> > +       struct panfrost_perfcnt *perfcnt;
> > +       struct drm_gem_shmem_object *bo;
> > +       size_t size;
> > +       u32 status;
> > +       int ret;
> > +
> > +       if (panfrost_has_hw_feature(pfdev, HW_FEATURE_V4)) {
> > +               unsigned int ncoregroups;
> > +
> > +               ncoregroups = hweight64(pfdev->features.l2_present);
> > +               size = ncoregroups * BLOCKS_PER_COREGROUP *
> > +                      COUNTERS_PER_BLOCK * BYTES_PER_COUNTER;
> > +       } else {
> > +               unsigned int nl2c, ncores;
> > +
> > +               /*
> > +                * TODO: define a macro to extract the number of l2 caches from
> > +                * mem_features.
> > +                */
> > +               nl2c = ((pfdev->features.mem_features >> 8) & GENMASK(3, 0)) + 1;
> > +
> > +               /*
> > +                * The ARM driver is grouping cores per core group and then
> > +                * only using the number of cores in group 0 to calculate the
> > +                * size. Not sure why this is done like that, but I guess
> > +                * shader_present will only show cores in the first group
> > +                * anyway.
> > +                */
> > +               ncores = hweight64(pfdev->features.shader_present);
> > +
> > +               /*
> > +                * There's always one JM and one Tiler block, hence the '+ 2'
> > +                * here.
> > +                */
> > +               size = (nl2c + ncores + 2) *
> > +                      COUNTERS_PER_BLOCK * BYTES_PER_COUNTER;
> > +       }
> > +
> > +       perfcnt = devm_kzalloc(pfdev->dev, sizeof(*perfcnt), GFP_KERNEL);
> > +       if (!perfcnt)
> > +               return -ENOMEM;
> > +
> > +       bo = drm_gem_shmem_create(pfdev->ddev, size);
> > +       if (IS_ERR(bo))
> > +               return PTR_ERR(bo);
> > +
> > +       perfcnt->bo = to_panfrost_bo(&bo->base);
> > +       perfcnt->bosize = size;
> > +
> > +       /*
> > +        * We always use the same buffer, so let's map it once and keep it
> > +        * mapped until the driver is unloaded. This might be a problem if
> > +        * we start using different AS and the perfcnt BO is not mapped at
> > +        * the same GPU virtual address.  
> 
> Per FD address space is next up on my list, so we'll need to sort this
> out soon. As this interface is not tied to any FD, that would mean it
> will need its own address space (I assume it doesn't need to be
> shared?) and would reduce the number available to other clients
> (though presumably they will use some sort of LRU swapping).
> 
> Maybe Emil's suggestion on a module param would be sufficient to go
> ahead and make this an ioctl interface instead.

Okay. I wanted to stay away from ioctl()s as this feature is supposed
to be replaced by something else, but if you think that's okay to
deprecate the ioctl()s I'm fine with this option.

> 
> > +        */
> > +       ret = panfrost_mmu_map(perfcnt->bo);
> > +       if (ret)
> > +               goto err_put_bo;
> > +
> > +       /* Disable everything. */
> > +       gpu_write(pfdev, GPU_PERFCNT_CFG,
> > +                 GPU_PERFCNT_CFG_AS(0) |
> > +                 GPU_PERFCNT_CFG_MODE(GPU_PERFCNT_CFG_MODE_OFF) |
> > +                 (panfrost_model_cmp(pfdev, 0x1000) >= 0 ?
> > +                  GPU_PERFCNT_CFG_SETSEL(1) : 0));
> > +       gpu_write(pfdev, GPU_PRFCNT_JM_EN, 0);
> > +       gpu_write(pfdev, GPU_PRFCNT_SHADER_EN, 0);
> > +       gpu_write(pfdev, GPU_PRFCNT_MMU_L2_EN, 0);
> > +       gpu_write(pfdev, GPU_PRFCNT_TILER_EN, 0);
> > +
> > +       perfcnt->buf = drm_gem_vmap(&bo->base);
> > +       if (IS_ERR(perfcnt->buf)) {
> > +               ret = PTR_ERR(perfcnt->buf);
> > +               goto err_put_bo;
> > +       }
> > +
> > +       init_completion(&perfcnt->dump_comp);
> > +       mutex_init(&perfcnt->lock);
> > +       pfdev->perfcnt = perfcnt;
> > +
> > +       /*
> > +        * Invalidate the cache and clear the counters to start from a fresh
> > +        * state.
> > +        */
> > +       gpu_write(pfdev, GPU_INT_MASK, 0);
> > +       gpu_write(pfdev, GPU_INT_CLEAR, GPU_IRQ_CLEAN_CACHES_COMPLETED);
> > +
> > +       gpu_write(pfdev, GPU_CMD, GPU_CMD_PERFCNT_CLEAR);
> > +       gpu_write(pfdev, GPU_CMD, GPU_CMD_CLEAN_INV_CACHES);
> > +       ret = readl_relaxed_poll_timeout(pfdev->iomem + GPU_INT_RAWSTAT,
> > +                                        status,
> > +                                        status &
> > +                                        GPU_IRQ_CLEAN_CACHES_COMPLETED,
> > +                                        100, 10000);
> > +       if (ret)
> > +               goto err_gem_vunmap;
> > +
> > +       gpu_write(pfdev, GPU_INT_MASK, GPU_IRQ_MASK_ALL);  
> 
> We already setup GPU_INT_* registers elsewhere. We shouldn't have it
> in 2 places. This was why the register definitions were local to the
> .c files...

Was needed to prevent the irq handler from calling the
sample/clean_cache_done() functions. Now that those functions just call
complete() on the ->dump_comp field it should be safe to get rid of that
trick and use wait_for_completion_timeout() instead of readl_poll().

> 
> > +
> > +       return 0;
> > +
> > +err_gem_vunmap:
> > +       drm_gem_vunmap(&pfdev->perfcnt->bo->base.base, pfdev->perfcnt->buf);
> > +
> > +err_put_bo:
> > +       drm_gem_object_put_unlocked(&bo->base);
> > +       return ret;
> > +}
> > +
> > +void panfrost_perfcnt_fini(struct panfrost_device *pfdev)
> > +{  
> 
> Need to do some h/w clean-up?

I'll add code to disable the perf monitor.

> 
> > +       drm_gem_vunmap(&pfdev->perfcnt->bo->base.base, pfdev->perfcnt->buf);
> > +       drm_gem_object_put_unlocked(&pfdev->perfcnt->bo->base.base);
> > +}

Thanks for the review.

Regards,

Boris