[Intel-xe] [RFC PATCH 1/1] drm/xe: Introduce function pointers for MMIO functions

Tue Jun 20 21:15:28 UTC 2023

On Sun, Jun 18, 2023 at 11:28:20AM +0300, Oded Gabbay wrote:
>On Thu, Jun 15, 2023 at 7:34 PM Matt Roper <matthew.d.roper at intel.com> wrote:
>>
>> On Thu, Jun 15, 2023 at 04:04:18PM +0300, Oded Gabbay wrote:
>> > On Thu, Jun 15, 2023 at 3:01 AM Matt Roper <matthew.d.roper at intel.com> wrote:
>> > >
>> > > On Mon, Jun 12, 2023 at 06:31:57PM +0200, Francois Dugast wrote:
>> > > > On Thu, Jun 08, 2023 at 10:35:29AM -0700, Lucas De Marchi wrote:
>> > > > > On Fri, Jun 02, 2023 at 02:25:01PM +0000, Francois Dugast wrote:
>> > > > > > A local structure of function pointers is used as a minimal hardware
>> > > > > > abstraction layer to prepare for platform independent MMIO calls.
>> > > > > >
>> > > > > > Cc: Oded Gabbay <ogabbay at kernel.org>
>> > > > > > Cc: Ofir Bitton <ofir1.bitton at intel.com>
>> > > > > > Cc: Ohad Sharabi <ohadshar at intel.com>
>> > > > > > Signed-off-by: Francois Dugast <francois.dugast at intel.com>
>> > > > > > ---
>> > > > > > drivers/gpu/drm/xe/xe_device_types.h |  3 ++
>> > > > > > drivers/gpu/drm/xe/xe_mmio.c         | 81 ++++++++++++++++++++++++++++
>> > > > > > drivers/gpu/drm/xe/xe_mmio.h         | 35 ++++++------
>> > > > > > 3 files changed, 99 insertions(+), 20 deletions(-)
>> > > > > >
>> > > > > > diff --git a/drivers/gpu/drm/xe/xe_device_types.h b/drivers/gpu/drm/xe/xe_device_types.h
>> > > > > > index 17b6b1cc5adb..3f8fd0d8129b 100644
>> > > > > > --- a/drivers/gpu/drm/xe/xe_device_types.h
>> > > > > > +++ b/drivers/gpu/drm/xe/xe_device_types.h
>> > > > > > @@ -378,6 +378,9 @@ struct xe_device {
>> > > > > >   /** @d3cold_allowed: Indicates if d3cold is a valid device state */
>> > > > > >   bool d3cold_allowed;
>> > > > > >
>> > > > > > + /** @mmio_funcs: function pointers for MMIO related functions */
>> > > > > > + const struct xe_mmio_funcs *mmio_funcs;
>> > > > > > +
>> > > > > >   /* private: */
>> > > > > >
>> > > > > > #if IS_ENABLED(CONFIG_DRM_XE_DISPLAY)
>> > > > > > diff --git a/drivers/gpu/drm/xe/xe_mmio.c b/drivers/gpu/drm/xe/xe_mmio.c
>> > > > > > index 475b14fe4356..f3d08676a77a 100644
>> > > > > > --- a/drivers/gpu/drm/xe/xe_mmio.c
>> > > > > > +++ b/drivers/gpu/drm/xe/xe_mmio.c
>> > > > > > @@ -25,6 +25,62 @@
>> > > > > >
>> > > > > > #define BAR_SIZE_SHIFT 20
>> > > > > >
>> > > > > > +static void xe_mmio_write32_device(struct xe_gt *gt,
>> > > > > > +                            struct xe_reg reg, u32 val);
>> > > > > > +static u32 xe_mmio_read32_device(struct xe_gt *gt, struct xe_reg reg);
>> > > > > > +static void xe_mmio_write64_device(struct xe_gt *gt,
>> > > > > > +                            struct xe_reg reg, u64 val);
>> > > > > > +static u64 xe_mmio_read64_device(struct xe_gt *gt, struct xe_reg reg);
>> > > > > > +
>> > > > > > +static const struct xe_mmio_funcs xe_mmio_funcs_device = {
>> > > > > > + .write32 = xe_mmio_write32_device,
>> > > > > > + .read32 = xe_mmio_read32_device,
>> > > > > > + .write64 = xe_mmio_write64_device,
>> > > > > > + .read64 = xe_mmio_read64_device,
>> > > > > > +};
>> > > > > > +
>> > > > > > +static inline void xe_mmio_write32_device(struct xe_gt *gt,
>> > > > > > +                            struct xe_reg reg, u32 val)
>> > > > > > +{
>> > > > > > + struct xe_tile *tile = gt_to_tile(gt);
>> > > > > > +
>> > > > > > + if (reg.addr < gt->mmio.adj_limit)
>> > > > > > +         reg.addr += gt->mmio.adj_offset;
>> > > > > > +
>> > > > > > + writel(val, tile->mmio.regs + reg.addr);
>> > > > > > +}
>> > > > > > +
>> > > > > > +static inline u32 xe_mmio_read32_device(struct xe_gt *gt, struct xe_reg reg)
>> > > > > > +{
>> > > > > > + struct xe_tile *tile = gt_to_tile(gt);
>> > > > > > +
>> > > > > > + if (reg.addr < gt->mmio.adj_limit)
>> > > > > > +         reg.addr += gt->mmio.adj_offset;
>> > > > > > +
>> > > > > > + return readl(tile->mmio.regs + reg.addr);
>> > > > > > +}
>> > > > > > +
>> > > > > > +static inline void xe_mmio_write64_device(struct xe_gt *gt,
>> > > > > > +                            struct xe_reg reg, u64 val)
>> > > > > > +{
>> > > > > > + struct xe_tile *tile = gt_to_tile(gt);
>> > > > > > +
>> > > > > > + if (reg.addr < gt->mmio.adj_limit)
>> > > > > > +         reg.addr += gt->mmio.adj_offset;
>> > > > > > +
>> > > > > > + writeq(val, tile->mmio.regs + reg.addr);
>> > > > > > +}
>> > > > > > +
>> > > > > > +static inline u64 xe_mmio_read64_device(struct xe_gt *gt, struct xe_reg reg)
>> > > > > > +{
>> > > > > > + struct xe_tile *tile = gt_to_tile(gt);
>> > > > > > +
>> > > > > > + if (reg.addr < gt->mmio.adj_limit)
>> > > > > > +         reg.addr += gt->mmio.adj_offset;
>> > > > > > +
>> > > > > > + return readq(tile->mmio.regs + reg.addr);
>> > > > > > +}
>> > > > > > +
>> > > > > > static int xe_set_dma_info(struct xe_device *xe)
>> > > > > > {
>> > > > > >   unsigned int mask_size = xe->info.dma_mask_size;
>> > > > > > @@ -377,6 +433,29 @@ static void mmio_fini(struct drm_device *drm, void *arg)
>> > > > > >           iounmap(xe->mem.vram.mapping);
>> > > > > > }
>> > > > > >
>> > > > > > +static void xe_mmio_set_funcs(struct xe_device *xe)
>> > > > > > +{
>> > > > > > + /* For now all platforms use the set of MMIO functions for a
>> > > > > > +  * physical device.
>> > > > > > +  */
>> > > > >
>> > > > >
>> > > > > what is "device" in this context? that seems confusing as we always ever
>> > > > > just support reading/writing to a real device (physical here may also
>> > > > > add to the confusion when thinking about SR-IOV and VFs).
>> > > > > We shouldn't add abstractions that are then never used and all platforms
>> > > > > end up using the same. Unless it's a preparation for a follow up series
>> > > > > adding the different handling.
>> > > >
>> > > > For now "device" is meant as "in opposition to simulator" but I agree
>> > > > we can find a better name. Existing platforms all use the same
>> > > > implementation but this is preparation for platforms that require a
>> > > > different implementation.
>> > >
>> > > I agree with Lucas that this doesn't really seem to be a good candidate
>> > > for a vtable; every platform uses exactly the same logic and I can't
>> > > really envision how/why this would need to change for future platforms
>> > > either, so this seems to just be adding unnecessary complexity.
>> > > Registers being accessed at some offset into a PCI BAR isn't likely to
>> > > change going forward.  On future platforms it's more likely that we'd
>> > > need changes to the part of the code that maps the MMIO BAR rather than
>> > > the code that reads/writes an offset into a mapping that's already been
>> > > setup.
>> > I agree with that for every *real hardware* platform the
>> > implementation will probably be the same.
>> > But for simulator/emulation, there is a different implementation.
>> > And even if we don't upstream that different implementation, doing
>> > this abstraction will help us upstream the rest of the driver as we
>> > minimize the differences between upstream and downstream.
>> > And helping us upstream the driver is a good enough reason imo to add
>> > this abstraction.
>>
>> Adding extra abstraction layers to upstream code that provide no
>> upstream benefit and only come into play for some private, internal-only
>> workflow is generally frowned upon in the DRM subsystem.  Unless you
>> have some kind of public, open-source usage model, adding the extra
>> complexity and clutter here is probably a no-go for upstream.
>While I ofc agree in general with this statement, I believe it does
>not apply in this case.
>
>The intention behind changing register access to be callback functions
>is to allow the Linux driver to serve as the principal entity for
>pre-silicon validation . This is an important distinction, as the aim
>is not to obscure the Linux kernel API for portability across other
>operating systems or to reinvent mechanisms already provided by the
>kernel, which I believe is the primary objection of upstream for
>adding abstraction layers.
>
>Instead, this strategy is about utilizing the inherent advantages of
>Linux for the crucial stage of pre-silicon validation. In fact,
>adopting Linux as the main vessel for this process can help leverage a
>broader open source community for problem-solving, encourage more
>uniform testing practices, and result in higher quality drivers due to
>the transparency and scrutiny inherent in open-source development.
>
>Making the upstream driver as close as possible to the internal driver
>is advantageous for both the speed and quality of the upstreaming
>process. A driver that closely mirrors the internal one will likely
>require fewer modifications, thus streamlining the upstreaming
>process. With a closely aligned internal and upstream driver, the
>benefits of any improvements, bug fixes, and other modifications made
>in the internal driver could be quickly and easily transferred to the
>upstream driver.

The problem is the side you are bending: making the upstream one follow
the internal rather than the opposite, which requires "dead
abstractions". This can be seen as something very very close to an OS
abstraction layer at the best and at the worst as a way to have hooks
enhancing the driver with "extensions" not shared.  So abstractions that
are just a layer of indirection without really abstracting anything from
the upstream point of view will always be a grey zone to avoid.

>
>In short, this change will allow us to do a continuously faster and
>better upstream process, which imho provides a substantial benifit to
>the kernel community.

We are all very aware of the benefits of having non-divergent code
bases. We are all on the same page here. As I and Matt argued, in this
specific case with mmio, a vtable doesn't seem to be the best solution
and the reason for adding it goes to a direction where we have a dead
abstraction from the upstream POV.

There are other cases in the code where a vtable is a reasonable
approach. We've been using vtable throughout i915 for page setup, pte
encoding and more heavily on the display side (see all the encoder
hooks in ).  In the display side specifically, we can't say they are silver
bullet when we are not the only stakeholder in the driver. Hooks are
added/removed/changed which sometimes even create silent conflicts
hidden behind

	if (func->foo)
		func->foo();

And the recipe to avoid that is being involved in upstream development
like we are.

>
>>
>> Also, even if you do have an unusual form of simulation/emulation that
>> doesn't behave like real hardware, are you going to have several of them
>> that all work in different manners and need unique implementations?  If
>> we only ever expect to have two code paths total (and only one of those
>> upstream), then using a vtable seems like overkill.  A very simple
>>
>>         if (special case)
>>                 return alternate_implementation();
>>
>> would be much easier to understand and maintain.
>>
>I agree that in this case (register access), as I don't foresee more
>than two implementations, we can go ahead with your suggestion.
>i.e. in the mmio access functions we will add:
>       if (sim)
>                 return alternate_implementation();
>
>If there are no objections, we can go ahead with this approach and
>send a patch upstream.

In this specific case it does seem to be the best approach, but that
would be a downstream patch, not upstream... unless we add a minimal
simulator in upstream, akin to what mesa has with drm-shim

Lucas De Marchi

>
>Oded
>
>>
>> There are lots of other places in the Xe driver that _would_ benefit
>> from per-platform vtables; we should prioritize making changes like this
>> in the areas where they provide a clear benefit.
>>
>>
>> Matt
>>
>> >
>> >
>> > Oded
>> > >
>> > > >
>> > > > >
>> > > > > +Matt as there is still (at least) one refactor planned in this area,
>> > > > > since gt is not always the proper target for the MMIOs. He was saying in
>> > > > > his earlier series about having a mmio_view or such to abstract the
>> > > > > offset and other differences between each IO range. Not sure if this
>> > > > > series would go the rigth direction, maybe we need to think in both
>> > > > > cases together.
>> > > >
>> > > > Matt, would this series block the refactor mentioned by Lucas?
>> > > >
>> > > > In general, are there objections to introducing functions pointers for
>> > > > MMIO functions (extended to all of them, as suggested by Ohad)?
>> > >
>> > > It probably makes more sense to do vtable conversion on other parts of
>> > > the driver where we already have different implementations per platform
>> > > and where we already know that design makes sense.  We can always come
>> > > back and do this to the MMIO functions later once there are actually
>> > > multiple implementations, but it doesn't seem to serve any purpose right
>> > > now.
>> > >
>> > >
>> > > Matt
>> > >
>> > > >
>> > > > Thanks,
>> > > > Francois
>> > > >
>> > > > >
>> > > > >
>> > > > > Lucas De Marchi
>> > > > >
>> > > > > > + switch (xe->info.platform) {
>> > > > > > + case XE_TIGERLAKE:
>> > > > > > + case XE_ROCKETLAKE:
>> > > > > > + case XE_ALDERLAKE_S:
>> > > > > > + case XE_ALDERLAKE_P:
>> > > > > > + case XE_ALDERLAKE_N:
>> > > > > > + case XE_DG1:
>> > > > > > + case XE_DG2:
>> > > > > > + case XE_PVC:
>> > > > > > + case XE_METEORLAKE:
>> > > > > > +         xe->mmio_funcs = &xe_mmio_funcs_device;
>> > > > > > +         break;
>> > > > > > + default:
>> > > > > > +         DRM_ERROR("Unsupported platform\n");
>> > > > > > +         break;
>> > > > > > + }
>> > > > > > +}
>> > > > > > +
>> > > > > > int xe_mmio_init(struct xe_device *xe)
>> > > > > > {
>> > > > > >   struct xe_tile *root_tile = xe_device_get_root_tile(xe);
>> > > > > > @@ -384,6 +463,8 @@ int xe_mmio_init(struct xe_device *xe)
>> > > > > >   const int mmio_bar = 0;
>> > > > > >   int err;
>> > > > > >
>> > > > > > + xe_mmio_set_funcs(xe);
>> > > > > > +
>> > > > > >   /*
>> > > > > >    * Map the first 16MB of th BAR, which includes the registers (0-4MB),
>> > > > > >    * reserved space (4MB-8MB), and GGTT (8MB-16MB) for a single tile.
>> > > > > > diff --git a/drivers/gpu/drm/xe/xe_mmio.h b/drivers/gpu/drm/xe/xe_mmio.h
>> > > > > > index 3c547d78afba..80ce9de7aac4 100644
>> > > > > > --- a/drivers/gpu/drm/xe/xe_mmio.h
>> > > > > > +++ b/drivers/gpu/drm/xe/xe_mmio.h
>> > > > > > @@ -19,6 +19,13 @@ struct xe_device;
>> > > > > >
>> > > > > > #define GEN12_LMEM_BAR            2
>> > > > > >
>> > > > > > +struct xe_mmio_funcs {
>> > > > > > + u32 (*read32)(struct xe_gt *gt, struct xe_reg reg);
>> > > > > > + u64 (*read64)(struct xe_gt *gt, struct xe_reg reg);
>> > > > > > + void (*write32)(struct xe_gt *gt, struct xe_reg reg, u32 val);
>> > > > > > + void (*write64)(struct xe_gt *gt, struct xe_reg reg, u64 val);
>> > > > > > +};
>> > > > > > +
>> > > > > > int xe_mmio_init(struct xe_device *xe);
>> > > > > >
>> > > > > > static inline u8 xe_mmio_read8(struct xe_gt *gt, struct xe_reg reg)
>> > > > > > @@ -34,22 +41,16 @@ static inline u8 xe_mmio_read8(struct xe_gt *gt, struct xe_reg reg)
>> > > > > > static inline void xe_mmio_write32(struct xe_gt *gt,
>> > > > > >                              struct xe_reg reg, u32 val)
>> > > > > > {
>> > > > > > - struct xe_tile *tile = gt_to_tile(gt);
>> > > > > > -
>> > > > > > - if (reg.addr < gt->mmio.adj_limit)
>> > > > > > -         reg.addr += gt->mmio.adj_offset;
>> > > > > > + struct xe_device *xe = gt_to_xe(gt);
>> > > > > >
>> > > > > > - writel(val, tile->mmio.regs + reg.addr);
>> > > > > > + xe->mmio_funcs->write32(gt, reg, val);
>> > > > > > }
>> > > > > >
>> > > > > > static inline u32 xe_mmio_read32(struct xe_gt *gt, struct xe_reg reg)
>> > > > > > {
>> > > > > > - struct xe_tile *tile = gt_to_tile(gt);
>> > > > > > -
>> > > > > > - if (reg.addr < gt->mmio.adj_limit)
>> > > > > > -         reg.addr += gt->mmio.adj_offset;
>> > > > > > + struct xe_device *xe = gt_to_xe(gt);
>> > > > > >
>> > > > > > - return readl(tile->mmio.regs + reg.addr);
>> > > > > > + return xe->mmio_funcs->read32(gt, reg);
>> > > > > > }
>> > > > > >
>> > > > > > static inline u32 xe_mmio_rmw32(struct xe_gt *gt, struct xe_reg reg, u32 clr,
>> > > > > > @@ -67,22 +68,16 @@ static inline u32 xe_mmio_rmw32(struct xe_gt *gt, struct xe_reg reg, u32 clr,
>> > > > > > static inline void xe_mmio_write64(struct xe_gt *gt,
>> > > > > >                              struct xe_reg reg, u64 val)
>> > > > > > {
>> > > > > > - struct xe_tile *tile = gt_to_tile(gt);
>> > > > > > -
>> > > > > > - if (reg.addr < gt->mmio.adj_limit)
>> > > > > > -         reg.addr += gt->mmio.adj_offset;
>> > > > > > + struct xe_device *xe = gt_to_xe(gt);
>> > > > > >
>> > > > > > - writeq(val, tile->mmio.regs + reg.addr);
>> > > > > > + xe->mmio_funcs->write64(gt, reg, val);
>> > > > > > }
>> > > > > >
>> > > > > > static inline u64 xe_mmio_read64(struct xe_gt *gt, struct xe_reg reg)
>> > > > > > {
>> > > > > > - struct xe_tile *tile = gt_to_tile(gt);
>> > > > > > -
>> > > > > > - if (reg.addr < gt->mmio.adj_limit)
>> > > > > > -         reg.addr += gt->mmio.adj_offset;
>> > > > > > + struct xe_device *xe = gt_to_xe(gt);
>> > > > > >
>> > > > > > - return readq(tile->mmio.regs + reg.addr);
>> > > > > > + return xe->mmio_funcs->read64(gt, reg);
>> > > > > > }
>> > > > > >
>> > > > > > static inline int xe_mmio_write32_and_verify(struct xe_gt *gt,
>> > > > > > --
>> > > > > > 2.34.1
>> > > > > >
>> > >
>> > > --
>> > > Matt Roper
>> > > Graphics Software Engineer
>> > > Linux GPU Platform Enablement
>> > > Intel Corporation
>>
>> --
>> Matt Roper
>> Graphics Software Engineer
>> Linux GPU Platform Enablement
>> Intel Corporation