[Intel-gfx] [PATCH v2] drm/i915/bxt: Broxton decoupled MMIO

Paneri, Praveen praveen.paneri at intel.com
Mon Sep 26 11:08:27 UTC 2016



On 9/23/2016 3:19 PM, Tvrtko Ursulin wrote:
>
> Hi,
>
> On 19/09/2016 18:15, Praveen Paneri wrote:
>
>> Decoupled MMIO is an alternative way to access forcewake domain
>> registers, which requires less cycles and avoids frequent software
>> forcewake.
>
> I would like to see a sentence or two on how it works here.
Sure
>
>> v2:
>> - Moved platform check out of the function and got rid of duplicate
>> functions to find out decoupled power domain.
>> - Added a check for forcewake already held and skipped decoupled access
>> - Skipped writing 64 bit register through decoupled MMIO
>>
>> Signed-off-by: Zhe Wang <zhe1.wang at intel.com>
>> Signed-off-by: Damien Lespiau <damien.lespiau at intel.com>
>> Signed-off-by: Ankitprasad Sharma <ankitprasad.r.sharma at intel.com>
>> Signed-off-by: Praveen Paneri <praveen.paneri at intel.com>
>
> How come this list of SoBs?
Will clean it up
>
>> ---
>>   drivers/gpu/drm/i915/i915_drv.h     |  11 ++++
>>   drivers/gpu/drm/i915/i915_reg.h     |   7 +++
>>   drivers/gpu/drm/i915/intel_uncore.c | 118
>> +++++++++++++++++++++++++++++++++++-
>>   3 files changed, 134 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/gpu/drm/i915/i915_drv.h
>> b/drivers/gpu/drm/i915/i915_drv.h
>> index 4dd307e..065247b 100644
>> --- a/drivers/gpu/drm/i915/i915_drv.h
>> +++ b/drivers/gpu/drm/i915/i915_drv.h
>> @@ -558,6 +558,16 @@ enum forcewake_domains {
>>   #define FW_REG_READ  (1)
>>   #define FW_REG_WRITE (2)
>>   +enum power_domains {
>
> If this is specific to decouples mmio maybe call it
> decoupled_power_domains?
sure
>
>> +    GEN9_DECOUPLED_PD_BLITTER = 0,
>> +    GEN9_DECOUPLED_PD_RENDER,
>> +    GEN9_DECOUPLED_PD_MEDIA,
>> +    GEN9_DECOUPLED_PD_ALL
>> +};
>> +
>> +#define GEN9_DECOUPLED_OP_WRITE (0)
>> +#define GEN9_DECOUPLED_OP_READ (1)
>
> Would it be more consistent to make these ones enums as well?
yes, will do that
>
>> +
>>   enum forcewake_domains
>>   intel_uncore_forcewake_for_reg(struct drm_i915_private *dev_priv,
>>                      i915_reg_t reg, unsigned int op);
>> @@ -2854,6 +2864,7 @@ struct drm_i915_cmd_table {
>>   #define GT_FREQUENCY_MULTIPLIER 50
>>   #define GEN9_FREQ_SCALER 3
>>   +#define HAS_DECOUPLED_MMIO(dev_priv) (IS_BROXTON(dev_priv) &&
>> IS_BXT_REVID(dev_priv, BXT_REVID_C0, REVID_FOREVER))
>
> There is a recent patch series from Carlos Santa which moved all these
> type of things to device info. So I think you have to make this
> compliant with that new style.
I looked into it. Could you suggest where can I add the check for BXT C0 
revision?
Will this be okay
#define HAS_DECOUPLED_MMIO(dev)   (INTEL_INFO(dev)->has_decoupled_mmio 
&& IS_BXT_REVID(to_i915(dev), BXT_REVID_C0, REVID_FOREVER))
>
>>   #include "i915_trace.h"
>>     static inline bool intel_scanout_needs_vtd_wa(struct
>> drm_i915_private *dev_priv)
>> diff --git a/drivers/gpu/drm/i915/i915_reg.h
>> b/drivers/gpu/drm/i915/i915_reg.h
>> index 70d9616..be59803 100644
>> --- a/drivers/gpu/drm/i915/i915_reg.h
>> +++ b/drivers/gpu/drm/i915/i915_reg.h
>> @@ -7394,6 +7394,13 @@ enum {
>>   #define  SKL_FUSE_PG1_DIST_STATUS              (1<<26)
>>   #define  SKL_FUSE_PG2_DIST_STATUS              (1<<25)
>>   +/* Decoupled MMIO register pair for kernel driver */
>> +#define GEN9_DECOUPLED_REG0_DW0            _MMIO(0xF00)
>> +#define GEN9_DECOUPLED_REG0_DW1            _MMIO(0xF04)
>> +#define GEN9_DECOUPLED_DW1_GO            (1<<31)
>> +#define GEN9_DECOUPLED_PD_SHIFT            28
>> +#define GEN9_DECOUPLED_OP_SHIFT            24
>> +
>>   /* Per-pipe DDI Function Control */
>>   #define _TRANS_DDI_FUNC_CTL_A        0x60400
>>   #define _TRANS_DDI_FUNC_CTL_B        0x61400
>> diff --git a/drivers/gpu/drm/i915/intel_uncore.c
>> b/drivers/gpu/drm/i915/intel_uncore.c
>> index a9b6c93..06fffba 100644
>> --- a/drivers/gpu/drm/i915/intel_uncore.c
>> +++ b/drivers/gpu/drm/i915/intel_uncore.c
>> @@ -816,6 +816,42 @@ unclaimed_reg_debug(struct drm_i915_private
>> *dev_priv,
>>       __unclaimed_reg_debug(dev_priv, reg, read, before);
>>   }
>>   +/*
>> + * Decoupled MMIO access for only 1 DWORD
>> + */
>> +static void __gen9_decoupled_mmio_access(struct drm_i915_private
>> *dev_priv,
>> +                     uint32_t reg, u32 *ptr_data,
>
> Seeing uint32_t and u32 on the same line looks a bit untidy to me. I
> think you should only use one and that in kernel u32 is preferred.
sure
>
>> +                     enum forcewake_domains fw_engine, int operation)
>> +{
>> +    enum power_domains pd_engine;
>> +    u32 ctrl_reg_data = 0;
>> +
>> +    if (operation == GEN9_DECOUPLED_OP_WRITE)
>> +        __raw_i915_write32(dev_priv,
>> +                GEN9_DECOUPLED_REG0_DW0,
>> +                *ptr_data);
>> +
>> +    pd_engine = (fw_engine == (FORCEWAKE_RENDER ||
>> FORCEWAKE_BLITTER)) ? \
>> +             !(fw_engine >> 1) : (fw_engine >> 1); \
>
> Feels that look up table would be better.
>
>> +
>> +    ctrl_reg_data |= reg;
>> +    ctrl_reg_data |= (operation << GEN9_DECOUPLED_OP_SHIFT);
>> +    ctrl_reg_data |= (pd_engine << GEN9_DECOUPLED_PD_SHIFT);
>> +    __raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1,
>> ctrl_reg_data);
>> +
>> +    ctrl_reg_data |= GEN9_DECOUPLED_DW1_GO;
>> +    __raw_i915_write32(dev_priv, GEN9_DECOUPLED_REG0_DW1,
>> ctrl_reg_data);
>> +
>> +    if (wait_for_atomic((__raw_i915_read32(dev_priv,
>> +            GEN9_DECOUPLED_REG0_DW1) & GEN9_DECOUPLED_DW1_GO) == 0,
>> +            FORCEWAKE_ACK_TIMEOUT_MS))
>> +        DRM_ERROR("Decoupled MMIO wait timed out\n");
>> +
>
> Is FORCEWAKE_ACK_TIMEOUT_MS correct/relevant for decoupled MMIO? It is
> potentially quite a long atomic wait.
This is max wait time. We might not wait for that long but I can still 
check on it.
>
> Would it be worth returning some fixed value in the case of a timeout?
> Might be better than random stuff? (applies in the 64-bit read case)
This is same as forcewake implementation. If we change it, what would be 
the appropriate fixed value to be returned?
>
>> +    if (operation == GEN9_DECOUPLED_OP_READ)
>> +        *ptr_data = __raw_i915_read32(dev_priv,
>> +                GEN9_DECOUPLED_REG0_DW0);
>> +}
>> +
>>   #define GEN2_READ_HEADER(x) \
>>       u##x val = 0; \
>>       assert_rpm_wakelock_held(dev_priv);
>> @@ -892,6 +928,20 @@ static inline void __force_wake_auto(struct
>> drm_i915_private *dev_priv,
>>           dev_priv->uncore.funcs.force_wake_get(dev_priv, fw_domains);
>>   }
>>   +static inline bool __is_forcewake_active(struct drm_i915_private
>> *dev_priv,
>> +                     enum forcewake_domains fw_domains)
>> +{
>> +    struct intel_uncore_forcewake_domain *domain;
>> +
>> +    /* Ideally GCC would be constant-fold and eliminate this loop */
>> +    for_each_fw_domain_masked(domain, fw_domains, dev_priv) {
>> +        if (domain->wake_count)
>> +            fw_domains &= ~domain->mask;
>> +    }
>> +
>> +    return fw_domains ? 0 : 1;
>> +}
>> +
>>   #define __gen6_read(x) \
>>   static u##x \
>>   gen6_read##x(struct drm_i915_private *dev_priv, i915_reg_t reg, bool
>> trace) { \
>> @@ -940,6 +990,37 @@ gen9_read##x(struct drm_i915_private *dev_priv,
>> i915_reg_t reg, bool trace) { \
>>       GEN6_READ_FOOTER; \
>>   }
>>   +#define __gen9_decoupled_read(x) \
>> +static u##x \
>> +gen9_decoupled_read##x(struct drm_i915_private *dev_priv, i915_reg_t
>> reg, bool trace) { \
>> +    enum forcewake_domains fw_engine; \
>
> fw_engines
>
>> +    GEN6_READ_HEADER(x); \
>> +    fw_engine = __gen9_reg_read_fw_domains(offset); \
>> +    if (fw_engine && x%32 == 0) { \
>> +        if (__is_forcewake_active(dev_priv, fw_engine)) \
>> +            __raw_i915_write##x(dev_priv, reg, val); \
>
> Write in the read macro, I don't understand!?
typo, I will fix it.
>
> Also, would a single mmio read call be possible, something like below?
>
> if (x % 32 || !fw_engines || __is_forcewake_active()) {
>     if (fw_engines)
>         __force_wake_auto
>      __raw_i915_read
> } else {
>     ... decoupled mmio loop ...
> }
>
> I might have made an oversight, no guarantees that I haven't. :)
Looks fine. Will test this
>
>> +        else { \
>> +            unsigned i; \
>> +            u32 *ptr_data = (u32 *) &val; \
>
> Hm, curios if taking the address of val causes some issues for code
> generation. I am not set up yet to try it out myself, but would be
> interesting to see the difference between your approach and the one
> where reads are done via return value and not pointer writes. Even if it
> means refactoring the common accessor a bit.
>
>> +            for (i = 0; i < x/32; i++) \
>
> Could you check if for the 64-bit read compiler manages to unroll this
> loop?
>
>> +                __gen9_decoupled_mmio_access(dev_priv, \
>> +                        (offset + i*4), \
>
> Hopefully compiler manages to optimize, but please check and in case it
> doesn't consider offsett += sizeof(u32) in the for statement as an
> alternative.
Will check the assembly and let you know how compiler generates the code.
>
>> +                        ptr_data + i, \
>> +                        fw_engine, \
>> +                        GEN9_DECOUPLED_OP_READ); \
>> +        } \
>> +    } else { \
>> +        if (fw_engine) \
>> +            __force_wake_auto(dev_priv, fw_engine); \
>> +        val = __raw_i915_read##x(dev_priv, reg); \
>> +    } \
>> +    GEN6_READ_FOOTER; \
>> +}
>> +
>> +__gen9_decoupled_read(8)
>> +__gen9_decoupled_read(16)
>
> What is the value of defining 8 and 16-bit version of those when they
> won't use the decoupled mmio path at all? Wouldn't it be better to just
> assign a mix of gen9 mmio functions in that case?
Will fix it as per below suggestion.
>
>> +__gen9_decoupled_read(32)
>> +__gen9_decoupled_read(64)
>>   __gen9_read(8)
>>   __gen9_read(16)
>>   __gen9_read(32)
>> @@ -1107,6 +1188,34 @@ gen9_write##x(struct drm_i915_private
>> *dev_priv, i915_reg_t reg, u##x val, \
>>       GEN6_WRITE_FOOTER; \
>>   }
>>   +#define __gen9_decoupled_write(x) \
>> +static void \
>> +gen9_decoupled_write##x(struct drm_i915_private *dev_priv, i915_reg_t
>> reg, u##x val, \
>> +        bool trace) { \
>> +    enum forcewake_domains fw_engine; \
>> +    GEN6_WRITE_HEADER; \
>> +    fw_engine = __gen9_reg_write_fw_domains(offset); \
>> +    if (fw_engine && x == 32) { \
>> +        u32 *ptr_data = (u32 *) &val; \
>> +        if (__is_forcewake_active(dev_priv, fw_engine)) \
>> +            __raw_i915_write##x(dev_priv, reg, val); \
>> +        else \
>> +            __gen9_decoupled_mmio_access(dev_priv, \
>> +                offset, \
>> +                ptr_data, \
>> +                fw_engine, \
>> +                GEN9_DECOUPLED_OP_WRITE); \
>> +    } else { \
>> +        if (fw_engine) \
>> +            __force_wake_auto(dev_priv, fw_engine); \
>> +        __raw_i915_write##x(dev_priv, reg, val); \
>> +    } \
>> +    GEN6_WRITE_FOOTER; \
>> +}
>> +
>> +__gen9_decoupled_write(8)
>> +__gen9_decoupled_write(16)
>
> All same comments as for the read block.
>
>> +__gen9_decoupled_write(32)
>>   __gen9_write(8)
>>   __gen9_write(16)
>>   __gen9_write(32)
>> @@ -1328,8 +1437,13 @@ void intel_uncore_init(struct drm_i915_private
>> *dev_priv)
>>       switch (INTEL_INFO(dev_priv)->gen) {
>>       default:
>>       case 9:
>> -        ASSIGN_WRITE_MMIO_VFUNCS(gen9);
>> -        ASSIGN_READ_MMIO_VFUNCS(gen9);
>> +        if (HAS_DECOUPLED_MMIO(dev_priv)) {
>> +            ASSIGN_WRITE_MMIO_VFUNCS(gen9_decoupled);
>> +            ASSIGN_READ_MMIO_VFUNCS(gen9_decoupled);
>> +        } else {
>> +            ASSIGN_WRITE_MMIO_VFUNCS(gen9);
>> +            ASSIGN_READ_MMIO_VFUNCS(gen9);
>> +        }
>
> So with the suggestion from above this would have to be open coded to
> only override 32 and 64 bit ops with decoupled versions, but I think
> that is better than having duplicated code compiled in the module.
>
>>           break;
>>       case 8:
>>           if (IS_CHERRYVIEW(dev_priv)) {
>
> Regards,
>
> Tvrtko
>
Thanks,
Praveen


More information about the Intel-gfx mailing list