<html><head>
<meta http-equiv="Content-Type" content="text/html; charset=utf-8">
  </head>
  <body>
    <p><br>
    </p>
    <div class="moz-cite-prefix">On 2023-07-07 13:15, Felix Kuehling
      wrote:<br>
    </div>
    <blockquote type="cite" cite="mid:2f659574-3ef8-fd74-33a5-9f7c7dac7a60@amd.com">On
      2023-07-07 11:49, Philip Yang wrote:
      <br>
      <blockquote type="cite">Retry faults are delegated to soft IH ring
        and then processed by
        <br>
        deferred worker. Current soft IH ring size PAGE_SIZE can store
        128
        <br>
        entries, which may overflow and drop retry faults, causes HW
        stucks
        <br>
        because the retry fault is not recovered.
        <br>
        <br>
        Increase soft IH ring size to 8KB, enough to store 256 CAM
        entries
        <br>
        because we clear the CAM entry after handling the retry fault
        from soft
        <br>
        ring.
        <br>
        <br>
        Define macro IH_RING_SIZE and IH_SW_RING_SIZE to remove
        duplicate
        <br>
        constant.
        <br>
        <br>
        Show warning message if soft IH ring overflows because this
        should not
        <br>
        happen.
        <br>
      </blockquote>
      <br>
      It would indicate a problem with the CAM or it could happen on
      older GPUs that don't have a CAM. See below.
      <br>
      <br>
      <br>
      <blockquote type="cite">
        <br>
        Signed-off-by: Philip Yang <a class="moz-txt-link-rfc2396E" href="mailto:Philip.Yang@amd.com"><Philip.Yang@amd.com></a>
        <br>
        ---
        <br>
          drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c  | 8 ++++++--
        <br>
          drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h  | 7 +++++--
        <br>
          drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c | 2 +-
        <br>
          drivers/gpu/drm/amd/amdgpu/ih_v6_0.c    | 4 ++--
        <br>
          drivers/gpu/drm/amd/amdgpu/navi10_ih.c  | 4 ++--
        <br>
          drivers/gpu/drm/amd/amdgpu/vega10_ih.c  | 4 ++--
        <br>
          drivers/gpu/drm/amd/amdgpu/vega20_ih.c  | 4 ++--
        <br>
          7 files changed, 20 insertions(+), 13 deletions(-)
        <br>
        <br>
        diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
        b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
        <br>
        index fceb3b384955..51a0dbd2358a 100644
        <br>
        --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
        <br>
        +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.c
        <br>
        @@ -138,6 +138,7 @@ void amdgpu_ih_ring_fini(struct
        amdgpu_device *adev, struct amdgpu_ih_ring *ih)
        <br>
          /**
        <br>
           * amdgpu_ih_ring_write - write IV to the ring buffer
        <br>
           *
        <br>
        + * @adev: amdgpu_device pointer
        <br>
           * @ih: ih ring to write to
        <br>
           * @iv: the iv to write
        <br>
           * @num_dw: size of the iv in dw
        <br>
        @@ -145,8 +146,8 @@ void amdgpu_ih_ring_fini(struct
        amdgpu_device *adev, struct amdgpu_ih_ring *ih)
        <br>
           * Writes an IV to the ring buffer using the CPU and increment
        the wptr.
        <br>
           * Used for testing and delegating IVs to a software ring.
        <br>
           */
        <br>
        -void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, const
        uint32_t *iv,
        <br>
        -              unsigned int num_dw)
        <br>
        +void amdgpu_ih_ring_write(struct amdgpu_device *adev, struct
        amdgpu_ih_ring *ih,
        <br>
        +              const uint32_t *iv, unsigned int num_dw)
        <br>
          {
        <br>
              uint32_t wptr = le32_to_cpu(*ih->wptr_cpu) >> 2;
        <br>
              unsigned int i;
        <br>
        @@ -161,6 +162,9 @@ void amdgpu_ih_ring_write(struct
        amdgpu_ih_ring *ih, const uint32_t *iv,
        <br>
              if (wptr != READ_ONCE(ih->rptr)) {
        <br>
                  wmb();
        <br>
                  WRITE_ONCE(*ih->wptr_cpu, cpu_to_le32(wptr));
        <br>
        +    } else {
        <br>
        +        dev_warn(adev->dev, "IH soft ring buffer overflow
        0x%X, 0x%X\n",
        <br>
        +             wptr, ih->rptr);
        <br>
      </blockquote>
      <br>
      If this happens, it's probably going to flood the log. It would be
      a good idea to apply a rate-limit, or use dev_warn_once. With that
      fixed, the patch is
      <br>
    </blockquote>
    <p>Will use dev_warn_once and if adev->irq.retry_cam_enabled
      because soft IH ring overflow is fine for older GPUs without CAM.</p>
    <p>Thanks,</p>
    <p>Philip<br>
    </p>
    <blockquote type="cite" cite="mid:2f659574-3ef8-fd74-33a5-9f7c7dac7a60@amd.com">
      <br>
      Reviewed-by: Felix Kuehling <a class="moz-txt-link-rfc2396E" href="mailto:Felix.Kuehling@amd.com"><Felix.Kuehling@amd.com></a>
      <br>
      <br>
      <br>
      <blockquote type="cite">      }
        <br>
          }
        <br>
          diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
        b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
        <br>
        index dd1c2eded6b9..6c6184f0dbc1 100644
        <br>
        --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
        <br>
        +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ih.h
        <br>
        @@ -27,6 +27,9 @@
        <br>
          /* Maximum number of IVs processed at once */
        <br>
          #define AMDGPU_IH_MAX_NUM_IVS    32
        <br>
          +#define IH_RING_SIZE    (256 * 1024)
        <br>
        +#define IH_SW_RING_SIZE    (8 * 1024)    /* enough for 256 CAM
        entries */
        <br>
        +
        <br>
          struct amdgpu_device;
        <br>
          struct amdgpu_iv_entry;
        <br>
          @@ -97,8 +100,8 @@ struct amdgpu_ih_funcs {
        <br>
          int amdgpu_ih_ring_init(struct amdgpu_device *adev, struct
        amdgpu_ih_ring *ih,
        <br>
                      unsigned ring_size, bool use_bus_addr);
        <br>
          void amdgpu_ih_ring_fini(struct amdgpu_device *adev, struct
        amdgpu_ih_ring *ih);
        <br>
        -void amdgpu_ih_ring_write(struct amdgpu_ih_ring *ih, const
        uint32_t *iv,
        <br>
        -              unsigned int num_dw);
        <br>
        +void amdgpu_ih_ring_write(struct amdgpu_device *adev, struct
        amdgpu_ih_ring *ih,
        <br>
        +              const uint32_t *iv, unsigned int num_dw);
        <br>
          int amdgpu_ih_wait_on_checkpoint_process_ts(struct
        amdgpu_device *adev,
        <br>
                                  struct amdgpu_ih_ring *ih);
        <br>
          int amdgpu_ih_process(struct amdgpu_device *adev, struct
        amdgpu_ih_ring *ih);
        <br>
        diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
        b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
        <br>
        index 5273decc5753..fa6d0adcec20 100644
        <br>
        --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
        <br>
        +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_irq.c
        <br>
        @@ -493,7 +493,7 @@ void amdgpu_irq_delegate(struct
        amdgpu_device *adev,
        <br>
                       struct amdgpu_iv_entry *entry,
        <br>
                       unsigned int num_dw)
        <br>
          {
        <br>
        -    amdgpu_ih_ring_write(&adev->irq.ih_soft,
        entry->iv_entry, num_dw);
        <br>
        +    amdgpu_ih_ring_write(adev, &adev->irq.ih_soft,
        entry->iv_entry, num_dw);
        <br>
              schedule_work(&adev->irq.ih_soft_work);
        <br>
          }
        <br>
          diff --git a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
        b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
        <br>
        index b02e1cef78a7..980b24120080 100644
        <br>
        --- a/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
        <br>
        +++ b/drivers/gpu/drm/amd/amdgpu/ih_v6_0.c
        <br>
        @@ -535,7 +535,7 @@ static int ih_v6_0_sw_init(void *handle)
        <br>
               * use bus address for ih ring by psp bl */
        <br>
              use_bus_addr =
        <br>
                  (adev->firmware.load_type == AMDGPU_FW_LOAD_PSP) ?
        false : true;
        <br>
        -    r = amdgpu_ih_ring_init(adev, &adev->irq.ih, 256 *
        1024, use_bus_addr);
        <br>
        +    r = amdgpu_ih_ring_init(adev, &adev->irq.ih,
        IH_RING_SIZE, use_bus_addr);
        <br>
              if (r)
        <br>
                  return r;
        <br>
          @@ -548,7 +548,7 @@ static int ih_v6_0_sw_init(void *handle)
        <br>
              /* initialize ih control register offset */
        <br>
              ih_v6_0_init_register_offset(adev);
        <br>
          -    r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft,
        PAGE_SIZE, true);
        <br>
        +    r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft,
        IH_SW_RING_SIZE, true);
        <br>
              if (r)
        <br>
                  return r;
        <br>
          diff --git a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
        b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
        <br>
        index eec13cb5bf75..b6a8478dabf4 100644
        <br>
        --- a/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
        <br>
        +++ b/drivers/gpu/drm/amd/amdgpu/navi10_ih.c
        <br>
        @@ -565,7 +565,7 @@ static int navi10_ih_sw_init(void *handle)
        <br>
                  use_bus_addr = false;
        <br>
              else
        <br>
                  use_bus_addr = true;
        <br>
        -    r = amdgpu_ih_ring_init(adev, &adev->irq.ih, 256 *
        1024, use_bus_addr);
        <br>
        +    r = amdgpu_ih_ring_init(adev, &adev->irq.ih,
        IH_RING_SIZE, use_bus_addr);
        <br>
              if (r)
        <br>
                  return r;
        <br>
          @@ -578,7 +578,7 @@ static int navi10_ih_sw_init(void *handle)
        <br>
              /* initialize ih control registers offset */
        <br>
              navi10_ih_init_register_offset(adev);
        <br>
          -    r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft,
        PAGE_SIZE, true);
        <br>
        +    r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft,
        IH_SW_RING_SIZE, true);
        <br>
              if (r)
        <br>
                  return r;
        <br>
          diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
        b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
        <br>
        index 1e83db0c5438..d364c6dd152c 100644
        <br>
        --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
        <br>
        +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
        <br>
        @@ -485,7 +485,7 @@ static int vega10_ih_sw_init(void *handle)
        <br>
              if (r)
        <br>
                  return r;
        <br>
          -    r = amdgpu_ih_ring_init(adev, &adev->irq.ih, 256 *
        1024, true);
        <br>
        +    r = amdgpu_ih_ring_init(adev, &adev->irq.ih,
        IH_RING_SIZE, true);
        <br>
              if (r)
        <br>
                  return r;
        <br>
          @@ -510,7 +510,7 @@ static int vega10_ih_sw_init(void *handle)
        <br>
              /* initialize ih control registers offset */
        <br>
              vega10_ih_init_register_offset(adev);
        <br>
          -    r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft,
        PAGE_SIZE, true);
        <br>
        +    r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft,
        IH_SW_RING_SIZE, true);
        <br>
              if (r)
        <br>
                  return r;
        <br>
          diff --git a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
        b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
        <br>
        index 4d719df376a7..544ee55a22da 100644
        <br>
        --- a/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
        <br>
        +++ b/drivers/gpu/drm/amd/amdgpu/vega20_ih.c
        <br>
        @@ -539,7 +539,7 @@ static int vega20_ih_sw_init(void *handle)
        <br>
                  (adev->ip_versions[OSSSYS_HWIP][0] == IP_VERSION(4,
        4, 2)))
        <br>
                  use_bus_addr = false;
        <br>
          -    r = amdgpu_ih_ring_init(adev, &adev->irq.ih, 256 *
        1024, use_bus_addr);
        <br>
        +    r = amdgpu_ih_ring_init(adev, &adev->irq.ih,
        IH_RING_SIZE, use_bus_addr);
        <br>
              if (r)
        <br>
                  return r;
        <br>
          @@ -565,7 +565,7 @@ static int vega20_ih_sw_init(void *handle)
        <br>
              /* initialize ih control registers offset */
        <br>
              vega20_ih_init_register_offset(adev);
        <br>
          -    r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft,
        PAGE_SIZE, use_bus_addr);
        <br>
        +    r = amdgpu_ih_ring_init(adev, &adev->irq.ih_soft,
        IH_SW_RING_SIZE, use_bus_addr);
        <br>
              if (r)
        <br>
                  return r;
        <br>
          </blockquote>
    </blockquote>
  </body>
</html>