[PATCH] drm/amdgpu: Handle GPUVM fault storms

Deucher, Alexander Alexander.Deucher at amd.com
Tue Sep 26 18:45:55 UTC 2017


> -----Original Message-----
> From: Kuehling, Felix
> Sent: Tuesday, September 26, 2017 2:38 PM
> To: amd-gfx at lists.freedesktop.org; Koenig, Christian; Deucher, Alexander
> Subject: Re: [PATCH] drm/amdgpu: Handle GPUVM fault storms
> 
> [ping]
> 
> Hi Christian or Alex,
> 
> Do you mind reviewing this change?

Reviewed-by: Alex Deucher <alexander.deucher at amd.com>

> 
> Thanks,
>   Felix
> 
> 
> On 2017-09-21 08:09 PM, Felix Kuehling wrote:
> > When many wavefronts cause VM faults at the same time, it can
> > overwhelm the interrupt handler and cause IH ring overflows before
> > the driver can notify or kill the faulting application.
> >
> > As a workaround I'm introducing limited per-VM fault credit. After
> > that number of VM faults have occurred, further VM faults are
> > filtered out at the prescreen stage of processing.
> >
> > This depends on the PASID in the interrupt packet, so it currently
> > only works for KFD contexts.
> >
> > Signed-off-by: Felix Kuehling <Felix.Kuehling at amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 31
> +++++++++++++++++++++++++++++++
> >  drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h  |  7 ++++++-
> >  drivers/gpu/drm/amd/amdgpu/cik_ih.c     | 19 +++++++++++++++++--
> >  drivers/gpu/drm/amd/amdgpu/cz_ih.c      | 19 +++++++++++++++++--
> >  drivers/gpu/drm/amd/amdgpu/iceland_ih.c | 19 +++++++++++++++++--
> >  drivers/gpu/drm/amd/amdgpu/tonga_ih.c   | 19 +++++++++++++++++--
> >  drivers/gpu/drm/amd/amdgpu/vega10_ih.c  | 11 +++++++----
> >  7 files changed, 112 insertions(+), 13 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > index 8fcc743..c91d5c7 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
> > @@ -2682,6 +2682,7 @@ int amdgpu_vm_init(struct amdgpu_device
> *adev, struct amdgpu_vm *vm,
> >  	}
> >
> >  	INIT_KFIFO(vm->faults);
> > +	vm->fault_credit = 16;
> >
> >  	return 0;
> >
> > @@ -2776,6 +2777,36 @@ void amdgpu_vm_fini(struct amdgpu_device
> *adev, struct amdgpu_vm *vm)
> >  }
> >
> >  /**
> > + * amdgpu_vm_pasid_fault_credit - Check fault credit for given PASID
> > + *
> > + * @adev: amdgpu_device pointer
> > + * @pasid: PASID do identify the VM
> > + *
> > + * This function is expected to be called in interrupt context. Returns
> > + * true if there was fault credit, false otherwise
> > + */
> > +bool amdgpu_vm_pasid_fault_credit(struct amdgpu_device *adev,
> > +				  unsigned int pasid)
> > +{
> > +	struct amdgpu_vm *vm;
> > +
> > +	spin_lock(&adev->vm_manager.pasid_lock);
> > +	vm = idr_find(&adev->vm_manager.pasid_idr, pasid);
> > +	spin_unlock(&adev->vm_manager.pasid_lock);
> > +	if (!vm)
> > +		/* VM not found, can't track fault credit */
> > +		return true;
> > +
> > +	/* No lock needed. only accessed by IRQ handler */
> > +	if (!vm->fault_credit)
> > +		/* Too many faults in this VM */
> > +		return false;
> > +
> > +	vm->fault_credit--;
> > +	return true;
> > +}
> > +
> > +/**
> >   * amdgpu_vm_manager_init - init the VM manager
> >   *
> >   * @adev: amdgpu_device pointer
> > diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> > index 447ed6e..66efbc2 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> > +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
> > @@ -165,8 +165,11 @@ struct amdgpu_vm {
> >  	/* Flag to indicate ATS support from PTE for GFX9 */
> >  	bool			pte_support_ats;
> >
> > -	/* Up to 128 pending page faults */
> > +	/* Up to 128 pending retry page faults */
> >  	DECLARE_KFIFO(faults, u64, 128);
> > +
> > +	/* Limit non-retry fault storms */
> > +	unsigned int		fault_credit;
> >  };
> >
> >  struct amdgpu_vm_id {
> > @@ -244,6 +247,8 @@ void amdgpu_vm_manager_fini(struct
> amdgpu_device *adev);
> >  int amdgpu_vm_init(struct amdgpu_device *adev, struct amdgpu_vm
> *vm,
> >  		   int vm_context, unsigned int pasid);
> >  void amdgpu_vm_fini(struct amdgpu_device *adev, struct amdgpu_vm
> *vm);
> > +bool amdgpu_vm_pasid_fault_credit(struct amdgpu_device *adev,
> > +				  unsigned int pasid);
> >  void amdgpu_vm_get_pd_bo(struct amdgpu_vm *vm,
> >  			 struct list_head *validated,
> >  			 struct amdgpu_bo_list_entry *entry);
> > diff --git a/drivers/gpu/drm/amd/amdgpu/cik_ih.c
> b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
> > index 07d3d89..a870b35 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/cik_ih.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/cik_ih.c
> > @@ -237,8 +237,23 @@ static u32 cik_ih_get_wptr(struct amdgpu_device
> *adev)
> >   */
> >  static bool cik_ih_prescreen_iv(struct amdgpu_device *adev)
> >  {
> > -	/* Process all interrupts */
> > -	return true;
> > +	u32 ring_index = adev->irq.ih.rptr >> 2;
> > +	u16 pasid;
> > +
> > +	switch (le32_to_cpu(adev->irq.ih.ring[ring_index]) & 0xff) {
> > +	case 146:
> > +	case 147:
> > +		pasid = le32_to_cpu(adev->irq.ih.ring[ring_index + 2]) >> 16;
> > +		if (!pasid || amdgpu_vm_pasid_fault_credit(adev, pasid))
> > +			return true;
> > +		break;
> > +	default:
> > +		/* Not a VM fault */
> > +		return true;
> > +	}
> > +
> > +	adev->irq.ih.rptr += 16;
> > +	return false;
> >  }
> >
> >   /**
> > diff --git a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> > index b6cdf4a..fa61d64 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/cz_ih.c
> > @@ -216,8 +216,23 @@ static u32 cz_ih_get_wptr(struct amdgpu_device
> *adev)
> >   */
> >  static bool cz_ih_prescreen_iv(struct amdgpu_device *adev)
> >  {
> > -	/* Process all interrupts */
> > -	return true;
> > +	u32 ring_index = adev->irq.ih.rptr >> 2;
> > +	u16 pasid;
> > +
> > +	switch (le32_to_cpu(adev->irq.ih.ring[ring_index]) & 0xff) {
> > +	case 146:
> > +	case 147:
> > +		pasid = le32_to_cpu(adev->irq.ih.ring[ring_index + 2]) >> 16;
> > +		if (!pasid || amdgpu_vm_pasid_fault_credit(adev, pasid))
> > +			return true;
> > +		break;
> > +	default:
> > +		/* Not a VM fault */
> > +		return true;
> > +	}
> > +
> > +	adev->irq.ih.rptr += 16;
> > +	return false;
> >  }
> >
> >  /**
> > diff --git a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
> b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
> > index 65ed6d3..bd592cb 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/iceland_ih.c
> > @@ -216,8 +216,23 @@ static u32 iceland_ih_get_wptr(struct
> amdgpu_device *adev)
> >   */
> >  static bool iceland_ih_prescreen_iv(struct amdgpu_device *adev)
> >  {
> > -	/* Process all interrupts */
> > -	return true;
> > +	u32 ring_index = adev->irq.ih.rptr >> 2;
> > +	u16 pasid;
> > +
> > +	switch (le32_to_cpu(adev->irq.ih.ring[ring_index]) & 0xff) {
> > +	case 146:
> > +	case 147:
> > +		pasid = le32_to_cpu(adev->irq.ih.ring[ring_index + 2]) >> 16;
> > +		if (!pasid || amdgpu_vm_pasid_fault_credit(adev, pasid))
> > +			return true;
> > +		break;
> > +	default:
> > +		/* Not a VM fault */
> > +		return true;
> > +	}
> > +
> > +	adev->irq.ih.rptr += 16;
> > +	return false;
> >  }
> >
> >  /**
> > diff --git a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
> b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
> > index 5ed0069..aa4e320 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/tonga_ih.c
> > @@ -227,8 +227,23 @@ static u32 tonga_ih_get_wptr(struct
> amdgpu_device *adev)
> >   */
> >  static bool tonga_ih_prescreen_iv(struct amdgpu_device *adev)
> >  {
> > -	/* Process all interrupts */
> > -	return true;
> > +	u32 ring_index = adev->irq.ih.rptr >> 2;
> > +	u16 pasid;
> > +
> > +	switch (le32_to_cpu(adev->irq.ih.ring[ring_index]) & 0xff) {
> > +	case 146:
> > +	case 147:
> > +		pasid = le32_to_cpu(adev->irq.ih.ring[ring_index + 2]) >> 16;
> > +		if (!pasid || amdgpu_vm_pasid_fault_credit(adev, pasid))
> > +			return true;
> > +		break;
> > +	default:
> > +		/* Not a VM fault */
> > +		return true;
> > +	}
> > +
> > +	adev->irq.ih.rptr += 16;
> > +	return false;
> >  }
> >
> >  /**
> > diff --git a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> > index a3b30d8..6973257 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/vega10_ih.c
> > @@ -260,15 +260,18 @@ static bool vega10_ih_prescreen_iv(struct
> amdgpu_device *adev)
> >  		return true;
> >  	}
> >
> > -	/* Not a retry fault */
> > -	if (!(dw5 & 0x80))
> > -		return true;
> > -
> >  	pasid = dw3 & 0xffff;
> >  	/* No PASID, can't identify faulting process */
> >  	if (!pasid)
> >  		return true;
> >
> > +	/* Not a retry fault, check fault credit */
> > +	if (!(dw5 & 0x80)) {
> > +		if (!amdgpu_vm_pasid_fault_credit(adev, pasid))
> > +			goto ignore_iv;
> > +		return true;
> > +	}
> > +
> >  	addr = ((u64)(dw5 & 0xf) << 44) | ((u64)dw4 << 12);
> >  	key = AMDGPU_VM_FAULT(pasid, addr);
> >  	r = amdgpu_ih_add_fault(adev, key);



More information about the amd-gfx mailing list