[RFC v1 1/1] drm/xe: Allow fault injection in vm create and vm bind IOCTLs

Matthew Brost matthew.brost at intel.com
Fri Nov 8 17:47:03 UTC 2024


On Fri, Nov 08, 2024 at 06:20:56PM +0100, Francois Dugast wrote:
> On Fri, Nov 08, 2024 at 08:16:50AM -0800, Matthew Brost wrote:
> > On Fri, Nov 08, 2024 at 05:11:56PM +0100, Francois Dugast wrote:
> > > Use fault injection infrastructure to allow specific functions to
> > > be configured over debugfs for failing during the execution of
> > > xe_vm_create_ioctl() and xe_vm_bind_ioctl(). This allows more
> > > thorough testing from user space by going through code paths for
> > > error handling and unwinding which cannot be reached by simply
> > > injecting errors in IOCTL arguments. This can help increase code
> > > robustness.
> > > 
> > 
> > Let also add xe_pt_update_ops_prepare and xe_pt_update_ops_run if possible.
> 
> This was just to show a couple of examples but it is straightforward to
> add more, for instance this is enough to add xe_pt_update_ops_prepare and
> xe_pt_update_ops_run:
> 
> 	diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> 	index e111698abbd9..684dc075deac 100644
> 	--- a/drivers/gpu/drm/xe/xe_pt.c
> 	+++ b/drivers/gpu/drm/xe/xe_pt.c
> 	@@ -1852,6 +1852,7 @@ int xe_pt_update_ops_prepare(struct xe_tile *tile, struct xe_vma_ops *vops)
> 	 
> 	 	return 0;
> 	 }
> 	+ALLOW_ERROR_INJECTION(xe_pt_update_ops_prepare, ERRNO);
> 	 
> 	 static void bind_op_commit(struct xe_vm *vm, struct xe_tile *tile,
> 	 			   struct xe_vm_pgtable_update_ops *pt_update_ops,
> 	@@ -2132,6 +2133,7 @@ xe_pt_update_ops_run(struct xe_tile *tile, struct xe_vma_ops *vops)
> 	 
> 	 	return ERR_PTR(err);
> 	 }
> 	+ALLOW_ERROR_INJECTION(xe_pt_update_ops_run, ERRNO);
> 	 
> 	 /**
> 	  * xe_pt_update_ops_fini() - Finish PT update operations
> 
> and in IGT:
> 
> 	diff --git a/tests/intel/xe_fault_injection.c b/tests/intel/xe_fault_injection.c
> 	index 43d3a2da0..ced6edaea 100644
> 	--- a/tests/intel/xe_fault_injection.c
> 	+++ b/tests/intel/xe_fault_injection.c
> 	@@ -233,6 +233,8 @@ simple_vm_bind(int fd, uint32_t vm)
> 	  * @xe_vma_ops_alloc:			xe_vma_ops_alloc
> 	  * @vm_bind_ioctl_ops_execute:		vm_bind_ioctl_ops_execute
> 	  * @vm_bind_ioctl_ops_create:		vm_bind_ioctl_ops_create
> 	+ * @xe_pt_update_ops_prepare:	xe_pt_update_ops_prepare
> 	+ * @xe_pt_update_ops_run:	xe_pt_update_ops_run
> 	  */
> 	 static void
> 	 vm_bind_fail(int fd, const char function_name[])
> 	@@ -282,6 +284,8 @@ igt_main
> 	 		{ "xe_vma_ops_alloc" },
> 	 		{ "vm_bind_ioctl_ops_create" },
> 	 		{ "vm_bind_ioctl_ops_execute" },
> 	+		{ "xe_pt_update_ops_prepare" },
> 	+		{ "xe_pt_update_ops_run" },
> 	 		{ }
> 	 	};
> 	 
> 
> Then in the next IGT run:
> 
> 	...
> 	Starting subtest: vm-bind-fail-xe_pt_update_ops_prepare
> 	Subtest vm-bind-fail-xe_pt_update_ops_prepare: SUCCESS (0.042s)
> 	Starting subtest: vm-bind-fail-xe_pt_update_ops_run
> 	Subtest vm-bind-fail-xe_pt_update_ops_run: SUCCESS (0.048s)
> 	...
> 
> ... which triggers vm_bind_ioctl_ops_unwind().
> 
> I will wait for more feedback and bring in those changes in the next version.
> 

I haven't looked at the IGTs but the error injection point here LGTM to
promote to an non-RFC in next rev. 

I'll likely want update xe_vm.bind-array-conflict-error-inject to use
these points too and drop TEST_VM_OPS_ERROR / FORCE_OP_ERROR too.

Matt

> Thanks,
> Francois
> 
> > 
> > Matt
> > 
> > > Signed-off-by: Francois Dugast <francois.dugast at intel.com>
> > > ---
> > >  drivers/gpu/drm/xe/xe_exec_queue.c | 1 +
> > >  drivers/gpu/drm/xe/xe_pt.c         | 1 +
> > >  drivers/gpu/drm/xe/xe_vm.c         | 4 ++++
> > >  3 files changed, 6 insertions(+)
> > > 
> > > diff --git a/drivers/gpu/drm/xe/xe_exec_queue.c b/drivers/gpu/drm/xe/xe_exec_queue.c
> > > index fd0f3b3c9101..b999db5f5c19 100644
> > > --- a/drivers/gpu/drm/xe/xe_exec_queue.c
> > > +++ b/drivers/gpu/drm/xe/xe_exec_queue.c
> > > @@ -240,6 +240,7 @@ struct xe_exec_queue *xe_exec_queue_create_bind(struct xe_device *xe,
> > >  
> > >  	return q;
> > >  }
> > > +ALLOW_ERROR_INJECTION(xe_exec_queue_create_bind, ERRNO);
> > >  
> > >  void xe_exec_queue_destroy(struct kref *ref)
> > >  {
> > > diff --git a/drivers/gpu/drm/xe/xe_pt.c b/drivers/gpu/drm/xe/xe_pt.c
> > > index f27f579f4d85..e111698abbd9 100644
> > > --- a/drivers/gpu/drm/xe/xe_pt.c
> > > +++ b/drivers/gpu/drm/xe/xe_pt.c
> > > @@ -136,6 +136,7 @@ struct xe_pt *xe_pt_create(struct xe_vm *vm, struct xe_tile *tile,
> > >  	xe_pt_free(pt);
> > >  	return ERR_PTR(err);
> > >  }
> > > +ALLOW_ERROR_INJECTION(xe_pt_create, ERRNO);
> > >  
> > >  /**
> > >   * xe_pt_populate_empty() - Populate a page-table bo with scratch- or zero
> > > diff --git a/drivers/gpu/drm/xe/xe_vm.c b/drivers/gpu/drm/xe/xe_vm.c
> > > index 624133fae5f5..2e67648ed512 100644
> > > --- a/drivers/gpu/drm/xe/xe_vm.c
> > > +++ b/drivers/gpu/drm/xe/xe_vm.c
> > > @@ -740,6 +740,7 @@ static int xe_vma_ops_alloc(struct xe_vma_ops *vops, bool array_of_binds)
> > >  
> > >  	return 0;
> > >  }
> > > +ALLOW_ERROR_INJECTION(xe_vma_ops_alloc, ERRNO);
> > >  
> > >  static void xe_vma_ops_fini(struct xe_vma_ops *vops)
> > >  {
> > > @@ -1352,6 +1353,7 @@ static int xe_vm_create_scratch(struct xe_device *xe, struct xe_tile *tile,
> > >  
> > >  	return 0;
> > >  }
> > > +ALLOW_ERROR_INJECTION(xe_vm_create_scratch, ERRNO);
> > >  
> > >  static void xe_vm_free_scratch(struct xe_vm *vm)
> > >  {
> > > @@ -1978,6 +1980,7 @@ vm_bind_ioctl_ops_create(struct xe_vm *vm, struct xe_bo *bo,
> > >  
> > >  	return ops;
> > >  }
> > > +ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_create, ERRNO);
> > >  
> > >  static struct xe_vma *new_vma(struct xe_vm *vm, struct drm_gpuva_op_map *op,
> > >  			      u16 pat_index, unsigned int flags)
> > > @@ -2697,6 +2700,7 @@ static int vm_bind_ioctl_ops_execute(struct xe_vm *vm,
> > >  	drm_exec_fini(&exec);
> > >  	return err;
> > >  }
> > > +ALLOW_ERROR_INJECTION(vm_bind_ioctl_ops_execute, ERRNO);
> > >  
> > >  #define SUPPORTED_FLAGS_STUB  \
> > >  	(DRM_XE_VM_BIND_FLAG_READONLY | \
> > > -- 
> > > 2.43.0
> > > 


More information about the Intel-xe mailing list