[PATCH] ammdgpu fix for gfx1103 queue evict/restore crash

kernel test robot lkp at intel.com
Sat Nov 23 08:31:57 UTC 2024


Hi Mika,

kernel test robot noticed the following build warnings:

[auto build test WARNING on drm-misc/drm-misc-next]
[also build test WARNING on drm-tip/drm-tip v6.12 next-20241122]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]

url:    https://github.com/intel-lab-lkp/linux/commits/Mika-Laitio/ammdgpu-fix-for-gfx1103-queue-evict-restore-crash/20241122-035602
base:   git://anongit.freedesktop.org/drm/drm-misc drm-misc-next
patch link:    https://lore.kernel.org/r/20241121195233.10679-1-lamikr%40gmail.com
patch subject: [PATCH] ammdgpu fix for gfx1103 queue evict/restore crash
config: arm64-allmodconfig (https://download.01.org/0day-ci/archive/20241123/202411231603.PMbyCkko-lkp@intel.com/config)
compiler: clang version 20.0.0git (https://github.com/llvm/llvm-project 592c0fe55f6d9a811028b5f3507be91458ab2713)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241123/202411231603.PMbyCkko-lkp@intel.com/reproduce)

If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp at intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202411231603.PMbyCkko-lkp@intel.com/

All warnings (new ones prefixed by >>):

   In file included from drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_device_queue_manager.c:32:
   In file included from drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_priv.h:37:
   In file included from include/linux/kfifo.h:40:
   In file included from include/linux/dma-mapping.h:11:
   In file included from include/linux/scatterlist.h:8:
   In file included from include/linux/mm.h:2213:
   include/linux/vmstat.h:504:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion]
     504 |         return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
         |                            ~~~~~~~~~~~~~~~~~~~~~ ^
     505 |                            item];
         |                            ~~~~
   include/linux/vmstat.h:511:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion]
     511 |         return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
         |                            ~~~~~~~~~~~~~~~~~~~~~ ^
     512 |                            NR_VM_NUMA_EVENT_ITEMS +
         |                            ~~~~~~~~~~~~~~~~~~~~~~
   include/linux/vmstat.h:518:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
     518 |         return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
         |                               ~~~~~~~~~~~ ^ ~~~
   include/linux/vmstat.h:524:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion]
     524 |         return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
         |                            ~~~~~~~~~~~~~~~~~~~~~ ^
     525 |                            NR_VM_NUMA_EVENT_ITEMS +
         |                            ~~~~~~~~~~~~~~~~~~~~~~
>> drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_device_queue_manager.c:1354:1: warning: unused label 'out_unlock' [-Wunused-label]
    1354 | out_unlock:
         | ^~~~~~~~~~~
   5 warnings generated.


vim +/out_unlock +1354 drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_device_queue_manager.c

  1292	
  1293	static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
  1294						struct qcm_process_device *qpd)
  1295	{
  1296		struct queue *q;
  1297		struct device *dev = dqm->dev->adev->dev;
  1298		struct kfd_process_device *pdd;
  1299		uint64_t eviction_duration;
  1300		int retval = 0;
  1301	
  1302		// gfx1103 APU fails to remove the queue usually after 10-50 attempts
  1303		if (dqm->dev->adev->flags & AMD_IS_APU)
  1304			goto out;
  1305		pdd = qpd_to_pdd(qpd);
  1306	
  1307		dqm_lock(dqm);
  1308		if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */
  1309			goto out;
  1310		if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */
  1311			qpd->evicted--;
  1312			goto out;
  1313		}
  1314	
  1315		/* The debugger creates processes that temporarily have not acquired
  1316		 * all VMs for all devices and has no VMs itself.
  1317		 * Skip queue restore on process restore.
  1318		 */
  1319		if (!pdd->drm_priv)
  1320			goto vm_not_acquired;
  1321	
  1322		pr_debug_ratelimited("Restoring PASID 0x%x queues\n",
  1323				    pdd->process->pasid);
  1324	
  1325		/* Update PD Base in QPD */
  1326		qpd->page_table_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
  1327		pr_debug("Updated PD address to 0x%llx\n", qpd->page_table_base);
  1328	
  1329		/* activate all active queues on the qpd */
  1330		list_for_each_entry(q, &qpd->queues_list, list) {
  1331			q->properties.is_evicted = false;
  1332			if (!QUEUE_IS_ACTIVE(q->properties))
  1333				continue;
  1334	
  1335			q->properties.is_active = true;
  1336			increment_queue_count(dqm, &pdd->qpd, q);
  1337	
  1338			if (dqm->dev->kfd->shared_resources.enable_mes) {
  1339				retval = add_queue_mes(dqm, q, qpd);
  1340				if (retval) {
  1341					dev_err(dev, "Failed to restore queue %d\n",
  1342						q->properties.queue_id);
  1343					goto out;
  1344				}
  1345			}
  1346		}
  1347		if (!dqm->dev->kfd->shared_resources.enable_mes)
  1348			retval = execute_queues_cpsch(dqm,
  1349						      KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD);
  1350		eviction_duration = get_jiffies_64() - pdd->last_evict_timestamp;
  1351		atomic64_add(eviction_duration, &pdd->evict_duration_counter);
  1352	vm_not_acquired:
  1353		qpd->evicted = 0;
> 1354	out_unlock:
  1355		dqm_unlock(dqm);
  1356	out:
  1357		return retval;
  1358	}
  1359	

-- 
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki


More information about the dri-devel mailing list