[PATCH] ammdgpu fix for gfx1103 queue evict/restore crash
kernel test robot
lkp at intel.com
Sat Nov 23 08:31:57 UTC 2024
Hi Mika,
kernel test robot noticed the following build warnings:
[auto build test WARNING on drm-misc/drm-misc-next]
[also build test WARNING on drm-tip/drm-tip v6.12 next-20241122]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch#_base_tree_information]
url: https://github.com/intel-lab-lkp/linux/commits/Mika-Laitio/ammdgpu-fix-for-gfx1103-queue-evict-restore-crash/20241122-035602
base: git://anongit.freedesktop.org/drm/drm-misc drm-misc-next
patch link: https://lore.kernel.org/r/20241121195233.10679-1-lamikr%40gmail.com
patch subject: [PATCH] ammdgpu fix for gfx1103 queue evict/restore crash
config: arm64-allmodconfig (https://download.01.org/0day-ci/archive/20241123/202411231603.PMbyCkko-lkp@intel.com/config)
compiler: clang version 20.0.0git (https://github.com/llvm/llvm-project 592c0fe55f6d9a811028b5f3507be91458ab2713)
reproduce (this is a W=1 build): (https://download.01.org/0day-ci/archive/20241123/202411231603.PMbyCkko-lkp@intel.com/reproduce)
If you fix the issue in a separate patch/commit (i.e. not just a new version of
the same patch/commit), kindly add following tags
| Reported-by: kernel test robot <lkp at intel.com>
| Closes: https://lore.kernel.org/oe-kbuild-all/202411231603.PMbyCkko-lkp@intel.com/
All warnings (new ones prefixed by >>):
In file included from drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_device_queue_manager.c:32:
In file included from drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_priv.h:37:
In file included from include/linux/kfifo.h:40:
In file included from include/linux/dma-mapping.h:11:
In file included from include/linux/scatterlist.h:8:
In file included from include/linux/mm.h:2213:
include/linux/vmstat.h:504:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion]
504 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
| ~~~~~~~~~~~~~~~~~~~~~ ^
505 | item];
| ~~~~
include/linux/vmstat.h:511:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion]
511 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
| ~~~~~~~~~~~~~~~~~~~~~ ^
512 | NR_VM_NUMA_EVENT_ITEMS +
| ~~~~~~~~~~~~~~~~~~~~~~
include/linux/vmstat.h:518:36: warning: arithmetic between different enumeration types ('enum node_stat_item' and 'enum lru_list') [-Wenum-enum-conversion]
518 | return node_stat_name(NR_LRU_BASE + lru) + 3; // skip "nr_"
| ~~~~~~~~~~~ ^ ~~~
include/linux/vmstat.h:524:43: warning: arithmetic between different enumeration types ('enum zone_stat_item' and 'enum numa_stat_item') [-Wenum-enum-conversion]
524 | return vmstat_text[NR_VM_ZONE_STAT_ITEMS +
| ~~~~~~~~~~~~~~~~~~~~~ ^
525 | NR_VM_NUMA_EVENT_ITEMS +
| ~~~~~~~~~~~~~~~~~~~~~~
>> drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_device_queue_manager.c:1354:1: warning: unused label 'out_unlock' [-Wunused-label]
1354 | out_unlock:
| ^~~~~~~~~~~
5 warnings generated.
vim +/out_unlock +1354 drivers/gpu/drm/amd/amdgpu/../amdkfd/kfd_device_queue_manager.c
1292
1293 static int restore_process_queues_cpsch(struct device_queue_manager *dqm,
1294 struct qcm_process_device *qpd)
1295 {
1296 struct queue *q;
1297 struct device *dev = dqm->dev->adev->dev;
1298 struct kfd_process_device *pdd;
1299 uint64_t eviction_duration;
1300 int retval = 0;
1301
1302 // gfx1103 APU fails to remove the queue usually after 10-50 attempts
1303 if (dqm->dev->adev->flags & AMD_IS_APU)
1304 goto out;
1305 pdd = qpd_to_pdd(qpd);
1306
1307 dqm_lock(dqm);
1308 if (WARN_ON_ONCE(!qpd->evicted)) /* already restored, do nothing */
1309 goto out;
1310 if (qpd->evicted > 1) { /* ref count still > 0, decrement & quit */
1311 qpd->evicted--;
1312 goto out;
1313 }
1314
1315 /* The debugger creates processes that temporarily have not acquired
1316 * all VMs for all devices and has no VMs itself.
1317 * Skip queue restore on process restore.
1318 */
1319 if (!pdd->drm_priv)
1320 goto vm_not_acquired;
1321
1322 pr_debug_ratelimited("Restoring PASID 0x%x queues\n",
1323 pdd->process->pasid);
1324
1325 /* Update PD Base in QPD */
1326 qpd->page_table_base = amdgpu_amdkfd_gpuvm_get_process_page_dir(pdd->drm_priv);
1327 pr_debug("Updated PD address to 0x%llx\n", qpd->page_table_base);
1328
1329 /* activate all active queues on the qpd */
1330 list_for_each_entry(q, &qpd->queues_list, list) {
1331 q->properties.is_evicted = false;
1332 if (!QUEUE_IS_ACTIVE(q->properties))
1333 continue;
1334
1335 q->properties.is_active = true;
1336 increment_queue_count(dqm, &pdd->qpd, q);
1337
1338 if (dqm->dev->kfd->shared_resources.enable_mes) {
1339 retval = add_queue_mes(dqm, q, qpd);
1340 if (retval) {
1341 dev_err(dev, "Failed to restore queue %d\n",
1342 q->properties.queue_id);
1343 goto out;
1344 }
1345 }
1346 }
1347 if (!dqm->dev->kfd->shared_resources.enable_mes)
1348 retval = execute_queues_cpsch(dqm,
1349 KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES, 0, USE_DEFAULT_GRACE_PERIOD);
1350 eviction_duration = get_jiffies_64() - pdd->last_evict_timestamp;
1351 atomic64_add(eviction_duration, &pdd->evict_duration_counter);
1352 vm_not_acquired:
1353 qpd->evicted = 0;
> 1354 out_unlock:
1355 dqm_unlock(dqm);
1356 out:
1357 return retval;
1358 }
1359
--
0-DAY CI Kernel Test Service
https://github.com/intel/lkp-tests/wiki
More information about the dri-devel
mailing list