[PATCH 01/23] drm/amdgpu: add flag to indicate nps mode
Xie, Patrick
Gangliang.Xie at amd.com
Wed Nov 13 09:03:51 UTC 2024
[AMD Official Use Only - AMD Internal Distribution Only]
-----Original Message-----
From: amd-gfx <amd-gfx-bounces at lists.freedesktop.org> On Behalf Of amd-gfx-request at lists.freedesktop.org
Sent: Friday, November 8, 2024 7:15 PM
To: amd-gfx at lists.freedesktop.org
Subject: amd-gfx Digest, Vol 102, Issue 101
Send amd-gfx mailing list submissions to
amd-gfx at lists.freedesktop.org
To subscribe or unsubscribe via the World Wide Web, visit
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
or, via email, send a message with subject or body 'help' to
amd-gfx-request at lists.freedesktop.org
You can reach the person managing the list at
amd-gfx-owner at lists.freedesktop.org
When replying, please edit your Subject line so it is more specific than "Re: Contents of amd-gfx digest..."
Today's Topics:
1. [PATCH 01/23] drm/amdgpu: add flag to indicate nps mode
switch (Tao Zhou)
2. [PATCH 03/23] drm/amdgpu: simplify RAS page retirement in one
memory row (Tao Zhou)
3. [PATCH 05/23] drm/amdgpu: store PA with column bits cleared
for RAS bad page (Tao Zhou)
----------------------------------------------------------------------
Message: 1
Date: Fri, 8 Nov 2024 19:14:01 +0800
From: Tao Zhou <tao.zhou1 at amd.com>
To: <amd-gfx at lists.freedesktop.org>
Cc: Tao Zhou <tao.zhou1 at amd.com>
Subject: [PATCH 01/23] drm/amdgpu: add flag to indicate nps mode
switch
Message-ID: <20241108111423.60169-1-tao.zhou1 at amd.com>
Content-Type: text/plain
There are two types of gpu reset, nps mode switch and normal gpu reset, add a flag to distigush them.
Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 9 +++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c | 13 ++++++++++++- drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h | 3 ++-
drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c | 2 +-
5 files changed, 23 insertions(+), 5 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index 9365b43c0055..ba9b0d322b33 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1681,6 +1681,7 @@ static inline bool amdgpu_is_tmz(struct amdgpu_device *adev) }
int amdgpu_in_reset(struct amdgpu_device *adev);
+int amdgpu_in_nps_switch(struct amdgpu_device *adev);
extern const struct attribute_group amdgpu_vram_mgr_attr_group; extern const struct attribute_group amdgpu_gtt_mgr_attr_group; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index 286f0fdfcb50..d69fcbb28b0e 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -5862,7 +5862,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
/* We need to lock reset domain only once both for XGMI and single device */
tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
reset_list);
- amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
+ amdgpu_device_lock_reset_domain(tmp_adev);
/* block all schedulers and reset given job's ring */
list_for_each_entry(tmp_adev, device_list_handle, reset_list) { @@ -6343,7 +6343,7 @@ pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_sta
* Locking adev->reset_domain->sem will prevent any external access
* to GPU during PCI error recovery
*/
- amdgpu_device_lock_reset_domain(adev->reset_domain);
+ amdgpu_device_lock_reset_domain(adev);
amdgpu_device_set_mp1_state(adev);
/*
@@ -6579,6 +6579,11 @@ int amdgpu_in_reset(struct amdgpu_device *adev)
return atomic_read(&adev->reset_domain->in_gpu_reset);
}
+int amdgpu_in_nps_switch(struct amdgpu_device *adev) {
+ return atomic_read(&adev->reset_domain->in_nps_switch);
+}
+
/**
* amdgpu_device_halt() - bring hardware to some kind of halt state
*
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
index f4c08fa83756..1becf8fbbc71 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.c
@@ -301,15 +301,25 @@ struct amdgpu_reset_domain *amdgpu_reset_create_reset_domain(enum amdgpu_reset_d
INIT_WORK(&reset_domain->clear, amdgpu_reset_domain_cancel_all_work);
atomic_set(&reset_domain->in_gpu_reset, 0);
+ atomic_set(&reset_domain->in_nps_switch, 0);
atomic_set(&reset_domain->reset_res, 0);
init_rwsem(&reset_domain->sem);
return reset_domain;
}
-void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain)
+void amdgpu_device_lock_reset_domain(struct amdgpu_device *adev)
{
+ struct amdgpu_reset_domain *reset_domain = adev->reset_domain;
+
atomic_set(&reset_domain->in_gpu_reset, 1);
+ /* The life time of in_nps_switch is longer than
+ * amdgpu_device_nps_switch_needed
+ */
+ if (adev->nbio.funcs && adev->nbio.funcs->is_nps_switch_requested &&
+ adev->nbio.funcs->is_nps_switch_requested(adev))
+ atomic_set(&reset_domain->in_nps_switch, 1);
+
down_write(&reset_domain->sem);
}
@@ -317,6 +327,7 @@ void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain) void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain) {
atomic_set(&reset_domain->in_gpu_reset, 0);
+ atomic_set(&reset_domain->in_nps_switch, 0);
up_write(&reset_domain->sem);
}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
index 977b2dd2205a..c74a1f88f0ef 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_reset.h
@@ -97,6 +97,7 @@ struct amdgpu_reset_domain {
enum amdgpu_reset_domain_type type;
struct rw_semaphore sem;
atomic_t in_gpu_reset;
+ atomic_t in_nps_switch;
atomic_t reset_res;
struct work_struct clear;
bool drain;
@@ -158,7 +159,7 @@ static inline bool amdgpu_reset_pending(struct amdgpu_reset_domain *domain)
return rwsem_is_contended(&domain->sem); }
-void amdgpu_device_lock_reset_domain(struct amdgpu_reset_domain *reset_domain);
+void amdgpu_device_lock_reset_domain(struct amdgpu_device *adev);
void amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain);
[Patrick] Why not change " amdgpu_device_unlock_reset_domain(struct amdgpu_reset_domain *reset_domain)" to " amdgpu_device_unlock_reset_domain(struct amdgpu_device *adev)", to maintain the same style.
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
index daa69dfb4dca..8387e075c385 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
@@ -1540,7 +1540,7 @@ static void amdgpu_xgmi_reset_on_init_work(struct work_struct *work)
tmp_adev = list_first_entry(&device_list, struct amdgpu_device,
reset_list);
- amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
+ amdgpu_device_lock_reset_domain(tmp_adev);
reset_context.method = AMD_RESET_METHOD_ON_INIT;
reset_context.reset_req_dev = tmp_adev;
--
2.34.1
------------------------------
Message: 2
Date: Fri, 8 Nov 2024 19:14:03 +0800
From: Tao Zhou <tao.zhou1 at amd.com>
To: <amd-gfx at lists.freedesktop.org>
Cc: Tao Zhou <tao.zhou1 at amd.com>
Subject: [PATCH 03/23] drm/amdgpu: simplify RAS page retirement in one
memory row
Message-ID: <20241108111423.60169-3-tao.zhou1 at amd.com>
Content-Type: text/plain
Take R13 and column bits as a whole for UMC v12.
Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 57 +++++++++++---------------
drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 1 +
2 files changed, 24 insertions(+), 34 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index 1a8ea834efa6..8939b4f1fb49 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -177,7 +177,7 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
struct ras_err_data *err_data,
struct ta_ras_query_address_input *addr_in)
{
- uint32_t col, row, row_xor, bank, channel_index;
+ uint32_t col, row, bank, channel_index;
uint64_t soc_pa, retired_page, column, err_addr;
struct ta_ras_query_address_output addr_out;
@@ -195,31 +195,27 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
channel_index = addr_out.pa.channel_idx;
col = (err_addr >> 1) & 0x1fULL;
- row = (err_addr >> 10) & 0x3fffULL;
- row_xor = row ^ (0x1ULL << 13);
/* clear [C3 C2] in soc physical address */
soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT);
/* clear [C4] in soc physical address */
soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT);
+ /* clear [R13] in soc physical address */
+ soc_pa &= ~(0x1ULL << UMC_V12_0_PA_R13_BIT);
- /* loop for all possibilities of [C4 C3 C2] */
- for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) {
+ /* loop for all possibilities of [R13 C4 C3 C2] */
+ for (column = 0; column < UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL; column++) {
retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT);
retired_page |= (((column & 0x4) >> 2) << UMC_V12_0_PA_C4_BIT);
+ retired_page |= (((column & 0x8) >> 3) << UMC_V12_0_PA_R13_BIT);
+
/* include column bit 0 and 1 */
col &= 0x3;
col |= (column << 2);
- dev_info(adev->dev,
- "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
- retired_page, row, col, bank, channel_index);
- amdgpu_umc_fill_error_record(err_data, err_addr,
- retired_page, channel_index, addr_in->ma.umc_inst);
+ row = (retired_page >> UMC_V12_0_PA_R0_BIT) & 0x3fffULL;
- /* shift R13 bit */
- retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT);
dev_info(adev->dev,
"Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
- retired_page, row_xor, col, bank, channel_index);
+ retired_page, row, col, bank, channel_index);
amdgpu_umc_fill_error_record(err_data, err_addr,
retired_page, channel_index, addr_in->ma.umc_inst);
}
@@ -229,7 +225,7 @@ static void umc_v12_0_dump_addr_info(struct amdgpu_device *adev,
struct ta_ras_query_address_output *addr_out,
uint64_t err_addr)
{
- uint32_t col, row, row_xor, bank, channel_index;
+ uint32_t col, row, bank, channel_index;
uint64_t soc_pa, retired_page, column;
soc_pa = addr_out->pa.pa;
@@ -237,29 +233,27 @@ static void umc_v12_0_dump_addr_info(struct amdgpu_device *adev,
channel_index = addr_out->pa.channel_idx;
col = (err_addr >> 1) & 0x1fULL;
- row = (err_addr >> 10) & 0x3fffULL;
- row_xor = row ^ (0x1ULL << 13);
/* clear [C3 C2] in soc physical address */
soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT);
/* clear [C4] in soc physical address */
soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT);
+ /* clear [R13] in soc physical address */
+ soc_pa &= ~(0x1ULL << UMC_V12_0_PA_R13_BIT);
- /* loop for all possibilities of [C4 C3 C2] */
- for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) {
+ /* loop for all possibilities of [R13 C4 C3 C2] */
+ for (column = 0; column < UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL; column++) {
retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT);
retired_page |= (((column & 0x4) >> 2) << UMC_V12_0_PA_C4_BIT);
+ retired_page |= (((column & 0x8) >> 3) << UMC_V12_0_PA_R13_BIT);
+
/* include column bit 0 and 1 */
col &= 0x3;
- col |= (column << 2);
- dev_info(adev->dev,
- "Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
- retired_page, row, col, bank, channel_index);
+ col |= ((column & 0x7) << 2);
+ row = (retired_page >> UMC_V12_0_PA_R0_BIT) & 0x3fffULL;
- /* shift R13 bit */
- retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT);
dev_info(adev->dev,
"Error Address(PA):0x%-10llx Row:0x%-4x Col:0x%-2x Bank:0x%x Channel:0x%x\n",
- retired_page, row_xor, col, bank, channel_index);
+ retired_page, row, col, bank, channel_index);
}
}
@@ -274,23 +268,18 @@ static int umc_v12_0_lookup_bad_pages_in_a_row(struct amdgpu_device *adev,
soc_pa &= ~(0x3ULL << UMC_V12_0_PA_C2_BIT);
/* clear [C4] in soc physical address */
soc_pa &= ~(0x1ULL << UMC_V12_0_PA_C4_BIT);
+ /* clear [R13] in soc physical address */
+ soc_pa &= ~(0x1ULL << UMC_V12_0_PA_R13_BIT);
/* loop for all possibilities of [C4 C3 C2] */
- for (column = 0; column < UMC_V12_0_NA_MAP_PA_NUM; column++) {
+ for (column = 0; column < UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL; column++) {
retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT);
retired_page |= (((column & 0x4) >> 2) << UMC_V12_0_PA_C4_BIT);
+ retired_page |= (((column & 0x8) >> 3) << UMC_V12_0_PA_R13_BIT);
if (pos >= len)
return 0;
pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
-
- /* shift R13 bit */
- retired_page ^= (0x1ULL << UMC_V12_0_PA_R13_BIT);
-
- if (pos >= len)
- return 0;
- pfns[pos++] = retired_page >> AMDGPU_GPU_PAGE_SHIFT;
-
}
return pos;
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
index be5598d76c1d..dea42810fc53 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
@@ -60,6 +60,7 @@
#define UMC_V12_0_PA_C2_BIT 15
#define UMC_V12_0_PA_C4_BIT 21
/* row bits in SOC physical address */
+#define UMC_V12_0_PA_R0_BIT 22
#define UMC_V12_0_PA_R13_BIT 35
#define MCA_UMC_HWID_V12_0 0x96
--
2.34.1
------------------------------
Message: 3
Date: Fri, 8 Nov 2024 19:14:05 +0800
From: Tao Zhou <tao.zhou1 at amd.com>
To: <amd-gfx at lists.freedesktop.org>
Cc: Tao Zhou <tao.zhou1 at amd.com>
Subject: [PATCH 05/23] drm/amdgpu: store PA with column bits cleared
for RAS bad page
Message-ID: <20241108111423.60169-5-tao.zhou1 at amd.com>
Content-Type: text/plain
So the code can be simplified, and no need to expose the detail of PA
format outside address conversion.
Signed-off-by: Tao Zhou <tao.zhou1 at amd.com>
---
drivers/gpu/drm/amd/amdgpu/umc_v12_0.c | 4 +++-
drivers/gpu/drm/amd/amdgpu/umc_v12_0.h | 5 -----
2 files changed, 3 insertions(+), 6 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
index a030fed16c5a..65336ae12585 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.c
@@ -219,6 +219,8 @@ static void umc_v12_0_convert_error_address(struct amdgpu_device *adev,
/* clear [R13] in soc physical address */
soc_pa &= ~(0x1ULL << UMC_V12_0_PA_R13_BIT);
+ paddr_out->pa.pa = soc_pa;
+
/* loop for all possibilities of [R13 C4 C3 C2] */
for (column = 0; column < UMC_V12_0_BAD_PAGE_NUM_PER_CHANNEL; column++) {
retired_page = soc_pa | ((column & 0x3) << UMC_V12_0_PA_C2_BIT);
@@ -537,7 +539,7 @@ static int umc_v12_0_update_ecc_status(struct amdgpu_device *adev,
ecc_err->status = status;
ecc_err->ipid = ipid;
ecc_err->addr = addr;
- ecc_err->pa_pfn = UMC_V12_ADDR_MASK_BAD_COLS(pa_addr) >> AMDGPU_GPU_PAGE_SHIFT;
+ ecc_err->pa_pfn = pa_addr >> AMDGPU_GPU_PAGE_SHIFT;
/* If converted pa_pfn is 0, use pa C4 pfn. */
if (!ecc_err->pa_pfn)
diff --git a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
index dea42810fc53..f0074abb5381 100644
--- a/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
+++ b/drivers/gpu/drm/amd/amdgpu/umc_v12_0.h
@@ -82,11 +82,6 @@
(((REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdLo) & 0x1) << 2) | \
(REG_GET_FIELD(ipid, MCMP1_IPIDT0, InstanceIdHi) & 0x03))
-#define UMC_V12_ADDR_MASK_BAD_COLS(addr) \
- ((addr) & ~((0x3ULL << UMC_V12_0_PA_C2_BIT) | \
- (0x1ULL << UMC_V12_0_PA_C4_BIT) | \
- (0x1ULL << UMC_V12_0_PA_R13_BIT)))
-
bool umc_v12_0_is_deferred_error(struct amdgpu_device *adev, uint64_t mc_umc_status);
bool umc_v12_0_is_uncorrectable_error(struct amdgpu_device *adev, uint64_t mc_umc_status);
bool umc_v12_0_is_correctable_error(struct amdgpu_device *adev, uint64_t mc_umc_status);
--
2.34.1
------------------------------
Subject: Digest Footer
_______________________________________________
amd-gfx mailing list
amd-gfx at lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/amd-gfx
------------------------------
End of amd-gfx Digest, Vol 102, Issue 101
*****************************************
More information about the amd-gfx
mailing list