aboutsummaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c230
1 files changed, 128 insertions, 102 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 1adc81a55734..b3d11703df04 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -1299,7 +1299,7 @@ ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *a
return -EINVAL;
return sysfs_emit(buf, "%s: %lu\n%s: %lu\n%s: %lu\n", "ue", info.ue_count,
- "ce", info.ce_count, "de", info.ue_count);
+ "ce", info.ce_count, "de", info.de_count);
}
static int amdgpu_ras_query_error_status_helper(struct amdgpu_device *adev,
@@ -1759,6 +1759,9 @@ int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
{
struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
+ if (amdgpu_aca_is_enabled(adev))
+ return 0;
+
if (!obj || obj->attr_inuse)
return -EINVAL;
@@ -1793,6 +1796,9 @@ int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
{
struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
+ if (amdgpu_aca_is_enabled(adev))
+ return 0;
+
if (!obj || !obj->attr_inuse)
return -EINVAL;
@@ -2172,12 +2178,15 @@ static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
struct ras_dispatch_if *info)
{
- struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
- struct ras_ih_data *data = &obj->ih_data;
+ struct ras_manager *obj;
+ struct ras_ih_data *data;
+ obj = amdgpu_ras_find_obj(adev, &info->head);
if (!obj)
return -EINVAL;
+ data = &obj->ih_data;
+
if (data->inuse == 0)
return 0;
@@ -2804,8 +2813,8 @@ static void amdgpu_ras_do_page_retirement(struct work_struct *work)
mutex_unlock(&con->umc_ecc_log.lock);
}
-static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
- enum amdgpu_ras_block ras_block, uint32_t timeout_ms)
+static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
+ uint32_t timeout_ms)
{
int ret = 0;
struct ras_ecc_log_info *ecc_log;
@@ -2814,7 +2823,7 @@ static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
memset(&info, 0, sizeof(info));
- info.head.block = ras_block;
+ info.head.block = AMDGPU_RAS_BLOCK__UMC;
ecc_log = &ras->umc_ecc_log;
ecc_log->de_updated = false;
@@ -2822,7 +2831,7 @@ static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
ret = amdgpu_ras_query_error_status(adev, &info);
if (ret) {
dev_err(adev->dev, "Failed to query ras error! ret:%d\n", ret);
- return ret;
+ return;
}
if (timeout && !ecc_log->de_updated) {
@@ -2833,21 +2842,11 @@ static int amdgpu_ras_query_ecc_status(struct amdgpu_device *adev,
if (timeout_ms && !timeout) {
dev_warn(adev->dev, "Can't find deferred error\n");
- return -ETIMEDOUT;
+ return;
}
- return 0;
-}
-
-static void amdgpu_ras_poison_creation_handler(struct amdgpu_device *adev,
- uint32_t timeout)
-{
- struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
- int ret;
-
- ret = amdgpu_ras_query_ecc_status(adev, AMDGPU_RAS_BLOCK__UMC, timeout);
if (!ret)
- schedule_delayed_work(&con->page_retirement_dwork, 0);
+ schedule_delayed_work(&ras->page_retirement_dwork, 0);
}
static int amdgpu_ras_poison_consumption_handler(struct amdgpu_device *adev,
@@ -2896,7 +2895,7 @@ static int amdgpu_ras_page_retirement_thread(void *param)
ras_block = poison_msg.block;
- dev_info(adev->dev, "Start processing ras block %s(%d)\n",
+ dev_dbg(adev->dev, "Start processing ras block %s(%d)\n",
ras_block_str(ras_block), ras_block);
if (ras_block == AMDGPU_RAS_BLOCK__UMC) {
@@ -2927,7 +2926,6 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data;
u32 max_eeprom_records_count = 0;
- bool exc_err_limit = false;
int ret;
if (!con || amdgpu_sriov_vf(adev))
@@ -2964,12 +2962,12 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
*/
if (adev->gmc.xgmi.pending_reset)
return 0;
- ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit);
+ ret = amdgpu_ras_eeprom_init(&con->eeprom_control);
/*
- * This calling fails when exc_err_limit is true or
+ * This calling fails when is_rma is true or
* ret != 0.
*/
- if (exc_err_limit || ret)
+ if (con->is_rma || ret)
goto free;
if (con->eeprom_control.ras_num_recs) {
@@ -3017,7 +3015,7 @@ out:
* Except error threshold exceeding case, other failure cases in this
* function would not fail amdgpu driver init.
*/
- if (!exc_err_limit)
+ if (!con->is_rma)
ret = 0;
else
ret = -EINVAL;
@@ -3063,6 +3061,7 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)
switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {
case IP_VERSION(13, 0, 2):
case IP_VERSION(13, 0, 6):
+ case IP_VERSION(13, 0, 14):
return true;
default:
return false;
@@ -3074,6 +3073,7 @@ static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev)
case IP_VERSION(13, 0, 0):
case IP_VERSION(13, 0, 6):
case IP_VERSION(13, 0, 10):
+ case IP_VERSION(13, 0, 14):
return true;
default:
return false;
@@ -3297,6 +3297,24 @@ static void amdgpu_ras_event_mgr_init(struct amdgpu_device *adev)
amdgpu_put_xgmi_hive(hive);
}
+static void amdgpu_ras_init_reserved_vram_size(struct amdgpu_device *adev)
+{
+ struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
+
+ if (!con || (adev->flags & AMD_IS_APU))
+ return;
+
+ switch (amdgpu_ip_version(adev, MP0_HWIP, 0)) {
+ case IP_VERSION(13, 0, 2):
+ case IP_VERSION(13, 0, 6):
+ case IP_VERSION(13, 0, 14):
+ con->reserved_pages_in_bytes = AMDGPU_RAS_RESERVED_VRAM_SIZE;
+ break;
+ default:
+ break;
+ }
+}
+
int amdgpu_ras_init(struct amdgpu_device *adev)
{
struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
@@ -3402,6 +3420,8 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
/* Get RAS schema for particular SOC */
con->schema = amdgpu_get_ras_schema(adev);
+ amdgpu_ras_init_reserved_vram_size(adev);
+
if (amdgpu_ras_fs_init(adev)) {
r = -EINVAL;
goto release_con;
@@ -3613,25 +3633,33 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
struct amdgpu_ras_block_object *obj;
int r;
- /* Guest side doesn't need init ras feature */
- if (amdgpu_sriov_vf(adev))
- return 0;
-
amdgpu_ras_event_mgr_init(adev);
if (amdgpu_aca_is_enabled(adev)) {
- if (amdgpu_in_reset(adev))
- r = amdgpu_aca_reset(adev);
- else
+ if (!amdgpu_in_reset(adev)) {
r = amdgpu_aca_init(adev);
+ if (r)
+ return r;
+ }
+
+ if (!amdgpu_sriov_vf(adev))
+ amdgpu_ras_set_aca_debug_mode(adev, false);
+ } else {
+ if (amdgpu_in_reset(adev))
+ r = amdgpu_mca_reset(adev);
+ else
+ r = amdgpu_mca_init(adev);
if (r)
return r;
- amdgpu_ras_set_aca_debug_mode(adev, false);
- } else {
- amdgpu_ras_set_mca_debug_mode(adev, false);
+ if (!amdgpu_sriov_vf(adev))
+ amdgpu_ras_set_mca_debug_mode(adev, false);
}
+ /* Guest side doesn't need init ras feature */
+ if (amdgpu_sriov_vf(adev))
+ return 0;
+
list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
obj = node->ras_obj;
if (!obj) {
@@ -3701,6 +3729,8 @@ int amdgpu_ras_fini(struct amdgpu_device *adev)
if (amdgpu_aca_is_enabled(adev))
amdgpu_aca_fini(adev);
+ else
+ amdgpu_mca_fini(adev);
WARN(AMDGPU_RAS_GET_FEATURES(con->features), "Feature mask is not cleared");
@@ -4284,21 +4314,8 @@ static struct ras_err_info *amdgpu_ras_error_get_info(struct ras_err_data *err_d
void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info, struct ras_err_addr *err_addr)
{
- struct ras_err_addr *mca_err_addr;
-
/* This function will be retired. */
return;
- mca_err_addr = kzalloc(sizeof(*mca_err_addr), GFP_KERNEL);
- if (!mca_err_addr)
- return;
-
- INIT_LIST_HEAD(&mca_err_addr->node);
-
- mca_err_addr->err_status = err_addr->err_status;
- mca_err_addr->err_ipid = err_addr->err_ipid;
- mca_err_addr->err_addr = err_addr->err_addr;
-
- list_add_tail(&mca_err_addr->node, &err_info->err_addr_list);
}
void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info, struct ras_err_addr *mca_err_addr)
@@ -4382,64 +4399,74 @@ int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data,
#define mmMP0_SMN_C2PMSG_92 0x1609C
#define mmMP0_SMN_C2PMSG_126 0x160BE
static void amdgpu_ras_boot_time_error_reporting(struct amdgpu_device *adev,
- u32 instance, u32 boot_error)
+ u32 instance)
{
u32 socket_id, aid_id, hbm_id;
- u32 reg_data;
+ u32 fw_status;
+ u32 boot_error;
u64 reg_addr;
- socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
- aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
- hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
-
/* The pattern for smn addressing in other SOC could be different from
* the one for aqua_vanjaram. We should revisit the code if the pattern
* is changed. In such case, replace the aqua_vanjaram implementation
* with more common helper */
reg_addr = (mmMP0_SMN_C2PMSG_92 << 2) +
aqua_vanjaram_encode_ext_smn_addressing(instance);
+ fw_status = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
+
+ reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
+ aqua_vanjaram_encode_ext_smn_addressing(instance);
+ boot_error = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
- reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
- dev_err(adev->dev, "socket: %d, aid: %d, firmware boot failed, fw status is 0x%x\n",
- socket_id, aid_id, reg_data);
+ socket_id = AMDGPU_RAS_GPU_ERR_SOCKET_ID(boot_error);
+ aid_id = AMDGPU_RAS_GPU_ERR_AID_ID(boot_error);
+ hbm_id = AMDGPU_RAS_GPU_ERR_HBM_ID(boot_error);
if (AMDGPU_RAS_GPU_ERR_MEM_TRAINING(boot_error))
- dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, memory training failed\n",
- socket_id, aid_id, hbm_id);
+ dev_info(adev->dev,
+ "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, memory training failed\n",
+ socket_id, aid_id, hbm_id, fw_status);
if (AMDGPU_RAS_GPU_ERR_FW_LOAD(boot_error))
- dev_info(adev->dev, "socket: %d, aid: %d, firmware load failed at boot time\n",
- socket_id, aid_id);
+ dev_info(adev->dev,
+ "socket: %d, aid: %d, fw_status: 0x%x, firmware load failed at boot time\n",
+ socket_id, aid_id, fw_status);
if (AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(boot_error))
- dev_info(adev->dev, "socket: %d, aid: %d, wafl link training failed\n",
- socket_id, aid_id);
+ dev_info(adev->dev,
+ "socket: %d, aid: %d, fw_status: 0x%x, wafl link training failed\n",
+ socket_id, aid_id, fw_status);
if (AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(boot_error))
- dev_info(adev->dev, "socket: %d, aid: %d, xgmi link training failed\n",
- socket_id, aid_id);
+ dev_info(adev->dev,
+ "socket: %d, aid: %d, fw_status: 0x%x, xgmi link training failed\n",
+ socket_id, aid_id, fw_status);
if (AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(boot_error))
- dev_info(adev->dev, "socket: %d, aid: %d, usr cp link training failed\n",
- socket_id, aid_id);
+ dev_info(adev->dev,
+ "socket: %d, aid: %d, fw_status: 0x%x, usr cp link training failed\n",
+ socket_id, aid_id, fw_status);
if (AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(boot_error))
- dev_info(adev->dev, "socket: %d, aid: %d, usr dp link training failed\n",
- socket_id, aid_id);
+ dev_info(adev->dev,
+ "socket: %d, aid: %d, fw_status: 0x%x, usr dp link training failed\n",
+ socket_id, aid_id, fw_status);
if (AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(boot_error))
- dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm memory test failed\n",
- socket_id, aid_id, hbm_id);
+ dev_info(adev->dev,
+ "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm memory test failed\n",
+ socket_id, aid_id, hbm_id, fw_status);
if (AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(boot_error))
- dev_info(adev->dev, "socket: %d, aid: %d, hbm: %d, hbm bist test failed\n",
- socket_id, aid_id, hbm_id);
+ dev_info(adev->dev,
+ "socket: %d, aid: %d, hbm: %d, fw_status: 0x%x, hbm bist test failed\n",
+ socket_id, aid_id, hbm_id, fw_status);
}
-static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev,
- u32 instance, u32 *boot_error)
+static bool amdgpu_ras_boot_error_detected(struct amdgpu_device *adev,
+ u32 instance)
{
- u32 reg_addr;
+ u64 reg_addr;
u32 reg_data;
int retry_loop;
@@ -4448,41 +4475,22 @@ static int amdgpu_ras_wait_for_boot_complete(struct amdgpu_device *adev,
for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) {
reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
- if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) == AMDGPU_RAS_BOOT_STEADY_STATUS) {
- *boot_error = AMDGPU_RAS_BOOT_SUCEESS;
- return 0;
- }
- msleep(1);
- }
-
- /* The pattern for smn addressing in other SOC could be different from
- * the one for aqua_vanjaram. We should revisit the code if the pattern
- * is changed. In such case, replace the aqua_vanjaram implementation
- * with more common helper */
- reg_addr = (mmMP0_SMN_C2PMSG_126 << 2) +
- aqua_vanjaram_encode_ext_smn_addressing(instance);
-
- for (retry_loop = 0; retry_loop < AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT; retry_loop++) {
- reg_data = amdgpu_device_indirect_rreg_ext(adev, reg_addr);
- if (AMDGPU_RAS_GPU_ERR_BOOT_STATUS(reg_data)) {
- *boot_error = reg_data;
- return 0;
- }
- msleep(1);
+ if ((reg_data & AMDGPU_RAS_BOOT_STATUS_MASK) == AMDGPU_RAS_BOOT_STEADY_STATUS)
+ return false;
+ else
+ msleep(1);
}
- *boot_error = reg_data;
- return -ETIME;
+ return true;
}
void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances)
{
- u32 boot_error = 0;
u32 i;
for (i = 0; i < num_instances; i++) {
- if (amdgpu_ras_wait_for_boot_complete(adev, i, &boot_error))
- amdgpu_ras_boot_time_error_reporting(adev, i, boot_error);
+ if (amdgpu_ras_boot_error_detected(adev, i))
+ amdgpu_ras_boot_time_error_reporting(adev, i);
}
}
@@ -4501,3 +4509,21 @@ int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn)
return ret;
}
+
+void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id,
+ const char *fmt, ...)
+{
+ struct va_format vaf;
+ va_list args;
+
+ va_start(args, fmt);
+ vaf.fmt = fmt;
+ vaf.va = &args;
+
+ if (amdgpu_ras_event_id_is_valid(adev, event_id))
+ dev_printk(KERN_INFO, adev->dev, "{%llu}%pV", event_id, &vaf);
+ else
+ dev_printk(KERN_INFO, adev->dev, "%pV", &vaf);
+
+ va_end(args);
+}