diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c | 381 |
1 files changed, 238 insertions, 143 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c index 0734490347db..9d3a3c778504 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mca.c @@ -153,7 +153,7 @@ int amdgpu_mca_mpio_ras_sw_init(struct amdgpu_device *adev) return 0; } -void amdgpu_mca_bank_set_init(struct mca_bank_set *mca_set) +static void amdgpu_mca_bank_set_init(struct mca_bank_set *mca_set) { if (!mca_set) return; @@ -162,7 +162,7 @@ void amdgpu_mca_bank_set_init(struct mca_bank_set *mca_set) INIT_LIST_HEAD(&mca_set->list); } -int amdgpu_mca_bank_set_add_entry(struct mca_bank_set *mca_set, struct mca_bank_entry *entry) +static int amdgpu_mca_bank_set_add_entry(struct mca_bank_set *mca_set, struct mca_bank_entry *entry) { struct mca_bank_node *node; @@ -183,14 +183,36 @@ int amdgpu_mca_bank_set_add_entry(struct mca_bank_set *mca_set, struct mca_bank_ return 0; } -void amdgpu_mca_bank_set_release(struct mca_bank_set *mca_set) +static int amdgpu_mca_bank_set_merge(struct mca_bank_set *mca_set, struct mca_bank_set *new) +{ + struct mca_bank_node *node; + + list_for_each_entry(node, &new->list, node) + amdgpu_mca_bank_set_add_entry(mca_set, &node->entry); + + return 0; +} + +static void amdgpu_mca_bank_set_remove_node(struct mca_bank_set *mca_set, struct mca_bank_node *node) +{ + if (!node) + return; + + list_del(&node->node); + kvfree(node); + + mca_set->nr_entries--; +} + +static void amdgpu_mca_bank_set_release(struct mca_bank_set *mca_set) { struct mca_bank_node *node, *tmp; - list_for_each_entry_safe(node, tmp, &mca_set->list, node) { - list_del(&node->node); - kvfree(node); - } + if (list_empty(&mca_set->list)) + return; + + list_for_each_entry_safe(node, tmp, &mca_set->list, node) + amdgpu_mca_bank_set_remove_node(mca_set, node); } void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_mca_smu_funcs *mca_funcs) @@ -200,6 +222,45 @@ void amdgpu_mca_smu_init_funcs(struct amdgpu_device *adev, const struct amdgpu_m mca->mca_funcs = mca_funcs; } +int amdgpu_mca_init(struct amdgpu_device *adev) +{ + struct amdgpu_mca *mca = &adev->mca; + struct mca_bank_cache *mca_cache; + int i; + + atomic_set(&mca->ue_update_flag, 0); + + for (i = 0; i < ARRAY_SIZE(mca->mca_caches); i++) { + mca_cache = &mca->mca_caches[i]; + mutex_init(&mca_cache->lock); + amdgpu_mca_bank_set_init(&mca_cache->mca_set); + } + + return 0; +} + +void amdgpu_mca_fini(struct amdgpu_device *adev) +{ + struct amdgpu_mca *mca = &adev->mca; + struct mca_bank_cache *mca_cache; + int i; + + atomic_set(&mca->ue_update_flag, 0); + + for (i = 0; i < ARRAY_SIZE(mca->mca_caches); i++) { + mca_cache = &mca->mca_caches[i]; + amdgpu_mca_bank_set_release(&mca_cache->mca_set); + mutex_destroy(&mca_cache->lock); + } +} + +int amdgpu_mca_reset(struct amdgpu_device *adev) +{ + amdgpu_mca_fini(adev); + + return amdgpu_mca_init(adev); +} + int amdgpu_mca_smu_set_debug_mode(struct amdgpu_device *adev, bool enable) { const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; @@ -228,175 +289,213 @@ static void amdgpu_mca_smu_mca_bank_dump(struct amdgpu_device *adev, int idx, st idx, entry->regs[MCA_REG_IDX_SYND]); } -int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, - struct ras_err_data *err_data, struct ras_query_context *qctx) +static int amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, uint32_t *count) { - struct amdgpu_smuio_mcm_config_info mcm_info; - struct ras_err_addr err_addr = {0}; - struct mca_bank_set mca_set; - struct mca_bank_node *node; - struct mca_bank_entry *entry; - uint32_t count; - int ret, i = 0; - - amdgpu_mca_bank_set_init(&mca_set); - - ret = amdgpu_mca_smu_get_mca_set(adev, blk, type, &mca_set); - if (ret) - goto out_mca_release; - - list_for_each_entry(node, &mca_set.list, node) { - entry = &node->entry; + const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; - amdgpu_mca_smu_mca_bank_dump(adev, i++, entry, qctx); + if (!count) + return -EINVAL; - count = 0; - ret = amdgpu_mca_smu_parse_mca_error_count(adev, blk, type, entry, &count); - if (ret) - goto out_mca_release; + if (mca_funcs && mca_funcs->mca_get_valid_mca_count) + return mca_funcs->mca_get_valid_mca_count(adev, type, count); - if (!count) - continue; + return -EOPNOTSUPP; +} - mcm_info.socket_id = entry->info.socket_id; - mcm_info.die_id = entry->info.aid; +static int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, + int idx, struct mca_bank_entry *entry) +{ + const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; + int count; - if (blk == AMDGPU_RAS_BLOCK__UMC) { - err_addr.err_status = entry->regs[MCA_REG_IDX_STATUS]; - err_addr.err_ipid = entry->regs[MCA_REG_IDX_IPID]; - err_addr.err_addr = entry->regs[MCA_REG_IDX_ADDR]; - } + if (!mca_funcs || !mca_funcs->mca_get_mca_entry) + return -EOPNOTSUPP; - if (type == AMDGPU_MCA_ERROR_TYPE_UE) - amdgpu_ras_error_statistic_ue_count(err_data, - &mcm_info, &err_addr, (uint64_t)count); - else { - if (amdgpu_mca_is_deferred_error(adev, entry->regs[MCA_REG_IDX_STATUS])) - amdgpu_ras_error_statistic_de_count(err_data, - &mcm_info, &err_addr, (uint64_t)count); - else - amdgpu_ras_error_statistic_ce_count(err_data, - &mcm_info, &err_addr, (uint64_t)count); - } + switch (type) { + case AMDGPU_MCA_ERROR_TYPE_UE: + count = mca_funcs->max_ue_count; + break; + case AMDGPU_MCA_ERROR_TYPE_CE: + count = mca_funcs->max_ce_count; + break; + default: + return -EINVAL; } -out_mca_release: - amdgpu_mca_bank_set_release(&mca_set); + if (idx >= count) + return -EINVAL; - return ret; + return mca_funcs->mca_get_mca_entry(adev, type, idx, entry); } - -int amdgpu_mca_smu_get_valid_mca_count(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, uint32_t *count) +static bool amdgpu_mca_bank_should_update(struct amdgpu_device *adev, enum amdgpu_mca_error_type type) { - const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; - - if (!count) - return -EINVAL; - - if (mca_funcs && mca_funcs->mca_get_valid_mca_count) - return mca_funcs->mca_get_valid_mca_count(adev, type, count); + struct amdgpu_mca *mca = &adev->mca; + bool ret = true; + + /* + * Because the UE Valid MCA count will only be cleared after reset, + * in order to avoid repeated counting of the error count, + * the aca bank is only updated once during the gpu recovery stage. + */ + if (type == AMDGPU_MCA_ERROR_TYPE_UE) { + if (amdgpu_ras_intr_triggered()) + ret = atomic_cmpxchg(&mca->ue_update_flag, 0, 1) == 0; + else + atomic_set(&mca->ue_update_flag, 0); + } - return -EOPNOTSUPP; + return ret; } -int amdgpu_mca_smu_get_mca_set_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk, - enum amdgpu_mca_error_type type, uint32_t *total) +static int amdgpu_mca_smu_get_mca_set(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, struct mca_bank_set *mca_set, + struct ras_query_context *qctx) { - const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; - struct mca_bank_set mca_set; - struct mca_bank_node *node; - struct mca_bank_entry *entry; - uint32_t count; + struct mca_bank_entry entry; + uint32_t count = 0, i; int ret; - if (!total) + if (!mca_set) return -EINVAL; - if (!mca_funcs) - return -EOPNOTSUPP; - - if (!mca_funcs->mca_get_ras_mca_set || !mca_funcs->mca_get_valid_mca_count) - return -EOPNOTSUPP; - - amdgpu_mca_bank_set_init(&mca_set); + if (!amdgpu_mca_bank_should_update(adev, type)) + return 0; - ret = mca_funcs->mca_get_ras_mca_set(adev, blk, type, &mca_set); + ret = amdgpu_mca_smu_get_valid_mca_count(adev, type, &count); if (ret) - goto err_mca_set_release; - - *total = 0; - list_for_each_entry(node, &mca_set.list, node) { - entry = &node->entry; + return ret; - count = 0; - ret = mca_funcs->mca_parse_mca_error_count(adev, blk, type, entry, &count); + for (i = 0; i < count; i++) { + memset(&entry, 0, sizeof(entry)); + ret = amdgpu_mca_smu_get_mca_entry(adev, type, i, &entry); if (ret) - goto err_mca_set_release; + return ret; - *total += count; - } + amdgpu_mca_bank_set_add_entry(mca_set, &entry); -err_mca_set_release: - amdgpu_mca_bank_set_release(&mca_set); + amdgpu_mca_smu_mca_bank_dump(adev, i, &entry, qctx); + } - return ret; + return 0; } -int amdgpu_mca_smu_parse_mca_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk, - enum amdgpu_mca_error_type type, struct mca_bank_entry *entry, uint32_t *count) +static int amdgpu_mca_smu_parse_mca_error_count(struct amdgpu_device *adev, enum amdgpu_ras_block blk, + enum amdgpu_mca_error_type type, struct mca_bank_entry *entry, uint32_t *count) { const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; + if (!count || !entry) return -EINVAL; if (!mca_funcs || !mca_funcs->mca_parse_mca_error_count) return -EOPNOTSUPP; - return mca_funcs->mca_parse_mca_error_count(adev, blk, type, entry, count); } -int amdgpu_mca_smu_get_mca_set(struct amdgpu_device *adev, enum amdgpu_ras_block blk, - enum amdgpu_mca_error_type type, struct mca_bank_set *mca_set) +static int amdgpu_mca_dispatch_mca_set(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, + struct mca_bank_set *mca_set, struct ras_err_data *err_data) { - const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; + struct ras_err_addr err_addr; + struct amdgpu_smuio_mcm_config_info mcm_info; + struct mca_bank_node *node, *tmp; + struct mca_bank_entry *entry; + uint32_t count; + int ret; if (!mca_set) return -EINVAL; - if (!mca_funcs || !mca_funcs->mca_get_ras_mca_set) - return -EOPNOTSUPP; + if (!mca_set->nr_entries) + return 0; - WARN_ON(!list_empty(&mca_set->list)); + list_for_each_entry_safe(node, tmp, &mca_set->list, node) { + entry = &node->entry; + + count = 0; + ret = amdgpu_mca_smu_parse_mca_error_count(adev, blk, type, entry, &count); + if (ret && ret != -EOPNOTSUPP) + return ret; + + if (!count) + continue; + + memset(&mcm_info, 0, sizeof(mcm_info)); + memset(&err_addr, 0, sizeof(err_addr)); + + mcm_info.socket_id = entry->info.socket_id; + mcm_info.die_id = entry->info.aid; + + if (blk == AMDGPU_RAS_BLOCK__UMC) { + err_addr.err_status = entry->regs[MCA_REG_IDX_STATUS]; + err_addr.err_ipid = entry->regs[MCA_REG_IDX_IPID]; + err_addr.err_addr = entry->regs[MCA_REG_IDX_ADDR]; + } + + if (type == AMDGPU_MCA_ERROR_TYPE_UE) { + amdgpu_ras_error_statistic_ue_count(err_data, + &mcm_info, &err_addr, (uint64_t)count); + } else { + if (amdgpu_mca_is_deferred_error(adev, entry->regs[MCA_REG_IDX_STATUS])) + amdgpu_ras_error_statistic_de_count(err_data, + &mcm_info, &err_addr, (uint64_t)count); + else + amdgpu_ras_error_statistic_ce_count(err_data, + &mcm_info, &err_addr, (uint64_t)count); + } - return mca_funcs->mca_get_ras_mca_set(adev, blk, type, mca_set); + amdgpu_mca_bank_set_remove_node(mca_set, node); + } + + return 0; } -int amdgpu_mca_smu_get_mca_entry(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, - int idx, struct mca_bank_entry *entry) +static int amdgpu_mca_add_mca_set_to_cache(struct amdgpu_device *adev, enum amdgpu_mca_error_type type, struct mca_bank_set *new) { - const struct amdgpu_mca_smu_funcs *mca_funcs = adev->mca.mca_funcs; - int count; + struct mca_bank_cache *mca_cache = &adev->mca.mca_caches[type]; + int ret; - if (!mca_funcs || !mca_funcs->mca_get_mca_entry) - return -EOPNOTSUPP; + mutex_lock(&mca_cache->lock); + ret = amdgpu_mca_bank_set_merge(&mca_cache->mca_set, new); + mutex_unlock(&mca_cache->lock); - switch (type) { - case AMDGPU_MCA_ERROR_TYPE_UE: - count = mca_funcs->max_ue_count; - break; - case AMDGPU_MCA_ERROR_TYPE_CE: - count = mca_funcs->max_ce_count; - break; - default: - return -EINVAL; + return ret; +} + +int amdgpu_mca_smu_log_ras_error(struct amdgpu_device *adev, enum amdgpu_ras_block blk, enum amdgpu_mca_error_type type, + struct ras_err_data *err_data, struct ras_query_context *qctx) +{ + struct mca_bank_set mca_set; + struct mca_bank_cache *mca_cache = &adev->mca.mca_caches[type]; + int ret; + + amdgpu_mca_bank_set_init(&mca_set); + + ret = amdgpu_mca_smu_get_mca_set(adev, type, &mca_set, qctx); + if (ret) + goto out_mca_release; + + ret = amdgpu_mca_dispatch_mca_set(adev, blk, type, &mca_set, err_data); + if (ret) + goto out_mca_release; + + /* add remain mca bank to mca cache */ + if (mca_set.nr_entries) { + ret = amdgpu_mca_add_mca_set_to_cache(adev, type, &mca_set); + if (ret) + goto out_mca_release; } - if (idx >= count) - return -EINVAL; + /* dispatch mca set again if mca cache has valid data */ + mutex_lock(&mca_cache->lock); + if (mca_cache->mca_set.nr_entries) + ret = amdgpu_mca_dispatch_mca_set(adev, blk, type, &mca_cache->mca_set, err_data); + mutex_unlock(&mca_cache->lock); - return mca_funcs->mca_get_mca_entry(adev, type, idx, entry); +out_mca_release: + amdgpu_mca_bank_set_release(&mca_set); + + return ret; } #if defined(CONFIG_DEBUG_FS) @@ -437,36 +536,32 @@ static void mca_dump_entry(struct seq_file *m, struct mca_bank_entry *entry) static int mca_dump_show(struct seq_file *m, enum amdgpu_mca_error_type type) { struct amdgpu_device *adev = (struct amdgpu_device *)m->private; - struct mca_bank_entry *entry; - uint32_t count = 0; - int i, ret; + struct mca_bank_node *node; + struct mca_bank_set mca_set; + struct ras_query_context qctx; + int ret; - ret = amdgpu_mca_smu_get_valid_mca_count(adev, type, &count); + amdgpu_mca_bank_set_init(&mca_set); + + qctx.event_id = 0ULL; + ret = amdgpu_mca_smu_get_mca_set(adev, type, &mca_set, &qctx); if (ret) - return ret; + goto err_free_mca_set; seq_printf(m, "amdgpu smu %s valid mca count: %d\n", - type == AMDGPU_MCA_ERROR_TYPE_UE ? "UE" : "CE", count); - - if (!count) - return 0; + type == AMDGPU_MCA_ERROR_TYPE_UE ? "UE" : "CE", mca_set.nr_entries); - entry = kmalloc(sizeof(*entry), GFP_KERNEL); - if (!entry) - return -ENOMEM; + if (!mca_set.nr_entries) + goto err_free_mca_set; - for (i = 0; i < count; i++) { - memset(entry, 0, sizeof(*entry)); + list_for_each_entry(node, &mca_set.list, node) + mca_dump_entry(m, &node->entry); - ret = amdgpu_mca_smu_get_mca_entry(adev, type, i, entry); - if (ret) - goto err_free_entry; + /* add mca bank to mca bank cache */ + ret = amdgpu_mca_add_mca_set_to_cache(adev, type, &mca_set); - mca_dump_entry(m, entry); - } - -err_free_entry: - kfree(entry); +err_free_mca_set: + amdgpu_mca_bank_set_release(&mca_set); return ret; } @@ -513,7 +608,7 @@ DEFINE_DEBUGFS_ATTRIBUTE(mca_debug_mode_fops, NULL, amdgpu_mca_smu_debug_mode_se void amdgpu_mca_smu_debugfs_init(struct amdgpu_device *adev, struct dentry *root) { #if defined(CONFIG_DEBUG_FS) - if (!root || amdgpu_ip_version(adev, MP1_HWIP, 0) != IP_VERSION(13, 0, 6)) + if (!root) return; debugfs_create_file("mca_debug_mode", 0200, root, adev, &mca_debug_mode_fops); |