aboutsummaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c120
1 files changed, 77 insertions, 43 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
index 2de9309a4193..ccebd8e2a2d8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
@@ -35,6 +35,8 @@
#include "amdgpu_xgmi.h"
#include "ivsrcid/nbio/irqsrcs_nbif_7_4.h"
#include "atom.h"
+#include "amdgpu_reset.h"
+
#ifdef CONFIG_X86_MCE_AMD
#include <asm/mce.h>
@@ -197,6 +199,13 @@ static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
if (amdgpu_ras_query_error_status(obj->adev, &info))
return -EINVAL;
+ /* Hardware counter will be reset automatically after the query on Vega20 and Arcturus */
+ if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
+ obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
+ if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
+ dev_warn(obj->adev->dev, "Failed to reset error counter and error status");
+ }
+
s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
"ue", info.ue_count,
"ce", info.ce_count);
@@ -550,9 +559,10 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
if (amdgpu_ras_query_error_status(obj->adev, &info))
return -EINVAL;
- if (obj->adev->asic_type == CHIP_ALDEBARAN) {
+ if (obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
+ obj->adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
if (amdgpu_ras_reset_error_status(obj->adev, info.head.block))
- DRM_WARN("Failed to reset error counter and error status");
+ dev_warn(obj->adev->dev, "Failed to reset error counter and error status");
}
return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count,
@@ -707,27 +717,30 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
if (!con)
return -EINVAL;
- info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL);
- if (!info)
- return -ENOMEM;
+ if (head->block == AMDGPU_RAS_BLOCK__GFX) {
+ info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL);
+ if (!info)
+ return -ENOMEM;
- if (!enable) {
- info->disable_features = (struct ta_ras_disable_features_input) {
- .block_id = amdgpu_ras_block_to_ta(head->block),
- .error_type = amdgpu_ras_error_to_ta(head->type),
- };
- } else {
- info->enable_features = (struct ta_ras_enable_features_input) {
- .block_id = amdgpu_ras_block_to_ta(head->block),
- .error_type = amdgpu_ras_error_to_ta(head->type),
- };
+ if (!enable) {
+ info->disable_features = (struct ta_ras_disable_features_input) {
+ .block_id = amdgpu_ras_block_to_ta(head->block),
+ .error_type = amdgpu_ras_error_to_ta(head->type),
+ };
+ } else {
+ info->enable_features = (struct ta_ras_enable_features_input) {
+ .block_id = amdgpu_ras_block_to_ta(head->block),
+ .error_type = amdgpu_ras_error_to_ta(head->type),
+ };
+ }
}
/* Do not enable if it is not allowed. */
WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
/* Only enable ras feature operation handle on host side */
- if (!amdgpu_sriov_vf(adev) &&
+ if (head->block == AMDGPU_RAS_BLOCK__GFX &&
+ !amdgpu_sriov_vf(adev) &&
!amdgpu_ras_intr_triggered()) {
ret = psp_ras_enable_features(&adev->psp, info, enable);
if (ret) {
@@ -743,7 +756,8 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
__amdgpu_ras_feature_enable(adev, head, enable);
ret = 0;
out:
- kfree(info);
+ if (head->block == AMDGPU_RAS_BLOCK__GFX)
+ kfree(info);
return ret;
}
@@ -1027,9 +1041,6 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
}
}
- if (!amdgpu_persistent_edc_harvesting_supported(adev))
- amdgpu_ras_reset_error_status(adev, info->head.block);
-
return 0;
}
@@ -1149,6 +1160,12 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
if (res)
return res;
+ if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
+ adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4)) {
+ if (amdgpu_ras_reset_error_status(adev, info.head.block))
+ dev_warn(adev->dev, "Failed to reset error counter and error status");
+ }
+
ce += info.ce_count;
ue += info.ue_count;
}
@@ -1792,6 +1809,13 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev)
continue;
amdgpu_ras_query_error_status(adev, &info);
+
+ if (adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 2) &&
+ adev->ip_versions[MP0_HWIP][0] != IP_VERSION(11, 0, 4) &&
+ adev->ip_versions[MP0_HWIP][0] != IP_VERSION(13, 0, 0)) {
+ if (amdgpu_ras_reset_error_status(adev, info.head.block))
+ dev_warn(adev->dev, "Failed to reset error counter and error status");
+ }
}
}
@@ -1919,8 +1943,17 @@ static void amdgpu_ras_do_recovery(struct work_struct *work)
amdgpu_put_xgmi_hive(hive);
}
- if (amdgpu_device_should_recover_gpu(ras->adev))
- amdgpu_device_gpu_recover(ras->adev, NULL);
+ if (amdgpu_device_should_recover_gpu(ras->adev)) {
+ struct amdgpu_reset_context reset_context;
+ memset(&reset_context, 0, sizeof(reset_context));
+
+ reset_context.method = AMD_RESET_METHOD_NONE;
+ reset_context.reset_req_dev = adev;
+ clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+ clear_bit(AMDGPU_SKIP_MODE2_RESET, &reset_context.flags);
+
+ amdgpu_device_gpu_recover(ras->adev, NULL, &reset_context);
+ }
atomic_set(&ras->in_recovery, 0);
}
@@ -2131,7 +2164,7 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
bool exc_err_limit = false;
int ret;
- if (!con)
+ if (!con || amdgpu_sriov_vf(adev))
return 0;
/* Allow access to RAS EEPROM via debugfs, when the ASIC
@@ -2278,8 +2311,9 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
!amdgpu_ras_asic_supported(adev))
return;
- if (!(amdgpu_sriov_vf(adev) &&
- (adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2))))
+ /* If driver run on sriov guest side, only enable ras for aldebaran */
+ if (amdgpu_sriov_vf(adev) &&
+ adev->ip_versions[MP1_HWIP][0] != IP_VERSION(13, 0, 2))
return;
if (!adev->gmc.xgmi.connected_to_cpu) {
@@ -2686,7 +2720,8 @@ int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
/* Need disable ras on all IPs here before ip [hw/sw]fini */
- amdgpu_ras_disable_all_features(adev, 0);
+ if (con->features)
+ amdgpu_ras_disable_all_features(adev, 0);
amdgpu_ras_recovery_fini(adev);
return 0;
}
@@ -2799,11 +2834,8 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb,
struct mce *m = (struct mce *)data;
struct amdgpu_device *adev = NULL;
uint32_t gpu_id = 0;
- uint32_t umc_inst = 0;
- uint32_t ch_inst, channel_index = 0;
+ uint32_t umc_inst = 0, ch_inst = 0;
struct ras_err_data err_data = {0, 0, 0, NULL};
- struct eeprom_table_record err_rec;
- uint64_t retired_page;
/*
* If the error was generated in UMC_V2, which belongs to GPU UMCs,
@@ -2842,21 +2874,22 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb,
dev_info(adev->dev, "Uncorrectable error detected in UMC inst: %d, chan_idx: %d",
umc_inst, ch_inst);
+ err_data.err_addr =
+ kcalloc(adev->umc.max_ras_err_cnt_per_query,
+ sizeof(struct eeprom_table_record), GFP_KERNEL);
+ if(!err_data.err_addr) {
+ dev_warn(adev->dev, "Failed to alloc memory for "
+ "umc error address record in mca notifier!\n");
+ return NOTIFY_DONE;
+ }
+
/*
* Translate UMC channel address to Physical address
*/
- channel_index =
- adev->umc.channel_idx_tbl[umc_inst * adev->umc.channel_inst_num
- + ch_inst];
-
- retired_page = ADDR_OF_8KB_BLOCK(m->addr) |
- ADDR_OF_256B_BLOCK(channel_index) |
- OFFSET_IN_256B_BLOCK(m->addr);
-
- memset(&err_rec, 0x0, sizeof(struct eeprom_table_record));
- err_data.err_addr = &err_rec;
- amdgpu_umc_fill_error_record(&err_data, m->addr,
- retired_page, channel_index, umc_inst);
+ if (adev->umc.ras &&
+ adev->umc.ras->convert_ras_error_address)
+ adev->umc.ras->convert_ras_error_address(adev,
+ &err_data, 0, ch_inst, umc_inst, m->addr);
if (amdgpu_bad_page_threshold != 0) {
amdgpu_ras_add_bad_pages(adev, err_data.err_addr,
@@ -2864,6 +2897,7 @@ static int amdgpu_bad_page_notifier(struct notifier_block *nb,
amdgpu_ras_save_bad_pages(adev);
}
+ kfree(err_data.err_addr);
return NOTIFY_OK;
}
@@ -2928,7 +2962,7 @@ int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
- schedule_work(&ras->recovery_work);
+ amdgpu_reset_domain_schedule(ras->adev->reset_domain, &ras->recovery_work);
return 0;
}