diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 462 |
1 files changed, 269 insertions, 193 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index bcce4c0be462..b96267068a72 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -34,6 +34,8 @@ #include "amdgpu_xgmi.h" #include "ivsrcid/nbio/irqsrcs_nbif_7_4.h" +static const char *RAS_FS_NAME = "ras"; + const char *ras_error_string[] = { "none", "parity", @@ -62,13 +64,14 @@ const char *ras_block_string[] = { #define ras_err_str(i) (ras_error_string[ffs(i)]) #define ras_block_str(i) (ras_block_string[i]) -#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1 -#define AMDGPU_RAS_FLAG_INIT_NEED_RESET 2 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS) /* inject address is 52 bits */ #define RAS_UMC_INJECT_ADDR_LIMIT (0x1ULL << 52) +/* typical ECC bad page rate(1 bad page per 100MB VRAM) */ +#define RAS_BAD_PAGE_RATE (100 * 1024 * 1024ULL) + enum amdgpu_ras_retire_page_reservation { AMDGPU_RAS_RETIRE_PAGE_RESERVED, AMDGPU_RAS_RETIRE_PAGE_PENDING, @@ -77,6 +80,8 @@ enum amdgpu_ras_retire_page_reservation { atomic_t amdgpu_ras_in_intr = ATOMIC_INIT(0); +static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, + uint64_t addr); static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, uint64_t addr); @@ -367,12 +372,19 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user *buf, size_t size, loff_t *pos) { - struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private; + struct amdgpu_device *adev = + (struct amdgpu_device *)file_inode(f)->i_private; int ret; - ret = amdgpu_ras_eeprom_reset_table(&adev->psp.ras.ras->eeprom_control); + ret = amdgpu_ras_eeprom_reset_table( + &(amdgpu_ras_get_context(adev)->eeprom_control)); - return ret == 1 ? size : -EIO; + if (ret == 1) { + amdgpu_ras_get_context(adev)->flags = RAS_DEFAULT_FLAGS; + return size; + } else { + return -EIO; + } } static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = { @@ -506,9 +518,9 @@ struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev, /* obj end */ static void amdgpu_ras_parse_status_code(struct amdgpu_device *adev, - const char* invoke_type, - const char* block_name, - enum ta_ras_status ret) + const char* invoke_type, + const char* block_name, + enum ta_ras_status ret) { switch (ret) { case TA_RAS_STATUS__SUCCESS: @@ -597,7 +609,7 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, if (!con) return -EINVAL; - info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL); + info = kzalloc(sizeof(union ta_ras_cmd_input), GFP_KERNEL); if (!info) return -ENOMEM; @@ -943,7 +955,7 @@ static char *amdgpu_ras_badpage_flags_str(unsigned int flags) case AMDGPU_RAS_RETIRE_PAGE_FAULT: default: return "F"; - }; + } } /** @@ -1017,45 +1029,13 @@ static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev, return scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features); } -static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev) +static void amdgpu_ras_sysfs_remove_bad_page_node(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - struct attribute *attrs[] = { - &con->features_attr.attr, - NULL - }; - struct bin_attribute *bin_attrs[] = { - &con->badpages_attr, - NULL - }; - struct attribute_group group = { - .name = "ras", - .attrs = attrs, - .bin_attrs = bin_attrs, - }; - con->features_attr = (struct device_attribute) { - .attr = { - .name = "features", - .mode = S_IRUGO, - }, - .show = amdgpu_ras_sysfs_features_read, - }; - - con->badpages_attr = (struct bin_attribute) { - .attr = { - .name = "gpu_vram_bad_pages", - .mode = S_IRUGO, - }, - .size = 0, - .private = NULL, - .read = amdgpu_ras_sysfs_badpages_read, - }; - - sysfs_attr_init(attrs[0]); - sysfs_bin_attr_init(bin_attrs[0]); - - return sysfs_create_group(&adev->dev->kobj, &group); + sysfs_remove_file_from_group(&adev->dev->kobj, + &con->badpages_attr.attr, + RAS_FS_NAME); } static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) @@ -1065,14 +1045,9 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) &con->features_attr.attr, NULL }; - struct bin_attribute *bin_attrs[] = { - &con->badpages_attr, - NULL - }; struct attribute_group group = { - .name = "ras", + .name = RAS_FS_NAME, .attrs = attrs, - .bin_attrs = bin_attrs, }; sysfs_remove_group(&adev->dev->kobj, &group); @@ -1105,7 +1080,7 @@ int amdgpu_ras_sysfs_create(struct amdgpu_device *adev, if (sysfs_add_file_to_group(&adev->dev->kobj, &obj->sysfs_attr.attr, - "ras")) { + RAS_FS_NAME)) { put_obj(obj); return -EINVAL; } @@ -1125,7 +1100,7 @@ int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev, sysfs_remove_file_from_group(&adev->dev->kobj, &obj->sysfs_attr.attr, - "ras"); + RAS_FS_NAME); obj->attr_inuse = 0; put_obj(obj); @@ -1141,6 +1116,9 @@ static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) amdgpu_ras_sysfs_remove(adev, &obj->head); } + if (amdgpu_bad_page_threshold != 0) + amdgpu_ras_sysfs_remove_bad_page_node(adev); + amdgpu_ras_sysfs_remove_feature_node(adev); return 0; @@ -1169,9 +1147,9 @@ static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev) static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - struct drm_minor *minor = adev->ddev->primary; + struct drm_minor *minor = adev_to_drm(adev)->primary; - con->dir = debugfs_create_dir("ras", minor->debugfs_root); + con->dir = debugfs_create_dir(RAS_FS_NAME, minor->debugfs_root); debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir, adev, &amdgpu_ras_debugfs_ctrl_ops); debugfs_create_file("ras_eeprom_reset", S_IWUGO | S_IRUGO, con->dir, @@ -1187,6 +1165,13 @@ static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev) */ debugfs_create_bool("auto_reboot", S_IWUGO | S_IRUGO, con->dir, &con->reboot); + + /* + * User could set this not to clean up hardware's error count register + * of RAS IPs during ras recovery. + */ + debugfs_create_bool("disable_ras_err_cnt_harvest", 0644, + con->dir, &con->disable_ras_err_cnt_harvest); } void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, @@ -1211,6 +1196,7 @@ void amdgpu_ras_debugfs_create(struct amdgpu_device *adev, void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) { +#if defined(CONFIG_DEBUG_FS) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_manager *obj; struct ras_fs_if fs_info; @@ -1233,6 +1219,7 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev) amdgpu_ras_debugfs_create(adev, &fs_info); } } +#endif } void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev, @@ -1243,13 +1230,13 @@ void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev, if (!obj || !obj->ent) return; - debugfs_remove(obj->ent); obj->ent = NULL; put_obj(obj); } static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev) { +#if defined(CONFIG_DEBUG_FS) struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_manager *obj, *tmp; @@ -1257,16 +1244,49 @@ static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev) amdgpu_ras_debugfs_remove(adev, &obj->head); } - debugfs_remove_recursive(con->dir); con->dir = NULL; +#endif } /* debugfs end */ /* ras fs */ - +static BIN_ATTR(gpu_vram_bad_pages, S_IRUGO, + amdgpu_ras_sysfs_badpages_read, NULL, 0); +static DEVICE_ATTR(features, S_IRUGO, + amdgpu_ras_sysfs_features_read, NULL); static int amdgpu_ras_fs_init(struct amdgpu_device *adev) { - amdgpu_ras_sysfs_create_feature_node(adev); + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct attribute_group group = { + .name = RAS_FS_NAME, + }; + struct attribute *attrs[] = { + &con->features_attr.attr, + NULL + }; + struct bin_attribute *bin_attrs[] = { + NULL, + NULL, + }; + int r; + + /* add features entry */ + con->features_attr = dev_attr_features; + group.attrs = attrs; + sysfs_attr_init(attrs[0]); + + if (amdgpu_bad_page_threshold != 0) { + /* add bad_page_features entry */ + bin_attr_gpu_vram_bad_pages.private = NULL; + con->badpages_attr = bin_attr_gpu_vram_bad_pages; + bin_attrs[0] = &con->badpages_attr; + group.bin_attrs = bin_attrs; + sysfs_bin_attr_init(bin_attrs[0]); + } + + r = sysfs_create_group(&adev->dev->kobj, &group); + if (r) + dev_err(adev->dev, "Failed to create RAS sysfs group!"); return 0; } @@ -1458,6 +1478,45 @@ static void amdgpu_ras_log_on_err_counter(struct amdgpu_device *adev) } } +/* Parse RdRspStatus and WrRspStatus */ +void amdgpu_ras_error_status_query(struct amdgpu_device *adev, + struct ras_query_if *info) +{ + /* + * Only two block need to query read/write + * RspStatus at current state + */ + switch (info->head.block) { + case AMDGPU_RAS_BLOCK__GFX: + if (adev->gfx.funcs->query_ras_error_status) + adev->gfx.funcs->query_ras_error_status(adev); + break; + case AMDGPU_RAS_BLOCK__MMHUB: + if (adev->mmhub.funcs->query_ras_error_status) + adev->mmhub.funcs->query_ras_error_status(adev); + break; + default: + break; + } +} + +static void amdgpu_ras_query_err_status(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_manager *obj; + + if (!con) + return; + + list_for_each_entry(obj, &con->head, node) { + struct ras_query_if info = { + .head = obj->head, + }; + + amdgpu_ras_error_status_query(adev, &info); + } +} + /* recovery begin */ /* return 0 on success. @@ -1494,10 +1553,12 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, .size = AMDGPU_GPU_PAGE_SIZE, .flags = AMDGPU_RAS_RETIRE_PAGE_RESERVED, }; - - if (data->last_reserved <= i) + ret = amdgpu_vram_mgr_query_page_status( + ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM), + data->bps[i].retired_page); + if (ret == -EBUSY) (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_PENDING; - else if (data->bps_bo[i] == NULL) + else if (ret == -ENOENT) (*bps)[i].flags = AMDGPU_RAS_RETIRE_PAGE_FAULT; } @@ -1514,23 +1575,30 @@ static void amdgpu_ras_do_recovery(struct work_struct *work) struct amdgpu_device *remote_adev = NULL; struct amdgpu_device *adev = ras->adev; struct list_head device_list, *device_list_handle = NULL; - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, false); - - /* Build list of devices to query RAS related errors */ - if (hive && adev->gmc.xgmi.num_physical_nodes > 1) - device_list_handle = &hive->device_list; - else { - INIT_LIST_HEAD(&device_list); - list_add_tail(&adev->gmc.xgmi.head, &device_list); - device_list_handle = &device_list; - } - list_for_each_entry(remote_adev, device_list_handle, gmc.xgmi.head) { - amdgpu_ras_log_on_err_counter(remote_adev); + if (!ras->disable_ras_err_cnt_harvest) { + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); + + /* Build list of devices to query RAS related errors */ + if (hive && adev->gmc.xgmi.num_physical_nodes > 1) { + device_list_handle = &hive->device_list; + } else { + INIT_LIST_HEAD(&device_list); + list_add_tail(&adev->gmc.xgmi.head, &device_list); + device_list_handle = &device_list; + } + + list_for_each_entry(remote_adev, + device_list_handle, gmc.xgmi.head) { + amdgpu_ras_query_err_status(remote_adev); + amdgpu_ras_log_on_err_counter(remote_adev); + } + + amdgpu_put_xgmi_hive(hive); } if (amdgpu_device_should_recover_gpu(ras->adev)) - amdgpu_device_gpu_recover(ras->adev, 0); + amdgpu_device_gpu_recover(ras->adev, NULL); atomic_set(&ras->in_recovery, 0); } @@ -1542,12 +1610,9 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, unsigned int new_space = old_space + pages; unsigned int align_space = ALIGN(new_space, 512); void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); - struct amdgpu_bo **bps_bo = - kmalloc(align_space * sizeof(*data->bps_bo), GFP_KERNEL); - if (!bps || !bps_bo) { + if (!bps) { kfree(bps); - kfree(bps_bo); return -ENOMEM; } @@ -1556,14 +1621,8 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, data->count * sizeof(*data->bps)); kfree(data->bps); } - if (data->bps_bo) { - memcpy(bps_bo, data->bps_bo, - data->count * sizeof(*data->bps_bo)); - kfree(data->bps_bo); - } data->bps = bps; - data->bps_bo = bps_bo; data->space_left += align_space - old_space; return 0; } @@ -1575,6 +1634,7 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data *data; int ret = 0; + uint32_t i; if (!con || !con->eh_data || !bps || pages <= 0) return 0; @@ -1584,16 +1644,26 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, if (!data) goto out; - if (data->space_left <= pages) - if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) { + for (i = 0; i < pages; i++) { + if (amdgpu_ras_check_bad_page_unlock(con, + bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT)) + continue; + + if (!data->space_left && + amdgpu_ras_realloc_eh_data_space(adev, data, 256)) { ret = -ENOMEM; goto out; } - memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps)); - data->count += pages; - data->space_left -= pages; + amdgpu_vram_mgr_reserve_range( + ttm_manager_type(&adev->mman.bdev, TTM_PL_VRAM), + bps[i].retired_page << AMDGPU_GPU_PAGE_SHIFT, + AMDGPU_GPU_PAGE_SIZE); + memcpy(&data->bps[data->count], &bps[i], sizeof(*data->bps)); + data->count++; + data->space_left--; + } out: mutex_unlock(&con->recovery_lock); @@ -1604,7 +1674,7 @@ out: * write error record array to eeprom, the function should be * protected by recovery_lock */ -static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) +int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data *data; @@ -1645,7 +1715,7 @@ static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) int ret = 0; /* no bad page record, skip eeprom access */ - if (!control->num_recs) + if (!control->num_recs || (amdgpu_bad_page_threshold == 0)) return ret; bps = kcalloc(control->num_recs, sizeof(*bps), GFP_KERNEL); @@ -1666,116 +1736,87 @@ out: return ret; } -/* - * check if an address belongs to bad page - * - * Note: this check is only for umc block - */ -static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, +static bool amdgpu_ras_check_bad_page_unlock(struct amdgpu_ras *con, uint64_t addr) { - struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - struct ras_err_handler_data *data; + struct ras_err_handler_data *data = con->eh_data; int i; - bool ret = false; - - if (!con || !con->eh_data) - return ret; - - mutex_lock(&con->recovery_lock); - data = con->eh_data; - if (!data) - goto out; addr >>= AMDGPU_GPU_PAGE_SHIFT; for (i = 0; i < data->count; i++) - if (addr == data->bps[i].retired_page) { - ret = true; - goto out; - } + if (addr == data->bps[i].retired_page) + return true; -out: - mutex_unlock(&con->recovery_lock); - return ret; + return false; } -/* called in gpu recovery/init */ -int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) +/* + * check if an address belongs to bad page + * + * Note: this check is only for umc block + */ +static bool amdgpu_ras_check_bad_page(struct amdgpu_device *adev, + uint64_t addr) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - struct ras_err_handler_data *data; - uint64_t bp; - struct amdgpu_bo *bo = NULL; - int i, ret = 0; + bool ret = false; if (!con || !con->eh_data) - return 0; + return ret; mutex_lock(&con->recovery_lock); - data = con->eh_data; - if (!data) - goto out; - /* reserve vram at driver post stage. */ - for (i = data->last_reserved; i < data->count; i++) { - bp = data->bps[i].retired_page; - - /* There are two cases of reserve error should be ignored: - * 1) a ras bad page has been allocated (used by someone); - * 2) a ras bad page has been reserved (duplicate error injection - * for one page); - */ - if (amdgpu_bo_create_kernel_at(adev, bp << AMDGPU_GPU_PAGE_SHIFT, - AMDGPU_GPU_PAGE_SIZE, - AMDGPU_GEM_DOMAIN_VRAM, - &bo, NULL)) - dev_warn(adev->dev, "RAS WARN: reserve vram for " - "retired page %llx fail\n", bp); - - data->bps_bo[i] = bo; - data->last_reserved = i + 1; - bo = NULL; - } - - /* continue to save bad pages to eeprom even reesrve_vram fails */ - ret = amdgpu_ras_save_bad_pages(adev); -out: + ret = amdgpu_ras_check_bad_page_unlock(con, addr); mutex_unlock(&con->recovery_lock); return ret; } -/* called when driver unload */ -static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev) +static void amdgpu_ras_validate_threshold(struct amdgpu_device *adev, + uint32_t max_length) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); - struct ras_err_handler_data *data; - struct amdgpu_bo *bo; - int i; + int tmp_threshold = amdgpu_bad_page_threshold; + u64 val; - if (!con || !con->eh_data) - return 0; - - mutex_lock(&con->recovery_lock); - data = con->eh_data; - if (!data) - goto out; - - for (i = data->last_reserved - 1; i >= 0; i--) { - bo = data->bps_bo[i]; + /* + * Justification of value bad_page_cnt_threshold in ras structure + * + * Generally, -1 <= amdgpu_bad_page_threshold <= max record length + * in eeprom, and introduce two scenarios accordingly. + * + * Bad page retirement enablement: + * - If amdgpu_bad_page_threshold = -1, + * bad_page_cnt_threshold = typical value by formula. + * + * - When the value from user is 0 < amdgpu_bad_page_threshold < + * max record length in eeprom, use it directly. + * + * Bad page retirement disablement: + * - If amdgpu_bad_page_threshold = 0, bad page retirement + * functionality is disabled, and bad_page_cnt_threshold will + * take no effect. + */ - amdgpu_bo_free_kernel(&bo, NULL, NULL); + if (tmp_threshold < -1) + tmp_threshold = -1; + else if (tmp_threshold > max_length) + tmp_threshold = max_length; - data->bps_bo[i] = bo; - data->last_reserved = i; + if (tmp_threshold == -1) { + val = adev->gmc.mc_vram_size; + do_div(val, RAS_BAD_PAGE_RATE); + con->bad_page_cnt_threshold = min(lower_32_bits(val), + max_length); + } else { + con->bad_page_cnt_threshold = tmp_threshold; } -out: - mutex_unlock(&con->recovery_lock); - return 0; } int amdgpu_ras_recovery_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_err_handler_data **data; + uint32_t max_eeprom_records_len = 0; + bool exc_err_limit = false; int ret; if (con) @@ -1794,31 +1835,41 @@ int amdgpu_ras_recovery_init(struct amdgpu_device *adev) atomic_set(&con->in_recovery, 0); con->adev = adev; - ret = amdgpu_ras_eeprom_init(&con->eeprom_control); - if (ret) + max_eeprom_records_len = amdgpu_ras_eeprom_get_record_max_length(); + amdgpu_ras_validate_threshold(adev, max_eeprom_records_len); + + ret = amdgpu_ras_eeprom_init(&con->eeprom_control, &exc_err_limit); + /* + * This calling fails when exc_err_limit is true or + * ret != 0. + */ + if (exc_err_limit || ret) goto free; if (con->eeprom_control.num_recs) { ret = amdgpu_ras_load_bad_pages(adev); if (ret) goto free; - ret = amdgpu_ras_reserve_bad_pages(adev); - if (ret) - goto release; } return 0; -release: - amdgpu_ras_release_bad_pages(adev); free: kfree((*data)->bps); - kfree((*data)->bps_bo); kfree(*data); con->eh_data = NULL; out: dev_warn(adev->dev, "Failed to initialize ras recovery!\n"); + /* + * Except error threshold exceeding case, other failure cases in this + * function would not fail amdgpu driver init. + */ + if (!exc_err_limit) + ret = 0; + else + ret = -EINVAL; + return ret; } @@ -1832,12 +1883,10 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) return 0; cancel_work_sync(&con->recovery_work); - amdgpu_ras_release_bad_pages(adev); mutex_lock(&con->recovery_lock); con->eh_data = NULL; kfree(data->bps); - kfree(data->bps_bo); kfree(data); mutex_unlock(&con->recovery_lock); @@ -1858,6 +1907,17 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, return 0; } +static int amdgpu_ras_check_asic_type(struct amdgpu_device *adev) +{ + if (adev->asic_type != CHIP_VEGA10 && + adev->asic_type != CHIP_VEGA20 && + adev->asic_type != CHIP_ARCTURUS && + adev->asic_type != CHIP_SIENNA_CICHLID) + return 1; + else + return 0; +} + /* * check hardware's ras ability which will be saved in hw_supported. * if hardware does not support ras, we can skip some ras initializtion and @@ -1874,8 +1934,7 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev, *supported = 0; if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw || - (adev->asic_type != CHIP_VEGA20 && - adev->asic_type != CHIP_ARCTURUS)) + amdgpu_ras_check_asic_type(adev)) return; if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { @@ -1897,6 +1956,7 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev, *supported = amdgpu_ras_enable == 0 ? 0 : *hw_supported & amdgpu_ras_mask; + adev->ras_features = *supported; } int amdgpu_ras_init(struct amdgpu_device *adev) @@ -1919,9 +1979,9 @@ int amdgpu_ras_init(struct amdgpu_device *adev) amdgpu_ras_check_supported(adev, &con->hw_supported, &con->supported); - if (!con->hw_supported) { + if (!con->hw_supported || (adev->asic_type == CHIP_VEGA10)) { r = 0; - goto err_out; + goto release_con; } con->features = 0; @@ -1932,25 +1992,25 @@ int amdgpu_ras_init(struct amdgpu_device *adev) if (adev->nbio.funcs->init_ras_controller_interrupt) { r = adev->nbio.funcs->init_ras_controller_interrupt(adev); if (r) - goto err_out; + goto release_con; } if (adev->nbio.funcs->init_ras_err_event_athub_interrupt) { r = adev->nbio.funcs->init_ras_err_event_athub_interrupt(adev); if (r) - goto err_out; + goto release_con; } if (amdgpu_ras_fs_init(adev)) { r = -EINVAL; - goto err_out; + goto release_con; } dev_info(adev->dev, "RAS INFO: ras initialized successfully, " "hardware ability[%x] ras_mask[%x]\n", con->hw_supported, con->supported); return 0; -err_out: +release_con: amdgpu_ras_set_context(adev, NULL); kfree(con); @@ -1978,7 +2038,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, amdgpu_ras_request_reset_on_boot(adev, ras_block->block); return 0; - } else if (adev->in_suspend || adev->in_gpu_reset) { + } else if (adev->in_suspend || amdgpu_in_reset(adev)) { /* in resume phase, if fail to enable ras, * clean up all ras fs nodes, and disable ras */ goto cleanup; @@ -1987,7 +2047,7 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, } /* in resume phase, no need to create ras fs node */ - if (adev->in_suspend || adev->in_gpu_reset) + if (adev->in_suspend || amdgpu_in_reset(adev)) return 0; if (ih_info->cb) { @@ -2021,7 +2081,7 @@ void amdgpu_ras_late_fini(struct amdgpu_device *adev, amdgpu_ras_sysfs_remove(adev, ras_block); if (ih_info->cb) - amdgpu_ras_interrupt_remove_handler(adev, ih_info); + amdgpu_ras_interrupt_remove_handler(adev, ih_info); amdgpu_ras_feature_enable(adev, ras_block, 0); } @@ -2145,3 +2205,19 @@ bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev) return false; } + +bool amdgpu_ras_check_err_threshold(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + bool exc_err_limit = false; + + if (con && (amdgpu_bad_page_threshold != 0)) + amdgpu_ras_eeprom_check_err_threshold(&con->eeprom_control, + &exc_err_limit); + + /* + * We are only interested in variable exc_err_limit, + * as it says if GPU is in bad state or not. + */ + return exc_err_limit; +} |