From 7af23ebe93fe52a30b2338450c8cd40a4f5210d4 Mon Sep 17 00:00:00 2001 From: xinhui pan Date: Wed, 8 May 2019 16:13:03 +0800 Subject: drm/amdgpu: Issue ras TA disable/enable cmd forcely on boot Check ras TA error code and return EAGAIN. Issue ras enable/disable cmd without checking currect state. Looks like ras TA will handle current state == target state case. Now driver might need do a reset to satisfy ras TA. Signed-off-by: xinhui pan Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 34 +++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 22bd21efe6b1..5f8e1163a75d 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -521,6 +521,8 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, enable ? "enable":"disable", ras_block_str(head->block), ret); + if (ret == TA_RAS_STATUS__RESET_NEEDED) + return -EAGAIN; return -EINVAL; } @@ -541,16 +543,32 @@ int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev, return -EINVAL; if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) { - /* If ras is enabled by vbios, we set up ras object first in - * both case. For enable, that is all what we need do. For - * disable, we need perform a ras TA disable cmd after that. - */ - ret = __amdgpu_ras_feature_enable(adev, head, 1); - if (ret) - return ret; + if (enable) { + /* There is no harm to issue a ras TA cmd regardless of + * the currecnt ras state. + * If current state == target state, it will do nothing + * But sometimes it requests driver to reset and repost + * with error code -EAGAIN. + */ + ret = amdgpu_ras_feature_enable(adev, head, 1); + /* With old ras TA, we might fail to enable ras. + * Log it and just setup the object. + * TODO need remove this WA in the future. + */ + if (ret == -EINVAL) { + ret = __amdgpu_ras_feature_enable(adev, head, 1); + if (!ret) + DRM_INFO("RAS INFO: %s setup object\n", + ras_block_str(head->block)); + } + } else { + /* setup the object then issue a ras TA disable cmd.*/ + ret = __amdgpu_ras_feature_enable(adev, head, 1); + if (ret) + return ret; - if (!enable) ret = amdgpu_ras_feature_enable(adev, head, 0); + } } else ret = amdgpu_ras_feature_enable(adev, head, enable); -- cgit From a564808e7f5b19b3621a1dc4ff2a3042171ae167 Mon Sep 17 00:00:00 2001 From: xinhui pan Date: Wed, 8 May 2019 19:12:24 +0800 Subject: drm/amdgpu: handle ras reset add another flag to allow IP do a gpu reset after device init. Signed-off-by: xinhui pan Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 36 +++++++++++++++++++++++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 3 +++ 2 files changed, 37 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 5f8e1163a75d..37cb3de08494 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -118,7 +118,8 @@ const char *ras_block_string[] = { #define ras_err_str(i) (ras_error_string[ffs(i)]) #define ras_block_str(i) (ras_block_string[i]) -#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1 +#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS 1 +#define AMDGPU_RAS_FLAG_INIT_NEED_RESET 2 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS) static void amdgpu_ras_self_test(struct amdgpu_device *adev) @@ -1358,6 +1359,19 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) } /* recovery end */ +/* return 0 if ras will reset gpu and repost.*/ +int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, + unsigned int block) +{ + struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); + + if (!ras) + return -EINVAL; + + ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET; + return 0; +} + /* * check hardware's ras ability which will be saved in hw_supported. * if hardware does not support ras, we can skip some ras initializtion and @@ -1433,7 +1447,12 @@ recovery_out: return -EINVAL; } -/* do some init work after IP late init as dependence */ +/* do some init work after IP late init as dependence. + * TODO + * gpu reset will re-enable ras, need fint out one way to run it again. + * for now, if a gpu reset happened, unless IP enable its ras, the ras state + * will be showed as disabled. + */ void amdgpu_ras_post_init(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); @@ -1462,6 +1481,19 @@ void amdgpu_ras_post_init(struct amdgpu_device *adev) } } } + + if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) { + con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET; + /* setup ras obj state as disabled. + * for init_by_vbios case. + * if we want to enable ras, just enable it in a normal way. + * If we want do disable it, need setup ras obj as enabled, + * then issue another TA disable cmd. + * See feature_enable_on_boot + */ + amdgpu_ras_disable_all_features(adev, 1); + amdgpu_ras_reset_gpu(adev, 0); + } } /* do some fini work before IP fini as dependence */ diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index e60a554656ca..06ef325b61b8 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -175,6 +175,9 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev, return ras && (ras->supported & (1 << block)); } +int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, + unsigned int block); + int amdgpu_ras_query_error_count(struct amdgpu_device *adev, bool is_ce); -- cgit From 466b179346094e01deccd051a215fe782b59ca68 Mon Sep 17 00:00:00 2001 From: xinhui pan Date: Tue, 7 May 2019 11:53:31 +0800 Subject: drm/amdgpu: add badpages sysfs interafce add badpages node. it will output badpages list in format gpu pfn : gpu page size : flags example 0x00000000 : 0x00001000 : R 0x00000001 : 0x00001000 : R 0x00000002 : 0x00001000 : R 0x00000003 : 0x00001000 : R 0x00000004 : 0x00001000 : R 0x00000005 : 0x00001000 : R 0x00000006 : 0x00001000 : R 0x00000007 : 0x00001000 : P 0x00000008 : 0x00001000 : P 0x00000009 : 0x00001000 : P flags can be one of below characters R: reserved. P: pending for reserve. F: failed to reserve for some reasons. Signed-off-by: xinhui pan Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 146 ++++++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 1 + 2 files changed, 147 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 37cb3de08494..49c71cfc7fc6 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -90,6 +90,12 @@ struct ras_manager { struct ras_err_data err_data; }; +struct ras_badpage { + unsigned int bp; + unsigned int size; + unsigned int flags; +}; + const char *ras_error_string[] = { "none", "parity", @@ -710,6 +716,77 @@ int amdgpu_ras_query_error_count(struct amdgpu_device *adev, /* sysfs begin */ +static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, + struct ras_badpage **bps, unsigned int *count); + +static char *amdgpu_ras_badpage_flags_str(unsigned int flags) +{ + switch (flags) { + case 0: + return "R"; + case 1: + return "P"; + case 2: + default: + return "F"; + }; +} + +/* + * DOC: ras sysfs gpu_vram_bad_pages interface + * + * It allows user to read the bad pages of vram on the gpu through + * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages + * + * It outputs multiple lines, and each line stands for one gpu page. + * + * The format of one line is below, + * gpu pfn : gpu page size : flags + * + * gpu pfn and gpu page size are printed in hex format. + * flags can be one of below character, + * R: reserved, this gpu page is reserved and not able to use. + * P: pending for reserve, this gpu page is marked as bad, will be reserved + * in next window of page_reserve. + * F: unable to reserve. this gpu page can't be reserved due to some reasons. + * + * examples: + * 0x00000001 : 0x00001000 : R + * 0x00000002 : 0x00001000 : P + */ + +static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f, + struct kobject *kobj, struct bin_attribute *attr, + char *buf, loff_t ppos, size_t count) +{ + struct amdgpu_ras *con = + container_of(attr, struct amdgpu_ras, badpages_attr); + struct amdgpu_device *adev = con->adev; + const unsigned int element_size = + sizeof("0xabcdabcd : 0x12345678 : R\n") - 1; + unsigned int start = (ppos + element_size - 1) / element_size; + unsigned int end = (ppos + count - 1) / element_size; + ssize_t s = 0; + struct ras_badpage *bps = NULL; + unsigned int bps_count = 0; + + memset(buf, 0, count); + + if (amdgpu_ras_badpages_read(adev, &bps, &bps_count)) + return 0; + + for (; start < end && start < bps_count; start++) + s += scnprintf(&buf[s], element_size + 1, + "0x%08x : 0x%08x : %1s\n", + bps[start].bp, + bps[start].size, + amdgpu_ras_badpage_flags_str(bps[start].flags)); + + kfree(bps); + + return s; +} + static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev, struct device_attribute *attr, char *buf) { @@ -750,9 +827,14 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev) &con->features_attr.attr, NULL }; + struct bin_attribute *bin_attrs[] = { + &con->badpages_attr, + NULL + }; struct attribute_group group = { .name = "ras", .attrs = attrs, + .bin_attrs = bin_attrs, }; con->features_attr = (struct device_attribute) { @@ -762,7 +844,19 @@ static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev) }, .show = amdgpu_ras_sysfs_features_read, }; + + con->badpages_attr = (struct bin_attribute) { + .attr = { + .name = "gpu_vram_bad_pages", + .mode = S_IRUGO, + }, + .size = 0, + .private = NULL, + .read = amdgpu_ras_sysfs_badpages_read, + }; + sysfs_attr_init(attrs[0]); + sysfs_bin_attr_init(bin_attrs[0]); return sysfs_create_group(&adev->dev->kobj, &group); } @@ -774,9 +868,14 @@ static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev) &con->features_attr.attr, NULL }; + struct bin_attribute *bin_attrs[] = { + &con->badpages_attr, + NULL + }; struct attribute_group group = { .name = "ras", .attrs = attrs, + .bin_attrs = bin_attrs, }; sysfs_remove_group(&adev->dev->kobj, &group); @@ -1108,6 +1207,53 @@ static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev) /* ih end */ /* recovery begin */ + +/* return 0 on success. + * caller need free bps. + */ +static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, + struct ras_badpage **bps, unsigned int *count) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + struct ras_err_handler_data *data; + int i = 0; + int ret = 0; + + if (!con || !con->eh_data || !bps || !count) + return -EINVAL; + + mutex_lock(&con->recovery_lock); + data = con->eh_data; + if (!data || data->count == 0) { + *bps = NULL; + goto out; + } + + *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL); + if (!*bps) { + ret = -ENOMEM; + goto out; + } + + for (; i < data->count; i++) { + (*bps)[i] = (struct ras_badpage){ + .bp = data->bps[i].bp, + .size = AMDGPU_GPU_PAGE_SIZE, + .flags = 0, + }; + + if (data->last_reserved <= i) + (*bps)[i].flags = 1; + else if (data->bps[i].bo == NULL) + (*bps)[i].flags = 2; + } + + *count = data->count; +out: + mutex_unlock(&con->recovery_lock); + return ret; +} + static void amdgpu_ras_do_recovery(struct work_struct *work) { struct amdgpu_ras *ras = diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 06ef325b61b8..59994ee00855 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -93,6 +93,7 @@ struct amdgpu_ras { struct dentry *ent; /* sysfs */ struct device_attribute features_attr; + struct bin_attribute badpages_attr; /* block array */ struct ras_manager *objs; -- cgit From 511fdbc33aaa4758f7c445183ff840e251c0b427 Mon Sep 17 00:00:00 2001 From: xinhui pan Date: Thu, 9 May 2019 08:26:27 +0800 Subject: drm/amdgpu: ras support suspend/resume add ras suspend function. rename ras_post_init to amdgpu_ras_resume. Signed-off-by: xinhui pan Reviewed-by: Alex Deucher Reviewed-by: James Zhu Tested-by: James Zhu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 20 +++++++++++++++----- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 4 +++- 3 files changed, 20 insertions(+), 8 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 309461d0c275..da120fe330be 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2745,7 +2745,7 @@ fence_driver_init: } /* must succeed. */ - amdgpu_ras_post_init(adev); + amdgpu_ras_resume(adev); r = device_create_file(adev->dev, &dev_attr_pcie_replay_count); if (r) { @@ -3503,7 +3503,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, goto out; /* must succeed. */ - amdgpu_ras_post_init(tmp_adev); + amdgpu_ras_resume(tmp_adev); /* Update PSP FW topology after reset */ if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 49c71cfc7fc6..da1dc40b9b14 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -1594,12 +1594,9 @@ recovery_out: } /* do some init work after IP late init as dependence. - * TODO - * gpu reset will re-enable ras, need fint out one way to run it again. - * for now, if a gpu reset happened, unless IP enable its ras, the ras state - * will be showed as disabled. + * and it runs in resume/gpu reset/booting up cases. */ -void amdgpu_ras_post_init(struct amdgpu_device *adev) +void amdgpu_ras_resume(struct amdgpu_device *adev) { struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct ras_manager *obj, *tmp; @@ -1642,6 +1639,19 @@ void amdgpu_ras_post_init(struct amdgpu_device *adev) } } +void amdgpu_ras_suspend(struct amdgpu_device *adev) +{ + struct amdgpu_ras *con = amdgpu_ras_get_context(adev); + + if (!con) + return; + + amdgpu_ras_disable_all_features(adev, 0); + /* Make sure all ras objects are disabled. */ + if (con->features) + amdgpu_ras_disable_all_features(adev, 1); +} + /* do some fini work before IP fini as dependence */ int amdgpu_ras_pre_fini(struct amdgpu_device *adev) { diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 59994ee00855..c6b34fbd695f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -179,6 +179,9 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev, int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, unsigned int block); +void amdgpu_ras_resume(struct amdgpu_device *adev); +void amdgpu_ras_suspend(struct amdgpu_device *adev); + int amdgpu_ras_query_error_count(struct amdgpu_device *adev, bool is_ce); @@ -256,7 +259,6 @@ amdgpu_ras_error_to_ta(enum amdgpu_ras_error_type error) { /* called in ip_init and ip_fini */ int amdgpu_ras_init(struct amdgpu_device *adev); -void amdgpu_ras_post_init(struct amdgpu_device *adev); int amdgpu_ras_fini(struct amdgpu_device *adev); int amdgpu_ras_pre_fini(struct amdgpu_device *adev); -- cgit From d6ee400e793f0ae6c9f5926bea9fbb362a950d96 Mon Sep 17 00:00:00 2001 From: Slava Abramov Date: Thu, 16 May 2019 16:17:53 -0400 Subject: drm/amdgpu: use div64_ul for 32-bit compatibility v1 v1: replace casting to unsigned long with div64_ul Acked-by: Alex Deucher Signed-off-by: Slava Abramov Tested-by: Slava Abramov Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index da1dc40b9b14..d5719b0fb82c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -764,8 +764,8 @@ static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f, struct amdgpu_device *adev = con->adev; const unsigned int element_size = sizeof("0xabcdabcd : 0x12345678 : R\n") - 1; - unsigned int start = (ppos + element_size - 1) / element_size; - unsigned int end = (ppos + count - 1) / element_size; + unsigned int start = div64_ul(ppos + element_size - 1, element_size); + unsigned int end = div64_ul(ppos + count - 1, element_size); ssize_t s = 0; struct ras_badpage *bps = NULL; unsigned int bps_count = 0; -- cgit From 74abc2210e105f0fffe59c35d2329201f1b4310e Mon Sep 17 00:00:00 2001 From: Tom St Denis Date: Fri, 24 May 2019 09:21:54 -0400 Subject: drm/amd/doc: Add RAS documentation to guide Acked-by: Slava Abramov Signed-off-by: Tom St Denis Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- Documentation/gpu/amdgpu.rst | 11 +++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 4 ++-- 2 files changed, 13 insertions(+), 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c') diff --git a/Documentation/gpu/amdgpu.rst b/Documentation/gpu/amdgpu.rst index cacfcfad2356..86138798128f 100644 --- a/Documentation/gpu/amdgpu.rst +++ b/Documentation/gpu/amdgpu.rst @@ -79,6 +79,17 @@ AMDGPU XGMI Support .. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c :internal: +AMDGPU RAS debugfs control interface +==================================== + +.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c + :doc: AMDGPU RAS debugfs control interface + + +.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c + :internal: + + GPU Power/Thermal Controls and Monitoring ========================================= diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index d5719b0fb82c..7c8a4aedf07c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -244,8 +244,8 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, return 0; } -/* - * DOC: ras debugfs control interface +/** + * DOC: AMDGPU RAS debugfs control interface * * It accepts struct ras_debug_if who has two members. * -- cgit