aboutsummaryrefslogtreecommitdiff
path: root/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h')
-rw-r--r--drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h101
1 files changed, 66 insertions, 35 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
index f80fd3428c98..e36f4de9fa55 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h
@@ -31,6 +31,8 @@
#include "ta_ras_if.h"
#include "amdgpu_ras_eeprom.h"
+#define AMDGPU_RAS_FLAG_INIT_BY_VBIOS (0x1 << 0)
+
enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__UMC = 0,
AMDGPU_RAS_BLOCK__SDMA,
@@ -46,11 +48,22 @@ enum amdgpu_ras_block {
AMDGPU_RAS_BLOCK__MP0,
AMDGPU_RAS_BLOCK__MP1,
AMDGPU_RAS_BLOCK__FUSE,
+ AMDGPU_RAS_BLOCK__MCA,
AMDGPU_RAS_BLOCK__LAST
};
+enum amdgpu_ras_mca_block {
+ AMDGPU_RAS_MCA_BLOCK__MP0 = 0,
+ AMDGPU_RAS_MCA_BLOCK__MP1,
+ AMDGPU_RAS_MCA_BLOCK__MPIO,
+ AMDGPU_RAS_MCA_BLOCK__IOHC,
+
+ AMDGPU_RAS_MCA_BLOCK__LAST
+};
+
#define AMDGPU_RAS_BLOCK_COUNT AMDGPU_RAS_BLOCK__LAST
+#define AMDGPU_RAS_MCA_BLOCK_COUNT AMDGPU_RAS_MCA_BLOCK__LAST
#define AMDGPU_RAS_BLOCK_MASK ((1ULL << AMDGPU_RAS_BLOCK_COUNT) - 1)
enum amdgpu_ras_gfx_subblock {
@@ -303,23 +316,18 @@ struct ras_common_if {
enum amdgpu_ras_block block;
enum amdgpu_ras_error_type type;
uint32_t sub_block_index;
- /* block name */
char name[32];
};
struct amdgpu_ras {
/* ras infrastructure */
/* for ras itself. */
- uint32_t hw_supported;
- /* for IP to check its ras ability. */
- uint32_t supported;
uint32_t features;
struct list_head head;
- /* debugfs */
- struct dentry *dir;
/* sysfs */
struct device_attribute features_attr;
struct bin_attribute badpages_attr;
+ struct dentry *de_ras_eeprom_table;
/* block array */
struct ras_manager *objs;
@@ -334,6 +342,22 @@ struct amdgpu_ras {
uint32_t flags;
bool reboot;
struct amdgpu_ras_eeprom_control eeprom_control;
+
+ bool error_query_ready;
+
+ /* bad page count threshold */
+ uint32_t bad_page_cnt_threshold;
+
+ /* disable ras error count harvest in recovery */
+ bool disable_ras_err_cnt_harvest;
+
+ /* is poison mode supported */
+ bool poison_supported;
+
+ /* RAS count errors delayed work */
+ struct delayed_work ras_counte_delay_work;
+ atomic_t ras_ue_count;
+ atomic_t ras_ce_count;
};
struct ras_fs_data {
@@ -351,14 +375,10 @@ struct ras_err_data {
struct ras_err_handler_data {
/* point to bad page records array */
struct eeprom_table_record *bps;
- /* point to reserved bo array */
- struct amdgpu_bo **bps_bo;
/* the count of entries */
int count;
/* the space can place new entries */
int space_left;
- /* last reserved entry's index + 1 */
- int last_reserved;
};
typedef int (*ras_ih_cb)(struct amdgpu_device *adev,
@@ -388,8 +408,6 @@ struct ras_manager {
struct list_head node;
/* the device */
struct amdgpu_device *adev;
- /* debugfs */
- struct dentry *ent;
/* sysfs */
struct device_attribute sysfs_attr;
int attr_inuse;
@@ -412,7 +430,7 @@ struct ras_badpage {
/* interfaces for IP */
struct ras_fs_if {
struct ras_common_if head;
- char sysfs_name[32];
+ const char* sysfs_name;
char debugfs_name[32];
};
@@ -464,8 +482,8 @@ struct ras_debug_if {
* 8: feature disable
*/
-#define amdgpu_ras_get_context(adev) ((adev)->psp.ras.ras)
-#define amdgpu_ras_set_context(adev, ras_con) ((adev)->psp.ras.ras = (ras_con))
+#define amdgpu_ras_get_context(adev) ((adev)->psp.ras_context.ras)
+#define amdgpu_ras_set_context(adev, ras_con) ((adev)->psp.ras_context.ras = (ras_con))
/* check if ras is supported on block, say, sdma, gfx */
static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
@@ -475,36 +493,28 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
if (block >= AMDGPU_RAS_BLOCK_COUNT)
return 0;
- return ras && (ras->supported & (1 << block));
+ return ras && (adev->ras_enabled & (1 << block));
}
int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
-int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
- unsigned int block);
void amdgpu_ras_resume(struct amdgpu_device *adev);
void amdgpu_ras_suspend(struct amdgpu_device *adev);
-unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
- bool is_ce);
+int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
+ unsigned long *ce_count,
+ unsigned long *ue_count);
/* error handling functions */
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
struct eeprom_table_record *bps, int pages);
-int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev);
+int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev);
-static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev,
- bool is_baco)
+static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev)
{
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
- /* save bad page to eeprom before gpu reset,
- * i2c may be unstable in gpu reset
- */
- if (in_task())
- amdgpu_ras_reserve_bad_pages(adev);
-
if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
schedule_work(&ras->recovery_work);
return 0;
@@ -541,6 +551,8 @@ amdgpu_ras_block_to_ta(enum amdgpu_ras_block block) {
return TA_RAS_BLOCK__MP1;
case AMDGPU_RAS_BLOCK__FUSE:
return TA_RAS_BLOCK__FUSE;
+ case AMDGPU_RAS_BLOCK__MCA:
+ return TA_RAS_BLOCK__MCA;
default:
WARN_ONCE(1, "RAS ERROR: unexpected block id %d\n", block);
return TA_RAS_BLOCK__UMC;
@@ -590,15 +602,14 @@ int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
struct ras_common_if *head);
-void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
- struct ras_fs_if *head);
-
-void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
- struct ras_common_if *head);
+void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev);
-int amdgpu_ras_error_query(struct amdgpu_device *adev,
+int amdgpu_ras_query_error_status(struct amdgpu_device *adev,
struct ras_query_if *info);
+int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,
+ enum amdgpu_ras_block block);
+
int amdgpu_ras_error_inject(struct amdgpu_device *adev,
struct ras_inject_if *info);
@@ -611,6 +622,9 @@ int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
struct ras_dispatch_if *info);
+struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
+ struct ras_common_if *head);
+
extern atomic_t amdgpu_ras_in_intr;
static inline bool amdgpu_ras_intr_triggered(void)
@@ -618,6 +632,23 @@ static inline bool amdgpu_ras_intr_triggered(void)
return !!atomic_read(&amdgpu_ras_in_intr);
}
+static inline void amdgpu_ras_intr_cleared(void)
+{
+ atomic_set(&amdgpu_ras_in_intr, 0);
+}
+
void amdgpu_ras_global_ras_isr(struct amdgpu_device *adev);
+void amdgpu_ras_set_error_query_ready(struct amdgpu_device *adev, bool ready);
+
+bool amdgpu_ras_need_emergency_restart(struct amdgpu_device *adev);
+
+void amdgpu_release_ras_context(struct amdgpu_device *adev);
+
+int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev);
+
+const char *get_ras_block_str(struct ras_common_if *ras_block);
+
+bool amdgpu_ras_is_poison_mode_supported(struct amdgpu_device *adev);
+
#endif