diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 111 |
1 files changed, 107 insertions, 4 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index e0f8ce9d8440..dcf1f3dbb5c4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -26,6 +26,9 @@ #include <linux/debugfs.h> #include <linux/list.h> +#include <linux/kfifo.h> +#include <linux/radix-tree.h> +#include <linux/siphash.h> #include "ta_ras_if.h" #include "amdgpu_ras_eeprom.h" #include "amdgpu_smuio.h" @@ -43,13 +46,11 @@ struct amdgpu_iv_entry; #define AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(x) AMDGPU_GET_REG_FIELD(x, 7, 7) #define AMDGPU_RAS_GPU_ERR_SOCKET_ID(x) AMDGPU_GET_REG_FIELD(x, 10, 8) #define AMDGPU_RAS_GPU_ERR_AID_ID(x) AMDGPU_GET_REG_FIELD(x, 12, 11) -#define AMDGPU_RAS_GPU_ERR_HBM_ID(x) AMDGPU_GET_REG_FIELD(x, 13, 13) -#define AMDGPU_RAS_GPU_ERR_BOOT_STATUS(x) AMDGPU_GET_REG_FIELD(x, 31, 31) +#define AMDGPU_RAS_GPU_ERR_HBM_ID(x) AMDGPU_GET_REG_FIELD(x, 14, 13) -#define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT 1000 +#define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT 100 #define AMDGPU_RAS_BOOT_STEADY_STATUS 0xBA #define AMDGPU_RAS_BOOT_STATUS_MASK 0xFF -#define AMDGPU_RAS_BOOT_SUCEESS 0x80000000 #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS (0x1 << 0) /* position of instance value in sub_block_index of @@ -61,9 +62,21 @@ struct amdgpu_iv_entry; #define AMDGPU_RAS_FEATURES_SOCKETID_SHIFT 29 #define AMDGPU_RAS_FEATURES_SOCKETID_MASK 0xe0000000 +/* Reserve 8 physical dram row for possible retirement. + * In worst cases, it will lose 8 * 2MB memory in vram domain */ +#define AMDGPU_RAS_RESERVED_VRAM_SIZE (16ULL << 20) /* The high three bits indicates socketid */ #define AMDGPU_RAS_GET_FEATURES(val) ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK) +#define RAS_EVENT_INVALID_ID (BIT_ULL(63)) +#define RAS_EVENT_ID_IS_VALID(x) (!((x) & BIT_ULL(63))) + +#define RAS_EVENT_LOG(adev, id, fmt, ...) \ + amdgpu_ras_event_log_print((adev), (id), (fmt), ##__VA_ARGS__) + +#define amdgpu_ras_mark_ras_event(adev, type) \ + (amdgpu_ras_mark_ras_event_caller((adev), (type), __builtin_return_address(0))) + enum amdgpu_ras_block { AMDGPU_RAS_BLOCK__UMC = 0, AMDGPU_RAS_BLOCK__SDMA, @@ -419,6 +432,65 @@ struct umc_ecc_info { int record_ce_addr_supported; }; +enum ras_event_type { + RAS_EVENT_TYPE_INVALID = 0, + RAS_EVENT_TYPE_FATAL, + RAS_EVENT_TYPE_POISON_CREATION, + RAS_EVENT_TYPE_POISON_CONSUMPTION, + RAS_EVENT_TYPE_COUNT, +}; + +struct ras_event_state { + u64 last_seqno; + atomic64_t count; +}; + +struct ras_event_manager { + atomic64_t seqno; + struct ras_event_state event_state[RAS_EVENT_TYPE_COUNT]; +}; + +struct ras_event_id { + enum ras_event_type type; + u64 event_id; +}; + +struct ras_query_context { + struct ras_event_id evid; +}; + +typedef int (*pasid_notify)(struct amdgpu_device *adev, + uint16_t pasid, void *data); + +struct ras_poison_msg { + enum amdgpu_ras_block block; + uint16_t pasid; + uint32_t reset; + pasid_notify pasid_fn; + void *data; +}; + +struct ras_err_pages { + uint32_t count; + uint64_t *pfn; +}; + +struct ras_ecc_err { + u64 hash_index; + uint64_t status; + uint64_t ipid; + uint64_t addr; + struct ras_err_pages err_pages; +}; + +struct ras_ecc_log_info { + struct mutex lock; + siphash_key_t ecc_key; + struct radix_tree_root de_page_tree; + uint64_t de_queried_count; + uint64_t prev_de_queried_count; +}; + struct amdgpu_ras { /* ras infrastructure */ /* for ras itself. */ @@ -429,6 +501,7 @@ struct amdgpu_ras { struct device_attribute features_attr; struct device_attribute version_attr; struct device_attribute schema_attr; + struct device_attribute event_state_attr; struct bin_attribute badpages_attr; struct dentry *de_ras_eeprom_table; /* block array */ @@ -469,6 +542,7 @@ struct amdgpu_ras { bool update_channel_flag; /* Record status of smu mca debug mode */ bool is_aca_debug_mode; + bool is_rma; /* Record special requirements of gpu reset caller */ uint32_t gpu_reset_flags; @@ -477,8 +551,20 @@ struct amdgpu_ras { wait_queue_head_t page_retirement_wq; struct mutex page_retirement_lock; atomic_t page_retirement_req_cnt; + atomic_t poison_creation_count; + struct mutex page_rsv_lock; + DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128); + struct ras_ecc_log_info umc_ecc_log; + struct delayed_work page_retirement_dwork; + /* Fatal error detected flag */ atomic_t fed; + + /* RAS event manager */ + struct ras_event_manager __event_mgr; + struct ras_event_manager *event_mgr; + + uint64_t reserved_pages_in_bytes; }; struct ras_fs_data { @@ -512,6 +598,7 @@ struct ras_err_data { unsigned long de_count; unsigned long err_addr_cnt; struct eeprom_table_record *err_addr; + unsigned long err_addr_len; u32 err_list_count; struct list_head err_node_list; }; @@ -879,4 +966,20 @@ void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info, void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status); bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev); +u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type); +int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_type type, + const void *caller); + +int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn); + +int amdgpu_ras_put_poison_req(struct amdgpu_device *adev, + enum amdgpu_ras_block block, uint16_t pasid, + pasid_notify pasid_fn, void *data, uint32_t reset); + +bool amdgpu_ras_in_recovery(struct amdgpu_device *adev); + +__printf(3, 4) +void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, + const char *fmt, ...); + #endif |