diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 58 | 
1 files changed, 42 insertions, 16 deletions
| diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index 7021c4a66fb5..dcf1f3dbb5c4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -47,12 +47,10 @@ struct amdgpu_iv_entry;  #define AMDGPU_RAS_GPU_ERR_SOCKET_ID(x)			AMDGPU_GET_REG_FIELD(x, 10, 8)  #define AMDGPU_RAS_GPU_ERR_AID_ID(x)			AMDGPU_GET_REG_FIELD(x, 12, 11)  #define AMDGPU_RAS_GPU_ERR_HBM_ID(x)			AMDGPU_GET_REG_FIELD(x, 14, 13) -#define AMDGPU_RAS_GPU_ERR_BOOT_STATUS(x)		AMDGPU_GET_REG_FIELD(x, 31, 31) -#define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT	1000 +#define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT	100  #define AMDGPU_RAS_BOOT_STEADY_STATUS		0xBA  #define AMDGPU_RAS_BOOT_STATUS_MASK		0xFF -#define AMDGPU_RAS_BOOT_SUCEESS			0x80000000  #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS		(0x1 << 0)  /* position of instance value in sub_block_index of @@ -64,16 +62,20 @@ struct amdgpu_iv_entry;  #define AMDGPU_RAS_FEATURES_SOCKETID_SHIFT 29  #define AMDGPU_RAS_FEATURES_SOCKETID_MASK 0xe0000000 +/* Reserve 8 physical dram row for possible retirement. + * In worst cases, it will lose 8 * 2MB memory in vram domain */ +#define AMDGPU_RAS_RESERVED_VRAM_SIZE	(16ULL << 20)  /* The high three bits indicates socketid */  #define AMDGPU_RAS_GET_FEATURES(val)  ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK) -#define RAS_EVENT_LOG(_adev, _id, _fmt, ...)				\ -do {									\ -	if (amdgpu_ras_event_id_is_valid((_adev), (_id)))			\ -	    dev_info((_adev)->dev, "{%llu}" _fmt, (_id), ##__VA_ARGS__);	\ -	else								\ -	    dev_info((_adev)->dev, _fmt, ##__VA_ARGS__);			\ -} while (0) +#define RAS_EVENT_INVALID_ID		(BIT_ULL(63)) +#define RAS_EVENT_ID_IS_VALID(x)	(!((x) & BIT_ULL(63))) + +#define RAS_EVENT_LOG(adev, id, fmt, ...)	\ +	amdgpu_ras_event_log_print((adev), (id), (fmt), ##__VA_ARGS__) + +#define amdgpu_ras_mark_ras_event(adev, type)	\ +	(amdgpu_ras_mark_ras_event_caller((adev), (type), __builtin_return_address(0)))  enum amdgpu_ras_block {  	AMDGPU_RAS_BLOCK__UMC = 0, @@ -431,20 +433,32 @@ struct umc_ecc_info {  };  enum ras_event_type { -	RAS_EVENT_TYPE_INVALID = -1, -	RAS_EVENT_TYPE_ISR = 0, +	RAS_EVENT_TYPE_INVALID = 0, +	RAS_EVENT_TYPE_FATAL, +	RAS_EVENT_TYPE_POISON_CREATION, +	RAS_EVENT_TYPE_POISON_CONSUMPTION,  	RAS_EVENT_TYPE_COUNT,  }; +struct ras_event_state { +	u64 last_seqno; +	atomic64_t count; +}; +  struct ras_event_manager { -	atomic64_t seqnos[RAS_EVENT_TYPE_COUNT]; +	atomic64_t seqno; +	struct ras_event_state event_state[RAS_EVENT_TYPE_COUNT];  }; -struct ras_query_context { +struct ras_event_id {  	enum ras_event_type type;  	u64 event_id;  }; +struct ras_query_context { +	struct ras_event_id evid; +}; +  typedef int (*pasid_notify)(struct amdgpu_device *adev,  		uint16_t pasid, void *data); @@ -473,7 +487,8 @@ struct ras_ecc_log_info {  	struct mutex lock;  	siphash_key_t ecc_key;  	struct radix_tree_root de_page_tree; -	bool	de_updated; +	uint64_t	de_queried_count; +	uint64_t	prev_de_queried_count;  };  struct amdgpu_ras { @@ -486,6 +501,7 @@ struct amdgpu_ras {  	struct device_attribute features_attr;  	struct device_attribute version_attr;  	struct device_attribute schema_attr; +	struct device_attribute event_state_attr;  	struct bin_attribute badpages_attr;  	struct dentry *de_ras_eeprom_table;  	/* block array */ @@ -526,6 +542,7 @@ struct amdgpu_ras {  	bool update_channel_flag;  	/* Record status of smu mca debug mode */  	bool is_aca_debug_mode; +	bool is_rma;  	/* Record special requirements of gpu reset caller */  	uint32_t  gpu_reset_flags; @@ -534,6 +551,7 @@ struct amdgpu_ras {  	wait_queue_head_t page_retirement_wq;  	struct mutex page_retirement_lock;  	atomic_t page_retirement_req_cnt; +	atomic_t poison_creation_count;  	struct mutex page_rsv_lock;  	DECLARE_KFIFO(poison_fifo, struct ras_poison_msg, 128);  	struct ras_ecc_log_info  umc_ecc_log; @@ -546,6 +564,7 @@ struct amdgpu_ras {  	struct ras_event_manager __event_mgr;  	struct ras_event_manager *event_mgr; +	uint64_t reserved_pages_in_bytes;  };  struct ras_fs_data { @@ -947,8 +966,9 @@ void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info,  void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status);  bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev); -bool amdgpu_ras_event_id_is_valid(struct amdgpu_device *adev, u64 id);  u64 amdgpu_ras_acquire_event_id(struct amdgpu_device *adev, enum ras_event_type type); +int amdgpu_ras_mark_ras_event_caller(struct amdgpu_device *adev, enum ras_event_type type, +				     const void *caller);  int amdgpu_ras_reserve_page(struct amdgpu_device *adev, uint64_t pfn); @@ -956,4 +976,10 @@ int amdgpu_ras_put_poison_req(struct amdgpu_device *adev,  		enum amdgpu_ras_block block, uint16_t pasid,  		pasid_notify pasid_fn, void *data, uint32_t reset); +bool amdgpu_ras_in_recovery(struct amdgpu_device *adev); + +__printf(3, 4) +void amdgpu_ras_event_log_print(struct amdgpu_device *adev, u64 event_id, +				const char *fmt, ...); +  #endif |