diff options
| author | Dmitry Torokhov <[email protected]> | 2024-07-15 14:03:44 -0700 | 
|---|---|---|
| committer | Dmitry Torokhov <[email protected]> | 2024-07-15 14:03:44 -0700 | 
| commit | a23e1966932464e1c5226cb9ac4ce1d5fc10ba22 (patch) | |
| tree | bf5f1b57faa01ca31656bfc48c7d6b6f0bc39189 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | |
| parent | 7c7b1be19b228b450c2945ec379d7fc6bfef9852 (diff) | |
| parent | f3efefb6fdcce604413135bd8d4c5568e53a1f13 (diff) | |
Merge branch 'next' into for-linus
Prepare input updates for 6.11 merge window.
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h | 114 | 
1 files changed, 113 insertions, 1 deletions
| diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h index ffb49b2d533a..e0f8ce9d8440 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.h @@ -28,9 +28,29 @@  #include <linux/list.h>  #include "ta_ras_if.h"  #include "amdgpu_ras_eeprom.h" +#include "amdgpu_smuio.h" +#include "amdgpu_aca.h"  struct amdgpu_iv_entry; +#define AMDGPU_RAS_GPU_ERR_MEM_TRAINING(x)		AMDGPU_GET_REG_FIELD(x, 0, 0) +#define AMDGPU_RAS_GPU_ERR_FW_LOAD(x)			AMDGPU_GET_REG_FIELD(x, 1, 1) +#define AMDGPU_RAS_GPU_ERR_WAFL_LINK_TRAINING(x)	AMDGPU_GET_REG_FIELD(x, 2, 2) +#define AMDGPU_RAS_GPU_ERR_XGMI_LINK_TRAINING(x)	AMDGPU_GET_REG_FIELD(x, 3, 3) +#define AMDGPU_RAS_GPU_ERR_USR_CP_LINK_TRAINING(x)	AMDGPU_GET_REG_FIELD(x, 4, 4) +#define AMDGPU_RAS_GPU_ERR_USR_DP_LINK_TRAINING(x)	AMDGPU_GET_REG_FIELD(x, 5, 5) +#define AMDGPU_RAS_GPU_ERR_HBM_MEM_TEST(x)		AMDGPU_GET_REG_FIELD(x, 6, 6) +#define AMDGPU_RAS_GPU_ERR_HBM_BIST_TEST(x)		AMDGPU_GET_REG_FIELD(x, 7, 7) +#define AMDGPU_RAS_GPU_ERR_SOCKET_ID(x)			AMDGPU_GET_REG_FIELD(x, 10, 8) +#define AMDGPU_RAS_GPU_ERR_AID_ID(x)			AMDGPU_GET_REG_FIELD(x, 12, 11) +#define AMDGPU_RAS_GPU_ERR_HBM_ID(x)			AMDGPU_GET_REG_FIELD(x, 13, 13) +#define AMDGPU_RAS_GPU_ERR_BOOT_STATUS(x)		AMDGPU_GET_REG_FIELD(x, 31, 31) + +#define AMDGPU_RAS_BOOT_STATUS_POLLING_LIMIT	1000 +#define AMDGPU_RAS_BOOT_STEADY_STATUS		0xBA +#define AMDGPU_RAS_BOOT_STATUS_MASK		0xFF +#define AMDGPU_RAS_BOOT_SUCEESS			0x80000000 +  #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS		(0x1 << 0)  /* position of instance value in sub_block_index of   * ta_ras_trigger_error_input, the sub block uses lower 12 bits @@ -38,6 +58,12 @@ struct amdgpu_iv_entry;  #define AMDGPU_RAS_INST_MASK 0xfffff000  #define AMDGPU_RAS_INST_SHIFT 0xc +#define AMDGPU_RAS_FEATURES_SOCKETID_SHIFT 29 +#define AMDGPU_RAS_FEATURES_SOCKETID_MASK 0xe0000000 + +/* The high three bits indicates socketid */ +#define AMDGPU_RAS_GET_FEATURES(val)  ((val) & ~AMDGPU_RAS_FEATURES_SOCKETID_MASK) +  enum amdgpu_ras_block {  	AMDGPU_RAS_BLOCK__UMC = 0,  	AMDGPU_RAS_BLOCK__SDMA, @@ -56,6 +82,8 @@ enum amdgpu_ras_block {  	AMDGPU_RAS_BLOCK__MCA,  	AMDGPU_RAS_BLOCK__VCN,  	AMDGPU_RAS_BLOCK__JPEG, +	AMDGPU_RAS_BLOCK__IH, +	AMDGPU_RAS_BLOCK__MPIO,  	AMDGPU_RAS_BLOCK__LAST  }; @@ -319,6 +347,12 @@ enum amdgpu_ras_ret {  	AMDGPU_RAS_PT,  }; +enum amdgpu_ras_error_query_mode { +	AMDGPU_RAS_INVALID_ERROR_QUERY		= 0, +	AMDGPU_RAS_DIRECT_ERROR_QUERY		= 1, +	AMDGPU_RAS_FIRMWARE_ERROR_QUERY		= 2, +}; +  /* ras error status reisger fields */  #define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG__SHIFT	0x0  #define ERR_STATUS_LO__ERR_STATUS_VALID_FLAG_MASK	0x00000001L @@ -389,9 +423,12 @@ struct amdgpu_ras {  	/* ras infrastructure */  	/* for ras itself. */  	uint32_t features; +	uint32_t schema;  	struct list_head head;  	/* sysfs */  	struct device_attribute features_attr; +	struct device_attribute version_attr; +	struct device_attribute schema_attr;  	struct bin_attribute badpages_attr;  	struct dentry *de_ras_eeprom_table;  	/* block array */ @@ -430,23 +467,58 @@ struct amdgpu_ras {  	/* Indicates smu whether need update bad channel info */  	bool update_channel_flag; +	/* Record status of smu mca debug mode */ +	bool is_aca_debug_mode;  	/* Record special requirements of gpu reset caller */  	uint32_t  gpu_reset_flags; + +	struct task_struct *page_retirement_thread; +	wait_queue_head_t page_retirement_wq; +	struct mutex page_retirement_lock; +	atomic_t page_retirement_req_cnt; +	/* Fatal error detected flag */ +	atomic_t fed;  };  struct ras_fs_data { -	char sysfs_name[32]; +	char sysfs_name[48];  	char debugfs_name[32];  }; +struct ras_err_addr { +	struct list_head node; +	uint64_t err_status; +	uint64_t err_ipid; +	uint64_t err_addr; +}; + +struct ras_err_info { +	struct amdgpu_smuio_mcm_config_info mcm_info; +	u64 ce_count; +	u64 ue_count; +	u64 de_count; +	struct list_head err_addr_list; +}; + +struct ras_err_node { +	struct list_head node; +	struct ras_err_info err_info; +}; +  struct ras_err_data {  	unsigned long ue_count;  	unsigned long ce_count; +	unsigned long de_count;  	unsigned long err_addr_cnt;  	struct eeprom_table_record *err_addr; +	u32 err_list_count; +	struct list_head err_node_list;  }; +#define for_each_ras_error(err_node, err_data) \ +	list_for_each_entry(err_node, &(err_data)->err_node_list, node) +  struct ras_err_handler_data {  	/* point to bad page records array */  	struct eeprom_table_record *bps; @@ -494,6 +566,8 @@ struct ras_manager {  	struct ras_ih_data ih_data;  	struct ras_err_data err_data; + +	struct aca_handle aca_handle;  };  struct ras_badpage { @@ -513,6 +587,7 @@ struct ras_query_if {  	struct ras_common_if head;  	unsigned long ue_count;  	unsigned long ce_count; +	unsigned long de_count;  };  struct ras_inject_if { @@ -691,6 +766,8 @@ void amdgpu_ras_debugfs_create_all(struct amdgpu_device *adev);  int amdgpu_ras_query_error_status(struct amdgpu_device *adev,  		struct ras_query_if *info); +int amdgpu_ras_reset_error_count(struct amdgpu_device *adev, +		enum amdgpu_ras_block block);  int amdgpu_ras_reset_error_status(struct amdgpu_device *adev,  		enum amdgpu_ras_block block); @@ -743,6 +820,12 @@ struct amdgpu_ras* amdgpu_ras_get_context(struct amdgpu_device *adev);  int amdgpu_ras_set_context(struct amdgpu_device *adev, struct amdgpu_ras *ras_con); +int amdgpu_ras_set_mca_debug_mode(struct amdgpu_device *adev, bool enable); +int amdgpu_ras_set_aca_debug_mode(struct amdgpu_device *adev, bool enable); +bool amdgpu_ras_get_aca_debug_mode(struct amdgpu_device *adev); +bool amdgpu_ras_get_error_query_mode(struct amdgpu_device *adev, +				     unsigned int *mode); +  int amdgpu_ras_register_ras_block(struct amdgpu_device *adev,  				struct amdgpu_ras_block_object *ras_block_obj);  void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev); @@ -767,4 +850,33 @@ void amdgpu_ras_inst_reset_ras_error_count(struct amdgpu_device *adev,  					   const struct amdgpu_ras_err_status_reg_entry *reg_list,  					   uint32_t reg_list_size,  					   uint32_t instance); + +int amdgpu_ras_error_data_init(struct ras_err_data *err_data); +void amdgpu_ras_error_data_fini(struct ras_err_data *err_data); +int amdgpu_ras_error_statistic_ce_count(struct ras_err_data *err_data, +		struct amdgpu_smuio_mcm_config_info *mcm_info, +		struct ras_err_addr *err_addr, u64 count); +int amdgpu_ras_error_statistic_ue_count(struct ras_err_data *err_data, +		struct amdgpu_smuio_mcm_config_info *mcm_info, +		struct ras_err_addr *err_addr, u64 count); +int amdgpu_ras_error_statistic_de_count(struct ras_err_data *err_data, +		struct amdgpu_smuio_mcm_config_info *mcm_info, +		struct ras_err_addr *err_addr, u64 count); +void amdgpu_ras_query_boot_status(struct amdgpu_device *adev, u32 num_instances); +int amdgpu_ras_bind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk, +			       const struct aca_info *aca_info, void *data); +int amdgpu_ras_unbind_aca(struct amdgpu_device *adev, enum amdgpu_ras_block blk); + +ssize_t amdgpu_ras_aca_sysfs_read(struct device *dev, struct device_attribute *attr, +				  struct aca_handle *handle, char *buf, void *data); + +void amdgpu_ras_add_mca_err_addr(struct ras_err_info *err_info, +			struct ras_err_addr *err_addr); + +void amdgpu_ras_del_mca_err_addr(struct ras_err_info *err_info, +		struct ras_err_addr *mca_err_addr); + +void amdgpu_ras_set_fed(struct amdgpu_device *adev, bool status); +bool amdgpu_ras_get_fed_status(struct amdgpu_device *adev); +  #endif |