From e23b74aab5dc48d3e508a2bc171ccd152fb03803 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Thu, 28 Sep 2017 09:47:32 -0400 Subject: drm/amdgpu: fix vf error handling The error handling for virtual functions assumed a single vf per VM and didn't properly account for bare metal. Make the error arrays per device and add locking. Reviewed-by: Gavin Wan Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.c | 54 +++++++++++++--------------- 1 file changed, 25 insertions(+), 29 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.c index 45ac91861965..746b81339835 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.c @@ -25,30 +25,21 @@ #include "amdgpu_vf_error.h" #include "mxgpu_ai.h" -#define AMDGPU_VF_ERROR_ENTRY_SIZE 16 - -/* struct error_entry - amdgpu VF error information. */ -struct amdgpu_vf_error_buffer { - int read_count; - int write_count; - uint16_t code[AMDGPU_VF_ERROR_ENTRY_SIZE]; - uint16_t flags[AMDGPU_VF_ERROR_ENTRY_SIZE]; - uint64_t data[AMDGPU_VF_ERROR_ENTRY_SIZE]; -}; - -struct amdgpu_vf_error_buffer admgpu_vf_errors; - - -void amdgpu_vf_error_put(uint16_t sub_error_code, uint16_t error_flags, uint64_t error_data) +void amdgpu_vf_error_put(struct amdgpu_device *adev, + uint16_t sub_error_code, + uint16_t error_flags, + uint64_t error_data) { int index; uint16_t error_code = AMDGIM_ERROR_CODE(AMDGIM_ERROR_CATEGORY_VF, sub_error_code); - index = admgpu_vf_errors.write_count % AMDGPU_VF_ERROR_ENTRY_SIZE; - admgpu_vf_errors.code [index] = error_code; - admgpu_vf_errors.flags [index] = error_flags; - admgpu_vf_errors.data [index] = error_data; - admgpu_vf_errors.write_count ++; + mutex_lock(&adev->virt.vf_errors.lock); + index = adev->virt.vf_errors.write_count % AMDGPU_VF_ERROR_ENTRY_SIZE; + adev->virt.vf_errors.code [index] = error_code; + adev->virt.vf_errors.flags [index] = error_flags; + adev->virt.vf_errors.data [index] = error_data; + adev->virt.vf_errors.write_count ++; + mutex_unlock(&adev->virt.vf_errors.lock); } @@ -58,7 +49,8 @@ void amdgpu_vf_error_trans_all(struct amdgpu_device *adev) u32 data1, data2, data3; int index; - if ((NULL == adev) || (!amdgpu_sriov_vf(adev)) || (!adev->virt.ops) || (!adev->virt.ops->trans_msg)) { + if ((NULL == adev) || (!amdgpu_sriov_vf(adev)) || + (!adev->virt.ops) || (!adev->virt.ops->trans_msg)) { return; } /* @@ -68,18 +60,22 @@ void amdgpu_vf_error_trans_all(struct amdgpu_device *adev) return; } */ + + mutex_lock(&adev->virt.vf_errors.lock); /* The errors are overlay of array, correct read_count as full. */ - if (admgpu_vf_errors.write_count - admgpu_vf_errors.read_count > AMDGPU_VF_ERROR_ENTRY_SIZE) { - admgpu_vf_errors.read_count = admgpu_vf_errors.write_count - AMDGPU_VF_ERROR_ENTRY_SIZE; + if (adev->virt.vf_errors.write_count - adev->virt.vf_errors.read_count > AMDGPU_VF_ERROR_ENTRY_SIZE) { + adev->virt.vf_errors.read_count = adev->virt.vf_errors.write_count - AMDGPU_VF_ERROR_ENTRY_SIZE; } - while (admgpu_vf_errors.read_count < admgpu_vf_errors.write_count) { - index =admgpu_vf_errors.read_count % AMDGPU_VF_ERROR_ENTRY_SIZE; - data1 = AMDGIM_ERROR_CODE_FLAGS_TO_MAILBOX (admgpu_vf_errors.code[index], admgpu_vf_errors.flags[index]); - data2 = admgpu_vf_errors.data[index] & 0xFFFFFFFF; - data3 = (admgpu_vf_errors.data[index] >> 32) & 0xFFFFFFFF; + while (adev->virt.vf_errors.read_count < adev->virt.vf_errors.write_count) { + index =adev->virt.vf_errors.read_count % AMDGPU_VF_ERROR_ENTRY_SIZE; + data1 = AMDGIM_ERROR_CODE_FLAGS_TO_MAILBOX(adev->virt.vf_errors.code[index], + adev->virt.vf_errors.flags[index]); + data2 = adev->virt.vf_errors.data[index] & 0xFFFFFFFF; + data3 = (adev->virt.vf_errors.data[index] >> 32) & 0xFFFFFFFF; adev->virt.ops->trans_msg(adev, IDH_LOG_VF_ERROR, data1, data2, data3); - admgpu_vf_errors.read_count ++; + adev->virt.vf_errors.read_count ++; } + mutex_unlock(&adev->virt.vf_errors.lock); } -- cgit From 6867e1b5fbd1a9deaf95a1bd23ea930063c8d216 Mon Sep 17 00:00:00 2001 From: Monk Liu Date: Mon, 16 Oct 2017 19:50:44 +0800 Subject: drm/amdgpu:fix vf_error_put 1,it should not work on non-SR-IOV case 2,the NO_VBIOS error is incorrect, should handle it under detect_sriov_bios. 3,wrap the whole detect_sriov_bios with sriov check Signed-off-by: Monk Liu Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 21 +++++++++++---------- drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.c | 7 ++++++- 2 files changed, 17 insertions(+), 11 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 07726afd9307..e29731c4ada7 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2032,12 +2032,17 @@ static int amdgpu_resume(struct amdgpu_device *adev) static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev) { - if (adev->is_atom_fw) { - if (amdgpu_atomfirmware_gpu_supports_virtualization(adev)) - adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; - } else { - if (amdgpu_atombios_has_gpu_virtualization_table(adev)) - adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; + if (amdgpu_sriov_vf(adev)) { + if (adev->is_atom_fw) { + if (amdgpu_atomfirmware_gpu_supports_virtualization(adev)) + adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; + } else { + if (amdgpu_atombios_has_gpu_virtualization_table(adev)) + adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS; + } + + if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS)) + amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); } } @@ -2207,7 +2212,6 @@ int amdgpu_device_init(struct amdgpu_device *adev, if (amdgpu_need_post(adev)) { if (!adev->bios) { dev_err(adev->dev, "no vBIOS found\n"); - amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0); r = -EINVAL; goto failed; } @@ -2215,7 +2219,6 @@ int amdgpu_device_init(struct amdgpu_device *adev, r = amdgpu_atom_asic_init(adev->mode_info.atom_context); if (r) { dev_err(adev->dev, "gpu post error!\n"); - amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_GPU_POST_ERROR, 0, 0); goto failed; } } else { @@ -3019,7 +3022,6 @@ out: } } else { dev_err(adev->dev, "asic resume failed (%d).\n", r); - amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ASIC_RESUME_FAIL, 0, r); for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { if (adev->rings[i] && adev->rings[i]->sched.thread) { kthread_unpark(adev->rings[i]->sched.thread); @@ -3033,7 +3035,6 @@ out: if (r) { /* bad news, how to tell it to userspace ? */ dev_info(adev->dev, "GPU reset failed\n"); - amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); } else { dev_info(adev->dev, "GPU reset successed!\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.c index 746b81339835..7f7097931c6f 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vf_error.c @@ -31,7 +31,12 @@ void amdgpu_vf_error_put(struct amdgpu_device *adev, uint64_t error_data) { int index; - uint16_t error_code = AMDGIM_ERROR_CODE(AMDGIM_ERROR_CATEGORY_VF, sub_error_code); + uint16_t error_code; + + if (!amdgpu_sriov_vf(adev)) + return; + + error_code = AMDGIM_ERROR_CODE(AMDGIM_ERROR_CATEGORY_VF, sub_error_code); mutex_lock(&adev->virt.vf_errors.lock); index = adev->virt.vf_errors.write_count % AMDGPU_VF_ERROR_ENTRY_SIZE; -- cgit