diff options
author | Daniel Vetter <daniel.vetter@ffwll.ch> | 2021-04-13 12:25:16 +0200 |
---|---|---|
committer | Daniel Vetter <daniel.vetter@ffwll.ch> | 2021-04-13 12:25:17 +0200 |
commit | cd951b3971cdc1f8c76b075f2c97ff357bf141e2 (patch) | |
tree | 0af9a1eeba0011d2dffc360087808c4c0b7fee54 /drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | |
parent | c103b850721e4a79ff9578f131888129c37a4679 (diff) | |
parent | cbb8f989d5a07cb3e39e9c149a6f89d6c83432aa (diff) |
Merge tag 'amd-drm-next-5.13-2021-04-12' of https://gitlab.freedesktop.org/agd5f/linux into drm-next
amd-drm-next-5.13-2021-04-12:
amdgpu:
- Re-enable GPU reset on VanGogh
- Enable DPM flags for SMART_SUSPEND and MAY_SKIP_RESUME
- Disentangle HG from vga_switcheroo
- S0ix fixes
- W=1 fixes
- Resource iterator fixes
- DMCUB updates
- UBSAN fixes
- More PM API cleanup
- Aldebaran updates
- Modifier fixes
- Enable VCN load balancing with asymmetric engines
- Rework BO structs
- Aldebaran reset support
- Initial LTTPR display work
- Display MALL fixes
- Fall back to YCbCr420 when YCbCr444 fails
- SR-IOV fixes
- RAS updates
- Misc cleanups and fixes
radeon:
- Typo fixes
- Fix error handling for firmware on r6xx
- Fix a missing check in DP MST handling
Signed-off-by: Daniel Vetter <daniel.vetter@ffwll.ch>
From: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20210412220732.3845-1-alexander.deucher@amd.com
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c | 256 |
1 files changed, 195 insertions, 61 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c index 0e16683876aa..0541196ae1ed 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c @@ -99,6 +99,49 @@ static bool amdgpu_ras_get_error_query_ready(struct amdgpu_device *adev) return false; } +static int amdgpu_reserve_page_direct(struct amdgpu_device *adev, uint64_t address) +{ + struct ras_err_data err_data = {0, 0, 0, NULL}; + struct eeprom_table_record err_rec; + + if ((address >= adev->gmc.mc_vram_size) || + (address >= RAS_UMC_INJECT_ADDR_LIMIT)) { + dev_warn(adev->dev, + "RAS WARN: input address 0x%llx is invalid.\n", + address); + return -EINVAL; + } + + if (amdgpu_ras_check_bad_page(adev, address)) { + dev_warn(adev->dev, + "RAS WARN: 0x%llx has been marked as bad page!\n", + address); + return 0; + } + + memset(&err_rec, 0x0, sizeof(struct eeprom_table_record)); + + err_rec.address = address; + err_rec.retired_page = address >> AMDGPU_GPU_PAGE_SHIFT; + err_rec.ts = (uint64_t)ktime_get_real_seconds(); + err_rec.err_type = AMDGPU_RAS_EEPROM_ERR_NON_RECOVERABLE; + + err_data.err_addr = &err_rec; + err_data.err_addr_cnt = 1; + + if (amdgpu_bad_page_threshold != 0) { + amdgpu_ras_add_bad_pages(adev, err_data.err_addr, + err_data.err_addr_cnt); + amdgpu_ras_save_bad_pages(adev); + } + + dev_warn(adev->dev, "WARNING: THIS IS ONLY FOR TEST PURPOSES AND WILL CORRUPT RAS EEPROM\n"); + dev_warn(adev->dev, "Clear EEPROM:\n"); + dev_warn(adev->dev, " echo 1 > /sys/kernel/debug/dri/0/ras/ras_eeprom_reset\n"); + + return 0; +} + static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf, size_t size, loff_t *pos) { @@ -178,11 +221,25 @@ static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f, op = 1; else if (sscanf(str, "inject %32s %8s", block_name, err) == 2) op = 2; + else if (sscanf(str, "retire_page") == 0) + op = 3; else if (str[0] && str[1] && str[2] && str[3]) /* ascii string, but commands are not matched. */ return -EINVAL; if (op != -1) { + + if (op == 3) { + if (sscanf(str, "%*s %llu", &address) != 1) + if (sscanf(str, "%*s 0x%llx", &address) != 1) + return -EINVAL; + + data->op = op; + data->inject.address = address; + + return 0; + } + if (amdgpu_ras_find_block_id_by_name(block_name, &block_id)) return -EINVAL; @@ -310,6 +367,16 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * if (ret) return -EINVAL; + if (data.op == 3) + { + ret = amdgpu_reserve_page_direct(adev, data.inject.address); + + if (ret) + return size; + else + return ret; + } + if (!amdgpu_ras_is_supported(adev, data.head.block)) return -EINVAL; @@ -431,15 +498,13 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev, }; if (!amdgpu_ras_get_error_query_ready(obj->adev)) - return snprintf(buf, PAGE_SIZE, - "Query currently inaccessible\n"); + return sysfs_emit(buf, "Query currently inaccessible\n"); if (amdgpu_ras_query_error_status(obj->adev, &info)) return -EINVAL; - return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n", - "ue", info.ue_count, - "ce", info.ce_count); + return sysfs_emit(buf, "%s: %lu\n%s: %lu\n", "ue", info.ue_count, + "ce", info.ce_count); } /* obj begin */ @@ -449,11 +514,10 @@ static ssize_t amdgpu_ras_sysfs_read(struct device *dev, static inline void put_obj(struct ras_manager *obj) { - if (obj && --obj->use == 0) + if (obj && (--obj->use == 0)) list_del(&obj->node); - if (obj && obj->use < 0) { - DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name); - } + if (obj && (obj->use < 0)) + DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name); } /* make one obj and return it. */ @@ -777,13 +841,15 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, switch (info->head.block) { case AMDGPU_RAS_BLOCK__UMC: - if (adev->umc.funcs->query_ras_error_count) - adev->umc.funcs->query_ras_error_count(adev, &err_data); + if (adev->umc.ras_funcs && + adev->umc.ras_funcs->query_ras_error_count) + adev->umc.ras_funcs->query_ras_error_count(adev, &err_data); /* umc query_ras_error_address is also responsible for clearing * error status */ - if (adev->umc.funcs->query_ras_error_address) - adev->umc.funcs->query_ras_error_address(adev, &err_data); + if (adev->umc.ras_funcs && + adev->umc.ras_funcs->query_ras_error_address) + adev->umc.ras_funcs->query_ras_error_address(adev, &err_data); break; case AMDGPU_RAS_BLOCK__SDMA: if (adev->sdma.funcs->query_ras_error_count) { @@ -793,25 +859,32 @@ int amdgpu_ras_query_error_status(struct amdgpu_device *adev, } break; case AMDGPU_RAS_BLOCK__GFX: - if (adev->gfx.funcs->query_ras_error_count) - adev->gfx.funcs->query_ras_error_count(adev, &err_data); + if (adev->gfx.ras_funcs && + adev->gfx.ras_funcs->query_ras_error_count) + adev->gfx.ras_funcs->query_ras_error_count(adev, &err_data); - if (adev->gfx.funcs->query_ras_error_status) - adev->gfx.funcs->query_ras_error_status(adev); + if (adev->gfx.ras_funcs && + adev->gfx.ras_funcs->query_ras_error_status) + adev->gfx.ras_funcs->query_ras_error_status(adev); break; case AMDGPU_RAS_BLOCK__MMHUB: - if (adev->mmhub.funcs->query_ras_error_count) - adev->mmhub.funcs->query_ras_error_count(adev, &err_data); + if (adev->mmhub.ras_funcs && + adev->mmhub.ras_funcs->query_ras_error_count) + adev->mmhub.ras_funcs->query_ras_error_count(adev, &err_data); - if (adev->mmhub.funcs->query_ras_error_status) - adev->mmhub.funcs->query_ras_error_status(adev); + if (adev->mmhub.ras_funcs && + adev->mmhub.ras_funcs->query_ras_error_status) + adev->mmhub.ras_funcs->query_ras_error_status(adev); break; case AMDGPU_RAS_BLOCK__PCIE_BIF: - if (adev->nbio.funcs->query_ras_error_count) - adev->nbio.funcs->query_ras_error_count(adev, &err_data); + if (adev->nbio.ras_funcs && + adev->nbio.ras_funcs->query_ras_error_count) + adev->nbio.ras_funcs->query_ras_error_count(adev, &err_data); break; case AMDGPU_RAS_BLOCK__XGMI_WAFL: - amdgpu_xgmi_query_ras_error_count(adev, &err_data); + if (adev->gmc.xgmi.ras_funcs && + adev->gmc.xgmi.ras_funcs->query_ras_error_count) + adev->gmc.xgmi.ras_funcs->query_ras_error_count(adev, &err_data); break; default: break; @@ -848,15 +921,18 @@ int amdgpu_ras_reset_error_status(struct amdgpu_device *adev, switch (block) { case AMDGPU_RAS_BLOCK__GFX: - if (adev->gfx.funcs->reset_ras_error_count) - adev->gfx.funcs->reset_ras_error_count(adev); + if (adev->gfx.ras_funcs && + adev->gfx.ras_funcs->reset_ras_error_count) + adev->gfx.ras_funcs->reset_ras_error_count(adev); - if (adev->gfx.funcs->reset_ras_error_status) - adev->gfx.funcs->reset_ras_error_status(adev); + if (adev->gfx.ras_funcs && + adev->gfx.ras_funcs->reset_ras_error_status) + adev->gfx.ras_funcs->reset_ras_error_status(adev); break; case AMDGPU_RAS_BLOCK__MMHUB: - if (adev->mmhub.funcs->reset_ras_error_count) - adev->mmhub.funcs->reset_ras_error_count(adev); + if (adev->mmhub.ras_funcs && + adev->mmhub.ras_funcs->reset_ras_error_count) + adev->mmhub.ras_funcs->reset_ras_error_count(adev); break; case AMDGPU_RAS_BLOCK__SDMA: if (adev->sdma.funcs->reset_ras_error_count) @@ -921,12 +997,14 @@ int amdgpu_ras_error_inject(struct amdgpu_device *adev, switch (info->head.block) { case AMDGPU_RAS_BLOCK__GFX: - if (adev->gfx.funcs->ras_error_inject) - ret = adev->gfx.funcs->ras_error_inject(adev, info); + if (adev->gfx.ras_funcs && + adev->gfx.ras_funcs->ras_error_inject) + ret = adev->gfx.ras_funcs->ras_error_inject(adev, info); else ret = -EINVAL; break; case AMDGPU_RAS_BLOCK__UMC: + case AMDGPU_RAS_BLOCK__SDMA: case AMDGPU_RAS_BLOCK__MMHUB: case AMDGPU_RAS_BLOCK__PCIE_BIF: ret = psp_ras_trigger_error(&adev->psp, &block_info); @@ -1508,12 +1586,14 @@ static void amdgpu_ras_error_status_query(struct amdgpu_device *adev, */ switch (info->head.block) { case AMDGPU_RAS_BLOCK__GFX: - if (adev->gfx.funcs->query_ras_error_status) - adev->gfx.funcs->query_ras_error_status(adev); + if (adev->gfx.ras_funcs && + adev->gfx.ras_funcs->query_ras_error_status) + adev->gfx.ras_funcs->query_ras_error_status(adev); break; case AMDGPU_RAS_BLOCK__MMHUB: - if (adev->mmhub.funcs->query_ras_error_status) - adev->mmhub.funcs->query_ras_error_status(adev); + if (adev->mmhub.ras_funcs && + adev->mmhub.ras_funcs->query_ras_error_status) + adev->mmhub.ras_funcs->query_ras_error_status(adev); break; default: break; @@ -1933,15 +2013,13 @@ int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, return 0; } -static int amdgpu_ras_check_asic_type(struct amdgpu_device *adev) +static bool amdgpu_ras_asic_supported(struct amdgpu_device *adev) { - if (adev->asic_type != CHIP_VEGA10 && - adev->asic_type != CHIP_VEGA20 && - adev->asic_type != CHIP_ARCTURUS && - adev->asic_type != CHIP_SIENNA_CICHLID) - return 1; - else - return 0; + return adev->asic_type == CHIP_VEGA10 || + adev->asic_type == CHIP_VEGA20 || + adev->asic_type == CHIP_ARCTURUS || + adev->asic_type == CHIP_ALDEBARAN || + adev->asic_type == CHIP_SIENNA_CICHLID; } /* @@ -1960,22 +2038,32 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev, *supported = 0; if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw || - amdgpu_ras_check_asic_type(adev)) + !amdgpu_ras_asic_supported(adev)) return; - if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { - dev_info(adev->dev, "MEM ECC is active.\n"); - *hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC | - 1 << AMDGPU_RAS_BLOCK__DF); - } else - dev_info(adev->dev, "MEM ECC is not presented.\n"); + if (!adev->gmc.xgmi.connected_to_cpu) { + if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { + dev_info(adev->dev, "MEM ECC is active.\n"); + *hw_supported |= (1 << AMDGPU_RAS_BLOCK__UMC | + 1 << AMDGPU_RAS_BLOCK__DF); + } else { + dev_info(adev->dev, "MEM ECC is not presented.\n"); + } - if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { - dev_info(adev->dev, "SRAM ECC is active.\n"); - *hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC | - 1 << AMDGPU_RAS_BLOCK__DF); - } else - dev_info(adev->dev, "SRAM ECC is not presented.\n"); + if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { + dev_info(adev->dev, "SRAM ECC is active.\n"); + *hw_supported |= ~(1 << AMDGPU_RAS_BLOCK__UMC | + 1 << AMDGPU_RAS_BLOCK__DF); + } else { + dev_info(adev->dev, "SRAM ECC is not presented.\n"); + } + } else { + /* driver only manages a few IP blocks RAS feature + * when GPU is connected cpu through XGMI */ + *hw_supported |= (1 << AMDGPU_RAS_BLOCK__GFX | + 1 << AMDGPU_RAS_BLOCK__SDMA | + 1 << AMDGPU_RAS_BLOCK__MMHUB); + } /* hw_supported needs to be aligned with RAS block mask. */ *hw_supported &= AMDGPU_RAS_BLOCK_MASK; @@ -2024,14 +2112,31 @@ int amdgpu_ras_init(struct amdgpu_device *adev) /* Might need get this flag from vbios. */ con->flags = RAS_DEFAULT_FLAGS; - if (adev->nbio.funcs->init_ras_controller_interrupt) { - r = adev->nbio.funcs->init_ras_controller_interrupt(adev); + /* initialize nbio ras function ahead of any other + * ras functions so hardware fatal error interrupt + * can be enabled as early as possible */ + switch (adev->asic_type) { + case CHIP_VEGA20: + case CHIP_ARCTURUS: + case CHIP_ALDEBARAN: + if (!adev->gmc.xgmi.connected_to_cpu) + adev->nbio.ras_funcs = &nbio_v7_4_ras_funcs; + break; + default: + /* nbio ras is not available */ + break; + } + + if (adev->nbio.ras_funcs && + adev->nbio.ras_funcs->init_ras_controller_interrupt) { + r = adev->nbio.ras_funcs->init_ras_controller_interrupt(adev); if (r) goto release_con; } - if (adev->nbio.funcs->init_ras_err_event_athub_interrupt) { - r = adev->nbio.funcs->init_ras_err_event_athub_interrupt(adev); + if (adev->nbio.ras_funcs && + adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt) { + r = adev->nbio.ras_funcs->init_ras_err_event_athub_interrupt(adev); if (r) goto release_con; } @@ -2052,6 +2157,32 @@ release_con: return r; } +static int amdgpu_persistent_edc_harvesting_supported(struct amdgpu_device *adev) +{ + if (adev->gmc.xgmi.connected_to_cpu) + return 1; + return 0; +} + +static int amdgpu_persistent_edc_harvesting(struct amdgpu_device *adev, + struct ras_common_if *ras_block) +{ + struct ras_query_if info = { + .head = *ras_block, + }; + + if (!amdgpu_persistent_edc_harvesting_supported(adev)) + return 0; + + if (amdgpu_ras_query_error_status(adev, &info) != 0) + DRM_WARN("RAS init harvest failure"); + + if (amdgpu_ras_reset_error_status(adev, ras_block->block) != 0) + DRM_WARN("RAS init harvest reset failure"); + + return 0; +} + /* helper function to handle common stuff in ip late init phase */ int amdgpu_ras_late_init(struct amdgpu_device *adev, struct ras_common_if *ras_block, @@ -2081,6 +2212,9 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev, return r; } + /* check for errors on warm reset edc persisant supported ASIC */ + amdgpu_persistent_edc_harvesting(adev, ras_block); + /* in resume phase, no need to create ras fs node */ if (adev->in_suspend || amdgpu_in_reset(adev)) return 0; |