diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 237 |
1 files changed, 136 insertions, 101 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index d2a5f48c5767..690cf77b950e 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -87,6 +87,8 @@ static const char *amdgpu_asic_name[] = { "LAST", }; +static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev); + bool amdgpu_device_is_px(struct drm_device *dev) { struct amdgpu_device *adev = dev->dev_private; @@ -121,6 +123,32 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg, return ret; } +/* + * MMIO register read with bytes helper functions + * @offset:bytes offset from MMIO start + * +*/ + +uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) { + if (offset < adev->rmmio_size) + return (readb(adev->rmmio + offset)); + BUG(); +} + +/* + * MMIO register write with bytes helper functions + * @offset:bytes offset from MMIO start + * @value: the value want to be written to the register + * +*/ +void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) { + if (offset < adev->rmmio_size) + writeb(value, adev->rmmio + offset); + else + BUG(); +} + + void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags) { @@ -492,7 +520,7 @@ static int amdgpu_device_wb_init(struct amdgpu_device *adev) memset(&adev->wb.used, 0, sizeof(adev->wb.used)); /* clear wb memory */ - memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t)); + memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8); } return 0; @@ -530,8 +558,9 @@ int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb) */ void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb) { + wb >>= 3; if (wb < adev->wb.num_wb) - __clear_bit(wb >> 3, adev->wb.used); + __clear_bit(wb, adev->wb.used); } /** @@ -829,6 +858,8 @@ static void amdgpu_device_check_arguments(struct amdgpu_device *adev) dev_warn(adev->dev, "lockup_timeout msut be > 0, adjusting to 10000\n"); amdgpu_lockup_timeout = 10000; } + + adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); } /** @@ -1386,7 +1417,8 @@ static int amdgpu_device_ip_late_set_cg_state(struct amdgpu_device *adev) continue; /* skip CG for VCE/UVD, it's handled specially */ if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && - adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE) { + adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE && + adev->ip_blocks[i].version->funcs->set_clockgating_state) { /* enable clockgating to save power */ r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, AMD_CG_STATE_GATE); @@ -1435,7 +1467,8 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev) for (i = 0; i < adev->num_ip_blocks; i++) { if (!adev->ip_blocks[i].status.hw) continue; - if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { + if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC && + adev->ip_blocks[i].version->funcs->set_clockgating_state) { /* ungate blocks before hw fini so that we can shutdown the blocks safely */ r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, AMD_CG_STATE_UNGATE); @@ -1458,11 +1491,6 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev) for (i = adev->num_ip_blocks - 1; i >= 0; i--) { if (!adev->ip_blocks[i].status.hw) continue; - if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { - amdgpu_free_static_csa(adev); - amdgpu_device_wb_fini(adev); - amdgpu_device_vram_scratch_fini(adev); - } if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD && adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE) { @@ -1492,6 +1520,13 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev) for (i = adev->num_ip_blocks - 1; i >= 0; i--) { if (!adev->ip_blocks[i].status.sw) continue; + + if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { + amdgpu_free_static_csa(adev); + amdgpu_device_wb_fini(adev); + amdgpu_device_vram_scratch_fini(adev); + } + r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev); /* XXX handle errors */ if (r) { @@ -1542,7 +1577,8 @@ int amdgpu_device_ip_suspend(struct amdgpu_device *adev) if (!adev->ip_blocks[i].status.valid) continue; /* ungate blocks so that suspend can properly shut them down */ - if (i != AMD_IP_BLOCK_TYPE_SMC) { + if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_SMC && + adev->ip_blocks[i].version->funcs->set_clockgating_state) { r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev, AMD_CG_STATE_UNGATE); if (r) { @@ -1588,6 +1624,8 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) r = block->version->funcs->hw_init(adev); DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"successed"); + if (r) + return r; } } @@ -1621,6 +1659,8 @@ static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev) r = block->version->funcs->hw_init(adev); DRM_INFO("RE-INIT: %s %s\n", block->version->funcs->name, r?"failed":"successed"); + if (r) + return r; } } @@ -1871,6 +1911,8 @@ int amdgpu_device_init(struct amdgpu_device *adev, if (adev->rio_mem == NULL) DRM_INFO("PCI I/O BAR is not found.\n"); + amdgpu_device_get_pcie_info(adev); + /* early init functions */ r = amdgpu_device_ip_early_init(adev); if (r) @@ -2079,6 +2121,7 @@ void amdgpu_device_fini(struct amdgpu_device *adev) amdgpu_ib_pool_fini(adev); amdgpu_fence_driver_fini(adev); + amdgpu_pm_sysfs_fini(adev); amdgpu_fbdev_fini(adev); r = amdgpu_device_ip_fini(adev); if (adev->firmware.gpu_info_fw) { @@ -2107,7 +2150,6 @@ void amdgpu_device_fini(struct amdgpu_device *adev) iounmap(adev->rmmio); adev->rmmio = NULL; amdgpu_device_doorbell_fini(adev); - amdgpu_pm_sysfs_fini(adev); amdgpu_debugfs_regs_cleanup(adev); } @@ -2467,17 +2509,71 @@ err: return r; } +static int amdgpu_device_handle_vram_lost(struct amdgpu_device *adev) +{ + struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; + struct amdgpu_bo *bo, *tmp; + struct dma_fence *fence = NULL, *next = NULL; + long r = 1; + int i = 0; + long tmo; + + if (amdgpu_sriov_runtime(adev)) + tmo = msecs_to_jiffies(amdgpu_lockup_timeout); + else + tmo = msecs_to_jiffies(100); + + DRM_INFO("recover vram bo from shadow start\n"); + mutex_lock(&adev->shadow_list_lock); + list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) { + next = NULL; + amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next); + if (fence) { + r = dma_fence_wait_timeout(fence, false, tmo); + if (r == 0) + pr_err("wait fence %p[%d] timeout\n", fence, i); + else if (r < 0) + pr_err("wait fence %p[%d] interrupted\n", fence, i); + if (r < 1) { + dma_fence_put(fence); + fence = next; + break; + } + i++; + } + + dma_fence_put(fence); + fence = next; + } + mutex_unlock(&adev->shadow_list_lock); + + if (fence) { + r = dma_fence_wait_timeout(fence, false, tmo); + if (r == 0) + pr_err("wait fence %p[%d] timeout\n", fence, i); + else if (r < 0) + pr_err("wait fence %p[%d] interrupted\n", fence, i); + + } + dma_fence_put(fence); + + if (r > 0) + DRM_INFO("recover vram bo from shadow done\n"); + else + DRM_ERROR("recover vram bo from shadow failed\n"); + + return (r > 0?0:1); +} + /* * amdgpu_device_reset - reset ASIC/GPU for bare-metal or passthrough * * @adev: amdgpu device pointer - * @reset_flags: output param tells caller the reset result * * attempt to do soft-reset or full-reset and reinitialize Asic * return 0 means successed otherwise failed */ -static int amdgpu_device_reset(struct amdgpu_device *adev, - uint64_t* reset_flags) +static int amdgpu_device_reset(struct amdgpu_device *adev) { bool need_full_reset, vram_lost = 0; int r; @@ -2492,7 +2588,6 @@ static int amdgpu_device_reset(struct amdgpu_device *adev, DRM_INFO("soft reset failed, will fallback to full reset!\n"); need_full_reset = true; } - } if (need_full_reset) { @@ -2541,13 +2636,8 @@ out: } } - if (reset_flags) { - if (vram_lost) - (*reset_flags) |= AMDGPU_RESET_INFO_VRAM_LOST; - - if (need_full_reset) - (*reset_flags) |= AMDGPU_RESET_INFO_FULLRESET; - } + if (!r && ((need_full_reset && !(adev->flags & AMD_IS_APU)) || vram_lost)) + r = amdgpu_device_handle_vram_lost(adev); return r; } @@ -2556,14 +2646,11 @@ out: * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf * * @adev: amdgpu device pointer - * @reset_flags: output param tells caller the reset result * * do VF FLR and reinitialize Asic * return 0 means successed otherwise failed */ -static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, - uint64_t *reset_flags, - bool from_hypervisor) +static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, bool from_hypervisor) { int r; @@ -2584,28 +2671,20 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, /* now we are okay to resume SMC/CP/SDMA */ r = amdgpu_device_ip_reinit_late_sriov(adev); + amdgpu_virt_release_full_gpu(adev, true); if (r) goto error; amdgpu_irq_gpu_reset_resume_helper(adev); r = amdgpu_ib_ring_tests(adev); - if (r) - dev_err(adev->dev, "[GPU_RESET] ib ring test failed (%d).\n", r); -error: - /* release full control of GPU after ib test */ - amdgpu_virt_release_full_gpu(adev, true); - - if (reset_flags) { - if (adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { - (*reset_flags) |= AMDGPU_RESET_INFO_VRAM_LOST; - atomic_inc(&adev->vram_lost_counter); - } - - /* VF FLR or hotlink reset is always full-reset */ - (*reset_flags) |= AMDGPU_RESET_INFO_FULLRESET; + if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { + atomic_inc(&adev->vram_lost_counter); + r = amdgpu_device_handle_vram_lost(adev); } +error: + return r; } @@ -2623,7 +2702,6 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, struct amdgpu_job *job, bool force) { struct drm_atomic_state *state = NULL; - uint64_t reset_flags = 0; int i, r, resched; if (!force && !amdgpu_device_ip_check_soft_reset(adev)) { @@ -2645,22 +2723,23 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, /* block TTM */ resched = ttm_bo_lock_delayed_workqueue(&adev->mman.bdev); + /* store modesetting */ if (amdgpu_device_has_dc_support(adev)) state = drm_atomic_helper_suspend(adev->ddev); - /* block scheduler */ + /* block all schedulers and reset given job's ring */ for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { struct amdgpu_ring *ring = adev->rings[i]; if (!ring || !ring->sched.thread) continue; - /* only focus on the ring hit timeout if &job not NULL */ + kthread_park(ring->sched.thread); + if (job && job->ring->idx != i) continue; - kthread_park(ring->sched.thread); drm_sched_hw_job_reset(&ring->sched, &job->base); /* after all hw jobs are reset, hw fence is meaningless, so force_completion */ @@ -2668,68 +2747,24 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, } if (amdgpu_sriov_vf(adev)) - r = amdgpu_device_reset_sriov(adev, &reset_flags, job ? false : true); + r = amdgpu_device_reset_sriov(adev, job ? false : true); else - r = amdgpu_device_reset(adev, &reset_flags); - - if (!r) { - if (((reset_flags & AMDGPU_RESET_INFO_FULLRESET) && !(adev->flags & AMD_IS_APU)) || - (reset_flags & AMDGPU_RESET_INFO_VRAM_LOST)) { - struct amdgpu_ring *ring = adev->mman.buffer_funcs_ring; - struct amdgpu_bo *bo, *tmp; - struct dma_fence *fence = NULL, *next = NULL; - - DRM_INFO("recover vram bo from shadow\n"); - mutex_lock(&adev->shadow_list_lock); - list_for_each_entry_safe(bo, tmp, &adev->shadow_list, shadow_list) { - next = NULL; - amdgpu_device_recover_vram_from_shadow(adev, ring, bo, &next); - if (fence) { - r = dma_fence_wait(fence, false); - if (r) { - WARN(r, "recovery from shadow isn't completed\n"); - break; - } - } - - dma_fence_put(fence); - fence = next; - } - mutex_unlock(&adev->shadow_list_lock); - if (fence) { - r = dma_fence_wait(fence, false); - if (r) - WARN(r, "recovery from shadow isn't completed\n"); - } - dma_fence_put(fence); - } - - for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { - struct amdgpu_ring *ring = adev->rings[i]; + r = amdgpu_device_reset(adev); - if (!ring || !ring->sched.thread) - continue; + for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { + struct amdgpu_ring *ring = adev->rings[i]; - /* only focus on the ring hit timeout if &job not NULL */ - if (job && job->ring->idx != i) - continue; + if (!ring || !ring->sched.thread) + continue; + /* only need recovery sched of the given job's ring + * or all rings (in the case @job is NULL) + * after above amdgpu_reset accomplished + */ + if ((!job || job->ring->idx == i) && !r) drm_sched_job_recovery(&ring->sched); - kthread_unpark(ring->sched.thread); - } - } else { - for (i = 0; i < AMDGPU_MAX_RINGS; ++i) { - struct amdgpu_ring *ring = adev->rings[i]; - if (!ring || !ring->sched.thread) - continue; - - /* only focus on the ring hit timeout if &job not NULL */ - if (job && job->ring->idx != i) - continue; - - kthread_unpark(adev->rings[i]->sched.thread); - } + kthread_unpark(ring->sched.thread); } if (amdgpu_device_has_dc_support(adev)) { @@ -2755,7 +2790,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, return r; } -void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) +static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev) { u32 mask; int ret; |