diff options
author | Maxime Ripard <maxime@cerno.tech> | 2022-10-18 15:00:03 +0200 |
---|---|---|
committer | Maxime Ripard <maxime@cerno.tech> | 2022-10-18 15:00:03 +0200 |
commit | a140a6a2d5ec0329ad05cd3532a91ad0ce58dceb (patch) | |
tree | b2d44a1da423c53bd6c3ab3facd45ff5f2087ffd /drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | |
parent | 28743e25fa1c867675bd8ff976eb92d4251f13a1 (diff) | |
parent | 9abf2313adc1ca1b6180c508c25f22f9395cc780 (diff) |
Merge drm/drm-next into drm-misc-next
Let's kick-off this release cycle.
Signed-off-by: Maxime Ripard <maxime@cerno.tech>
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 139 |
1 files changed, 121 insertions, 18 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 62b26f0e37b0..ab8f970b2849 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -2365,8 +2365,16 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) } adev->ip_blocks[i].status.sw = true; - /* need to do gmc hw init early so we can allocate gpu mem */ - if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { + if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) { + /* need to do common hw init early so everything is set up for gmc */ + r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev); + if (r) { + DRM_ERROR("hw_init %d failed %d\n", i, r); + goto init_failed; + } + adev->ip_blocks[i].status.hw = true; + } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { + /* need to do gmc hw init early so we can allocate gpu mem */ /* Try to reserve bad pages early */ if (amdgpu_sriov_vf(adev)) amdgpu_virt_exchange_data(adev); @@ -2451,19 +2459,21 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev) */ if (adev->gmc.xgmi.num_physical_nodes > 1) { if (amdgpu_xgmi_add_device(adev) == 0) { - struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); + if (!amdgpu_sriov_vf(adev)) { + struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev); - if (!hive->reset_domain || - !amdgpu_reset_get_reset_domain(hive->reset_domain)) { - r = -ENOENT; + if (!hive->reset_domain || + !amdgpu_reset_get_reset_domain(hive->reset_domain)) { + r = -ENOENT; + amdgpu_put_xgmi_hive(hive); + goto init_failed; + } + + /* Drop the early temporary reset domain we created for device */ + amdgpu_reset_put_reset_domain(adev->reset_domain); + adev->reset_domain = hive->reset_domain; amdgpu_put_xgmi_hive(hive); - goto init_failed; } - - /* Drop the early temporary reset domain we created for device */ - amdgpu_reset_put_reset_domain(adev->reset_domain); - adev->reset_domain = hive->reset_domain; - amdgpu_put_xgmi_hive(hive); } } @@ -3052,8 +3062,8 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev) int i, r; static enum amd_ip_block_type ip_order[] = { - AMD_IP_BLOCK_TYPE_GMC, AMD_IP_BLOCK_TYPE_COMMON, + AMD_IP_BLOCK_TYPE_GMC, AMD_IP_BLOCK_TYPE_PSP, AMD_IP_BLOCK_TYPE_IH, }; @@ -3144,7 +3154,8 @@ static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev) continue; if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON || adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC || - adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) { + adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH || + (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) { r = adev->ip_blocks[i].version->funcs->resume(adev); if (r) { @@ -3501,6 +3512,7 @@ int amdgpu_device_init(struct amdgpu_device *adev, adev->gmc.gart_size = 512 * 1024 * 1024; adev->accel_working = false; adev->num_rings = 0; + RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub()); adev->mman.buffer_funcs = NULL; adev->mman.buffer_funcs_ring = NULL; adev->vm_manager.vm_pte_funcs = NULL; @@ -3982,6 +3994,7 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev) release_firmware(adev->firmware.gpu_info_fw); adev->firmware.gpu_info_fw = NULL; adev->accel_working = false; + dma_fence_put(rcu_dereference_protected(adev->gang_submit, true)); amdgpu_reset_fini(adev); @@ -4057,12 +4070,20 @@ static void amdgpu_device_evict_resources(struct amdgpu_device *adev) int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) { struct amdgpu_device *adev = drm_to_adev(dev); + int r = 0; if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) return 0; adev->in_suspend = true; + if (amdgpu_sriov_vf(adev)) { + amdgpu_virt_fini_data_exchange(adev); + r = amdgpu_virt_request_full_gpu(adev, false); + if (r) + return r; + } + if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3)) DRM_WARN("smart shift update failed\n"); @@ -4086,6 +4107,9 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon) amdgpu_device_ip_suspend_phase2(adev); + if (amdgpu_sriov_vf(adev)) + amdgpu_virt_release_full_gpu(adev, false); + return 0; } @@ -4104,6 +4128,12 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon) struct amdgpu_device *adev = drm_to_adev(dev); int r = 0; + if (amdgpu_sriov_vf(adev)) { + r = amdgpu_virt_request_full_gpu(adev, true); + if (r) + return r; + } + if (dev->switch_power_state == DRM_SWITCH_POWER_OFF) return 0; @@ -4118,6 +4148,13 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon) } r = amdgpu_device_ip_resume(adev); + + /* no matter what r is, always need to properly release full GPU */ + if (amdgpu_sriov_vf(adev)) { + amdgpu_virt_init_data_exchange(adev); + amdgpu_virt_release_full_gpu(adev, true); + } + if (r) { dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r); return r; @@ -4739,6 +4776,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, struct amdgpu_device *tmp_adev = NULL; bool need_full_reset, skip_hw_reset, vram_lost = false; int r = 0; + bool gpu_reset_for_dev_remove = 0; /* Try reset handler method first */ tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, @@ -4758,6 +4796,10 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags); + gpu_reset_for_dev_remove = + test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && + test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); + /* * ASIC reset has to be done on all XGMI hive nodes ASAP * to allow proper links negotiation in FW (within 1 sec) @@ -4802,6 +4844,18 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, amdgpu_ras_intr_cleared(); } + /* Since the mode1 reset affects base ip blocks, the + * phase1 ip blocks need to be resumed. Otherwise there + * will be a BIOS signature error and the psp bootloader + * can't load kdb on the next amdgpu install. + */ + if (gpu_reset_for_dev_remove) { + list_for_each_entry(tmp_adev, device_list_handle, reset_list) + amdgpu_device_ip_resume_phase1(tmp_adev); + + goto end; + } + list_for_each_entry(tmp_adev, device_list_handle, reset_list) { if (need_full_reset) { /* post card */ @@ -5124,6 +5178,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, bool need_emergency_restart = false; bool audio_suspended = false; int tmp_vram_lost_counter; + bool gpu_reset_for_dev_remove = false; + + gpu_reset_for_dev_remove = + test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) && + test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); /* * Special case: RAS triggered and full reset isn't supported @@ -5159,8 +5218,11 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, */ INIT_LIST_HEAD(&device_list); if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) { - list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) + list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) { list_add_tail(&tmp_adev->reset_list, &device_list); + if (gpu_reset_for_dev_remove && adev->shutdown) + tmp_adev->shutdown = true; + } if (!list_is_first(&adev->reset_list, &device_list)) list_rotate_to_front(&adev->reset_list, &device_list); device_list_handle = &device_list; @@ -5243,6 +5305,10 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, retry: /* Rest of adevs pre asic reset from XGMI hive. */ list_for_each_entry(tmp_adev, device_list_handle, reset_list) { + if (gpu_reset_for_dev_remove) { + /* Workaroud for ASICs need to disable SMC first */ + amdgpu_device_smu_fini_early(tmp_adev); + } r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context); /*TODO Should we stop ?*/ if (r) { @@ -5276,6 +5342,9 @@ retry: /* Rest of adevs pre asic reset from XGMI hive. */ adev->asic_reset_res = 0; goto retry; } + + if (!r && gpu_reset_for_dev_remove) + goto recover_end; } skip_hw_reset: @@ -5349,6 +5418,7 @@ skip_sched_resume: amdgpu_device_unset_mp1_state(tmp_adev); } +recover_end: tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, reset_list); amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain); @@ -5531,9 +5601,9 @@ bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); resource_size_t aper_limit = adev->gmc.aper_base + adev->gmc.aper_size - 1; - bool p2p_access = !adev->gmc.xgmi.connected_to_cpu && - !(pci_p2pdma_distance_many(adev->pdev, - &peer_adev->dev, 1, true) < 0); + bool p2p_access = + !adev->gmc.xgmi.connected_to_cpu && + !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && adev->gmc.real_vram_size == adev->gmc.visible_vram_size && @@ -5917,3 +5987,36 @@ void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev, (void)RREG32(data); spin_unlock_irqrestore(&adev->pcie_idx_lock, flags); } + +/** + * amdgpu_device_switch_gang - switch to a new gang + * @adev: amdgpu_device pointer + * @gang: the gang to switch to + * + * Try to switch to a new gang. + * Returns: NULL if we switched to the new gang or a reference to the current + * gang leader. + */ +struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, + struct dma_fence *gang) +{ + struct dma_fence *old = NULL; + + do { + dma_fence_put(old); + rcu_read_lock(); + old = dma_fence_get_rcu_safe(&adev->gang_submit); + rcu_read_unlock(); + + if (old == gang) + break; + + if (!dma_fence_is_signaled(old)) + return old; + + } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit, + old, gang) != old); + + dma_fence_put(old); + return NULL; +} |