diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
-rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 144 |
1 files changed, 92 insertions, 52 deletions
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index bcacf2e35eba..f4628412dac4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -1916,6 +1916,8 @@ static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev) */ static int amdgpu_device_check_arguments(struct amdgpu_device *adev) { + int i; + if (amdgpu_sched_jobs < 4) { dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n", amdgpu_sched_jobs); @@ -1970,6 +1972,9 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev) adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type); + for (i = 0; i < MAX_XCP; i++) + adev->enforce_isolation[i] = !!enforce_isolation; + return 0; } @@ -2471,6 +2476,7 @@ out: */ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) { + struct amdgpu_ip_block *ip_block; struct pci_dev *parent; int i, r; bool total; @@ -2608,7 +2614,10 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev) if (!total) return -ENODEV; - amdgpu_amdkfd_device_probe(adev); + ip_block = amdgpu_device_ip_get_ip_block(adev, AMD_IP_BLOCK_TYPE_GFX); + if (ip_block->status.valid != false) + amdgpu_amdkfd_device_probe(adev); + adev->cg_flags &= amdgpu_cg_mask; adev->pg_flags &= amdgpu_pg_mask; @@ -3948,6 +3957,27 @@ static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) adev->ram_is_direct_mapped = true; } +#if defined(CONFIG_HSA_AMD_P2P) +/** + * amdgpu_device_check_iommu_remap - Check if DMA remapping is enabled. + * + * @adev: amdgpu_device pointer + * + * return if IOMMU remapping bar address + */ +static bool amdgpu_device_check_iommu_remap(struct amdgpu_device *adev) +{ + struct iommu_domain *domain; + + domain = iommu_get_domain_for_dev(adev->dev); + if (domain && (domain->type == IOMMU_DOMAIN_DMA || + domain->type == IOMMU_DOMAIN_DMA_FQ)) + return true; + + return false; +} +#endif + static const struct attribute *amdgpu_dev_attributes[] = { &dev_attr_pcie_replay_count.attr, NULL @@ -4055,6 +4085,10 @@ int amdgpu_device_init(struct amdgpu_device *adev, mutex_init(&adev->notifier_lock); mutex_init(&adev->pm.stable_pstate_ctx_lock); mutex_init(&adev->benchmark_mutex); + mutex_init(&adev->gfx.reset_sem_mutex); + /* Initialize the mutex for cleaner shader isolation between GFX and compute processes */ + mutex_init(&adev->enforce_isolation_mutex); + mutex_init(&adev->gfx.kfd_sch_mutex); amdgpu_device_init_apu_flags(adev); @@ -4086,6 +4120,21 @@ int amdgpu_device_init(struct amdgpu_device *adev, amdgpu_device_delayed_init_work_handler); INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work, amdgpu_device_delay_enable_gfx_off); + /* + * Initialize the enforce_isolation work structures for each XCP + * partition. This work handler is responsible for enforcing shader + * isolation on AMD GPUs. It counts the number of emitted fences for + * each GFX and compute ring. If there are any fences, it schedules + * the `enforce_isolation_work` to be run after a delay. If there are + * no fences, it signals the Kernel Fusion Driver (KFD) to resume the + * runqueue. + */ + for (i = 0; i < MAX_XCP; i++) { + INIT_DELAYED_WORK(&adev->gfx.enforce_isolation[i].work, + amdgpu_gfx_enforce_isolation_handler); + adev->gfx.enforce_isolation[i].adev = adev; + adev->gfx.enforce_isolation[i].xcp_id = i; + } INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func); @@ -4482,6 +4531,9 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) { dev_info(adev->dev, "amdgpu: finishing device.\n"); flush_delayed_work(&adev->delayed_init_work); + + if (adev->mman.initialized) + drain_workqueue(adev->mman.bdev.wq); adev->shutdown = true; /* make sure IB test finished before entering exclusive mode @@ -4502,9 +4554,6 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) } amdgpu_fence_driver_hw_fini(adev); - if (adev->mman.initialized) - drain_workqueue(adev->mman.bdev.wq); - if (adev->pm.sysfs_initialized) amdgpu_pm_sysfs_fini(adev); if (adev->ucode_sysfs_en) @@ -5278,16 +5327,15 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, { int i, r = 0; struct amdgpu_job *job = NULL; + struct amdgpu_device *tmp_adev = reset_context->reset_req_dev; bool need_full_reset = test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags); if (reset_context->reset_req_dev == adev) job = reset_context->job; - if (amdgpu_sriov_vf(adev)) { - /* stop the data exchange thread */ - amdgpu_virt_fini_data_exchange(adev); - } + if (amdgpu_sriov_vf(adev)) + amdgpu_virt_pre_reset(adev); amdgpu_fence_driver_isr_toggle(adev, true); @@ -5336,6 +5384,16 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, } } + if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { + dev_info(tmp_adev->dev, "Dumping IP State\n"); + /* Trigger ip dump before we reset the asic */ + for (i = 0; i < tmp_adev->num_ip_blocks; i++) + if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) + tmp_adev->ip_blocks[i].version->funcs + ->dump_ip_state((void *)tmp_adev); + dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); + } + if (need_full_reset) r = amdgpu_device_ip_suspend(adev); if (need_full_reset) @@ -5348,47 +5406,17 @@ int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev, return r; } -static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev) -{ - int i; - - lockdep_assert_held(&adev->reset_domain->sem); - - for (i = 0; i < adev->reset_info.num_regs; i++) { - adev->reset_info.reset_dump_reg_value[i] = - RREG32(adev->reset_info.reset_dump_reg_list[i]); - - trace_amdgpu_reset_reg_dumps(adev->reset_info.reset_dump_reg_list[i], - adev->reset_info.reset_dump_reg_value[i]); - } - - return 0; -} - int amdgpu_do_asic_reset(struct list_head *device_list_handle, struct amdgpu_reset_context *reset_context) { struct amdgpu_device *tmp_adev = NULL; bool need_full_reset, skip_hw_reset, vram_lost = false; int r = 0; - uint32_t i; /* Try reset handler method first */ tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device, reset_list); - if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) { - amdgpu_reset_reg_dumps(tmp_adev); - - dev_info(tmp_adev->dev, "Dumping IP State\n"); - /* Trigger ip dump before we reset the asic */ - for (i = 0; i < tmp_adev->num_ip_blocks; i++) - if (tmp_adev->ip_blocks[i].version->funcs->dump_ip_state) - tmp_adev->ip_blocks[i].version->funcs - ->dump_ip_state((void *)tmp_adev); - dev_info(tmp_adev->dev, "Dumping IP State Completed\n"); - } - reset_context->reset_device_list = device_list_handle; r = amdgpu_reset_perform_reset(tmp_adev, reset_context); /* If reset handler not implemented, continue; otherwise return */ @@ -5461,7 +5489,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, vram_lost = amdgpu_device_check_vram_lost(tmp_adev); if (!test_bit(AMDGPU_SKIP_COREDUMP, &reset_context->flags)) - amdgpu_coredump(tmp_adev, vram_lost, reset_context); + amdgpu_coredump(tmp_adev, false, vram_lost, reset_context->job); if (vram_lost) { DRM_INFO("VRAM is lost due to GPU reset!\n"); @@ -5513,7 +5541,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle, * bad_page_threshold value to fix this once * probing driver again. */ - if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) { + if (!amdgpu_ras_is_rma(tmp_adev)) { /* must succeed. */ amdgpu_ras_resume(tmp_adev); } else { @@ -5879,7 +5907,7 @@ skip_hw_reset: if (!amdgpu_ring_sched_ready(ring)) continue; - drm_sched_start(&ring->sched, true); + drm_sched_start(&ring->sched); } if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) @@ -5891,8 +5919,14 @@ skip_hw_reset: tmp_adev->asic_reset_res = 0; if (r) { - /* bad news, how to tell it to userspace ? */ - dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter)); + /* bad news, how to tell it to userspace ? + * for ras error, we should report GPU bad status instead of + * reset failure + */ + if (reset_context->src != AMDGPU_RESET_SRC_RAS || + !amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) + dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", + atomic_read(&tmp_adev->gpu_reset_counter)); amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r); } else { dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter)); @@ -6138,18 +6172,24 @@ bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev, struct amdgpu_device *peer_adev) { #ifdef CONFIG_HSA_AMD_P2P - uint64_t address_mask = peer_adev->dev->dma_mask ? - ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); - resource_size_t aper_limit = - adev->gmc.aper_base + adev->gmc.aper_size - 1; bool p2p_access = !adev->gmc.xgmi.connected_to_cpu && !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0); - return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size && - adev->gmc.real_vram_size == adev->gmc.visible_vram_size && - !(adev->gmc.aper_base & address_mask || - aper_limit & address_mask)); + bool is_large_bar = adev->gmc.visible_vram_size && + adev->gmc.real_vram_size == adev->gmc.visible_vram_size; + bool p2p_addressable = amdgpu_device_check_iommu_remap(peer_adev); + + if (!p2p_addressable) { + uint64_t address_mask = peer_adev->dev->dma_mask ? + ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1); + resource_size_t aper_limit = + adev->gmc.aper_base + adev->gmc.aper_size - 1; + + p2p_addressable = !(adev->gmc.aper_base & address_mask || + aper_limit & address_mask); + } + return is_large_bar && p2p_access && p2p_addressable; #else return false; #endif @@ -6374,7 +6414,7 @@ void amdgpu_pci_resume(struct pci_dev *pdev) if (!amdgpu_ring_sched_ready(ring)) continue; - drm_sched_start(&ring->sched, true); + drm_sched_start(&ring->sched); } amdgpu_device_unset_mp1_state(adev); |