diff options
Diffstat (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_device.c')
| -rw-r--r-- | drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 221 | 
1 files changed, 160 insertions, 61 deletions
| diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 694c3726e0f4..cf7fad88c138 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -30,6 +30,7 @@  #include <linux/module.h>  #include <linux/console.h>  #include <linux/slab.h> +#include <linux/iommu.h>  #include <drm/drm_atomic_helper.h>  #include <drm/drm_probe_helper.h> @@ -331,7 +332,7 @@ void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,  }  /** - * amdgpu_device_vram_access - access vram by vram aperature + * amdgpu_device_aper_access - access vram by vram aperature   *   * @adev: amdgpu_device pointer   * @pos: offset of the buffer in vram @@ -550,11 +551,11 @@ void amdgpu_device_wreg(struct amdgpu_device *adev,  	trace_amdgpu_device_wreg(adev->pdev->device, reg, v);  } -/* - * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range +/** + * amdgpu_mm_wreg_mmio_rlc -  write register either with direct/indirect mmio or with RLC path if in range   *   * this function is invoked only the debugfs register access - * */ + */  void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,  			     uint32_t reg, uint32_t v)  { @@ -566,6 +567,8 @@ void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,  	    adev->gfx.rlc.funcs->is_rlcg_access_range) {  		if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))  			return adev->gfx.rlc.funcs->sriov_wreg(adev, reg, v, 0, 0); +	} else if ((reg * 4) >= adev->rmmio_size) { +		adev->pcie_wreg(adev, reg * 4, v);  	} else {  		writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));  	} @@ -1100,7 +1103,7 @@ static void amdgpu_device_wb_fini(struct amdgpu_device *adev)  }  /** - * amdgpu_device_wb_init- Init Writeback driver info and allocate memory + * amdgpu_device_wb_init - Init Writeback driver info and allocate memory   *   * @adev: amdgpu_device pointer   * @@ -1447,7 +1450,7 @@ static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)  			adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;  		break;  	default: -		return -EINVAL; +		break;  	}  	return 0; @@ -2316,6 +2319,10 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)  		/* need to do gmc hw init early so we can allocate gpu mem */  		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) { +			/* Try to reserve bad pages early */ +			if (amdgpu_sriov_vf(adev)) +				amdgpu_virt_exchange_data(adev); +  			r = amdgpu_device_vram_scratch_init(adev);  			if (r) {  				DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r); @@ -2347,7 +2354,7 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)  	}  	if (amdgpu_sriov_vf(adev)) -		amdgpu_virt_init_data_exchange(adev); +		amdgpu_virt_exchange_data(adev);  	r = amdgpu_ib_pool_init(adev);  	if (r) { @@ -2614,11 +2621,10 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)  	if (r)  		DRM_ERROR("enable mgpu fan boost failed (%d).\n", r); -	/* For XGMI + passthrough configuration on arcturus, enable light SBR */ -	if (adev->asic_type == CHIP_ARCTURUS && -	    amdgpu_passthrough(adev) && -	    adev->gmc.xgmi.num_physical_nodes > 1) -		smu_set_light_sbr(&adev->smu, true); +	/* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */ +	if (amdgpu_passthrough(adev) && ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1)|| +			       adev->asic_type == CHIP_ALDEBARAN )) +		smu_handle_passthrough_sbr(&adev->smu, true);  	if (adev->gmc.xgmi.num_physical_nodes > 1) {  		mutex_lock(&mgpu_info.mutex); @@ -2657,6 +2663,36 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)  	return 0;  } +/** + * amdgpu_device_smu_fini_early - smu hw_fini wrapper + * + * @adev: amdgpu_device pointer + * + * For ASICs need to disable SMC first + */ +static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev) +{ +	int i, r; + +	if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0)) +		return; + +	for (i = 0; i < adev->num_ip_blocks; i++) { +		if (!adev->ip_blocks[i].status.hw) +			continue; +		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { +			r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); +			/* XXX handle errors */ +			if (r) { +				DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", +					  adev->ip_blocks[i].version->funcs->name, r); +			} +			adev->ip_blocks[i].status.hw = false; +			break; +		} +	} +} +  static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)  {  	int i, r; @@ -2677,21 +2713,8 @@ static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)  	amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);  	amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE); -	/* need to disable SMC first */ -	for (i = 0; i < adev->num_ip_blocks; i++) { -		if (!adev->ip_blocks[i].status.hw) -			continue; -		if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) { -			r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev); -			/* XXX handle errors */ -			if (r) { -				DRM_DEBUG("hw_fini of IP block <%s> failed %d\n", -					  adev->ip_blocks[i].version->funcs->name, r); -			} -			adev->ip_blocks[i].status.hw = false; -			break; -		} -	} +	/* Workaroud for ASICs need to disable SMC first */ +	amdgpu_device_smu_fini_early(adev);  	for (i = adev->num_ip_blocks - 1; i >= 0; i--) {  		if (!adev->ip_blocks[i].status.hw) @@ -2733,8 +2756,6 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)  	if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)  		amdgpu_virt_release_ras_err_handler_data(adev); -	amdgpu_ras_pre_fini(adev); -  	if (adev->gmc.xgmi.num_physical_nodes > 1)  		amdgpu_xgmi_remove_device(adev); @@ -3373,6 +3394,22 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)  	return ret;  } +/** + * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU + * + * @adev: amdgpu_device pointer + * + * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode + */ +static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev) +{ +	struct iommu_domain *domain; + +	domain = iommu_get_domain_for_dev(adev->dev); +	if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY) +		adev->ram_is_direct_mapped = true; +} +  static const struct attribute *amdgpu_dev_attributes[] = {  	&dev_attr_product_name.attr,  	&dev_attr_product_number.attr, @@ -3461,9 +3498,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,  	mutex_init(&adev->psp.mutex);  	mutex_init(&adev->notifier_lock); -	r = amdgpu_device_init_apu_flags(adev); -	if (r) -		return r; +	 amdgpu_device_init_apu_flags(adev);  	r = amdgpu_device_check_arguments(adev);  	if (r) @@ -3547,6 +3582,13 @@ int amdgpu_device_init(struct amdgpu_device *adev,  	if (r)  		return r; +	/* Need to get xgmi info early to decide the reset behavior*/ +	if (adev->gmc.xgmi.supported) { +		r = adev->gfxhub.funcs->get_xgmi_info(adev); +		if (r) +			return r; +	} +  	/* enable PCIE atomic ops */  	if (amdgpu_sriov_vf(adev))  		adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *) @@ -3693,8 +3735,6 @@ fence_driver_init:  	/* Get a log2 for easy divisions. */  	adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps)); -	amdgpu_fbdev_init(adev); -  	r = amdgpu_pm_sysfs_init(adev);  	if (r) {  		adev->pm_sysfs_en = false; @@ -3778,6 +3818,8 @@ fence_driver_init:  		queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,  				   msecs_to_jiffies(AMDGPU_RESUME_MS)); +	amdgpu_device_check_iommu_direct_map(adev); +  	return 0;  release_ras_con: @@ -3791,6 +3833,7 @@ failed:  static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)  { +  	/* Clear all CPU mappings pointing to this device */  	unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1); @@ -3811,7 +3854,7 @@ static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)  }  /** - * amdgpu_device_fini - tear down the driver + * amdgpu_device_fini_hw - tear down the driver   *   * @adev: amdgpu_device pointer   * @@ -3852,21 +3895,27 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev)  		amdgpu_ucode_sysfs_fini(adev);  	sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes); -	amdgpu_fbdev_fini(adev); +	/* disable ras feature must before hw fini */ +	amdgpu_ras_pre_fini(adev);  	amdgpu_device_ip_fini_early(adev);  	amdgpu_irq_fini_hw(adev); -	ttm_device_clear_dma_mappings(&adev->mman.bdev); +	if (adev->mman.initialized) +		ttm_device_clear_dma_mappings(&adev->mman.bdev);  	amdgpu_gart_dummy_page_fini(adev); -	amdgpu_device_unmap_mmio(adev); +	if (drm_dev_is_unplugged(adev_to_drm(adev))) +		amdgpu_device_unmap_mmio(adev); +  }  void amdgpu_device_fini_sw(struct amdgpu_device *adev)  { +	int idx; +  	amdgpu_fence_driver_sw_fini(adev);  	amdgpu_device_ip_fini(adev);  	release_firmware(adev->firmware.gpu_info_fw); @@ -3891,6 +3940,14 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)  	if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)  		vga_client_unregister(adev->pdev); +	if (drm_dev_enter(adev_to_drm(adev), &idx)) { + +		iounmap(adev->rmmio); +		adev->rmmio = NULL; +		amdgpu_device_doorbell_fini(adev); +		drm_dev_exit(idx); +	} +  	if (IS_ENABLED(CONFIG_PERF_EVENTS))  		amdgpu_pmu_fini(adev);  	if (adev->mman.discovery_bin) @@ -3911,8 +3968,8 @@ void amdgpu_device_fini_sw(struct amdgpu_device *adev)   */  static void amdgpu_device_evict_resources(struct amdgpu_device *adev)  { -	/* No need to evict vram on APUs for suspend to ram */ -	if (adev->in_s3 && (adev->flags & AMD_IS_APU)) +	/* No need to evict vram on APUs for suspend to ram or s2idle */ +	if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))  		return;  	if (amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM)) @@ -3948,7 +4005,7 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)  	drm_kms_helper_poll_disable(dev);  	if (fbcon) -		amdgpu_fbdev_set_suspend(adev, 1); +		drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);  	cancel_delayed_work_sync(&adev->delayed_init_work); @@ -3959,16 +4016,11 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)  	if (!adev->in_s0ix)  		amdgpu_amdkfd_suspend(adev, adev->in_runpm); -	/* First evict vram memory */  	amdgpu_device_evict_resources(adev);  	amdgpu_fence_driver_hw_fini(adev);  	amdgpu_device_ip_suspend_phase2(adev); -	/* This second call to evict device resources is to evict -	 * the gart page table using the CPU. -	 */ -	amdgpu_device_evict_resources(adev);  	return 0;  } @@ -4025,7 +4077,7 @@ int amdgpu_device_resume(struct drm_device *dev, bool fbcon)  	flush_delayed_work(&adev->delayed_init_work);  	if (fbcon) -		amdgpu_fbdev_set_suspend(adev, 0); +		drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);  	drm_kms_helper_poll_enable(dev); @@ -4294,6 +4346,9 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,  				     bool from_hypervisor)  {  	int r; +	struct amdgpu_hive_info *hive = NULL; + +	amdgpu_amdkfd_pre_reset(adev);  	amdgpu_amdkfd_pre_reset(adev); @@ -4310,8 +4365,6 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,  		goto error;  	amdgpu_virt_init_data_exchange(adev); -	/* we need recover gart prior to run SMC/CP/SDMA resume */ -	amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));  	r = amdgpu_device_fw_loading(adev);  	if (r) @@ -4322,9 +4375,19 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,  	if (r)  		goto error; -	amdgpu_irq_gpu_reset_resume_helper(adev); -	r = amdgpu_ib_ring_tests(adev); -	amdgpu_amdkfd_post_reset(adev); +	hive = amdgpu_get_xgmi_hive(adev); +	/* Update PSP FW topology after reset */ +	if (hive && adev->gmc.xgmi.num_physical_nodes > 1) +		r = amdgpu_xgmi_update_topology(hive, adev); + +	if (hive) +		amdgpu_put_xgmi_hive(hive); + +	if (!r) { +		amdgpu_irq_gpu_reset_resume_helper(adev); +		r = amdgpu_ib_ring_tests(adev); +		amdgpu_amdkfd_post_reset(adev); +	}  error:  	if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) { @@ -4621,10 +4684,6 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,  					amdgpu_inc_vram_lost(tmp_adev);  				} -				r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT)); -				if (r) -					goto out; -  				r = amdgpu_device_fw_loading(tmp_adev);  				if (r)  					return r; @@ -4650,7 +4709,7 @@ int amdgpu_do_asic_reset(struct list_head *device_list_handle,  				if (r)  					goto out; -				amdgpu_fbdev_set_suspend(tmp_adev, 0); +				drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);  				/*  				 * The GPU enters bad state once faulty pages @@ -4749,7 +4808,7 @@ static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgp  {  	struct amdgpu_device *tmp_adev = NULL; -	if (adev->gmc.xgmi.num_physical_nodes > 1) { +	if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {  		if (!hive) {  			dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");  			return -ENODEV; @@ -4961,7 +5020,8 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  	 * We always reset all schedulers for device and all devices for XGMI  	 * hive so that should take care of them too.  	 */ -	hive = amdgpu_get_xgmi_hive(adev); +	if (!amdgpu_sriov_vf(adev)) +		hive = amdgpu_get_xgmi_hive(adev);  	if (hive) {  		if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {  			DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress", @@ -5002,7 +5062,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  	 * to put adev in the 1st position.  	 */  	INIT_LIST_HEAD(&device_list); -	if (adev->gmc.xgmi.num_physical_nodes > 1) { +	if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {  		list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)  			list_add_tail(&tmp_adev->reset_list, &device_list);  		if (!list_is_first(&adev->reset_list, &device_list)) @@ -5041,7 +5101,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,  		 */  		amdgpu_unregister_gpu_instance(tmp_adev); -		amdgpu_fbdev_set_suspend(tmp_adev, 1); +		drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);  		/* disable ras on ALL IPs */  		if (!need_emergency_restart && @@ -5636,3 +5696,42 @@ void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,  	amdgpu_asic_invalidate_hdp(adev, ring);  } + +/** + * amdgpu_device_halt() - bring hardware to some kind of halt state + * + * @adev: amdgpu_device pointer + * + * Bring hardware to some kind of halt state so that no one can touch it + * any more. It will help to maintain error context when error occurred. + * Compare to a simple hang, the system will keep stable at least for SSH + * access. Then it should be trivial to inspect the hardware state and + * see what's going on. Implemented as following: + * + * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc), + *    clears all CPU mappings to device, disallows remappings through page faults + * 2. amdgpu_irq_disable_all() disables all interrupts + * 3. amdgpu_fence_driver_hw_fini() signals all HW fences + * 4. set adev->no_hw_access to avoid potential crashes after setp 5 + * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings + * 6. pci_disable_device() and pci_wait_for_pending_transaction() + *    flush any in flight DMA operations + */ +void amdgpu_device_halt(struct amdgpu_device *adev) +{ +	struct pci_dev *pdev = adev->pdev; +	struct drm_device *ddev = adev_to_drm(adev); + +	drm_dev_unplug(ddev); + +	amdgpu_irq_disable_all(adev); + +	amdgpu_fence_driver_hw_fini(adev); + +	adev->no_hw_access = true; + +	amdgpu_device_unmap_mmio(adev); + +	pci_disable_device(pdev); +	pci_wait_for_pending_transaction(pdev); +} |