From 9d7a8bdb909e9b34bced6f56d4f2d320b0021697 Mon Sep 17 00:00:00 2001 From: "Dr. David Alan Gilbert" Date: Mon, 23 Sep 2024 02:24:45 +0100 Subject: drm/amdgpu: Remove unused amdgpu_gfx_bit_to_me_queue amdgpu_gfx_bit_to_me_queue has been unused since it was added in commit 7470bfcf2014 ("drm/amdgpu: add helper function for gfx queue/bitmap transition") Remove it. Signed-off-by: Dr. David Alan Gilbert Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index 5644e10a86a9..f710178a21bc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -540,8 +540,6 @@ bool amdgpu_gfx_is_high_priority_graphics_queue(struct amdgpu_device *adev, struct amdgpu_ring *ring); int amdgpu_gfx_me_queue_to_bit(struct amdgpu_device *adev, int me, int pipe, int queue); -void amdgpu_gfx_bit_to_me_queue(struct amdgpu_device *adev, int bit, - int *me, int *pipe, int *queue); bool amdgpu_gfx_is_me_queue_enabled(struct amdgpu_device *adev, int me, int pipe, int queue); void amdgpu_gfx_off_ctrl(struct amdgpu_device *adev, bool enable); -- cgit From efe6a8774375ddcbdd46fb920be55cc2d0120836 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 14 Oct 2024 11:58:34 -0400 Subject: drm/amdgpu: fix fairness in enforce isolation handling Make sure KFD gets a turn when serializing access to the GC IP. Currently non-KFD jobs can starve KFD if they submit often enough. This patch prevents that by stalling non-KFD if its time period has elapsed. v2: fix units v3: check enablement properly Acked-by: Srinivasan Shanmugam Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 2 +- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 53 +++++++++++++++++++++++++++++++-- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 2 ++ 3 files changed, 54 insertions(+), 3 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 607998d8b1d5..7645e498faa4 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -118,7 +118,7 @@ #define MAX_GPU_INSTANCE 64 -#define GFX_SLICE_PERIOD msecs_to_jiffies(250) +#define GFX_SLICE_PERIOD_MS 250 struct amdgpu_gpu_instance { struct amdgpu_device *adev; diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index e96984c53e72..b8cc4b146bdc 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -1752,7 +1752,7 @@ static void amdgpu_gfx_kfd_sch_ctrl(struct amdgpu_device *adev, u32 idx, if (adev->gfx.kfd_sch_req_count[idx] == 0 && adev->gfx.kfd_sch_inactive[idx]) { schedule_delayed_work(&adev->gfx.enforce_isolation[idx].work, - GFX_SLICE_PERIOD); + msecs_to_jiffies(adev->gfx.enforce_isolation_time[idx])); } } else { if (adev->gfx.kfd_sch_req_count[idx] == 0) { @@ -1807,8 +1807,9 @@ void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work) fences += amdgpu_fence_count_emitted(&adev->gfx.compute_ring[i]); } if (fences) { + /* we've already had our timeslice, so let's wrap this up */ schedule_delayed_work(&adev->gfx.enforce_isolation[idx].work, - GFX_SLICE_PERIOD); + msecs_to_jiffies(1)); } else { /* Tell KFD to resume the runqueue */ if (adev->kfd.init_complete) { @@ -1821,6 +1822,51 @@ void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work) mutex_unlock(&adev->enforce_isolation_mutex); } +static void +amdgpu_gfx_enforce_isolation_wait_for_kfd(struct amdgpu_device *adev, + u32 idx) +{ + unsigned long cjiffies; + bool wait = false; + + mutex_lock(&adev->enforce_isolation_mutex); + if (adev->enforce_isolation[idx]) { + /* set the initial values if nothing is set */ + if (!adev->gfx.enforce_isolation_jiffies[idx]) { + adev->gfx.enforce_isolation_jiffies[idx] = jiffies; + adev->gfx.enforce_isolation_time[idx] = GFX_SLICE_PERIOD_MS; + } + /* Make sure KFD gets a chance to run */ + if (amdgpu_amdkfd_compute_active(adev, idx)) { + cjiffies = jiffies; + if (time_after(cjiffies, adev->gfx.enforce_isolation_jiffies[idx])) { + cjiffies -= adev->gfx.enforce_isolation_jiffies[idx]; + if ((jiffies_to_msecs(cjiffies) >= GFX_SLICE_PERIOD_MS)) { + /* if our time is up, let KGD work drain before scheduling more */ + wait = true; + /* reset the timer period */ + adev->gfx.enforce_isolation_time[idx] = GFX_SLICE_PERIOD_MS; + } else { + /* set the timer period to what's left in our time slice */ + adev->gfx.enforce_isolation_time[idx] = + GFX_SLICE_PERIOD_MS - jiffies_to_msecs(cjiffies); + } + } else { + /* if jiffies wrap around we will just wait a little longer */ + adev->gfx.enforce_isolation_jiffies[idx] = jiffies; + } + } else { + /* if there is no KFD work, then set the full slice period */ + adev->gfx.enforce_isolation_jiffies[idx] = jiffies; + adev->gfx.enforce_isolation_time[idx] = GFX_SLICE_PERIOD_MS; + } + } + mutex_unlock(&adev->enforce_isolation_mutex); + + if (wait) + msleep(GFX_SLICE_PERIOD_MS); +} + void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring) { struct amdgpu_device *adev = ring->adev; @@ -1837,6 +1883,9 @@ void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring) if (idx >= MAX_XCP) return; + /* Don't submit more work until KFD has had some time */ + amdgpu_gfx_enforce_isolation_wait_for_kfd(adev, idx); + mutex_lock(&adev->enforce_isolation_mutex); if (adev->enforce_isolation[idx]) { if (adev->kfd.init_complete) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index f710178a21bc..af9dbd760fee 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -472,6 +472,8 @@ struct amdgpu_gfx { struct mutex kfd_sch_mutex; u64 kfd_sch_req_count[MAX_XCP]; bool kfd_sch_inactive[MAX_XCP]; + unsigned long enforce_isolation_jiffies[MAX_XCP]; + unsigned long enforce_isolation_time[MAX_XCP]; }; struct amdgpu_gfx_ras_reg_entry { -- cgit From c5c63d9cb5d3bbb2fc5973757616b17629795829 Mon Sep 17 00:00:00 2001 From: Jesse Zhang Date: Tue, 29 Oct 2024 10:11:05 +0800 Subject: drm/amdgpu: add amdgpu_gfx_sched_mask and amdgpu_compute_sched_mask debugfs compute/gfx may have multiple rings on some hardware. In some cases, userspace wants to run jobs on a specific ring for validation purposes. This debugfs entry helps to disable or enable submitting jobs to a specific ring. This entry is populated only if there are at least two or more cores in the gfx/compute ip. Signed-off-by: Jesse Zhang Suggested-by: Alex Deucher Reviewed-by: Tim Huang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c | 2 + drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 141 ++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 2 + 3 files changed, 145 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c index 37d8657f0776..6e3f657cab9c 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_debugfs.c @@ -2096,6 +2096,8 @@ int amdgpu_debugfs_init(struct amdgpu_device *adev) amdgpu_debugfs_umsch_fwlog_init(adev, &adev->umsch_mm); amdgpu_debugfs_jpeg_sched_mask_init(adev); + amdgpu_debugfs_gfx_sched_mask_init(adev); + amdgpu_debugfs_compute_sched_mask_init(adev); amdgpu_ras_debugfs_create_all(adev); amdgpu_rap_debugfs_init(adev); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index b8cc4b146bdc..63f5fa736f32 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -1917,3 +1917,144 @@ void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring) } mutex_unlock(&adev->enforce_isolation_mutex); } + +/* + * debugfs for to enable/disable gfx job submission to specific core. + */ +#if defined(CONFIG_DEBUG_FS) +static int amdgpu_debugfs_gfx_sched_mask_set(void *data, u64 val) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)data; + u32 i; + u64 mask = 0; + struct amdgpu_ring *ring; + + if (!adev) + return -ENODEV; + + mask = (1 << adev->gfx.num_gfx_rings) - 1; + if ((val & mask) == 0) + return -EINVAL; + + for (i = 0; i < adev->gfx.num_gfx_rings; ++i) { + ring = &adev->gfx.gfx_ring[i]; + if (val & (1 << i)) + ring->sched.ready = true; + else + ring->sched.ready = false; + } + /* publish sched.ready flag update effective immediately across smp */ + smp_rmb(); + return 0; +} + +static int amdgpu_debugfs_gfx_sched_mask_get(void *data, u64 *val) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)data; + u32 i; + u64 mask = 0; + struct amdgpu_ring *ring; + + if (!adev) + return -ENODEV; + for (i = 0; i < adev->gfx.num_gfx_rings; ++i) { + ring = &adev->gfx.gfx_ring[i]; + if (ring->sched.ready) + mask |= 1 << i; + } + + *val = mask; + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(amdgpu_debugfs_gfx_sched_mask_fops, + amdgpu_debugfs_gfx_sched_mask_get, + amdgpu_debugfs_gfx_sched_mask_set, "%llx\n"); + +#endif + +void amdgpu_debugfs_gfx_sched_mask_init(struct amdgpu_device *adev) +{ +#if defined(CONFIG_DEBUG_FS) + struct drm_minor *minor = adev_to_drm(adev)->primary; + struct dentry *root = minor->debugfs_root; + char name[32]; + + if (!(adev->gfx.num_gfx_rings > 1)) + return; + sprintf(name, "amdgpu_gfx_sched_mask"); + debugfs_create_file(name, 0600, root, adev, + &amdgpu_debugfs_gfx_sched_mask_fops); +#endif +} + +/* + * debugfs for to enable/disable compute job submission to specific core. + */ +#if defined(CONFIG_DEBUG_FS) +static int amdgpu_debugfs_compute_sched_mask_set(void *data, u64 val) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)data; + u32 i; + u64 mask = 0; + struct amdgpu_ring *ring; + + if (!adev) + return -ENODEV; + + mask = (1 << adev->gfx.num_compute_rings) - 1; + if ((val & mask) == 0) + return -EINVAL; + + for (i = 0; i < adev->gfx.num_compute_rings; ++i) { + ring = &adev->gfx.compute_ring[i]; + if (val & (1 << i)) + ring->sched.ready = true; + else + ring->sched.ready = false; + } + + /* publish sched.ready flag update effective immediately across smp */ + smp_rmb(); + return 0; +} + +static int amdgpu_debugfs_compute_sched_mask_get(void *data, u64 *val) +{ + struct amdgpu_device *adev = (struct amdgpu_device *)data; + u32 i; + u64 mask = 0; + struct amdgpu_ring *ring; + + if (!adev) + return -ENODEV; + for (i = 0; i < adev->gfx.num_compute_rings; ++i) { + ring = &adev->gfx.compute_ring[i]; + if (ring->sched.ready) + mask |= 1 << i; + } + + *val = mask; + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(amdgpu_debugfs_compute_sched_mask_fops, + amdgpu_debugfs_compute_sched_mask_get, + amdgpu_debugfs_compute_sched_mask_set, "%llx\n"); + +#endif + +void amdgpu_debugfs_compute_sched_mask_init(struct amdgpu_device *adev) +{ +#if defined(CONFIG_DEBUG_FS) + struct drm_minor *minor = adev_to_drm(adev)->primary; + struct dentry *root = minor->debugfs_root; + char name[32]; + + if (!(adev->gfx.num_compute_rings > 1)) + return; + sprintf(name, "amdgpu_compute_sched_mask"); + debugfs_create_file(name, 0600, root, adev, + &amdgpu_debugfs_compute_sched_mask_fops); +#endif +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index af9dbd760fee..7183831df0c3 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -584,6 +584,8 @@ void amdgpu_gfx_sysfs_isolation_shader_fini(struct amdgpu_device *adev); void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work); void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring); void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring); +void amdgpu_debugfs_gfx_sched_mask_init(struct amdgpu_device *adev); +void amdgpu_debugfs_compute_sched_mask_init(struct amdgpu_device *adev); static inline const char *amdgpu_gfx_compute_mode_desc(int mode) { -- cgit From 047767ddc93666704026c79c01554597375beb50 Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Tue, 29 Oct 2024 10:47:26 +0530 Subject: drm/amdgpu: Group gfx sysfs functions Make amdgpu_gfx_sysfs_init/fini functions as common entry points for all gfx related sysfs nodes. Signed-off-by: Lijo Lazar Reviewed-by: Alex Deucher Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 40 +++++++++++++++++++++++++-------- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 2 -- drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 5 +++-- drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 4 ++-- drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 5 ----- 7 files changed, 40 insertions(+), 24 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 63f5fa736f32..2f3f09dfb1fd 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -1602,7 +1602,7 @@ static DEVICE_ATTR(current_compute_partition, 0644, static DEVICE_ATTR(available_compute_partition, 0444, amdgpu_gfx_get_available_compute_partition, NULL); -int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev) +static int amdgpu_gfx_sysfs_xcp_init(struct amdgpu_device *adev) { struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr; bool xcp_switch_supported; @@ -1629,7 +1629,7 @@ int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev) return r; } -void amdgpu_gfx_sysfs_fini(struct amdgpu_device *adev) +static void amdgpu_gfx_sysfs_xcp_fini(struct amdgpu_device *adev) { struct amdgpu_xcp_mgr *xcp_mgr = adev->xcp_mgr; bool xcp_switch_supported; @@ -1646,25 +1646,47 @@ void amdgpu_gfx_sysfs_fini(struct amdgpu_device *adev) &dev_attr_available_compute_partition); } -int amdgpu_gfx_sysfs_isolation_shader_init(struct amdgpu_device *adev) +static int amdgpu_gfx_sysfs_isolation_shader_init(struct amdgpu_device *adev) { int r; r = device_create_file(adev->dev, &dev_attr_enforce_isolation); if (r) return r; + if (adev->gfx.enable_cleaner_shader) + r = device_create_file(adev->dev, &dev_attr_run_cleaner_shader); - r = device_create_file(adev->dev, &dev_attr_run_cleaner_shader); - if (r) + return r; +} + +static void amdgpu_gfx_sysfs_isolation_shader_fini(struct amdgpu_device *adev) +{ + device_remove_file(adev->dev, &dev_attr_enforce_isolation); + if (adev->gfx.enable_cleaner_shader) + device_remove_file(adev->dev, &dev_attr_run_cleaner_shader); +} + +int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev) +{ + int r; + + r = amdgpu_gfx_sysfs_xcp_init(adev); + if (r) { + dev_err(adev->dev, "failed to create xcp sysfs files"); return r; + } - return 0; + r = amdgpu_gfx_sysfs_isolation_shader_init(adev); + if (r) + dev_err(adev->dev, "failed to create isolation sysfs files"); + + return r; } -void amdgpu_gfx_sysfs_isolation_shader_fini(struct amdgpu_device *adev) +void amdgpu_gfx_sysfs_fini(struct amdgpu_device *adev) { - device_remove_file(adev->dev, &dev_attr_enforce_isolation); - device_remove_file(adev->dev, &dev_attr_run_cleaner_shader); + amdgpu_gfx_sysfs_xcp_fini(adev); + amdgpu_gfx_sysfs_isolation_shader_fini(adev); } int amdgpu_gfx_cleaner_shader_sw_init(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index 7183831df0c3..fd73e527f446 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -579,8 +579,6 @@ void amdgpu_gfx_cleaner_shader_sw_fini(struct amdgpu_device *adev); void amdgpu_gfx_cleaner_shader_init(struct amdgpu_device *adev, unsigned int cleaner_shader_size, const void *cleaner_shader_ptr); -int amdgpu_gfx_sysfs_isolation_shader_init(struct amdgpu_device *adev); -void amdgpu_gfx_sysfs_isolation_shader_fini(struct amdgpu_device *adev); void amdgpu_gfx_enforce_isolation_handler(struct work_struct *work); void amdgpu_gfx_enforce_isolation_ring_begin_use(struct amdgpu_ring *ring); void amdgpu_gfx_enforce_isolation_ring_end_use(struct amdgpu_ring *ring); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index 9da95b25e158..d1a18ca584dd 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -4853,9 +4853,10 @@ static int gfx_v10_0_sw_init(struct amdgpu_ip_block *ip_block) gfx_v10_0_alloc_ip_dump(adev); - r = amdgpu_gfx_sysfs_isolation_shader_init(adev); + r = amdgpu_gfx_sysfs_init(adev); if (r) return r; + return 0; } @@ -4907,7 +4908,7 @@ static int gfx_v10_0_sw_fini(struct amdgpu_ip_block *ip_block) gfx_v10_0_rlc_backdoor_autoload_buffer_fini(adev); gfx_v10_0_free_microcode(adev); - amdgpu_gfx_sysfs_isolation_shader_fini(adev); + amdgpu_gfx_sysfs_fini(adev); kfree(adev->gfx.ip_dump_core); kfree(adev->gfx.ip_dump_compute_queues); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 894fc04201c3..cfe17a36b8ee 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -1708,7 +1708,7 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block) gfx_v11_0_alloc_ip_dump(adev); - r = amdgpu_gfx_sysfs_isolation_shader_init(adev); + r = amdgpu_gfx_sysfs_init(adev); if (r) return r; @@ -1773,7 +1773,7 @@ static int gfx_v11_0_sw_fini(struct amdgpu_ip_block *ip_block) gfx_v11_0_free_microcode(adev); - amdgpu_gfx_sysfs_isolation_shader_fini(adev); + amdgpu_gfx_sysfs_fini(adev); kfree(adev->gfx.ip_dump_core); kfree(adev->gfx.ip_dump_compute_queues); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index 9fec28d8a5fc..1b99f90cd193 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -1466,7 +1466,7 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block) gfx_v12_0_alloc_ip_dump(adev); - r = amdgpu_gfx_sysfs_isolation_shader_init(adev); + r = amdgpu_gfx_sysfs_init(adev); if (r) return r; @@ -1529,7 +1529,7 @@ static int gfx_v12_0_sw_fini(struct amdgpu_ip_block *ip_block) gfx_v12_0_free_microcode(adev); - amdgpu_gfx_sysfs_isolation_shader_fini(adev); + amdgpu_gfx_sysfs_fini(adev); kfree(adev->gfx.ip_dump_core); kfree(adev->gfx.ip_dump_compute_queues); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index e9248a855ba7..a880dce16ae2 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -2402,7 +2402,7 @@ static int gfx_v9_0_sw_init(struct amdgpu_ip_block *ip_block) gfx_v9_0_alloc_ip_dump(adev); - r = amdgpu_gfx_sysfs_isolation_shader_init(adev); + r = amdgpu_gfx_sysfs_init(adev); if (r) return r; @@ -2443,7 +2443,7 @@ static int gfx_v9_0_sw_fini(struct amdgpu_ip_block *ip_block) } gfx_v9_0_free_microcode(adev); - amdgpu_gfx_sysfs_isolation_shader_fini(adev); + amdgpu_gfx_sysfs_fini(adev); kfree(adev->gfx.ip_dump_core); kfree(adev->gfx.ip_dump_compute_queues); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index 016290f00592..983088805c3a 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -1171,10 +1171,6 @@ static int gfx_v9_4_3_sw_init(struct amdgpu_ip_block *ip_block) gfx_v9_4_3_alloc_ip_dump(adev); - r = amdgpu_gfx_sysfs_isolation_shader_init(adev); - if (r) - return r; - return 0; } @@ -1199,7 +1195,6 @@ static int gfx_v9_4_3_sw_fini(struct amdgpu_ip_block *ip_block) amdgpu_bo_unref(&adev->gfx.rlc.clear_state_obj); gfx_v9_4_3_free_microcode(adev); amdgpu_gfx_sysfs_fini(adev); - amdgpu_gfx_sysfs_isolation_shader_fini(adev); kfree(adev->gfx.ip_dump_core); kfree(adev->gfx.ip_dump_compute_queues); -- cgit From 6c8d1f4b042e706ccd7575beb0397a75d545d71b Mon Sep 17 00:00:00 2001 From: "Jesse.zhang@amd.com" Date: Tue, 5 Nov 2024 15:22:56 +0800 Subject: drm/amdgpu: Add sysfs interface for gc reset mask Add two sysfs interfaces for gfx and compute: gfx_reset_mask compute_reset_mask These interfaces are read-only and show the resets supported by the IP. For example, full adapter reset (mode1/mode2/BACO/etc), soft reset, queue reset, and pipe reset. V2: the sysfs node returns a text string instead of some flags (Christian) v3: add a generic helper which takes the ring as parameter and print the strings in the order they are applied (Christian) check amdgpu_gpu_recovery before creating sysfs file itself, and initialize supported_reset_types in IP version files (Lijo) v4: Fixing uninitialized variables (Tim) Signed-off-by: Jesse Zhang Suggested-by: Alex Deucher Reviewed-by: Tim Huang Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu.h | 8 ++++ drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 44 +++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 70 ++++++++++++++++++++++++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h | 2 + drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 5 +++ drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 18 ++++++++ drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c | 6 +++ drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c | 6 +++ drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c | 13 ++++++ 9 files changed, 172 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h index 7645e498faa4..d8bc6da50016 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h @@ -299,6 +299,12 @@ extern int amdgpu_wbrf; #define AMDGPU_RESET_VCE (1 << 13) #define AMDGPU_RESET_VCE1 (1 << 14) +/* reset mask */ +#define AMDGPU_RESET_TYPE_FULL (1 << 0) /* full adapter reset, mode1/mode2/BACO/etc. */ +#define AMDGPU_RESET_TYPE_SOFT_RESET (1 << 1) /* IP level soft reset */ +#define AMDGPU_RESET_TYPE_PER_QUEUE (1 << 2) /* per queue */ +#define AMDGPU_RESET_TYPE_PER_PIPE (1 << 3) /* per pipe */ + /* max cursor sizes (in pixels) */ #define CIK_CURSOR_WIDTH 128 #define CIK_CURSOR_HEIGHT 128 @@ -1464,6 +1470,8 @@ struct dma_fence *amdgpu_device_get_gang(struct amdgpu_device *adev); struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev, struct dma_fence *gang); bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev); +ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring); +ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset); /* atpx handler */ #if defined(CONFIG_VGA_SWITCHEROO) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c index 0450eab6ade7..38a17571da75 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c @@ -6715,3 +6715,47 @@ uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev, } return ret; } + +ssize_t amdgpu_get_soft_full_reset_mask(struct amdgpu_ring *ring) +{ + ssize_t size = 0; + + if (!ring || !ring->adev) + return size; + + if (amdgpu_device_should_recover_gpu(ring->adev)) + size |= AMDGPU_RESET_TYPE_FULL; + + if (unlikely(!ring->adev->debug_disable_soft_recovery) && + !amdgpu_sriov_vf(ring->adev) && ring->funcs->soft_recovery) + size |= AMDGPU_RESET_TYPE_SOFT_RESET; + + return size; +} + +ssize_t amdgpu_show_reset_mask(char *buf, uint32_t supported_reset) +{ + ssize_t size = 0; + + if (supported_reset == 0) { + size += sysfs_emit_at(buf, size, "unsupported"); + size += sysfs_emit_at(buf, size, "\n"); + return size; + + } + + if (supported_reset & AMDGPU_RESET_TYPE_SOFT_RESET) + size += sysfs_emit_at(buf, size, "soft "); + + if (supported_reset & AMDGPU_RESET_TYPE_PER_QUEUE) + size += sysfs_emit_at(buf, size, "queue "); + + if (supported_reset & AMDGPU_RESET_TYPE_PER_PIPE) + size += sysfs_emit_at(buf, size, "pipe "); + + if (supported_reset & AMDGPU_RESET_TYPE_FULL) + size += sysfs_emit_at(buf, size, "full "); + + size += sysfs_emit_at(buf, size, "\n"); + return size; +} diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 2f3f09dfb1fd..6cc6484bde06 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -1588,6 +1588,32 @@ static ssize_t amdgpu_gfx_set_enforce_isolation(struct device *dev, return count; } +static ssize_t amdgpu_gfx_get_gfx_reset_mask(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct drm_device *ddev = dev_get_drvdata(dev); + struct amdgpu_device *adev = drm_to_adev(ddev); + + if (!adev) + return -ENODEV; + + return amdgpu_show_reset_mask(buf, adev->gfx.gfx_supported_reset); +} + +static ssize_t amdgpu_gfx_get_compute_reset_mask(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct drm_device *ddev = dev_get_drvdata(dev); + struct amdgpu_device *adev = drm_to_adev(ddev); + + if (!adev) + return -ENODEV; + + return amdgpu_show_reset_mask(buf, adev->gfx.compute_supported_reset); +} + static DEVICE_ATTR(run_cleaner_shader, 0200, NULL, amdgpu_gfx_set_run_cleaner_shader); @@ -1601,6 +1627,11 @@ static DEVICE_ATTR(current_compute_partition, 0644, static DEVICE_ATTR(available_compute_partition, 0444, amdgpu_gfx_get_available_compute_partition, NULL); +static DEVICE_ATTR(gfx_reset_mask, 0444, + amdgpu_gfx_get_gfx_reset_mask, NULL); + +static DEVICE_ATTR(compute_reset_mask, 0444, + amdgpu_gfx_get_compute_reset_mask, NULL); static int amdgpu_gfx_sysfs_xcp_init(struct amdgpu_device *adev) { @@ -1666,6 +1697,40 @@ static void amdgpu_gfx_sysfs_isolation_shader_fini(struct amdgpu_device *adev) device_remove_file(adev->dev, &dev_attr_run_cleaner_shader); } +static int amdgpu_gfx_sysfs_reset_mask_init(struct amdgpu_device *adev) +{ + int r = 0; + + if (!amdgpu_gpu_recovery) + return r; + + if (adev->gfx.num_gfx_rings) { + r = device_create_file(adev->dev, &dev_attr_gfx_reset_mask); + if (r) + return r; + } + + if (adev->gfx.num_compute_rings) { + r = device_create_file(adev->dev, &dev_attr_compute_reset_mask); + if (r) + return r; + } + + return r; +} + +static void amdgpu_gfx_sysfs_reset_mask_fini(struct amdgpu_device *adev) +{ + if (!amdgpu_gpu_recovery) + return; + + if (adev->gfx.num_gfx_rings) + device_remove_file(adev->dev, &dev_attr_gfx_reset_mask); + + if (adev->gfx.num_compute_rings) + device_remove_file(adev->dev, &dev_attr_compute_reset_mask); +} + int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev) { int r; @@ -1680,6 +1745,10 @@ int amdgpu_gfx_sysfs_init(struct amdgpu_device *adev) if (r) dev_err(adev->dev, "failed to create isolation sysfs files"); + r = amdgpu_gfx_sysfs_reset_mask_init(adev); + if (r) + dev_err(adev->dev, "failed to create reset mask sysfs files"); + return r; } @@ -1687,6 +1756,7 @@ void amdgpu_gfx_sysfs_fini(struct amdgpu_device *adev) { amdgpu_gfx_sysfs_xcp_fini(adev); amdgpu_gfx_sysfs_isolation_shader_fini(adev); + amdgpu_gfx_sysfs_reset_mask_fini(adev); } int amdgpu_gfx_cleaner_shader_sw_init(struct amdgpu_device *adev, diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h index fd73e527f446..8b5bd63b5773 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h @@ -424,6 +424,8 @@ struct amdgpu_gfx { /* reset mask */ uint32_t grbm_soft_reset; uint32_t srbm_soft_reset; + uint32_t gfx_supported_reset; + uint32_t compute_supported_reset; /* gfx off */ bool gfx_off_state; /* true: enabled, false: disabled */ diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c index d1a18ca584dd..24dce803a829 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c @@ -4825,6 +4825,11 @@ static int gfx_v10_0_sw_init(struct amdgpu_ip_block *ip_block) } } } + /* TODO: Add queue reset mask when FW fully supports it */ + adev->gfx.gfx_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]); + adev->gfx.compute_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); r = amdgpu_gfx_kiq_init(adev, GFX10_MEC_HPD_SIZE, 0); if (r) { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c index 62e4c446793d..28fe8d23c91f 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c @@ -1691,6 +1691,24 @@ static int gfx_v11_0_sw_init(struct amdgpu_ip_block *ip_block) } } + adev->gfx.gfx_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]); + adev->gfx.compute_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { + case IP_VERSION(11, 0, 0): + case IP_VERSION(11, 0, 2): + case IP_VERSION(11, 0, 3): + if ((adev->gfx.me_fw_version >= 2280) && + (adev->gfx.mec_fw_version >= 2410)) { + adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; + adev->gfx.gfx_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; + } + break; + default: + break; + } + if (!adev->enable_mes_kiq) { r = amdgpu_gfx_kiq_init(adev, GFX11_MEC_HPD_SIZE, 0); if (r) { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c index 1b99f90cd193..fe7c48f2fb2a 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v12_0.c @@ -1437,6 +1437,12 @@ static int gfx_v12_0_sw_init(struct amdgpu_ip_block *ip_block) } } + /* TODO: Add queue reset mask when FW fully supports it */ + adev->gfx.gfx_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]); + adev->gfx.compute_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); + if (!adev->enable_mes_kiq) { r = amdgpu_gfx_kiq_init(adev, GFX12_MEC_HPD_SIZE, 0); if (r) { diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c index a880dce16ae2..0b6f09f2cc9b 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c @@ -2374,6 +2374,12 @@ static int gfx_v9_0_sw_init(struct amdgpu_ip_block *ip_block) } } + /* TODO: Add queue reset mask when FW fully supports it */ + adev->gfx.gfx_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.gfx_ring[0]); + adev->gfx.compute_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); + r = amdgpu_gfx_kiq_init(adev, GFX9_MEC_HPD_SIZE, 0); if (r) { DRM_ERROR("Failed to init KIQ BOs!\n"); diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c index 983088805c3a..e2b3dda57030 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_4_3.c @@ -1157,6 +1157,19 @@ static int gfx_v9_4_3_sw_init(struct amdgpu_ip_block *ip_block) return r; } + adev->gfx.compute_supported_reset = + amdgpu_get_soft_full_reset_mask(&adev->gfx.compute_ring[0]); + switch (amdgpu_ip_version(adev, GC_HWIP, 0)) { + case IP_VERSION(9, 4, 3): + case IP_VERSION(9, 4, 4): + if (adev->gfx.mec_fw_version >= 155) { + adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_QUEUE; + adev->gfx.compute_supported_reset |= AMDGPU_RESET_TYPE_PER_PIPE; + } + break; + default: + break; + } r = gfx_v9_4_3_gpu_early_init(adev); if (r) return r; -- cgit