From 3eebfd5e9cef738f683a6237dd9ff628e113f008 Mon Sep 17 00:00:00 2001 From: Feifei Xu Date: Thu, 12 Sep 2024 18:09:11 +0800 Subject: drm/amdkfd:Add kfd function to config sq perfmon Expose the interface for kfd to config sq perfmon. Signed-off-by: Feifei Xu Reviewed-by: Hawking Zhang Reviewed-by: Lijo Lazar Reviewed-by: James Zhu Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 4f08b153cb66..b545940e512b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -889,3 +889,18 @@ int amdgpu_amdkfd_start_sched(struct amdgpu_device *adev, uint32_t node_id) return kgd2kfd_start_sched(adev->kfd.dev, node_id); } + +/* Config CGTT_SQ_CLK_CTRL */ +int amdgpu_amdkfd_config_sq_perfmon(struct amdgpu_device *adev, uint32_t xcp_id, + bool core_override_enable, bool reg_override_enable, bool perfmon_override_enable) +{ + int r; + + if (!adev->kfd.init_complete) + return 0; + + r = psp_config_sq_perfmon(&adev->psp, xcp_id, core_override_enable, + reg_override_enable, perfmon_override_enable); + + return r; +} -- cgit From 8fe7cf58ff0e46769b86b3890d657c8996b86bc6 Mon Sep 17 00:00:00 2001 From: Alex Deucher Date: Mon, 14 Oct 2024 10:51:10 -0400 Subject: drm/amdkfd: add an interface to query whether is KFD is active Add an interface to query whether KFD has any active queues. v2: fix build issues Acked-by: Srinivasan Shanmugam Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 9 +++++++++ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h | 7 +++++++ drivers/gpu/drm/amd/amdkfd/kfd_device.c | 25 +++++++++++++++++++++++++ 3 files changed, 41 insertions(+) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index b545940e512b..24343c312480 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -890,6 +890,15 @@ int amdgpu_amdkfd_start_sched(struct amdgpu_device *adev, uint32_t node_id) return kgd2kfd_start_sched(adev->kfd.dev, node_id); } +/* check if there are KFD queues active */ +bool amdgpu_amdkfd_compute_active(struct amdgpu_device *adev, uint32_t node_id) +{ + if (!adev->kfd.init_complete) + return false; + + return kgd2kfd_compute_active(adev->kfd.dev, node_id); +} + /* Config CGTT_SQ_CLK_CTRL */ int amdgpu_amdkfd_config_sq_perfmon(struct amdgpu_device *adev, uint32_t xcp_id, bool core_override_enable, bool reg_override_enable, bool perfmon_override_enable) diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h index 7e0a22072536..4b80ad860639 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h @@ -268,6 +268,7 @@ int amdgpu_amdkfd_start_sched(struct amdgpu_device *adev, uint32_t node_id); int amdgpu_amdkfd_stop_sched(struct amdgpu_device *adev, uint32_t node_id); int amdgpu_amdkfd_config_sq_perfmon(struct amdgpu_device *adev, uint32_t xcp_id, bool core_override_enable, bool reg_override_enable, bool perfmon_override_enable); +bool amdgpu_amdkfd_compute_active(struct amdgpu_device *adev, uint32_t node_id); /* Read user wptr from a specified user address space with page fault @@ -431,6 +432,7 @@ int kgd2kfd_check_and_lock_kfd(void); void kgd2kfd_unlock_kfd(void); int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id); int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id); +bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id); #else static inline int kgd2kfd_init(void) { @@ -511,5 +513,10 @@ static inline int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id) { return 0; } + +static inline bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id) +{ + return false; +} #endif #endif /* AMDGPU_AMDKFD_H_INCLUDED */ diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_device.c b/drivers/gpu/drm/amd/amdkfd/kfd_device.c index fad1c8f2bc83..348925254bff 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_device.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_device.c @@ -1392,6 +1392,13 @@ void kfd_dec_compute_active(struct kfd_node *node) WARN_ONCE(count < 0, "Compute profile ref. count error"); } +static bool kfd_compute_active(struct kfd_node *node) +{ + if (atomic_read(&node->kfd->compute_profile)) + return true; + return false; +} + void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint64_t throttle_bitmask) { /* @@ -1485,6 +1492,24 @@ int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id) return node->dqm->ops.halt(node->dqm); } +bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id) +{ + struct kfd_node *node; + + if (!kfd->init_complete) + return false; + + if (node_id >= kfd->num_nodes) { + dev_warn(kfd->adev->dev, "Invalid node ID: %u exceeds %u\n", + node_id, kfd->num_nodes - 1); + return false; + } + + node = kfd->nodes[node_id]; + + return kfd_compute_active(node); +} + #if defined(CONFIG_DEBUG_FS) /* This function will send a package to HIQ to hang the HWS -- cgit From fa31798582882740f2b13d19e1bd43b4ef918e2f Mon Sep 17 00:00:00 2001 From: Lijo Lazar Date: Tue, 5 Nov 2024 10:30:20 +0530 Subject: drm/amdgpu: Fix map/unmap queue logic In current logic, it calls ring_alloc followed by a ring_test. ring_test in turn will call another ring_alloc. This is illegal usage as a ring_alloc is expected to be closed properly with a ring_commit. Change to commit the map/unmap queue packet first followed by a ring_test. Add a comment about the usage of ring_test. Also, reorder the current pre-condition checks of job hang or kiq ring scheduler not ready. Without them being met, it is not useful to attempt ring or memory allocations. Fixes tag refers to the original patch which introduced this issue which then got carried over into newer code. Signed-off-by: Lijo Lazar Reviewed-by: Le Ma Signed-off-by: Alex Deucher Fixes: 6c10b5cc4eaa ("drm/amdgpu: Remove duplicate code in gfx_v8_0.c") --- drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c | 13 +++++- drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c | 63 +++++++++++++++++++++--------- drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c | 7 ++++ 3 files changed, 63 insertions(+), 20 deletions(-) (limited to 'drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c') diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c index 24343c312480..3afcd1e8aa54 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c @@ -834,6 +834,9 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off, if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) return -EINVAL; + if (!kiq_ring->sched.ready || adev->job_hang) + return 0; + ring_funcs = kzalloc(sizeof(*ring_funcs), GFP_KERNEL); if (!ring_funcs) return -ENOMEM; @@ -858,8 +861,14 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off, kiq->pmf->kiq_unmap_queues(kiq_ring, ring, RESET_QUEUES, 0, 0); - if (kiq_ring->sched.ready && !adev->job_hang) - r = amdgpu_ring_test_helper(kiq_ring); + /* Submit unmap queue packet */ + amdgpu_ring_commit(kiq_ring); + /* + * Ring test will do a basic scratch register change check. Just run + * this to ensure that unmap queues that is submitted before got + * processed successfully before returning. + */ + r = amdgpu_ring_test_helper(kiq_ring); spin_unlock(&kiq->ring_lock); diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c index 6cc6484bde06..5136cb3bdd4b 100644 --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.c @@ -515,6 +515,17 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id) if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) return -EINVAL; + if (!kiq_ring->sched.ready || adev->job_hang) + return 0; + /** + * This is workaround: only skip kiq_ring test + * during ras recovery in suspend stage for gfx9.4.3 + */ + if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || + amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && + amdgpu_ras_in_recovery(adev)) + return 0; + spin_lock(&kiq->ring_lock); if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size * adev->gfx.num_compute_rings)) { @@ -528,20 +539,15 @@ int amdgpu_gfx_disable_kcq(struct amdgpu_device *adev, int xcc_id) &adev->gfx.compute_ring[j], RESET_QUEUES, 0, 0); } - - /** - * This is workaround: only skip kiq_ring test - * during ras recovery in suspend stage for gfx9.4.3 + /* Submit unmap queue packet */ + amdgpu_ring_commit(kiq_ring); + /* + * Ring test will do a basic scratch register change check. Just run + * this to ensure that unmap queues that is submitted before got + * processed successfully before returning. */ - if ((amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 3) || - amdgpu_ip_version(adev, GC_HWIP, 0) == IP_VERSION(9, 4, 4)) && - amdgpu_ras_in_recovery(adev)) { - spin_unlock(&kiq->ring_lock); - return 0; - } + r = amdgpu_ring_test_helper(kiq_ring); - if (kiq_ring->sched.ready && !adev->job_hang) - r = amdgpu_ring_test_helper(kiq_ring); spin_unlock(&kiq->ring_lock); return r; @@ -569,8 +575,11 @@ int amdgpu_gfx_disable_kgq(struct amdgpu_device *adev, int xcc_id) if (!kiq->pmf || !kiq->pmf->kiq_unmap_queues) return -EINVAL; - spin_lock(&kiq->ring_lock); + if (!adev->gfx.kiq[0].ring.sched.ready || adev->job_hang) + return 0; + if (amdgpu_gfx_is_master_xcc(adev, xcc_id)) { + spin_lock(&kiq->ring_lock); if (amdgpu_ring_alloc(kiq_ring, kiq->pmf->unmap_queues_size * adev->gfx.num_gfx_rings)) { spin_unlock(&kiq->ring_lock); @@ -583,11 +592,17 @@ int amdgpu_gfx_disable_kgq(struct amdgpu_device *adev, int xcc_id) &adev->gfx.gfx_ring[j], PREEMPT_QUEUES, 0, 0); } - } + /* Submit unmap queue packet */ + amdgpu_ring_commit(kiq_ring); - if (adev->gfx.kiq[0].ring.sched.ready && !adev->job_hang) + /* + * Ring test will do a basic scratch register change check. + * Just run this to ensure that unmap queues that is submitted + * before got processed successfully before returning. + */ r = amdgpu_ring_test_helper(kiq_ring); - spin_unlock(&kiq->ring_lock); + spin_unlock(&kiq->ring_lock); + } return r; } @@ -692,7 +707,13 @@ int amdgpu_gfx_enable_kcq(struct amdgpu_device *adev, int xcc_id) kiq->pmf->kiq_map_queues(kiq_ring, &adev->gfx.compute_ring[j]); } - + /* Submit map queue packet */ + amdgpu_ring_commit(kiq_ring); + /* + * Ring test will do a basic scratch register change check. Just run + * this to ensure that map queues that is submitted before got + * processed successfully before returning. + */ r = amdgpu_ring_test_helper(kiq_ring); spin_unlock(&kiq->ring_lock); if (r) @@ -743,7 +764,13 @@ int amdgpu_gfx_enable_kgq(struct amdgpu_device *adev, int xcc_id) &adev->gfx.gfx_ring[j]); } } - + /* Submit map queue packet */ + amdgpu_ring_commit(kiq_ring); + /* + * Ring test will do a basic scratch register change check. Just run + * this to ensure that map queues that is submitted before got + * processed successfully before returning. + */ r = amdgpu_ring_test_helper(kiq_ring); spin_unlock(&kiq->ring_lock); if (r) diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c index 480c41ee947e..b7006c41e270 100644 --- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c @@ -4823,6 +4823,13 @@ static int gfx_v8_0_kcq_disable(struct amdgpu_device *adev) amdgpu_ring_write(kiq_ring, 0); amdgpu_ring_write(kiq_ring, 0); } + /* Submit unmap queue packet */ + amdgpu_ring_commit(kiq_ring); + /* + * Ring test will do a basic scratch register change check. Just run + * this to ensure that unmap queues that is submitted before got + * processed successfully before returning. + */ r = amdgpu_ring_test_helper(kiq_ring); if (r) DRM_ERROR("KCQ disable failed\n"); -- cgit