drm/amdkfd: allow compute partition mode switch with cgroup exclusions
The KFD currently bars a compute partition mode switch while a KFD process exists. Since cgroup excluded devices remain excluded for the lifetime of a KFD process and user space is able to mode switch single devices, allow users to mode switch a device with any running process that has been cgroup excluded from this device. Signed-off-by: Jonathan Kim <jonathan.kim@amd.com> Reviewed-by: Harish Kasiviswanathan <harish.kasiviswanathan@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
This commit is contained in:
committed by
Alex Deucher
parent
dc8ffb2879
commit
96f75f9594
@@ -749,12 +749,12 @@ int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
|
||||
|
||||
int amdgpu_amdkfd_check_and_lock_kfd(struct amdgpu_device *adev)
|
||||
{
|
||||
return kgd2kfd_check_and_lock_kfd();
|
||||
return kgd2kfd_check_and_lock_kfd(adev->kfd.dev);
|
||||
}
|
||||
|
||||
void amdgpu_amdkfd_unlock_kfd(struct amdgpu_device *adev)
|
||||
{
|
||||
kgd2kfd_unlock_kfd();
|
||||
kgd2kfd_unlock_kfd(adev->kfd.dev);
|
||||
}
|
||||
|
||||
|
||||
|
||||
@@ -419,8 +419,8 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd);
|
||||
void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry);
|
||||
void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
|
||||
void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint64_t throttle_bitmask);
|
||||
int kgd2kfd_check_and_lock_kfd(void);
|
||||
void kgd2kfd_unlock_kfd(void);
|
||||
int kgd2kfd_check_and_lock_kfd(struct kfd_dev *kfd);
|
||||
void kgd2kfd_unlock_kfd(struct kfd_dev *kfd);
|
||||
int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id);
|
||||
int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id);
|
||||
bool kgd2kfd_compute_active(struct kfd_dev *kfd, uint32_t node_id);
|
||||
@@ -489,12 +489,12 @@ void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint64_t throttle_bitmask)
|
||||
{
|
||||
}
|
||||
|
||||
static inline int kgd2kfd_check_and_lock_kfd(void)
|
||||
static inline int kgd2kfd_check_and_lock_kfd(struct kfd_dev *kfd)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void kgd2kfd_unlock_kfd(void)
|
||||
static inline void kgd2kfd_unlock_kfd(struct kfd_dev *kfd)
|
||||
{
|
||||
}
|
||||
|
||||
|
||||
@@ -1013,10 +1013,30 @@ int kgd2kfd_post_reset(struct kfd_dev *kfd)
|
||||
return 0;
|
||||
}
|
||||
|
||||
bool kfd_is_locked(void)
|
||||
bool kfd_is_locked(struct kfd_dev *kfd)
|
||||
{
|
||||
uint8_t id = 0;
|
||||
struct kfd_node *dev;
|
||||
|
||||
lockdep_assert_held(&kfd_processes_mutex);
|
||||
return (kfd_locked > 0);
|
||||
|
||||
/* check reset/suspend lock */
|
||||
if (kfd_locked > 0)
|
||||
return true;
|
||||
|
||||
if (kfd)
|
||||
return kfd->kfd_dev_lock > 0;
|
||||
|
||||
/* check lock on all cgroup accessible devices */
|
||||
while (kfd_topology_enum_kfd_devices(id++, &dev) == 0) {
|
||||
if (!dev || kfd_devcgroup_check_permission(dev))
|
||||
continue;
|
||||
|
||||
if (dev->kfd->kfd_dev_lock > 0)
|
||||
return true;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
void kgd2kfd_suspend(struct kfd_dev *kfd, bool run_pm)
|
||||
@@ -1442,24 +1462,53 @@ unsigned int kfd_get_num_xgmi_sdma_engines(struct kfd_node *node)
|
||||
kfd_get_num_sdma_engines(node);
|
||||
}
|
||||
|
||||
int kgd2kfd_check_and_lock_kfd(void)
|
||||
int kgd2kfd_check_and_lock_kfd(struct kfd_dev *kfd)
|
||||
{
|
||||
struct kfd_process *p;
|
||||
int r = 0, temp, idx;
|
||||
|
||||
mutex_lock(&kfd_processes_mutex);
|
||||
if (!hash_empty(kfd_processes_table) || kfd_is_locked()) {
|
||||
mutex_unlock(&kfd_processes_mutex);
|
||||
return -EBUSY;
|
||||
|
||||
if (hash_empty(kfd_processes_table) && !kfd_is_locked(kfd))
|
||||
goto out;
|
||||
|
||||
/* fail under system reset/resume or kfd device is partition switching. */
|
||||
if (kfd_is_locked(kfd)) {
|
||||
r = -EBUSY;
|
||||
goto out;
|
||||
}
|
||||
|
||||
++kfd_locked;
|
||||
/*
|
||||
* ensure all running processes are cgroup excluded from device before mode switch.
|
||||
* i.e. no pdd was created on the process socket.
|
||||
*/
|
||||
idx = srcu_read_lock(&kfd_processes_srcu);
|
||||
hash_for_each_rcu(kfd_processes_table, temp, p, kfd_processes) {
|
||||
int i;
|
||||
|
||||
for (i = 0; i < p->n_pdds; i++) {
|
||||
if (p->pdds[i]->dev->kfd != kfd)
|
||||
continue;
|
||||
|
||||
r = -EBUSY;
|
||||
goto proc_check_unlock;
|
||||
}
|
||||
}
|
||||
|
||||
proc_check_unlock:
|
||||
srcu_read_unlock(&kfd_processes_srcu, idx);
|
||||
out:
|
||||
if (!r)
|
||||
++kfd->kfd_dev_lock;
|
||||
mutex_unlock(&kfd_processes_mutex);
|
||||
|
||||
return 0;
|
||||
return r;
|
||||
}
|
||||
|
||||
void kgd2kfd_unlock_kfd(void)
|
||||
void kgd2kfd_unlock_kfd(struct kfd_dev *kfd)
|
||||
{
|
||||
mutex_lock(&kfd_processes_mutex);
|
||||
--kfd_locked;
|
||||
--kfd->kfd_dev_lock;
|
||||
mutex_unlock(&kfd_processes_mutex);
|
||||
}
|
||||
|
||||
|
||||
@@ -372,6 +372,9 @@ struct kfd_dev {
|
||||
|
||||
/* bitmap for dynamic doorbell allocation from doorbell object */
|
||||
unsigned long *doorbell_bitmap;
|
||||
|
||||
/* for dynamic partitioning */
|
||||
int kfd_dev_lock;
|
||||
};
|
||||
|
||||
enum kfd_mempool {
|
||||
@@ -1536,7 +1539,7 @@ static inline bool kfd_flush_tlb_after_unmap(struct kfd_dev *dev)
|
||||
int kfd_send_exception_to_runtime(struct kfd_process *p,
|
||||
unsigned int queue_id,
|
||||
uint64_t error_reason);
|
||||
bool kfd_is_locked(void);
|
||||
bool kfd_is_locked(struct kfd_dev *kfd);
|
||||
|
||||
/* Compute profile */
|
||||
void kfd_inc_compute_active(struct kfd_node *dev);
|
||||
|
||||
@@ -854,7 +854,7 @@ struct kfd_process *kfd_create_process(struct task_struct *thread)
|
||||
*/
|
||||
mutex_lock(&kfd_processes_mutex);
|
||||
|
||||
if (kfd_is_locked()) {
|
||||
if (kfd_is_locked(NULL)) {
|
||||
pr_debug("KFD is locked! Cannot create process");
|
||||
process = ERR_PTR(-EINVAL);
|
||||
goto out;
|
||||
|
||||
Reference in New Issue
Block a user