Merge tag 'amd-drm-next-6.12-2024-09-13' of https://gitlab.freedesktop.org/agd5f/linux into drm-next

amd-drm-next-6.12-2024-09-13:

amdgpu:
- GPUVM sync fixes
- kdoc fixes
- Misc spelling mistakes
- Add some raven GFXOFF quirks
- Use clamp helper
- DC fixes
- JPEG fixes
- Process isolation fix
- Queue reset fix
- W=1 cleanup
- SMU14 fixes
- JPEG fixes

amdkfd:
- Fetch cacheline info from IP discovery
- Queue reset fix
- RAS fix
- Document SVM events
- CRIU fixes
- Race fix in dma-buf handling

drm:
- dma-buf fd race fixes

Signed-off-by: Dave Airlie <airlied@redhat.com>

From: Alex Deucher <alexander.deucher@amd.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240913134139.2861073-1-alexander.deucher@amd.com
This commit is contained in:
Dave Airlie
2024-09-17 01:05:31 +10:00
50 changed files with 679 additions and 367 deletions
+87 -13
View File
@@ -540,26 +540,29 @@ enum kfd_smi_event {
KFD_SMI_EVENT_ALL_PROCESS = 64
};
/* The reason of the page migration event */
enum KFD_MIGRATE_TRIGGERS {
KFD_MIGRATE_TRIGGER_PREFETCH,
KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU,
KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU,
KFD_MIGRATE_TRIGGER_TTM_EVICTION
KFD_MIGRATE_TRIGGER_PREFETCH, /* Prefetch to GPU VRAM or system memory */
KFD_MIGRATE_TRIGGER_PAGEFAULT_GPU, /* GPU page fault recover */
KFD_MIGRATE_TRIGGER_PAGEFAULT_CPU, /* CPU page fault recover */
KFD_MIGRATE_TRIGGER_TTM_EVICTION /* TTM eviction */
};
/* The reason of user queue evition event */
enum KFD_QUEUE_EVICTION_TRIGGERS {
KFD_QUEUE_EVICTION_TRIGGER_SVM,
KFD_QUEUE_EVICTION_TRIGGER_USERPTR,
KFD_QUEUE_EVICTION_TRIGGER_TTM,
KFD_QUEUE_EVICTION_TRIGGER_SUSPEND,
KFD_QUEUE_EVICTION_CRIU_CHECKPOINT,
KFD_QUEUE_EVICTION_CRIU_RESTORE
KFD_QUEUE_EVICTION_TRIGGER_SVM, /* SVM buffer migration */
KFD_QUEUE_EVICTION_TRIGGER_USERPTR, /* userptr movement */
KFD_QUEUE_EVICTION_TRIGGER_TTM, /* TTM move buffer */
KFD_QUEUE_EVICTION_TRIGGER_SUSPEND, /* GPU suspend */
KFD_QUEUE_EVICTION_CRIU_CHECKPOINT, /* CRIU checkpoint */
KFD_QUEUE_EVICTION_CRIU_RESTORE /* CRIU restore */
};
/* The reason of unmap buffer from GPU event */
enum KFD_SVM_UNMAP_TRIGGERS {
KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY,
KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE,
KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU
KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY, /* MMU notifier CPU buffer movement */
KFD_SVM_UNMAP_TRIGGER_MMU_NOTIFY_MIGRATE,/* MMU notifier page migration */
KFD_SVM_UNMAP_TRIGGER_UNMAP_FROM_CPU /* Unmap to free the buffer */
};
#define KFD_SMI_EVENT_MASK_FROM_INDEX(i) (1ULL << ((i) - 1))
@@ -570,6 +573,77 @@ struct kfd_ioctl_smi_events_args {
__u32 anon_fd; /* from KFD */
};
/*
* SVM event tracing via SMI system management interface
*
* Open event file descriptor
* use ioctl AMDKFD_IOC_SMI_EVENTS, pass in gpuid and return a anonymous file
* descriptor to receive SMI events.
* If calling with sudo permission, then file descriptor can be used to receive
* SVM events from all processes, otherwise, to only receive SVM events of same
* process.
*
* To enable the SVM event
* Write event file descriptor with KFD_SMI_EVENT_MASK_FROM_INDEX(event) bitmap
* mask to start record the event to the kfifo, use bitmap mask combination
* for multiple events. New event mask will overwrite the previous event mask.
* KFD_SMI_EVENT_MASK_FROM_INDEX(KFD_SMI_EVENT_ALL_PROCESS) bit requires sudo
* permisson to receive SVM events from all process.
*
* To receive the event
* Application can poll file descriptor to wait for the events, then read event
* from the file into a buffer. Each event is one line string message, starting
* with the event id, then the event specific information.
*
* To decode event information
* The following event format string macro can be used with sscanf to decode
* the specific event information.
* event triggers: the reason to generate the event, defined as enum for unmap,
* eviction and migrate events.
* node, from, to, prefetch_loc, preferred_loc: GPU ID, or 0 for system memory.
* addr: user mode address, in pages
* size: in pages
* pid: the process ID to generate the event
* ns: timestamp in nanosecond-resolution, starts at system boot time but
* stops during suspend
* migrate_update: GPU page fault is recovered by 'M' for migrate, 'U' for update
* rw: 'W' for write page fault, 'R' for read page fault
* rescheduled: 'R' if the queue restore failed and rescheduled to try again
*/
#define KFD_EVENT_FMT_UPDATE_GPU_RESET(reset_seq_num, reset_cause)\
"%x %s\n", (reset_seq_num), (reset_cause)
#define KFD_EVENT_FMT_THERMAL_THROTTLING(bitmask, counter)\
"%llx:%llx\n", (bitmask), (counter)
#define KFD_EVENT_FMT_VMFAULT(pid, task_name)\
"%x:%s\n", (pid), (task_name)
#define KFD_EVENT_FMT_PAGEFAULT_START(ns, pid, addr, node, rw)\
"%lld -%d @%lx(%x) %c\n", (ns), (pid), (addr), (node), (rw)
#define KFD_EVENT_FMT_PAGEFAULT_END(ns, pid, addr, node, migrate_update)\
"%lld -%d @%lx(%x) %c\n", (ns), (pid), (addr), (node), (migrate_update)
#define KFD_EVENT_FMT_MIGRATE_START(ns, pid, start, size, from, to, prefetch_loc,\
preferred_loc, migrate_trigger)\
"%lld -%d @%lx(%lx) %x->%x %x:%x %d\n", (ns), (pid), (start), (size),\
(from), (to), (prefetch_loc), (preferred_loc), (migrate_trigger)
#define KFD_EVENT_FMT_MIGRATE_END(ns, pid, start, size, from, to, migrate_trigger)\
"%lld -%d @%lx(%lx) %x->%x %d\n", (ns), (pid), (start), (size),\
(from), (to), (migrate_trigger)
#define KFD_EVENT_FMT_QUEUE_EVICTION(ns, pid, node, evict_trigger)\
"%lld -%d %x %d\n", (ns), (pid), (node), (evict_trigger)
#define KFD_EVENT_FMT_QUEUE_RESTORE(ns, pid, node, rescheduled)\
"%lld -%d %x %c\n", (ns), (pid), (node), (rescheduled)
#define KFD_EVENT_FMT_UNMAP_FROM_GPU(ns, pid, addr, size, node, unmap_trigger)\
"%lld -%d @%lx(%lx) %x %d\n", (ns), (pid), (addr), (size),\
(node), (unmap_trigger)
/**************************************************************************************************
* CRIU IOCTLs (Checkpoint Restore In Userspace)
*