]> git.dujemihanovic.xyz Git - linux.git/commitdiff
drm/amdkfd: Change kfd/svm page fault drain handling
authorXiaogang Chen <xiaogang.chen@amd.com>
Fri, 23 Aug 2024 07:04:09 +0000 (02:04 -0500)
committerAlex Deucher <alexander.deucher@amd.com>
Fri, 23 Aug 2024 14:55:13 +0000 (10:55 -0400)
When app unmap vm ranges(munmap) kfd/svm starts drain pending page fault and
not handle any incoming pages fault of this process until a deferred work item
got executed by default system wq. The time period of "not handle page fault"
can be long and is unpredicable. That is advese to kfd performance on page
faults recovery.

This patch uses time stamp of incoming page fault to decide to drop or recover
page fault. When app unmap vm ranges kfd records each gpu device's ih ring
current time stamp. These time stamps are used at kfd page fault recovery
routine.

Any page fault happened on unmapped ranges after unmap events is application
bug that accesses vm range after unmap. It is not driver work to cover that.

By using time stamp of page fault do not need drain page faults at deferred
work. So, the time period that kfd does not handle page faults is reduced
and can be controlled.

Signed-off-by: Xiaogang.Chen <Xiaogang.Chen@amd.com>
Reviewed-by: Philip Yang <Philip.Yang@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
drivers/gpu/drm/amd/amdgpu/amdgpu_vm.h
drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
drivers/gpu/drm/amd/amdkfd/kfd_priv.h
drivers/gpu/drm/amd/amdkfd/kfd_svm.c
drivers/gpu/drm/amd/amdkfd/kfd_svm.h

index 1468222ea0cdf44861435972a1b9a024ff52f1a7..ad2e469548c94ced84159486baf88184d2218ed3 100644 (file)
@@ -2776,7 +2776,7 @@ int amdgpu_vm_ioctl(struct drm_device *dev, void *data, struct drm_file *filp)
  * shouldn't be reported any more.
  */
 bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
-                           u32 vmid, u32 node_id, uint64_t addr,
+                           u32 vmid, u32 node_id, uint64_t addr, uint64_t ts,
                            bool write_fault)
 {
        bool is_compute_context = false;
@@ -2802,7 +2802,7 @@ bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
        addr /= AMDGPU_GPU_PAGE_SIZE;
 
        if (is_compute_context && !svm_range_restore_pages(adev, pasid, vmid,
-           node_id, addr, write_fault)) {
+           node_id, addr, ts, write_fault)) {
                amdgpu_bo_unref(&root);
                return true;
        }
index 046949c4b6959f849fc328363f64b782348a1137..d12d66dca8e953d339faa6524f4eaaa783633da2 100644 (file)
@@ -558,7 +558,7 @@ amdgpu_vm_get_task_info_vm(struct amdgpu_vm *vm);
 void amdgpu_vm_put_task_info(struct amdgpu_task_info *task_info);
 
 bool amdgpu_vm_handle_fault(struct amdgpu_device *adev, u32 pasid,
-                           u32 vmid, u32 node_id, uint64_t addr,
+                           u32 vmid, u32 node_id, uint64_t addr, uint64_t ts,
                            bool write_fault);
 
 void amdgpu_vm_set_task_info(struct amdgpu_vm *vm);
index f0ceab3ce5bfadf5a656e8a5f8d08b264e8c9e95..9784a28921853ffe4aaa13615ef9c8dd19b1fa63 100644 (file)
@@ -132,7 +132,8 @@ static int gmc_v10_0_process_interrupt(struct amdgpu_device *adev,
                /* Try to handle the recoverable page faults by filling page
                 * tables
                 */
-               if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr, write_fault))
+               if (amdgpu_vm_handle_fault(adev, entry->pasid, 0, 0, addr,
+                                          entry->timestamp, write_fault))
                        return 1;
        }
 
index b73136d390cc03ba7efd371fbb5db30515caf0ce..c76ac0dfe572d0e8ec688a41f06a55ac9e8c011f 100644 (file)
@@ -595,7 +595,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
                        cam_index = entry->src_data[2] & 0x3ff;
 
                        ret = amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
-                                                    addr, write_fault);
+                                                    addr, entry->timestamp, write_fault);
                        WDOORBELL32(adev->irq.retry_cam_doorbell_index, cam_index);
                        if (ret)
                                return 1;
@@ -618,7 +618,7 @@ static int gmc_v9_0_process_interrupt(struct amdgpu_device *adev,
                         * tables
                         */
                        if (amdgpu_vm_handle_fault(adev, entry->pasid, entry->vmid, node_id,
-                                                  addr, write_fault))
+                                                  addr, entry->timestamp, write_fault))
                                return 1;
                }
        }
index 7bba6bed2f4867d902561e53a6c7914a99714c26..9ae9abc6eb433466018df0762e62241dc09f01fb 100644 (file)
@@ -866,6 +866,8 @@ struct svm_range_list {
        struct delayed_work             restore_work;
        DECLARE_BITMAP(bitmap_supported, MAX_GPU_INSTANCE);
        struct task_struct              *faulting_task;
+       /* check point ts decides if page fault recovery need be dropped */
+       uint64_t                        checkpoint_ts[MAX_GPU_INSTANCE];
 };
 
 /* Process data */
index 2339bbdf452fbfbe6f65208a620124f926fde4a5..ce2a5d9f90d37ba6c85fc71b9c281486d8a2c9e9 100644 (file)
@@ -2262,16 +2262,10 @@ static void svm_range_drain_retry_fault(struct svm_range_list *svms)
 {
        struct kfd_process_device *pdd;
        struct kfd_process *p;
-       int drain;
        uint32_t i;
 
        p = container_of(svms, struct kfd_process, svms);
 
-restart:
-       drain = atomic_read(&svms->drain_pagefaults);
-       if (!drain)
-               return;
-
        for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) {
                pdd = p->pdds[i];
                if (!pdd)
@@ -2291,8 +2285,6 @@ restart:
 
                pr_debug("drain retry fault gpu %d svms 0x%p done\n", i, svms);
        }
-       if (atomic_cmpxchg(&svms->drain_pagefaults, drain, 0) != drain)
-               goto restart;
 }
 
 static void svm_range_deferred_list_work(struct work_struct *work)
@@ -2314,17 +2306,8 @@ static void svm_range_deferred_list_work(struct work_struct *work)
                         prange->start, prange->last, prange->work_item.op);
 
                mm = prange->work_item.mm;
-retry:
-               mmap_write_lock(mm);
 
-               /* Checking for the need to drain retry faults must be inside
-                * mmap write lock to serialize with munmap notifiers.
-                */
-               if (unlikely(atomic_read(&svms->drain_pagefaults))) {
-                       mmap_write_unlock(mm);
-                       svm_range_drain_retry_fault(svms);
-                       goto retry;
-               }
+               mmap_write_lock(mm);
 
                /* Remove from deferred_list must be inside mmap write lock, for
                 * two race cases:
@@ -2445,6 +2428,7 @@ svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange,
        struct kfd_process *p;
        unsigned long s, l;
        bool unmap_parent;
+       uint32_t i;
 
        if (atomic_read(&prange->queue_refcount)) {
                int r;
@@ -2464,11 +2448,35 @@ svm_range_unmap_from_cpu(struct mm_struct *mm, struct svm_range *prange,
        pr_debug("svms 0x%p prange 0x%p [0x%lx 0x%lx] [0x%lx 0x%lx]\n", svms,
                 prange, prange->start, prange->last, start, last);
 
-       /* Make sure pending page faults are drained in the deferred worker
-        * before the range is freed to avoid straggler interrupts on
-        * unmapped memory causing "phantom faults".
+       /* calculate time stamps that are used to decide which page faults need be
+        * dropped or handled before unmap pages from gpu vm
         */
-       atomic_inc(&svms->drain_pagefaults);
+       for_each_set_bit(i, svms->bitmap_supported, p->n_pdds) {
+               struct kfd_process_device *pdd;
+               struct amdgpu_device *adev;
+               struct amdgpu_ih_ring *ih;
+               uint32_t checkpoint_wptr;
+
+               pdd = p->pdds[i];
+               if (!pdd)
+                       continue;
+
+               adev = pdd->dev->adev;
+
+               /* Check and drain ih1 ring if cam not available */
+               ih = &adev->irq.ih1;
+               checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih);
+               if (ih->rptr != checkpoint_wptr) {
+                       svms->checkpoint_ts[i] = amdgpu_ih_decode_iv_ts(adev, ih, checkpoint_wptr, -1);
+                       continue;
+               }
+
+               /* check if dev->irq.ih_soft is not empty */
+               ih = &adev->irq.ih_soft;
+               checkpoint_wptr = amdgpu_ih_get_wptr(adev, ih);
+               if (ih->rptr != checkpoint_wptr)
+                       svms->checkpoint_ts[i] = amdgpu_ih_decode_iv_ts(adev, ih, checkpoint_wptr, -1);
+       }
 
        unmap_parent = start <= prange->start && last >= prange->last;
 
@@ -2909,7 +2917,7 @@ svm_fault_allowed(struct vm_area_struct *vma, bool write_fault)
 int
 svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
                        uint32_t vmid, uint32_t node_id,
-                       uint64_t addr, bool write_fault)
+                       uint64_t addr, uint64_t ts, bool write_fault)
 {
        unsigned long start, last, size;
        struct mm_struct *mm = NULL;
@@ -2919,7 +2927,7 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
        ktime_t timestamp = ktime_get_boottime();
        struct kfd_node *node;
        int32_t best_loc;
-       int32_t gpuidx = MAX_GPU_INSTANCE;
+       int32_t gpuid, gpuidx = MAX_GPU_INSTANCE;
        bool write_locked = false;
        struct vm_area_struct *vma;
        bool migration = false;
@@ -2940,11 +2948,38 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
        pr_debug("restoring svms 0x%p fault address 0x%llx\n", svms, addr);
 
        if (atomic_read(&svms->drain_pagefaults)) {
-               pr_debug("draining retry fault, drop fault 0x%llx\n", addr);
+               pr_debug("page fault handling disabled, drop fault 0x%llx\n", addr);
                r = 0;
                goto out;
        }
 
+       node = kfd_node_by_irq_ids(adev, node_id, vmid);
+       if (!node) {
+               pr_debug("kfd node does not exist node_id: %d, vmid: %d\n", node_id,
+                        vmid);
+               r = -EFAULT;
+               goto out;
+       }
+
+       if (kfd_process_gpuid_from_node(p, node, &gpuid, &gpuidx)) {
+               pr_debug("failed to get gpuid/gpuidex for node_id: %d\n", node_id);
+               r = -EFAULT;
+               goto out;
+       }
+
+       /* check if this page fault time stamp is before svms->checkpoint_ts */
+       if (svms->checkpoint_ts[gpuidx] != 0) {
+               if (amdgpu_ih_ts_after(ts,  svms->checkpoint_ts[gpuidx])) {
+                       pr_debug("draining retry fault, drop fault 0x%llx\n", addr);
+                       r = 0;
+                       goto out;
+               } else
+                       /* ts is after svms->checkpoint_ts now, reset svms->checkpoint_ts
+                        * to zero to avoid following ts wrap around give wrong comparing
+                        */
+                       svms->checkpoint_ts[gpuidx] = 0;
+       }
+
        if (!p->xnack_enabled) {
                pr_debug("XNACK not enabled for pasid 0x%x\n", pasid);
                r = -EFAULT;
@@ -2961,13 +2996,6 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
                goto out;
        }
 
-       node = kfd_node_by_irq_ids(adev, node_id, vmid);
-       if (!node) {
-               pr_debug("kfd node does not exist node_id: %d, vmid: %d\n", node_id,
-                        vmid);
-               r = -EFAULT;
-               goto out;
-       }
        mmap_read_lock(mm);
 retry_write_locked:
        mutex_lock(&svms->lock);
@@ -3182,8 +3210,9 @@ void svm_range_list_fini(struct kfd_process *p)
        /*
         * Ensure no retry fault comes in afterwards, as page fault handler will
         * not find kfd process and take mm lock to recover fault.
+        * stop kfd page fault handing, then wait pending page faults got drained
         */
-       atomic_inc(&p->svms.drain_pagefaults);
+       atomic_set(&p->svms.drain_pagefaults, 1);
        svm_range_drain_retry_fault(&p->svms);
 
        list_for_each_entry_safe(prange, next, &p->svms.list, list) {
index 747325a2ea8962b522ac7d0af7ce86c34b5a54e1..bddd24f04669e87a4d34e639c5d0482b2391afc8 100644 (file)
@@ -174,7 +174,7 @@ int svm_range_vram_node_new(struct kfd_node *node, struct svm_range *prange,
                            bool clear);
 void svm_range_vram_node_free(struct svm_range *prange);
 int svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
-                           uint32_t vmid, uint32_t node_id, uint64_t addr,
+                           uint32_t vmid, uint32_t node_id, uint64_t addr, uint64_t ts,
                            bool write_fault);
 int svm_range_schedule_evict_svm_bo(struct amdgpu_amdkfd_fence *fence);
 void svm_range_add_list_work(struct svm_range_list *svms,
@@ -225,7 +225,7 @@ static inline void svm_range_list_fini(struct kfd_process *p)
 static inline int svm_range_restore_pages(struct amdgpu_device *adev,
                                          unsigned int pasid,
                                          uint32_t client_id, uint32_t node_id,
-                                         uint64_t addr, bool write_fault)
+                                         uint64_t addr, uint64_t ts, bool write_fault)
 {
        return -EFAULT;
 }