drm/amdgpu: improve HMM error -ENOMEM and -EBUSY handling [Linux 5.3]

This Linux kernel change "drm/amdgpu: improve HMM error -ENOMEM and -EBUSY handling" is included in the Linux 5.3 release. This change is authored by Philip Yang <Philip.Yang [at] amd.com> on Fri Jun 14 14:03:36 2019 -0400. The commit for this change in Linux stable tree is e82fdb1 (patch).

drm/amdgpu: improve HMM error -ENOMEM and -EBUSY handling

Under memory pressure, hmm_range_fault may return error code -ENOMEM
or -EBUSY, change pr_info to pr_debug to remove unnecessary kernel log
message because we will retry restore again.

Call get_user_pages_done if TTM get user pages failed will have
WARN_ONCE kernel calling stack dump log.

Signed-off-by: Philip Yang <Philip.Yang@amd.com>
Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: Alex Deucher <alexander.deucher@amd.com>

There are 38 lines of Linux source code added/deleted in this change. Code changes to Linux kernel are as follows.

 drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c | 38 ++++--------------------
 1 file changed, 6 insertions(+), 32 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
index 74e8695..10abae3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
@@ -1731,35 +1731,17 @@ static int update_invalid_user_pages(struct amdkfd_process_info *process_info,
        ret = amdgpu_ttm_tt_get_user_pages(bo->tbo.ttm,
                           bo->tbo.ttm->pages);
        if (ret) {
-           bo->tbo.ttm->pages[0] = NULL;
-           pr_info("%s: Failed to get user pages: %d\n",
+           pr_debug("%s: Failed to get user pages: %d\n",
                __func__, ret);
-           /* Pretend it succeeded. It will fail later
-            * with a VM fault if the GPU tries to access
-            * it. Better than hanging indefinitely with
-            * stalled user mode queues.
-            */
-       }
-   }
-
-   return 0;
-}

-/* Remove invalid userptr BOs from hmm track list
- *
- * Stop HMM track the userptr update
- */
-static void untrack_invalid_user_pages(struct amdkfd_process_info *process_info)
-{
-   struct kgd_mem *mem, *tmp_mem;
-   struct amdgpu_bo *bo;
+           /* Return error -EBUSY or -ENOMEM, retry restore */
+           return ret;
+       }

-   list_for_each_entry_safe(mem, tmp_mem,
-                &process_info->userptr_inval_list,
-                validate_list.head) {
-       bo = mem->bo;
        amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
    }
+
+   return 0;
 }

 /* Validate invalid userptr BOs
@@ -1841,13 +1823,6 @@ static int validate_invalid_user_pages(struct amdkfd_process_info *process_info)
        list_move_tail(&mem->validate_list.head,
                   &process_info->userptr_valid_list);

-       /* Stop HMM track the userptr update. We dont check the return
-        * value for concurrent CPU page table update because we will
-        * reschedule the restore worker if process_info->evicted_bos
-        * is updated.
-        */
-       amdgpu_ttm_tt_get_user_pages_done(bo->tbo.ttm);
-
        /* Update mapping. If the BO was not validated
         * (because we couldn't get user pages), this will
         * clear the page table entries, which will result in
@@ -1946,7 +1921,6 @@ static void amdgpu_amdkfd_restore_userptr_worker(struct work_struct *work)
    }

 unlock_out:
-   untrack_invalid_user_pages(process_info);
    mutex_unlock(&process_info->lock);
    mmput(mm);
    put_task_struct(usertask);

Leave a Reply

Your email address will not be published. Required fields are marked *