Skip to content

Commit 45ecaea

Browse files
Andrey Grodzovskyalexdeucher
authored andcommitted
drm/sched: Partial revert of 'drm/sched: Keep s_fence->parent pointer'
Problem: This patch caused negative refcount as described in [1] because for that case parent fence did not signal by the time of drm_sched_stop and hence kept in pending list the assumption was they will not signal and so fence was put to account for the s_fence->parent refcount but for amdgpu which has embedded HW fence (always same parent fence) drm_sched_fence_release_scheduled was always called and would still drop the count for parent fence once more. For jobs that never signaled this imbalance was masked by refcount bug in amdgpu_fence_driver_clear_job_fences that would not drop refcount on the fences that were removed from fence drive fences array (against prevois insertion into the array in get in amdgpu_fence_emit). Fix: Revert this patch and by setting s_job->s_fence->parent to NULL as before prevent the extra refcount drop in amdgpu when drm_sched_fence_release_scheduled is called on job release. Also - align behaviour in drm_sched_resubmit_jobs_ext with that of drm_sched_main when submitting jobs - take a refcount for the new parent fence pointer and drop refcount for original kref_init for new HW fence creation (or fake new HW fence in amdgpu - see next patch). [1] - https://lore.kernel.org/all/731b7ff1-3cc9-e314-df2a-7c51b76d4db0@amd.com/t/#r00c728fcc069b1276642c325bfa9d82bf8fa21a3 Signed-off-by: Andrey Grodzovsky <andrey.grodzovsky@amd.com> Tested-by: Yiqing Yao <yiqing.yao@amd.com> Acked-by: Christian König <christian.koenig@amd.com> Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
1 parent 9e225fb commit 45ecaea

File tree

1 file changed

+10
-3
lines changed

1 file changed

+10
-3
lines changed

drivers/gpu/drm/scheduler/sched_main.c

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -419,6 +419,8 @@ void drm_sched_stop(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
419419
if (s_job->s_fence->parent &&
420420
dma_fence_remove_callback(s_job->s_fence->parent,
421421
&s_job->cb)) {
422+
dma_fence_put(s_job->s_fence->parent);
423+
s_job->s_fence->parent = NULL;
422424
atomic_dec(&sched->hw_rq_count);
423425
} else {
424426
/*
@@ -548,7 +550,6 @@ void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
548550
if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
549551
dma_fence_set_error(&s_fence->finished, -ECANCELED);
550552

551-
dma_fence_put(s_job->s_fence->parent);
552553
fence = sched->ops->run_job(s_job);
553554
i++;
554555

@@ -558,7 +559,11 @@ void drm_sched_resubmit_jobs_ext(struct drm_gpu_scheduler *sched, int max)
558559

559560
s_job->s_fence->parent = NULL;
560561
} else {
561-
s_job->s_fence->parent = fence;
562+
563+
s_job->s_fence->parent = dma_fence_get(fence);
564+
565+
/* Drop for orignal kref_init */
566+
dma_fence_put(fence);
562567
}
563568
}
564569
}
@@ -955,14 +960,16 @@ static int drm_sched_main(void *param)
955960

956961
if (!IS_ERR_OR_NULL(fence)) {
957962
s_fence->parent = dma_fence_get(fence);
963+
/* Drop for original kref_init of the fence */
964+
dma_fence_put(fence);
965+
958966
r = dma_fence_add_callback(fence, &sched_job->cb,
959967
drm_sched_job_done_cb);
960968
if (r == -ENOENT)
961969
drm_sched_job_done(sched_job);
962970
else if (r)
963971
DRM_DEV_ERROR(sched->dev, "fence add callback failed (%d)\n",
964972
r);
965-
dma_fence_put(fence);
966973
} else {
967974
if (IS_ERR(fence))
968975
dma_fence_set_error(&s_fence->finished, PTR_ERR(fence));

0 commit comments

Comments
 (0)