@@ -159,6 +159,8 @@ ur_result_t ur_queue_immediate_in_order_t::queueFinish() {
159
159
ZE2UR_CALL (zeCommandListHostSynchronize,
160
160
(commandListLocked->getZeCommandList (), UINT64_MAX));
161
161
162
+ hContext->getAsyncPool ()->cleanupPoolsForQueue (this );
163
+
162
164
// Free deferred kernels
163
165
for (auto &hKernel : submittedKernels) {
164
166
UR_CALL (hKernel->release ());
@@ -706,31 +708,155 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueWriteHostPipe(
706
708
return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
707
709
}
708
710
711
+ ur_result_t ur_queue_immediate_in_order_t::enqueueUSMAllocHelper (
712
+ ur_usm_pool_handle_t pPool, const size_t size,
713
+ const ur_exp_async_usm_alloc_properties_t *, uint32_t numEventsInWaitList,
714
+ const ur_event_handle_t *phEventWaitList, void **ppMem,
715
+ ur_event_handle_t *phEvent, ur_usm_type_t type) {
716
+ auto commandListLocked = commandListManager.lock ();
717
+
718
+ if (!pPool) {
719
+ pPool = hContext->getAsyncPool ();
720
+ }
721
+
722
+ auto device = (type == UR_USM_TYPE_HOST) ? nullptr : hDevice;
723
+
724
+ ur_event_handle_t originAllocEvent = nullptr ;
725
+ auto asyncAlloc = pPool->allocateEnqueued (hContext, this , true , device,
726
+ nullptr , type, size);
727
+ if (!asyncAlloc) {
728
+ auto Ret = pPool->allocate (hContext, device, nullptr , type, size, ppMem);
729
+ if (Ret) {
730
+ return Ret;
731
+ }
732
+ } else {
733
+ std::tie (*ppMem, originAllocEvent) = *asyncAlloc;
734
+ }
735
+
736
+ auto waitListView = getWaitListView (commandListLocked, phEventWaitList,
737
+ numEventsInWaitList, originAllocEvent);
738
+
739
+ ur_command_t commandType = UR_COMMAND_FORCE_UINT32;
740
+ switch (type) {
741
+ case UR_USM_TYPE_HOST:
742
+ commandType = UR_COMMAND_ENQUEUE_USM_HOST_ALLOC_EXP;
743
+ break ;
744
+ case UR_USM_TYPE_DEVICE:
745
+ commandType = UR_COMMAND_ENQUEUE_USM_DEVICE_ALLOC_EXP;
746
+ break ;
747
+ case UR_USM_TYPE_SHARED:
748
+ commandType = UR_COMMAND_ENQUEUE_USM_SHARED_ALLOC_EXP;
749
+ break ;
750
+ default :
751
+ UR_LOG (ERR, " enqueueUSMAllocHelper: unsupported USM type" );
752
+ throw UR_RESULT_ERROR_INVALID_ARGUMENT;
753
+ }
754
+
755
+ auto zeSignalEvent = getSignalEvent (commandListLocked, phEvent, commandType);
756
+ auto [pWaitEvents, numWaitEvents] = waitListView;
757
+
758
+ if (numWaitEvents > 0 ) {
759
+ ZE2UR_CALL (
760
+ zeCommandListAppendWaitOnEvents,
761
+ (commandListLocked->getZeCommandList (), numWaitEvents, pWaitEvents));
762
+ }
763
+ if (zeSignalEvent) {
764
+ ZE2UR_CALL (zeCommandListAppendSignalEvent,
765
+ (commandListLocked->getZeCommandList (), zeSignalEvent));
766
+ }
767
+ if (originAllocEvent) {
768
+ originAllocEvent->release ();
769
+ }
770
+
771
+ return UR_RESULT_SUCCESS;
772
+ }
773
+
709
774
ur_result_t ur_queue_immediate_in_order_t::enqueueUSMDeviceAllocExp (
710
- ur_usm_pool_handle_t , const size_t ,
711
- const ur_exp_async_usm_alloc_properties_t *, uint32_t ,
712
- const ur_event_handle_t *, void **, ur_event_handle_t *) {
713
- return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
775
+ ur_usm_pool_handle_t pPool, const size_t size,
776
+ const ur_exp_async_usm_alloc_properties_t *pProperties,
777
+ uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
778
+ void **ppMem, ur_event_handle_t *phEvent) {
779
+ TRACK_SCOPE_LATENCY (
780
+ " ur_queue_immediate_in_order_t::enqueueUSMDeviceAllocExp" );
781
+
782
+ return enqueueUSMAllocHelper (pPool, size, pProperties, numEventsInWaitList,
783
+ phEventWaitList, ppMem, phEvent,
784
+ UR_USM_TYPE_DEVICE);
714
785
}
715
786
716
787
ur_result_t ur_queue_immediate_in_order_t::enqueueUSMSharedAllocExp (
717
- ur_usm_pool_handle_t , const size_t ,
718
- const ur_exp_async_usm_alloc_properties_t *, uint32_t ,
719
- const ur_event_handle_t *, void **, ur_event_handle_t *) {
720
- return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
788
+ ur_usm_pool_handle_t pPool, const size_t size,
789
+ const ur_exp_async_usm_alloc_properties_t *pProperties,
790
+ uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
791
+ void **ppMem, ur_event_handle_t *phEvent) {
792
+ TRACK_SCOPE_LATENCY (
793
+ " ur_queue_immediate_in_order_t::enqueueUSMSharedAllocExp" );
794
+
795
+ return enqueueUSMAllocHelper (pPool, size, pProperties, numEventsInWaitList,
796
+ phEventWaitList, ppMem, phEvent,
797
+ UR_USM_TYPE_SHARED);
721
798
}
722
799
723
800
ur_result_t ur_queue_immediate_in_order_t::enqueueUSMHostAllocExp (
724
- ur_usm_pool_handle_t , const size_t ,
725
- const ur_exp_async_usm_alloc_properties_t *, uint32_t ,
726
- const ur_event_handle_t *, void **, ur_event_handle_t *) {
727
- return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
801
+ ur_usm_pool_handle_t pPool, const size_t size,
802
+ const ur_exp_async_usm_alloc_properties_t *pProperties,
803
+ uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList,
804
+ void **ppMem, ur_event_handle_t *phEvent) {
805
+ TRACK_SCOPE_LATENCY (" ur_queue_immediate_in_order_t::enqueueUSMHostAllocExp" );
806
+
807
+ return enqueueUSMAllocHelper (pPool, size, pProperties, numEventsInWaitList,
808
+ phEventWaitList, ppMem, phEvent,
809
+ UR_USM_TYPE_HOST);
728
810
}
729
811
730
812
ur_result_t ur_queue_immediate_in_order_t::enqueueUSMFreeExp (
731
- ur_usm_pool_handle_t , void *, uint32_t , const ur_event_handle_t *,
732
- ur_event_handle_t *) {
733
- return UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
813
+ ur_usm_pool_handle_t , void *pMem, uint32_t numEventsInWaitList,
814
+ const ur_event_handle_t *phEventWaitList, ur_event_handle_t *phEvent) {
815
+ TRACK_SCOPE_LATENCY (" ur_queue_immediate_in_order_t::enqueueUSMFreeExp" );
816
+ auto commandListLocked = commandListManager.lock ();
817
+ ur_event_handle_t internalEvent = nullptr ;
818
+ if (phEvent == nullptr ) {
819
+ phEvent = &internalEvent;
820
+ }
821
+
822
+ auto zeSignalEvent = getSignalEvent (commandListLocked, phEvent,
823
+ UR_COMMAND_ENQUEUE_USM_FREE_EXP);
824
+ auto [pWaitEvents, numWaitEvents] =
825
+ getWaitListView (commandListLocked, phEventWaitList, numEventsInWaitList);
826
+
827
+ umf_memory_pool_handle_t hPool = umfPoolByPtr (pMem);
828
+ if (!hPool) {
829
+ return UR_RESULT_ERROR_INVALID_MEM_OBJECT;
830
+ }
831
+
832
+ UsmPool *usmPool = nullptr ;
833
+ auto ret = umfPoolGetTag (hPool, (void **)&usmPool);
834
+ if (ret != UMF_RESULT_SUCCESS || !usmPool) {
835
+ // This should never happen
836
+ UR_LOG (ERR, " enqueueUSMFreeExp: invalid pool tag" );
837
+ return UR_RESULT_ERROR_UNKNOWN;
838
+ }
839
+
840
+ size_t size = umfPoolMallocUsableSize (hPool, pMem);
841
+ if (internalEvent == nullptr ) {
842
+ // When the output event is used instead of an internal event, we need to
843
+ // increment the refcount.
844
+ (*phEvent)->RefCount .increment ();
845
+ }
846
+
847
+ if (numWaitEvents > 0 ) {
848
+ ZE2UR_CALL (
849
+ zeCommandListAppendWaitOnEvents,
850
+ (commandListLocked->getZeCommandList (), numWaitEvents, pWaitEvents));
851
+ }
852
+
853
+ ZE2UR_CALL (zeCommandListAppendSignalEvent,
854
+ (commandListLocked->getZeCommandList (), zeSignalEvent));
855
+
856
+ // Insert must be done after the signal event is appended.
857
+ usmPool->asyncPool .insert (pMem, size, *phEvent, this );
858
+
859
+ return UR_RESULT_SUCCESS;
734
860
}
735
861
736
862
ur_result_t ur_queue_immediate_in_order_t::bindlessImagesImageCopyExp (
@@ -881,9 +1007,9 @@ ur_result_t ur_queue_immediate_in_order_t::enqueueGenericCommandListsExp(
881
1007
" ur_queue_immediate_in_order_t::enqueueGenericCommandListsExp" );
882
1008
883
1009
auto commandListLocked = commandListManager.lock ();
1010
+
884
1011
auto zeSignalEvent =
885
1012
getSignalEvent (commandListLocked, phEvent, callerCommand);
886
-
887
1013
auto [pWaitEvents, numWaitEvents] =
888
1014
getWaitListView (commandListLocked, phEventWaitList, numEventsInWaitList,
889
1015
additionalWaitEvent);
0 commit comments