Skip to content

Commit 583fb91

Browse files
authored
[SYCL] Support USM memcpy/copy in graphs (#186)
- Add support for USM memcpy and copy in graphs - New MemoryManager method for usm copies - New PI extension methods for enqueueing USM copies - Re-enabled and fixed issues in E2E tests for copies - Minor changes to queue_impl to capture copies correctly
1 parent 52ecaf3 commit 583fb91

File tree

13 files changed

+191
-35
lines changed

13 files changed

+191
-35
lines changed

sycl/include/sycl/detail/pi.def

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ _PI_API(piextCommandBufferRetain)
147147
_PI_API(piextCommandBufferRelease)
148148
_PI_API(piextCommandBufferFinalize)
149149
_PI_API(piextCommandBufferNDRangeKernel)
150+
_PI_API(piextCommandBufferMemcpyUSM)
150151
_PI_API(piextEnqueueCommandBuffer)
151152
_PI_API(piPluginGetLastError)
152153

sycl/include/sycl/detail/pi.h

Lines changed: 17 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2157,7 +2157,7 @@ piextCommandBufferFinalize(pi_ext_command_buffer command_buffer);
21572157
/// \param local_work_size Local work size to use when executing kernel.
21582158
/// \param num_sync_points_in_wait_list The number of sync points in the
21592159
/// provided wait list.
2160-
/// \param sync_point_wait_list A list of sync points that this executions must
2160+
/// \param sync_point_wait_list A list of sync points that this command must
21612161
/// wait on.
21622162
/// \param sync_point The sync_point associated with this kernel execution.
21632163
__SYCL_EXPORT pi_result piextCommandBufferNDRangeKernel(
@@ -2167,6 +2167,22 @@ __SYCL_EXPORT pi_result piextCommandBufferNDRangeKernel(
21672167
const pi_ext_sync_point *sync_point_wait_list,
21682168
pi_ext_sync_point *sync_point);
21692169

2170+
/// API to append a USM memcpy command to the command-buffer.
2171+
/// \param command_buffer The command-buffer to append onto.
2172+
/// \param dst_ptr is the location the data will be copied
2173+
/// \param src_ptr is the data to be copied
2174+
/// \param size is number of bytes to copy
2175+
/// \param num_sync_points_in_wait_list The number of sync points in the
2176+
/// provided wait list.
2177+
/// \param sync_point_wait_list A list of sync points that this command must
2178+
/// wait on.
2179+
/// \param sync_point The sync_point associated with this memory operation.
2180+
__SYCL_EXPORT pi_result piextCommandBufferMemcpyUSM(
2181+
pi_ext_command_buffer command_buffer, void *dst_ptr, const void *src_ptr,
2182+
size_t size, pi_uint32 num_sync_points_in_wait_list,
2183+
const pi_ext_sync_point *sync_point_wait_list,
2184+
pi_ext_sync_point *sync_point);
2185+
21702186
/// API to submit the command-buffer to queue for execution, returns an error if
21712187
/// command-buffer not finalized or another instance of same command-buffer
21722188
/// currently executing.

sycl/plugins/cuda/pi_cuda.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5836,6 +5836,15 @@ pi_result cuda_piextCommandBufferNDRangeKernel(
58365836
return {};
58375837
}
58385838

5839+
pi_result cuda_piextCommandBufferMemcpyUSM(
5840+
pi_ext_command_buffer command_buffer, void *dst_ptr, const void *src_ptr,
5841+
size_t size, pi_uint32 num_sync_points_in_wait_list,
5842+
const pi_ext_sync_point *sync_point_wait_list,
5843+
pi_ext_sync_point *sync_point) {
5844+
sycl::detail::pi::die("command-buffer API not implemented in CUDA backend");
5845+
return {};
5846+
}
5847+
58395848
pi_result cuda_piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer,
58405849
pi_queue queue,
58415850
pi_uint32 num_events_in_wait_list,
@@ -6040,6 +6049,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
60406049
_PI_CL(piextCommandBufferRetain, cuda_piextCommandBufferRetain)
60416050
_PI_CL(piextCommandBufferRelease, cuda_piextCommandBufferRelease)
60426051
_PI_CL(piextCommandBufferNDRangeKernel, cuda_piextCommandBufferNDRangeKernel)
6052+
_PI_CL(piextCommandBufferMemcpyUSM, cuda_piextCommandBufferMemcpyUSM)
60436053
_PI_CL(piextEnqueueCommandBuffer, cuda_piextEnqueueCommandBuffer)
60446054

60456055
_PI_CL(piextKernelSetArgMemObj, cuda_piextKernelSetArgMemObj)

sycl/plugins/esimd_emulator/pi_esimd_emulator.cpp

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2127,6 +2127,15 @@ pi_result piextCommandBufferNDRangeKernel(
21272127
DIE_NO_IMPLEMENTATION;
21282128
}
21292129

2130+
pi_result
2131+
piextCommandBufferMemcpyUSM(pi_ext_command_buffer command_buffer, void *dst_ptr,
2132+
const void *src_ptr, size_t size,
2133+
pi_uint32 num_sync_points_in_wait_list,
2134+
const pi_ext_sync_point *sync_point_wait_list,
2135+
pi_ext_sync_point *sync_point) {
2136+
DIE_NO_IMPLEMENTATION;
2137+
}
2138+
21302139
pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer,
21312140
pi_queue queue,
21322141
pi_uint32 num_events_in_wait_list,

sycl/plugins/hip/pi_hip.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5583,6 +5583,16 @@ pi_result hip_piextCommandBufferNDRangeKernel(
55835583
return {};
55845584
}
55855585

5586+
pi_result
5587+
hip_piextCommandBufferMemcpyUSM(pi_ext_command_buffer command_buffer,
5588+
void *dst_ptr, const void *src_ptr, size_t size,
5589+
pi_uint32 num_sync_points_in_wait_list,
5590+
const pi_ext_sync_point *sync_point_wait_list,
5591+
pi_ext_sync_point *sync_point) {
5592+
sycl::detail::pi::die("command-buffer API not implemented in HIP backend");
5593+
return {};
5594+
}
5595+
55865596
pi_result hip_piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer,
55875597
pi_queue queue,
55885598
pi_uint32 num_events_in_wait_list,
@@ -5787,6 +5797,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
57875797
_PI_CL(piextCommandBufferRetain, hip_piextCommandBufferRetain)
57885798
_PI_CL(piextCommandBufferRelease, hip_piextCommandBufferRelease)
57895799
_PI_CL(piextCommandBufferNDRangeKernel, hip_piextCommandBufferNDRangeKernel)
5800+
_PI_CL(piextCommandBufferMemcpyUSM, hip_piextCommandBufferMemcpyUSM)
57905801
_PI_CL(piextEnqueueCommandBuffer, hip_piextEnqueueCommandBuffer)
57915802

57925803
_PI_CL(piextKernelSetArgMemObj, hip_piextKernelSetArgMemObj)

sycl/plugins/level_zero/pi_level_zero.cpp

Lines changed: 60 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -8831,6 +8831,23 @@ pi_result _pi_buffer::free() {
88318831

88328832
/// command-buffer Extension
88338833

8834+
/// Helper function to take a list of pi_ext_sync_points and fill the provided
8835+
/// vector with the associated ZeEvents
8836+
static pi_result getEventsFromSyncPoints(
8837+
const std::unordered_map<pi_ext_sync_point, pi_event> &SyncPoints,
8838+
size_t NumSyncPointsInWaitList, const pi_ext_sync_point *SyncPointWaitList,
8839+
std::vector<ze_event_handle_t> &ZeEventList) {
8840+
for (size_t i = 0; i < NumSyncPointsInWaitList; i++) {
8841+
if (auto EventHandle = SyncPoints.find(SyncPointWaitList[i]);
8842+
EventHandle != SyncPoints.end()) {
8843+
ZeEventList.push_back(EventHandle->second->ZeEvent);
8844+
} else {
8845+
return PI_ERROR_INVALID_VALUE;
8846+
}
8847+
}
8848+
return PI_SUCCESS;
8849+
}
8850+
88348851
pi_result piextCommandBufferCreate(pi_context Context, pi_device Device,
88358852
const pi_ext_command_buffer_desc *Desc,
88368853
pi_ext_command_buffer *RetCommandBuffer) {
@@ -8935,19 +8952,16 @@ pi_result piextCommandBufferNDRangeKernel(
89358952

89368953
ZE_CALL(zeKernelSetGroupSize, (Kernel->ZeKernel, WG[0], WG[1], WG[2]));
89378954

8938-
std::vector<ze_event_handle_t> ZeEventList(NumSyncPointsInWaitList);
8939-
for (size_t i = 0; i < NumSyncPointsInWaitList; i++) {
8940-
if (auto EventHandle = CommandBuffer->SyncPoints.find(SyncPointWaitList[i]);
8941-
EventHandle != CommandBuffer->SyncPoints.end()) {
8942-
ZeEventList[i] = CommandBuffer->SyncPoints[SyncPointWaitList[i]]->ZeEvent;
8943-
} else {
8944-
return PI_ERROR_INVALID_VALUE;
8945-
}
8955+
std::vector<ze_event_handle_t> ZeEventList;
8956+
pi_result Res = getEventsFromSyncPoints(CommandBuffer->SyncPoints,
8957+
NumSyncPointsInWaitList,
8958+
SyncPointWaitList, ZeEventList);
8959+
if (Res) {
8960+
return Res;
89468961
}
8947-
89488962
pi_event LaunchEvent;
8949-
auto res = EventCreate(CommandBuffer->Context, nullptr, true, &LaunchEvent);
8950-
if (res)
8963+
Res = EventCreate(CommandBuffer->Context, nullptr, true, &LaunchEvent);
8964+
if (Res)
89518965
return PI_ERROR_OUT_OF_HOST_MEMORY;
89528966

89538967
LaunchEvent->CommandData = (void *)Kernel;
@@ -8972,6 +8986,41 @@ pi_result piextCommandBufferNDRangeKernel(
89728986
return PI_SUCCESS;
89738987
}
89748988

8989+
pi_result piextCommandBufferMemcpyUSM(
8990+
pi_ext_command_buffer CommandBuffer, void *DstPtr, const void *SrcPtr,
8991+
size_t Size, pi_uint32 NumSyncPointsInWaitList,
8992+
const pi_ext_sync_point *SyncPointWaitList, pi_ext_sync_point *SyncPoint) {
8993+
if (!DstPtr) {
8994+
return PI_ERROR_INVALID_VALUE;
8995+
}
8996+
8997+
std::vector<ze_event_handle_t> ZeEventList;
8998+
pi_result Res = getEventsFromSyncPoints(CommandBuffer->SyncPoints,
8999+
NumSyncPointsInWaitList,
9000+
SyncPointWaitList, ZeEventList);
9001+
if (Res) {
9002+
return Res;
9003+
}
9004+
9005+
pi_event LaunchEvent;
9006+
Res = EventCreate(CommandBuffer->Context, nullptr, true, &LaunchEvent);
9007+
if (Res)
9008+
return PI_ERROR_OUT_OF_HOST_MEMORY;
9009+
9010+
ZE_CALL(zeCommandListAppendMemoryCopy,
9011+
(CommandBuffer->ZeCommandList, DstPtr, SrcPtr, Size,
9012+
LaunchEvent->ZeEvent, ZeEventList.size(), ZeEventList.data()));
9013+
9014+
urPrint("calling zeCommandListAppendMemoryCopy() with"
9015+
" ZeEvent %#lx\n",
9016+
ur_cast<std::uintptr_t>(LaunchEvent->ZeEvent));
9017+
9018+
// Get sync point and register the event with it.
9019+
*SyncPoint = CommandBuffer->GetNextSyncPoint();
9020+
CommandBuffer->RegisterSyncPoint(*SyncPoint, LaunchEvent);
9021+
return PI_SUCCESS;
9022+
}
9023+
89759024
pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer CommandBuffer,
89769025
pi_queue Queue,
89779026
pi_uint32 NumEventsInWaitList,

sycl/plugins/opencl/pi_opencl.cpp

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2300,6 +2300,16 @@ pi_result piextCommandBufferNDRangeKernel(
23002300
return {};
23012301
}
23022302

2303+
pi_result
2304+
piextCommandBufferMemcpyUSM(pi_ext_command_buffer command_buffer, void *dst_ptr,
2305+
const void *src_ptr, size_t size,
2306+
pi_uint32 num_sync_points_in_wait_list,
2307+
const pi_ext_sync_point *sync_point_wait_list,
2308+
pi_ext_sync_point *sync_point) {
2309+
// Not implemented
2310+
return {};
2311+
}
2312+
23032313
pi_result piextEnqueueCommandBuffer(pi_ext_command_buffer command_buffer,
23042314
pi_queue queue,
23052315
pi_uint32 num_events_in_wait_list,
@@ -2509,6 +2519,7 @@ pi_result piPluginInit(pi_plugin *PluginInit) {
25092519
_PI_CL(piextCommandBufferRetain, piextCommandBufferRetain)
25102520
_PI_CL(piextCommandBufferRelease, piextCommandBufferRelease)
25112521
_PI_CL(piextCommandBufferNDRangeKernel, piextCommandBufferNDRangeKernel)
2522+
_PI_CL(piextCommandBufferMemcpyUSM, piextCommandBufferMemcpyUSM)
25122523
_PI_CL(piextEnqueueCommandBuffer, piextEnqueueCommandBuffer)
25132524

25142525
_PI_CL(piextKernelSetArgMemObj, piextKernelSetArgMemObj)

sycl/source/detail/memory_manager.cpp

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1223,6 +1223,21 @@ void MemoryManager::copy_from_device_global(
12231223
DepEvents, OutEvent);
12241224
}
12251225

1226+
// Command buffer methods
1227+
void MemoryManager::ext_oneapi_copy_usm_cmd_buffer(
1228+
ContextImplPtr Context, const void *SrcMem,
1229+
RT::PiExtCommandBuffer CommandBuffer, size_t Len, void *DstMem,
1230+
std::vector<RT::PiExtSyncPoint> Deps, RT::PiExtSyncPoint *OutSyncPoint) {
1231+
if (!SrcMem || !DstMem)
1232+
throw runtime_error("NULL pointer argument in memory copy operation.",
1233+
PI_ERROR_INVALID_VALUE);
1234+
1235+
const PluginPtr &Plugin = Context->getPlugin();
1236+
Plugin->call<PiApiKind::piextCommandBufferMemcpyUSM>(
1237+
CommandBuffer, DstMem, SrcMem, Len, Deps.size(), Deps.data(),
1238+
OutSyncPoint);
1239+
}
1240+
12261241
} // namespace detail
12271242
} // __SYCL_INLINE_VER_NAMESPACE(_V1)
12281243
} // namespace sycl

sycl/source/detail/memory_manager.hpp

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -173,6 +173,12 @@ class __SYCL_EXPORT MemoryManager {
173173
const void *DeviceGlobalPtr, bool IsDeviceImageScoped, QueueImplPtr Queue,
174174
size_t NumBytes, size_t Offset, void *DstMem, OSModuleHandle M,
175175
const std::vector<RT::PiEvent> &DepEvents, RT::PiEvent *OutEvent);
176+
177+
// Command buffer extension methods
178+
static void ext_oneapi_copy_usm_cmd_buffer(
179+
ContextImplPtr Context, const void *SrcMem,
180+
RT::PiExtCommandBuffer CommandBuffer, size_t Len, void *DstMem,
181+
std::vector<RT::PiExtSyncPoint> Deps, RT::PiExtSyncPoint *OutSyncPoint);
176182
};
177183
} // namespace detail
178184
} // __SYCL_INLINE_VER_NAMESPACE(_V1)

sycl/source/detail/queue_impl.cpp

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -146,6 +146,16 @@ event queue_impl::memcpy(const std::shared_ptr<detail::queue_impl> &Self,
146146
// Emit a begin/end scope for this call
147147
PrepareNotify.scopedNotify((uint16_t)xpti::trace_point_type_t::task_begin);
148148
#endif
149+
// If we have a command graph set we need to capture the copy through normal
150+
// queue submission rather than execute the copy directly.
151+
if (MGraph) {
152+
return submit(
153+
[&](handler &CGH) {
154+
CGH.depends_on(DepEvents);
155+
CGH.memcpy(Dest, Src, Count);
156+
},
157+
Self, {});
158+
}
149159
if (MHasDiscardEventsSupport) {
150160
MemoryManager::copy_usm(Src, Self, Count, Dest,
151161
getOrWaitEvents(DepEvents, MContext), nullptr);

sycl/source/detail/scheduler/commands.cpp

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2487,6 +2487,7 @@ pi_int32 ExecCGCommand::enqueueImpCommandBuffer() {
24872487
MCommandGroup->MRequirements.size() == 0)
24882488
? nullptr
24892489
: &MEvent->getHandleRef();
2490+
RT::PiExtSyncPoint OutSyncPoint;
24902491
switch (MCommandGroup->getType()) {
24912492
case CG::CGTYPE::Kernel: {
24922493
CGExecKernel *ExecKernel = (CGExecKernel *)MCommandGroup.get();
@@ -2506,13 +2507,20 @@ pi_int32 ExecCGCommand::enqueueImpCommandBuffer() {
25062507
Event = &MEvent->getHandleRef();
25072508
}
25082509
}
2509-
RT::PiExtSyncPoint OutSyncPoint;
25102510
auto result = enqueueImpCommandBufferKernel(
25112511
MQueue->get_context(), MQueue->getDeviceImplPtr(), MCommandBuffer,
25122512
*ExecKernel, MSyncPointDeps, &OutSyncPoint, getMemAllocationFunc);
25132513
MEvent->setSyncPoint(OutSyncPoint);
25142514
return result;
25152515
}
2516+
case CG::CGTYPE::CopyUSM: {
2517+
CGCopyUSM *Copy = (CGCopyUSM *)MCommandGroup.get();
2518+
MemoryManager::ext_oneapi_copy_usm_cmd_buffer(
2519+
MQueue->getContextImplPtr(), Copy->getSrc(), MCommandBuffer,
2520+
Copy->getLength(), Copy->getDst(), MSyncPointDeps, &OutSyncPoint);
2521+
MEvent->setSyncPoint(OutSyncPoint);
2522+
return PI_SUCCESS;
2523+
}
25162524
default:
25172525
throw runtime_error("CG type not implemented for command buffers.",
25182526
PI_ERROR_INVALID_OPERATION);

sycl/test-e2e/Graph/Explicit/usm_copy.cpp

Lines changed: 20 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2,9 +2,6 @@
22
// RUN: %{build} -o %t.out
33
// RUN: %{run} %t.out
44

5-
// Expected fail as memcopy not implemented yet
6-
// XFAIL: *
7-
85
// Tests adding a usm memcpy node using the explicit API and submitting
96
// the graph.
107

@@ -45,7 +42,7 @@ int main() {
4542
Queue.copy(DataC.data(), PtrC, Size);
4643
Queue.wait_and_throw();
4744

48-
// memcpy from B to A
45+
// Copy from B to A
4946
auto NodeA = Graph.add([&](handler &CGH) { CGH.copy(PtrB, PtrA, Size); });
5047

5148
// Read & write A
@@ -58,9 +55,20 @@ int main() {
5855
},
5956
{exp_ext::property::node::depends_on(NodeA)});
6057

61-
// memcpy from B to A
62-
auto NodeC = Graph.add([&](handler &CGH) { CGH.copy(PtrA, PtrB, Size); },
63-
{exp_ext::property::node::depends_on(NodeB)});
58+
// Read & write B
59+
auto NodeModB = Graph.add(
60+
[&](handler &CGH) {
61+
CGH.parallel_for(range<1>(Size), [=](item<1> id) {
62+
auto LinID = id.get_linear_id();
63+
PtrB[LinID] += ModValue;
64+
});
65+
},
66+
{exp_ext::property::node::depends_on(NodeA)});
67+
68+
// memcpy from A to B
69+
auto NodeC =
70+
Graph.add([&](handler &CGH) { CGH.memcpy(PtrB, PtrA, Size * sizeof(T)); },
71+
{exp_ext::property::node::depends_on(NodeB, NodeModB)});
6472

6573
// Read and write B
6674
auto NodeD = Graph.add(
@@ -72,9 +80,9 @@ int main() {
7280
},
7381
{exp_ext::property::node::depends_on(NodeC)});
7482

75-
// memcpy from B to C
83+
// Copy from B to C
7684
Graph.add([&](handler &CGH) { CGH.copy(PtrB, PtrC, Size); },
77-
{exp_ext::property::node::depends_on(NodeB)});
85+
{exp_ext::property::node::depends_on(NodeD)});
7886

7987
auto GraphExec = Graph.finalize();
8088

@@ -86,12 +94,11 @@ int main() {
8694
});
8795
}
8896

97+
Queue.copy(PtrA, DataA.data(), Size, Event);
98+
Queue.copy(PtrB, DataB.data(), Size, Event);
99+
Queue.copy(PtrC, DataC.data(), Size, Event);
89100
Queue.wait_and_throw();
90101

91-
Queue.copy(PtrA, DataA.data(), Size);
92-
Queue.copy(PtrB, DataB.data(), Size);
93-
Queue.copy(PtrC, DataC.data(), Size);
94-
95102
free(PtrA, Queue);
96103
free(PtrB, Queue);
97104
free(PtrC, Queue);

0 commit comments

Comments
 (0)