Skip to content

Mirror intel/llvm commits #2791

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
Jun 19, 2025
2 changes: 1 addition & 1 deletion .github/intel-llvm-mirror-base-commit
Original file line number Diff line number Diff line change
@@ -1 +1 @@
2d9e9a52857b6978b6dc4283deedde0a31bdedcd
e84de949d8bad654483634664bf799a23bb4f460
4 changes: 3 additions & 1 deletion source/adapters/cuda/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1384,7 +1384,9 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemUnmap(
BufferImpl.MemAllocMode == BufferMem::AllocMode::AllocHostPtr;

ur_result_t Result = UR_RESULT_SUCCESS;
if (!IsPinned && (Map->getMapFlags() & UR_MAP_FLAG_WRITE)) {
if (!IsPinned &&
(Map->getMapFlags() &
(UR_MAP_FLAG_WRITE | UR_MAP_FLAG_WRITE_INVALIDATE_REGION))) {
// Pinned host memory is only on host so it doesn't need to be written to.
Result = urEnqueueMemBufferWrite(
hQueue, hMem, true, Map->getMapOffset(), Map->getMapSize(), pMappedPtr,
Expand Down
26 changes: 18 additions & 8 deletions source/adapters/hip/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -273,6 +273,10 @@ urCommandBufferRetainExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
UR_APIEXPORT ur_result_t UR_APICALL
urCommandBufferReleaseExp(ur_exp_command_buffer_handle_t hCommandBuffer) {
if (hCommandBuffer->decrementReferenceCount() == 0) {
if (hCommandBuffer->CurrentExecution) {
UR_CHECK_ERROR(hCommandBuffer->CurrentExecution->wait());
UR_CHECK_ERROR(urEventRelease(hCommandBuffer->CurrentExecution));
}
delete hCommandBuffer;
}
return UR_RESULT_SUCCESS;
Expand Down Expand Up @@ -795,22 +799,28 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueCommandBufferExp(
hipStream_t HIPStream = hQueue->getNextComputeStream(
numEventsInWaitList, phEventWaitList, Guard, &StreamToken);

if (hCommandBuffer->CurrentExecution) {
UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, 1,
&hCommandBuffer->CurrentExecution));
UR_CHECK_ERROR(urEventRelease(hCommandBuffer->CurrentExecution));
}

UR_CHECK_ERROR(enqueueEventsWait(hQueue, HIPStream, numEventsInWaitList,
phEventWaitList));

if (phEvent) {
RetImplEvent = std::make_unique<ur_event_handle_t_>(
UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, hQueue, HIPStream,
StreamToken);
UR_CHECK_ERROR(RetImplEvent->start());
}
RetImplEvent = std::make_unique<ur_event_handle_t_>(
UR_COMMAND_ENQUEUE_COMMAND_BUFFER_EXP, hQueue, HIPStream, StreamToken);
UR_CHECK_ERROR(RetImplEvent->start());

// Launch graph
UR_CHECK_ERROR(hipGraphLaunch(hCommandBuffer->HIPGraphExec, HIPStream));
UR_CHECK_ERROR(RetImplEvent->record());

hCommandBuffer->CurrentExecution = RetImplEvent.release();

if (phEvent) {
UR_CHECK_ERROR(RetImplEvent->record());
*phEvent = RetImplEvent.release();
UR_CHECK_ERROR(urEventRetain(hCommandBuffer->CurrentExecution));
*phEvent = hCommandBuffer->CurrentExecution;
}
} catch (ur_result_t Err) {
return Err;
Expand Down
4 changes: 4 additions & 0 deletions source/adapters/hip/command_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,10 @@ struct ur_exp_command_buffer_handle_t_ : ur::hip::handle_base {
// Atomic variable counting the number of reference to this command_buffer
// using std::atomic prevents data race when incrementing/decrementing.
std::atomic_uint32_t RefCount;
// Track the event of the current graph execution. This extra synchronization
// is needed because HIP (unlike CUDA) does not seem to synchronize with other
// executions of the same graph during hipGraphLaunch and hipExecGraphDestroy.
ur_event_handle_t CurrentExecution = nullptr;

// Ordered map of sync_points to ur_events
std::map<ur_exp_command_buffer_sync_point_t, hipGraphNode_t> SyncPoints;
Expand Down
55 changes: 27 additions & 28 deletions source/adapters/level_zero/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1488,34 +1488,33 @@ ur_result_t waitForDependencies(ur_exp_command_buffer_handle_t CommandBuffer,
std::scoped_lock<ur_shared_mutex> Guard(CommandBuffer->Mutex);
const bool UseCopyEngine = false;
bool MustSignalWaitEvent = true;
if (NumEventsInWaitList) {
ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));

// Update the WaitList of the Wait Event
// Events are appended to the WaitList if the WaitList is not empty
if (CommandBuffer->WaitEvent->WaitList.isEmpty())
CommandBuffer->WaitEvent->WaitList = TmpWaitList;
else
CommandBuffer->WaitEvent->WaitList.insert(TmpWaitList);

if (!CommandBuffer->WaitEvent->WaitList.isEmpty()) {
// Create command-list to execute before `CommandListPtr` and will signal
// when `EventWaitList` dependencies are complete.
ur_command_list_ptr_t WaitCommandList{};
UR_CALL(Queue->Context->getAvailableCommandList(
Queue, WaitCommandList, false /*UseCopyEngine*/, NumEventsInWaitList,
EventWaitList, false /*AllowBatching*/, nullptr /*ForcedCmdQueue*/));

ZE2UR_CALL(zeCommandListAppendBarrier,
(WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent,
CommandBuffer->WaitEvent->WaitList.Length,
CommandBuffer->WaitEvent->WaitList.ZeEventList));
Queue->executeCommandList(WaitCommandList, false /*IsBlocking*/,
false /*OKToBatchCommand*/);
MustSignalWaitEvent = false;
}

ur_ze_event_list_t TmpWaitList;
UR_CALL(TmpWaitList.createAndRetainUrZeEventList(
NumEventsInWaitList, EventWaitList, Queue, UseCopyEngine));

// Update the WaitList of the Wait Event
// Events are appended to the WaitList if the WaitList is not empty
if (CommandBuffer->WaitEvent->WaitList.isEmpty())
CommandBuffer->WaitEvent->WaitList = TmpWaitList;
else
CommandBuffer->WaitEvent->WaitList.insert(TmpWaitList);

if (!CommandBuffer->WaitEvent->WaitList.isEmpty()) {
// Create command-list to execute before `CommandListPtr` and will signal
// when `EventWaitList` dependencies are complete.
ur_command_list_ptr_t WaitCommandList{};
UR_CALL(Queue->Context->getAvailableCommandList(
Queue, WaitCommandList, false /*UseCopyEngine*/, NumEventsInWaitList,
EventWaitList, false /*AllowBatching*/, nullptr /*ForcedCmdQueue*/));

ZE2UR_CALL(zeCommandListAppendBarrier,
(WaitCommandList->first, CommandBuffer->WaitEvent->ZeEvent,
CommandBuffer->WaitEvent->WaitList.Length,
CommandBuffer->WaitEvent->WaitList.ZeEventList));
Queue->executeCommandList(WaitCommandList, false /*IsBlocking*/,
false /*OKToBatchCommand*/);
MustSignalWaitEvent = false;
}
// Given WaitEvent was created without specifying Counting Events, then this
// event can be signalled on the host.
Expand Down
15 changes: 6 additions & 9 deletions source/adapters/level_zero/v2/command_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -66,17 +66,15 @@ ur_exp_command_buffer_handle_t_::ur_exp_command_buffer_handle_t_(
ur_context_handle_t context, ur_device_handle_t device,
v2::raii::command_list_unique_handle &&commandList,
const ur_exp_command_buffer_desc_t *desc)
: eventPool(context->getEventPoolCache(PoolCacheType::Regular)
.borrow(device->Id.value(),
isInOrder ? v2::EVENT_FLAGS_COUNTER : 0)),
context(context), device(device),
isUpdatable(desc ? desc->isUpdatable : false),
: isUpdatable(desc ? desc->isUpdatable : false),
isInOrder(desc ? desc->isInOrder : false),
commandListManager(
context, device,
std::forward<v2::raii::command_list_unique_handle>(commandList)) {
ur::level_zero::urContextRetain(context);
}
std::forward<v2::raii::command_list_unique_handle>(commandList)),
context(context), device(device),
eventPool(context->getEventPoolCache(PoolCacheType::Regular)
.borrow(device->Id.value(),
isInOrder ? v2::EVENT_FLAGS_COUNTER : 0)) {}

ur_exp_command_buffer_sync_point_t
ur_exp_command_buffer_handle_t_::getSyncPoint(ur_event_handle_t event) {
Expand Down Expand Up @@ -175,7 +173,6 @@ ur_exp_command_buffer_handle_t_::~ur_exp_command_buffer_handle_t_() {
for (auto &event : syncPoints) {
event->release();
}
ur::level_zero::urContextRelease(context);
}

ur_result_t ur_exp_command_buffer_handle_t_::applyUpdateCommands(
Expand Down
21 changes: 10 additions & 11 deletions source/adapters/level_zero/v2/command_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,15 @@ struct ur_exp_command_buffer_handle_t_ : public ur_object {
ur_result_t
registerExecutionEventUnlocked(ur_event_handle_t nextExecutionEvent);

// Indicates if command-buffer commands can be updated after it is closed.
const bool isUpdatable = false;
const bool isInOrder = true;

// Command-buffer profiling is enabled.
const bool isProfilingEnabled = false;

lockable<ur_command_list_manager> commandListManager;

ur_result_t finalizeCommandBuffer();

ur_result_t
Expand All @@ -54,8 +63,6 @@ struct ur_exp_command_buffer_handle_t_ : public ur_object {
createEventIfRequested(ur_exp_command_buffer_sync_point_t *retSyncPoint);

private:
v2::raii::cache_borrowed_event_pool eventPool;

// Stores all sync points that are created by the command buffer.
std::vector<ur_event_handle_t> syncPoints;

Expand All @@ -77,13 +84,5 @@ struct ur_exp_command_buffer_handle_t_ : public ur_object {

ur_event_handle_t currentExecution = nullptr;

public:
// Indicates if command-buffer commands can be updated after it is closed.
const bool isUpdatable = false;
const bool isInOrder = true;

// Command-buffer profiling is enabled.
const bool isProfilingEnabled = false;

lockable<ur_command_list_manager> commandListManager;
v2::raii::cache_borrowed_event_pool eventPool;
};
10 changes: 9 additions & 1 deletion source/adapters/level_zero/v2/command_list_manager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,15 @@ ur_command_list_manager::ur_command_list_manager(
ur_context_handle_t context, ur_device_handle_t device,
v2::raii::command_list_unique_handle &&commandList)
: hContext(context), hDevice(device),
zeCommandList(std::move(commandList)) {}
zeCommandList(std::move(commandList)) {
UR_CALL_THROWS(ur::level_zero::urContextRetain(context));
UR_CALL_THROWS(ur::level_zero::urDeviceRetain(device));
}

ur_command_list_manager::~ur_command_list_manager() {
ur::level_zero::urContextRelease(hContext);
ur::level_zero::urDeviceRelease(hDevice);
}

ur_result_t ur_command_list_manager::appendGenericFillUnlocked(
ur_mem_buffer_t *dst, size_t offset, size_t patternSize,
Expand Down
2 changes: 1 addition & 1 deletion source/adapters/level_zero/v2/command_list_manager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ struct ur_command_list_manager {
operator=(const ur_command_list_manager &src) = delete;
ur_command_list_manager &operator=(ur_command_list_manager &&src) = default;

~ur_command_list_manager() = default;
~ur_command_list_manager();

ze_command_list_handle_t getZeCommandList();

Expand Down
17 changes: 6 additions & 11 deletions source/adapters/level_zero/v2/queue_immediate_in_order.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -28,29 +28,25 @@ ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t(
ze_command_queue_priority_t priority, std::optional<int32_t> index,
event_flags_t eventFlags, ur_queue_flags_t flags)
: hContext(hContext), hDevice(hDevice),
eventPool(hContext->getEventPoolCache(PoolCacheType::Immediate)
.borrow(hDevice->Id.value(), eventFlags)),
commandListManager(
hContext, hDevice,
hContext->getCommandListCache().getImmediateCommandList(
hDevice->ZeDevice,
{true, ordinal, true /* always enable copy offload */},
ZE_COMMAND_QUEUE_MODE_ASYNCHRONOUS, priority, index)),
flags(flags) {
ur::level_zero::urContextRetain(hContext);
}
flags(flags),
eventPool(hContext->getEventPoolCache(PoolCacheType::Immediate)
.borrow(hDevice->Id.value(), eventFlags)) {}

ur_queue_immediate_in_order_t::ur_queue_immediate_in_order_t(
ur_context_handle_t hContext, ur_device_handle_t hDevice,
raii::command_list_unique_handle commandListHandle,
event_flags_t eventFlags, ur_queue_flags_t flags)
: hContext(hContext), hDevice(hDevice),
eventPool(hContext->getEventPoolCache(PoolCacheType::Immediate)
.borrow(hDevice->Id.value(), eventFlags)),
commandListManager(hContext, hDevice, std::move(commandListHandle)),
flags(flags) {
ur::level_zero::urContextRetain(hContext);
}
flags(flags),
eventPool(hContext->getEventPoolCache(PoolCacheType::Immediate)
.borrow(hDevice->Id.value(), eventFlags)) {}

ur_result_t
ur_queue_immediate_in_order_t::queueGetInfo(ur_queue_info_t propName,
Expand Down Expand Up @@ -126,7 +122,6 @@ ur_result_t ur_queue_immediate_in_order_t::queueFlush() {
ur_queue_immediate_in_order_t::~ur_queue_immediate_in_order_t() {
try {
UR_CALL_THROWS(queueFinish());
ur::level_zero::urContextRelease(hContext);
} catch (...) {
// Ignore errors during destruction
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,11 +29,9 @@ struct ur_queue_immediate_in_order_t : ur_object, ur_queue_t_ {
private:
ur_context_handle_t hContext;
ur_device_handle_t hDevice;

v2::raii::cache_borrowed_event_pool eventPool;

lockable<ur_command_list_manager> commandListManager;
ur_queue_flags_t flags;
v2::raii::cache_borrowed_event_pool eventPool;

// Only create an event when requested by the user.
ur_event_handle_t createEventIfRequested(ur_event_handle_t *phEvent) {
Expand Down
2 changes: 1 addition & 1 deletion source/adapters/native_cpu/kernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urKernelGetInfo(ur_kernel_handle_t hKernel,
// case UR_KERNEL_INFO_PROGRAM:
// return ReturnValue(ur_program_handle_t{ Kernel->Program });
case UR_KERNEL_INFO_FUNCTION_NAME:
return ReturnValue(hKernel->_name);
return ReturnValue(hKernel->_name.c_str());
case UR_KERNEL_INFO_REFERENCE_COUNT:
return ReturnValue(uint32_t{hKernel->getReferenceCount()});
case UR_KERNEL_INFO_ATTRIBUTES:
Expand Down
26 changes: 17 additions & 9 deletions source/adapters/offload/enqueue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,18 +47,24 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueKernelLaunch(
}

if (GroupSize[0] > GlobalSize[0] || GroupSize[1] > GlobalSize[1] ||
GroupSize[2] > GlobalSize[2]) {
GroupSize[2] > GlobalSize[2] ||
GroupSize[0] > std::numeric_limits<uint32_t>::max() ||
GroupSize[1] > std::numeric_limits<uint32_t>::max() ||
GroupSize[2] > std::numeric_limits<uint32_t>::max() ||
GlobalSize[0] / GroupSize[0] > std::numeric_limits<uint32_t>::max() ||
GlobalSize[1] / GroupSize[1] > std::numeric_limits<uint32_t>::max() ||
GlobalSize[2] / GroupSize[2] > std::numeric_limits<uint32_t>::max()) {
return UR_RESULT_ERROR_INVALID_WORK_GROUP_SIZE;
}

ol_kernel_launch_size_args_t LaunchArgs;
LaunchArgs.Dimensions = workDim;
LaunchArgs.NumGroupsX = GlobalSize[0] / GroupSize[0];
LaunchArgs.NumGroupsY = GlobalSize[1] / GroupSize[1];
LaunchArgs.NumGroupsZ = GlobalSize[2] / GroupSize[2];
LaunchArgs.GroupSizeX = GroupSize[0];
LaunchArgs.GroupSizeY = GroupSize[1];
LaunchArgs.GroupSizeZ = GroupSize[2];
LaunchArgs.NumGroups.x = GlobalSize[0] / GroupSize[0];
LaunchArgs.NumGroups.y = GlobalSize[1] / GroupSize[1];
LaunchArgs.NumGroups.z = GlobalSize[2] / GroupSize[2];
LaunchArgs.GroupSize.x = GroupSize[0];
LaunchArgs.GroupSize.y = GroupSize[1];
LaunchArgs.GroupSize.z = GroupSize[2];
LaunchArgs.DynSharedMemory = 0;

ol_event_handle_t EventOut;
Expand Down Expand Up @@ -103,7 +109,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferRead(

ol_event_handle_t EventOut = nullptr;

void *DevPtr = std::get<BufferMem>(hBuffer->Mem).Ptr;
char *DevPtr =
reinterpret_cast<char *>(std::get<BufferMem>(hBuffer->Mem).Ptr);

olMemcpy(hQueue->OffloadQueue, pDst, Adapter.HostDevice, DevPtr + offset,
hQueue->OffloadDevice, size, phEvent ? &EventOut : nullptr);
Expand Down Expand Up @@ -133,7 +140,8 @@ UR_APIEXPORT ur_result_t UR_APICALL urEnqueueMemBufferWrite(

ol_event_handle_t EventOut = nullptr;

void *DevPtr = std::get<BufferMem>(hBuffer->Mem).Ptr;
char *DevPtr =
reinterpret_cast<char *>(std::get<BufferMem>(hBuffer->Mem).Ptr);

auto Res =
olMemcpy(hQueue->OffloadQueue, DevPtr + offset, hQueue->OffloadDevice,
Expand Down
4 changes: 2 additions & 2 deletions source/loader/layers/sanitizer/asan/asan_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) {
UR_LOG_L(
getContext()->logger, ERR,
"Failed to copy {} bytes data from host pointer {} to buffer {}",
Size, HostPtr, this);
Size, (void *)HostPtr, this);
return URes;
}
}
Expand Down Expand Up @@ -181,7 +181,7 @@ ur_result_t MemBuffer::free() {
ur_result_t URes = getAsanInterceptor()->releaseMemory(Context, Ptr);
if (URes != UR_RESULT_SUCCESS) {
UR_LOG_L(getContext()->logger, ERR, "Failed to free buffer handle {}",
Ptr);
(void *)Ptr);
return URes;
}
}
Expand Down
1 change: 1 addition & 0 deletions source/loader/layers/sanitizer/asan/asan_buffer.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ struct MemBuffer {

size_t Size;

// FIXME: we should use uint8_t* instead of char* for non-string data.
char *HostPtr{};

struct SubBuffer_t {
Expand Down
4 changes: 2 additions & 2 deletions source/loader/layers/sanitizer/msan/msan_buffer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -153,7 +153,7 @@ ur_result_t MemBuffer::getHandle(ur_device_handle_t Device, char *&Handle) {
UR_LOG_L(
getContext()->logger, ERR,
"Failed to copy {} bytes data from host pointer {} to buffer {}",
Size, HostPtr, this);
Size, (void *)HostPtr, this);
return URes;
}

Expand Down Expand Up @@ -227,7 +227,7 @@ ur_result_t MemBuffer::free() {
ur_result_t URes = getContext()->urDdiTable.USM.pfnFree(Context, Ptr);
if (URes != UR_RESULT_SUCCESS) {
UR_LOG_L(getContext()->logger, ERR, "Failed to free buffer handle {}",
Ptr);
(void *)Ptr);
return URes;
}
}
Expand Down
Loading