diff --git a/.github/workflows/Test.yml b/.github/workflows/Test.yml index d8d9eb2c..dd7ff2b1 100644 --- a/.github/workflows/Test.yml +++ b/.github/workflows/Test.yml @@ -13,7 +13,7 @@ concurrency: jobs: test: - name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - PoCL ${{ matrix.pocl }} + name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ matrix.memory_backend }} - PoCL ${{ matrix.pocl }} runs-on: ${{ matrix.os }} timeout-minutes: 180 permissions: # needed to allow julia-actions/cache to proactively delete old caches that it has created @@ -26,7 +26,9 @@ jobs: os: [ubuntu-24.04, ubuntu-24.04-arm, macOS-13, macOS-15, windows-2025] arch: [x64, arm64] pocl: [jll, local] + memory_backend: [usm, svm, buffer] exclude: + # unsupported combinations - os: ubuntu-24.04 arch: arm64 - os: windows-2025 @@ -125,11 +127,13 @@ jobs: run(```$(cmake()) --build $builddir --parallel $(Sys.CPU_THREADS) --target install```) end' - echo '[pocl_jll]' > test/LocalPreferences.toml + echo '[pocl_jll]' >> test/LocalPreferences.toml echo 'libpocl_path="${{ github.workspace }}/target/lib/libpocl.so"' >> test/LocalPreferences.toml - name: Setup OpenCL.jl run: | + echo '[OpenCL]' >> test/LocalPreferences.toml + echo 'default_memory_backend="${{ matrix.memory_backend }}"' >> test/LocalPreferences.toml julia --project -e ' using Pkg Pkg.develop(path="lib/intrinsics")' diff --git a/.gitignore b/.gitignore index 3819a7de..ba39cc53 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1 @@ Manifest.toml -LocalPreferences.toml diff --git a/LocalPreferences.toml b/LocalPreferences.toml new file mode 100644 index 00000000..fee95c92 --- /dev/null +++ b/LocalPreferences.toml @@ -0,0 +1,7 @@ +[OpenCL] +# Which memory back-end to use for unspecified CLArray allocations. This can be: +# - "buffer": plain buffers (using pointers if `cl_ext_buffer_device_address` is available) +# - "usm": Unified Shared Memory (requiring `cl_intel_unified_shared_memory`) +# - "svm": Shared Virtual Memory (requiring coarse-grained SVM support) +# If unspecified, the default will be used based on the platform and device capabilities. +#default_memory_backend="..." diff --git a/Project.toml b/Project.toml index d3fd72d4..b914bd7d 100644 --- a/Project.toml +++ b/Project.toml @@ -10,6 +10,7 @@ KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c" LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" OpenCL_jll = "6cb37087-e8b6-5417-8430-1f242f1e46e4" +Preferences = "21216c6a-2e73-6563-6e65-726566657250" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" @@ -26,6 +27,7 @@ KernelAbstractions = "0.9.2" LLVM = "9.1" LinearAlgebra = "1" OpenCL_jll = "=2024.10.24" +Preferences = "1" Printf = "1" Random = "1" Reexport = "1" diff --git a/lib/cl/CL.jl b/lib/cl/CL.jl index 67e3fc49..2788a72f 100644 --- a/lib/cl/CL.jl +++ b/lib/cl/CL.jl @@ -1,6 +1,8 @@ module cl +import ..OpenCL using Printf +using Preferences include("pointer.jl") include("api.jl") @@ -18,8 +20,7 @@ include("device.jl") include("context.jl") include("cmdqueue.jl") include("event.jl") -include("memory/memory.jl") -include("buffer.jl") +include("memory.jl") include("program.jl") include("kernel.jl") diff --git a/lib/cl/device.jl b/lib/cl/device.jl index e7afc1c6..dd1c4a26 100644 --- a/lib/cl/device.jl +++ b/lib/cl/device.jl @@ -190,10 +190,7 @@ function exec_capabilities(d::Device) ) end -function usm_supported(d::Device) - "cl_intel_unified_shared_memory" in d.extensions || return false - return true -end +usm_supported(d::Device) = "cl_intel_unified_shared_memory" in d.extensions function usm_capabilities(d::Device) usm_supported(d) || throw(ArgumentError("Unified Shared Memory not supported on this device")) @@ -256,6 +253,8 @@ function svm_capabilities(d::Device) ) end +bda_supported(d::Device) = "cl_ext_buffer_device_address" in d.extensions + function cl_device_type(dtype::Symbol) if dtype == :all cl_dtype = CL_DEVICE_TYPE_ALL diff --git a/lib/cl/kernel.jl b/lib/cl/kernel.jl index 6d78972e..4770d9af 100644 --- a/lib/cl/kernel.jl +++ b/lib/cl/kernel.jl @@ -69,7 +69,7 @@ function set_arg!(k::Kernel, idx::Integer, arg::CLPtr{T}) where {T} end # raw memory -function set_arg!(k::Kernel, idx::Integer, arg::AbstractMemory) +function set_arg!(k::Kernel, idx::Integer, arg::AbstractPointerMemory) # XXX: this assumes that the receiving argument is pointer-typed, which is not the case # with Julia's `Ptr` ABI. Instead, one should reinterpret the pointer as a # `Core.LLVMPtr`, which _is_ pointer-valued. We retain this handling for `Ptr` for @@ -79,6 +79,8 @@ function set_arg!(k::Kernel, idx::Integer, arg::AbstractMemory) clSetKernelArgSVMPointer(k, idx - 1, pointer(arg)) elseif arg isa UnifiedMemory clSetKernelArgMemPointerINTEL(k, idx - 1, pointer(arg)) + elseif arg isa Buffer + clSetKernelArgDevicePointerEXT(k, idx - 1, pointer(arg)) else error("Unknown memory type") end @@ -191,6 +193,7 @@ function call( if !isempty(indirect_memory) svm_pointers = CLPtr{Cvoid}[] usm_pointers = CLPtr{Cvoid}[] + bda_pointers = CLPtr{Cvoid}[] device_access = host_access = shared_access = false for memory in indirect_memory ptr = pointer(memory) @@ -200,6 +203,8 @@ function call( if memory isa SharedVirtualMemory push!(svm_pointers, ptr) + elseif memory isa Buffer + push!(bda_pointers, ptr) elseif memory isa UnifiedDeviceMemory device_access = true push!(usm_pointers, ptr) @@ -229,6 +234,9 @@ function call( if !isempty(svm_pointers) clSetKernelExecInfo(k, CL_KERNEL_EXEC_INFO_SVM_PTRS, sizeof(svm_pointers), svm_pointers) end + if !isempty(bda_pointers) + clSetKernelExecInfo(k, CL_KERNEL_EXEC_INFO_DEVICE_PTRS_EXT, sizeof(bda_pointers), bda_pointers) + end if !isempty(usm_pointers) clSetKernelExecInfo(k, CL_KERNEL_EXEC_INFO_USM_PTRS_INTEL, sizeof(usm_pointers), usm_pointers) end diff --git a/lib/cl/libopencl.jl b/lib/cl/libopencl.jl index 7814f8fa..1886fcd6 100644 --- a/lib/cl/libopencl.jl +++ b/lib/cl/libopencl.jl @@ -8,8 +8,8 @@ end function check(f) res = retry_reclaim(err -> err == CL_OUT_OF_RESOURCES || - err == CL_MEM_OBJECT_ALLOCATION_FAILURE || - err == CL_OUT_OF_HOST_MEMORY) do + err == CL_MEM_OBJECT_ALLOCATION_FAILURE || + err == CL_OUT_OF_HOST_MEMORY) do return f() end @@ -21,7 +21,7 @@ function check(f) end macro CL_MAKE_VERSION(major, minor, patch) - quote + return quote VersionNumber($major, $minor, $patch) end end @@ -1263,424 +1263,6 @@ end const cl_device_partition_property_ext = cl_ulong -const cl_device_command_buffer_capabilities_khr = cl_bitfield - -mutable struct _cl_command_buffer_khr end - -const cl_command_buffer_khr = Ptr{_cl_command_buffer_khr} - -const cl_sync_point_khr = cl_uint - -const cl_command_buffer_info_khr = cl_uint - -const cl_command_buffer_state_khr = cl_uint - -const cl_command_buffer_properties_khr = cl_properties - -const cl_command_buffer_flags_khr = cl_bitfield - -const cl_command_properties_khr = cl_properties - -mutable struct _cl_mutable_command_khr end - -const cl_mutable_command_khr = Ptr{_cl_mutable_command_khr} - -# typedef cl_command_buffer_khr CL_API_CALL clCreateCommandBufferKHR_t ( cl_uint num_queues , const cl_command_queue * queues , const cl_command_buffer_properties_khr * properties , cl_int * errcode_ret ) -const clCreateCommandBufferKHR_t = Cvoid - -const clCreateCommandBufferKHR_fn = Ptr{clCreateCommandBufferKHR_t} - -# typedef cl_int CL_API_CALL clFinalizeCommandBufferKHR_t ( cl_command_buffer_khr command_buffer ) -const clFinalizeCommandBufferKHR_t = Cvoid - -const clFinalizeCommandBufferKHR_fn = Ptr{clFinalizeCommandBufferKHR_t} - -# typedef cl_int CL_API_CALL clRetainCommandBufferKHR_t ( cl_command_buffer_khr command_buffer ) -const clRetainCommandBufferKHR_t = Cvoid - -const clRetainCommandBufferKHR_fn = Ptr{clRetainCommandBufferKHR_t} - -# typedef cl_int CL_API_CALL clReleaseCommandBufferKHR_t ( cl_command_buffer_khr command_buffer ) -const clReleaseCommandBufferKHR_t = Cvoid - -const clReleaseCommandBufferKHR_fn = Ptr{clReleaseCommandBufferKHR_t} - -# typedef cl_int CL_API_CALL clEnqueueCommandBufferKHR_t ( cl_uint num_queues , cl_command_queue * queues , cl_command_buffer_khr command_buffer , cl_uint num_events_in_wait_list , const cl_event * event_wait_list , cl_event * event ) -const clEnqueueCommandBufferKHR_t = Cvoid - -const clEnqueueCommandBufferKHR_fn = Ptr{clEnqueueCommandBufferKHR_t} - -# typedef cl_int CL_API_CALL clCommandBarrierWithWaitListKHR_t ( cl_command_buffer_khr command_buffer , cl_command_queue command_queue , const cl_command_properties_khr * properties , cl_uint num_sync_points_in_wait_list , const cl_sync_point_khr * sync_point_wait_list , cl_sync_point_khr * sync_point , cl_mutable_command_khr * mutable_handle ) -const clCommandBarrierWithWaitListKHR_t = Cvoid - -const clCommandBarrierWithWaitListKHR_fn = Ptr{clCommandBarrierWithWaitListKHR_t} - -# typedef cl_int CL_API_CALL clCommandCopyBufferKHR_t ( cl_command_buffer_khr command_buffer , cl_command_queue command_queue , const cl_command_properties_khr * properties , cl_mem src_buffer , cl_mem dst_buffer , size_t src_offset , size_t dst_offset , size_t size , cl_uint num_sync_points_in_wait_list , const cl_sync_point_khr * sync_point_wait_list , cl_sync_point_khr * sync_point , cl_mutable_command_khr * mutable_handle ) -const clCommandCopyBufferKHR_t = Cvoid - -const clCommandCopyBufferKHR_fn = Ptr{clCommandCopyBufferKHR_t} - -# typedef cl_int CL_API_CALL clCommandCopyBufferRectKHR_t ( cl_command_buffer_khr command_buffer , cl_command_queue command_queue , const cl_command_properties_khr * properties , cl_mem src_buffer , cl_mem dst_buffer , const size_t * src_origin , const size_t * dst_origin , const size_t * region , size_t src_row_pitch , size_t src_slice_pitch , size_t dst_row_pitch , size_t dst_slice_pitch , cl_uint num_sync_points_in_wait_list , const cl_sync_point_khr * sync_point_wait_list , cl_sync_point_khr * sync_point , cl_mutable_command_khr * mutable_handle ) -const clCommandCopyBufferRectKHR_t = Cvoid - -const clCommandCopyBufferRectKHR_fn = Ptr{clCommandCopyBufferRectKHR_t} - -# typedef cl_int CL_API_CALL clCommandCopyBufferToImageKHR_t ( cl_command_buffer_khr command_buffer , cl_command_queue command_queue , const cl_command_properties_khr * properties , cl_mem src_buffer , cl_mem dst_image , size_t src_offset , const size_t * dst_origin , const size_t * region , cl_uint num_sync_points_in_wait_list , const cl_sync_point_khr * sync_point_wait_list , cl_sync_point_khr * sync_point , cl_mutable_command_khr * mutable_handle ) -const clCommandCopyBufferToImageKHR_t = Cvoid - -const clCommandCopyBufferToImageKHR_fn = Ptr{clCommandCopyBufferToImageKHR_t} - -# typedef cl_int CL_API_CALL clCommandCopyImageKHR_t ( cl_command_buffer_khr command_buffer , cl_command_queue command_queue , const cl_command_properties_khr * properties , cl_mem src_image , cl_mem dst_image , const size_t * src_origin , const size_t * dst_origin , const size_t * region , cl_uint num_sync_points_in_wait_list , const cl_sync_point_khr * sync_point_wait_list , cl_sync_point_khr * sync_point , cl_mutable_command_khr * mutable_handle ) -const clCommandCopyImageKHR_t = Cvoid - -const clCommandCopyImageKHR_fn = Ptr{clCommandCopyImageKHR_t} - -# typedef cl_int CL_API_CALL clCommandCopyImageToBufferKHR_t ( cl_command_buffer_khr command_buffer , cl_command_queue command_queue , const cl_command_properties_khr * properties , cl_mem src_image , cl_mem dst_buffer , const size_t * src_origin , const size_t * region , size_t dst_offset , cl_uint num_sync_points_in_wait_list , const cl_sync_point_khr * sync_point_wait_list , cl_sync_point_khr * sync_point , cl_mutable_command_khr * mutable_handle ) -const clCommandCopyImageToBufferKHR_t = Cvoid - -const clCommandCopyImageToBufferKHR_fn = Ptr{clCommandCopyImageToBufferKHR_t} - -# typedef cl_int CL_API_CALL clCommandFillBufferKHR_t ( cl_command_buffer_khr command_buffer , cl_command_queue command_queue , const cl_command_properties_khr * properties , cl_mem buffer , const void * pattern , size_t pattern_size , size_t offset , size_t size , cl_uint num_sync_points_in_wait_list , const cl_sync_point_khr * sync_point_wait_list , cl_sync_point_khr * sync_point , cl_mutable_command_khr * mutable_handle ) -const clCommandFillBufferKHR_t = Cvoid - -const clCommandFillBufferKHR_fn = Ptr{clCommandFillBufferKHR_t} - -# typedef cl_int CL_API_CALL clCommandFillImageKHR_t ( cl_command_buffer_khr command_buffer , cl_command_queue command_queue , const cl_command_properties_khr * properties , cl_mem image , const void * fill_color , const size_t * origin , const size_t * region , cl_uint num_sync_points_in_wait_list , const cl_sync_point_khr * sync_point_wait_list , cl_sync_point_khr * sync_point , cl_mutable_command_khr * mutable_handle ) -const clCommandFillImageKHR_t = Cvoid - -const clCommandFillImageKHR_fn = Ptr{clCommandFillImageKHR_t} - -# typedef cl_int CL_API_CALL clCommandNDRangeKernelKHR_t ( cl_command_buffer_khr command_buffer , cl_command_queue command_queue , const cl_command_properties_khr * properties , cl_kernel kernel , cl_uint work_dim , const size_t * global_work_offset , const size_t * global_work_size , const size_t * local_work_size , cl_uint num_sync_points_in_wait_list , const cl_sync_point_khr * sync_point_wait_list , cl_sync_point_khr * sync_point , cl_mutable_command_khr * mutable_handle ) -const clCommandNDRangeKernelKHR_t = Cvoid - -const clCommandNDRangeKernelKHR_fn = Ptr{clCommandNDRangeKernelKHR_t} - -# typedef cl_int CL_API_CALL clGetCommandBufferInfoKHR_t ( cl_command_buffer_khr command_buffer , cl_command_buffer_info_khr param_name , size_t param_value_size , void * param_value , size_t * param_value_size_ret ) -const clGetCommandBufferInfoKHR_t = Cvoid - -const clGetCommandBufferInfoKHR_fn = Ptr{clGetCommandBufferInfoKHR_t} - -function clCreateCommandBufferKHR(num_queues, queues, properties, errcode_ret) - @ext_ccall libopencl.clCreateCommandBufferKHR(num_queues::cl_uint, - queues::Ptr{cl_command_queue}, - properties::Ptr{cl_command_buffer_properties_khr}, - errcode_ret::Ptr{cl_int})::cl_command_buffer_khr -end - -@checked function clFinalizeCommandBufferKHR(command_buffer) - @ext_ccall libopencl.clFinalizeCommandBufferKHR(command_buffer::cl_command_buffer_khr)::cl_int -end - -@checked function clRetainCommandBufferKHR(command_buffer) - @ext_ccall libopencl.clRetainCommandBufferKHR(command_buffer::cl_command_buffer_khr)::cl_int -end - -@checked function clReleaseCommandBufferKHR(command_buffer) - @ext_ccall libopencl.clReleaseCommandBufferKHR(command_buffer::cl_command_buffer_khr)::cl_int -end - -@checked function clEnqueueCommandBufferKHR(num_queues, queues, command_buffer, - num_events_in_wait_list, event_wait_list, event) - @ext_ccall libopencl.clEnqueueCommandBufferKHR(num_queues::cl_uint, - queues::Ptr{cl_command_queue}, - command_buffer::cl_command_buffer_khr, - num_events_in_wait_list::cl_uint, - event_wait_list::Ptr{cl_event}, - event::Ptr{cl_event})::cl_int -end - -@checked function clCommandBarrierWithWaitListKHR(command_buffer, command_queue, properties, - num_sync_points_in_wait_list, - sync_point_wait_list, sync_point, - mutable_handle) - @ext_ccall libopencl.clCommandBarrierWithWaitListKHR(command_buffer::cl_command_buffer_khr, - command_queue::cl_command_queue, - properties::Ptr{cl_command_properties_khr}, - num_sync_points_in_wait_list::cl_uint, - sync_point_wait_list::Ptr{cl_sync_point_khr}, - sync_point::Ptr{cl_sync_point_khr}, - mutable_handle::Ptr{cl_mutable_command_khr})::cl_int -end - -@checked function clCommandCopyBufferKHR(command_buffer, command_queue, properties, - src_buffer, dst_buffer, src_offset, dst_offset, - size, num_sync_points_in_wait_list, - sync_point_wait_list, sync_point, mutable_handle) - @ext_ccall libopencl.clCommandCopyBufferKHR(command_buffer::cl_command_buffer_khr, - command_queue::cl_command_queue, - properties::Ptr{cl_command_properties_khr}, - src_buffer::cl_mem, dst_buffer::cl_mem, - src_offset::Csize_t, dst_offset::Csize_t, - size::Csize_t, - num_sync_points_in_wait_list::cl_uint, - sync_point_wait_list::Ptr{cl_sync_point_khr}, - sync_point::Ptr{cl_sync_point_khr}, - mutable_handle::Ptr{cl_mutable_command_khr})::cl_int -end - -@checked function clCommandCopyBufferRectKHR(command_buffer, command_queue, properties, - src_buffer, dst_buffer, src_origin, dst_origin, - region, src_row_pitch, src_slice_pitch, - dst_row_pitch, dst_slice_pitch, - num_sync_points_in_wait_list, - sync_point_wait_list, sync_point, - mutable_handle) - @ext_ccall libopencl.clCommandCopyBufferRectKHR(command_buffer::cl_command_buffer_khr, - command_queue::cl_command_queue, - properties::Ptr{cl_command_properties_khr}, - src_buffer::cl_mem, dst_buffer::cl_mem, - src_origin::Ptr{Csize_t}, - dst_origin::Ptr{Csize_t}, - region::Ptr{Csize_t}, - src_row_pitch::Csize_t, - src_slice_pitch::Csize_t, - dst_row_pitch::Csize_t, - dst_slice_pitch::Csize_t, - num_sync_points_in_wait_list::cl_uint, - sync_point_wait_list::Ptr{cl_sync_point_khr}, - sync_point::Ptr{cl_sync_point_khr}, - mutable_handle::Ptr{cl_mutable_command_khr})::cl_int -end - -@checked function clCommandCopyBufferToImageKHR(command_buffer, command_queue, properties, - src_buffer, dst_image, src_offset, - dst_origin, region, - num_sync_points_in_wait_list, - sync_point_wait_list, sync_point, - mutable_handle) - @ext_ccall libopencl.clCommandCopyBufferToImageKHR(command_buffer::cl_command_buffer_khr, - command_queue::cl_command_queue, - properties::Ptr{cl_command_properties_khr}, - src_buffer::cl_mem, - dst_image::cl_mem, - src_offset::Csize_t, - dst_origin::Ptr{Csize_t}, - region::Ptr{Csize_t}, - num_sync_points_in_wait_list::cl_uint, - sync_point_wait_list::Ptr{cl_sync_point_khr}, - sync_point::Ptr{cl_sync_point_khr}, - mutable_handle::Ptr{cl_mutable_command_khr})::cl_int -end - -@checked function clCommandCopyImageKHR(command_buffer, command_queue, properties, - src_image, dst_image, src_origin, dst_origin, - region, num_sync_points_in_wait_list, - sync_point_wait_list, sync_point, mutable_handle) - @ext_ccall libopencl.clCommandCopyImageKHR(command_buffer::cl_command_buffer_khr, - command_queue::cl_command_queue, - properties::Ptr{cl_command_properties_khr}, - src_image::cl_mem, dst_image::cl_mem, - src_origin::Ptr{Csize_t}, - dst_origin::Ptr{Csize_t}, - region::Ptr{Csize_t}, - num_sync_points_in_wait_list::cl_uint, - sync_point_wait_list::Ptr{cl_sync_point_khr}, - sync_point::Ptr{cl_sync_point_khr}, - mutable_handle::Ptr{cl_mutable_command_khr})::cl_int -end - -@checked function clCommandCopyImageToBufferKHR(command_buffer, command_queue, properties, - src_image, dst_buffer, src_origin, region, - dst_offset, num_sync_points_in_wait_list, - sync_point_wait_list, sync_point, - mutable_handle) - @ext_ccall libopencl.clCommandCopyImageToBufferKHR(command_buffer::cl_command_buffer_khr, - command_queue::cl_command_queue, - properties::Ptr{cl_command_properties_khr}, - src_image::cl_mem, - dst_buffer::cl_mem, - src_origin::Ptr{Csize_t}, - region::Ptr{Csize_t}, - dst_offset::Csize_t, - num_sync_points_in_wait_list::cl_uint, - sync_point_wait_list::Ptr{cl_sync_point_khr}, - sync_point::Ptr{cl_sync_point_khr}, - mutable_handle::Ptr{cl_mutable_command_khr})::cl_int -end - -@checked function clCommandFillBufferKHR(command_buffer, command_queue, properties, buffer, - pattern, pattern_size, offset, size, - num_sync_points_in_wait_list, sync_point_wait_list, - sync_point, mutable_handle) - @ext_ccall libopencl.clCommandFillBufferKHR(command_buffer::cl_command_buffer_khr, - command_queue::cl_command_queue, - properties::Ptr{cl_command_properties_khr}, - buffer::cl_mem, pattern::Ptr{Cvoid}, - pattern_size::Csize_t, offset::Csize_t, - size::Csize_t, - num_sync_points_in_wait_list::cl_uint, - sync_point_wait_list::Ptr{cl_sync_point_khr}, - sync_point::Ptr{cl_sync_point_khr}, - mutable_handle::Ptr{cl_mutable_command_khr})::cl_int -end - -@checked function clCommandFillImageKHR(command_buffer, command_queue, properties, image, - fill_color, origin, region, - num_sync_points_in_wait_list, sync_point_wait_list, - sync_point, mutable_handle) - @ext_ccall libopencl.clCommandFillImageKHR(command_buffer::cl_command_buffer_khr, - command_queue::cl_command_queue, - properties::Ptr{cl_command_properties_khr}, - image::cl_mem, fill_color::Ptr{Cvoid}, - origin::Ptr{Csize_t}, region::Ptr{Csize_t}, - num_sync_points_in_wait_list::cl_uint, - sync_point_wait_list::Ptr{cl_sync_point_khr}, - sync_point::Ptr{cl_sync_point_khr}, - mutable_handle::Ptr{cl_mutable_command_khr})::cl_int -end - -@checked function clCommandNDRangeKernelKHR(command_buffer, command_queue, properties, - kernel, work_dim, global_work_offset, - global_work_size, local_work_size, - num_sync_points_in_wait_list, - sync_point_wait_list, sync_point, - mutable_handle) - @ext_ccall libopencl.clCommandNDRangeKernelKHR(command_buffer::cl_command_buffer_khr, - command_queue::cl_command_queue, - properties::Ptr{cl_command_properties_khr}, - kernel::cl_kernel, work_dim::cl_uint, - global_work_offset::Ptr{Csize_t}, - global_work_size::Ptr{Csize_t}, - local_work_size::Ptr{Csize_t}, - num_sync_points_in_wait_list::cl_uint, - sync_point_wait_list::Ptr{cl_sync_point_khr}, - sync_point::Ptr{cl_sync_point_khr}, - mutable_handle::Ptr{cl_mutable_command_khr})::cl_int -end - -@checked function clGetCommandBufferInfoKHR(command_buffer, param_name, param_value_size, - param_value, param_value_size_ret) - @ext_ccall libopencl.clGetCommandBufferInfoKHR(command_buffer::cl_command_buffer_khr, - param_name::cl_command_buffer_info_khr, - param_value_size::Csize_t, - param_value::Ptr{Cvoid}, - param_value_size_ret::Ptr{Csize_t})::cl_int -end - -# typedef cl_int CL_API_CALL clCommandSVMMemcpyKHR_t ( cl_command_buffer_khr command_buffer , cl_command_queue command_queue , const cl_command_properties_khr * properties , void * dst_ptr , const void * src_ptr , size_t size , cl_uint num_sync_points_in_wait_list , const cl_sync_point_khr * sync_point_wait_list , cl_sync_point_khr * sync_point , cl_mutable_command_khr * mutable_handle ) -const clCommandSVMMemcpyKHR_t = Cvoid - -const clCommandSVMMemcpyKHR_fn = Ptr{clCommandSVMMemcpyKHR_t} - -# typedef cl_int CL_API_CALL clCommandSVMMemFillKHR_t ( cl_command_buffer_khr command_buffer , cl_command_queue command_queue , const cl_command_properties_khr * properties , void * svm_ptr , const void * pattern , size_t pattern_size , size_t size , cl_uint num_sync_points_in_wait_list , const cl_sync_point_khr * sync_point_wait_list , cl_sync_point_khr * sync_point , cl_mutable_command_khr * mutable_handle ) -const clCommandSVMMemFillKHR_t = Cvoid - -const clCommandSVMMemFillKHR_fn = Ptr{clCommandSVMMemFillKHR_t} - -@checked function clCommandSVMMemcpyKHR(command_buffer, command_queue, properties, dst_ptr, - src_ptr, size, num_sync_points_in_wait_list, - sync_point_wait_list, sync_point, mutable_handle) - @ext_ccall libopencl.clCommandSVMMemcpyKHR(command_buffer::cl_command_buffer_khr, - command_queue::cl_command_queue, - properties::Ptr{cl_command_properties_khr}, - dst_ptr::Ptr{Cvoid}, src_ptr::Ptr{Cvoid}, - size::Csize_t, - num_sync_points_in_wait_list::cl_uint, - sync_point_wait_list::Ptr{cl_sync_point_khr}, - sync_point::Ptr{cl_sync_point_khr}, - mutable_handle::Ptr{cl_mutable_command_khr})::cl_int -end - -@checked function clCommandSVMMemFillKHR(command_buffer, command_queue, properties, svm_ptr, - pattern, pattern_size, size, - num_sync_points_in_wait_list, sync_point_wait_list, - sync_point, mutable_handle) - @ext_ccall libopencl.clCommandSVMMemFillKHR(command_buffer::cl_command_buffer_khr, - command_queue::cl_command_queue, - properties::Ptr{cl_command_properties_khr}, - svm_ptr::Ptr{Cvoid}, pattern::Ptr{Cvoid}, - pattern_size::Csize_t, size::Csize_t, - num_sync_points_in_wait_list::cl_uint, - sync_point_wait_list::Ptr{cl_sync_point_khr}, - sync_point::Ptr{cl_sync_point_khr}, - mutable_handle::Ptr{cl_mutable_command_khr})::cl_int -end - -const cl_platform_command_buffer_capabilities_khr = cl_bitfield - -# typedef cl_command_buffer_khr CL_API_CALL clRemapCommandBufferKHR_t ( cl_command_buffer_khr command_buffer , cl_bool automatic , cl_uint num_queues , const cl_command_queue * queues , cl_uint num_handles , const cl_mutable_command_khr * handles , cl_mutable_command_khr * handles_ret , cl_int * errcode_ret ) -const clRemapCommandBufferKHR_t = Cvoid - -const clRemapCommandBufferKHR_fn = Ptr{clRemapCommandBufferKHR_t} - -function clRemapCommandBufferKHR(command_buffer, automatic, num_queues, queues, num_handles, - handles, handles_ret, errcode_ret) - @ext_ccall libopencl.clRemapCommandBufferKHR(command_buffer::cl_command_buffer_khr, - automatic::cl_bool, num_queues::cl_uint, - queues::Ptr{cl_command_queue}, - num_handles::cl_uint, - handles::Ptr{cl_mutable_command_khr}, - handles_ret::Ptr{cl_mutable_command_khr}, - errcode_ret::Ptr{cl_int})::cl_command_buffer_khr -end - -const cl_command_buffer_update_type_khr = cl_uint - -const cl_mutable_dispatch_fields_khr = cl_bitfield - -const cl_mutable_command_info_khr = cl_uint - -struct _cl_mutable_dispatch_arg_khr - arg_index::cl_uint - arg_size::Csize_t - arg_value::Ptr{Cvoid} -end - -const cl_mutable_dispatch_arg_khr = _cl_mutable_dispatch_arg_khr - -struct _cl_mutable_dispatch_exec_info_khr - param_name::cl_uint - param_value_size::Csize_t - param_value::Ptr{Cvoid} -end - -const cl_mutable_dispatch_exec_info_khr = _cl_mutable_dispatch_exec_info_khr - -struct _cl_mutable_dispatch_config_khr - command::cl_mutable_command_khr - num_args::cl_uint - num_svm_args::cl_uint - num_exec_infos::cl_uint - work_dim::cl_uint - arg_list::Ptr{cl_mutable_dispatch_arg_khr} - arg_svm_list::Ptr{cl_mutable_dispatch_arg_khr} - exec_info_list::Ptr{cl_mutable_dispatch_exec_info_khr} - global_work_offset::Ptr{Csize_t} - global_work_size::Ptr{Csize_t} - local_work_size::Ptr{Csize_t} -end - -const cl_mutable_dispatch_config_khr = _cl_mutable_dispatch_config_khr - -const cl_mutable_dispatch_asserts_khr = cl_bitfield - -# typedef cl_int CL_API_CALL clUpdateMutableCommandsKHR_t ( cl_command_buffer_khr command_buffer , cl_uint num_configs , const cl_command_buffer_update_type_khr * config_types , const void * * configs ) -const clUpdateMutableCommandsKHR_t = Cvoid - -const clUpdateMutableCommandsKHR_fn = Ptr{clUpdateMutableCommandsKHR_t} - -# typedef cl_int CL_API_CALL clGetMutableCommandInfoKHR_t ( cl_mutable_command_khr command , cl_mutable_command_info_khr param_name , size_t param_value_size , void * param_value , size_t * param_value_size_ret ) -const clGetMutableCommandInfoKHR_t = Cvoid - -const clGetMutableCommandInfoKHR_fn = Ptr{clGetMutableCommandInfoKHR_t} - -@checked function clUpdateMutableCommandsKHR(command_buffer, num_configs, config_types, - configs) - @ext_ccall libopencl.clUpdateMutableCommandsKHR(command_buffer::cl_command_buffer_khr, - num_configs::cl_uint, - config_types::Ptr{cl_command_buffer_update_type_khr}, - configs::Ptr{Ptr{Cvoid}})::cl_int -end - -@checked function clGetMutableCommandInfoKHR(command, param_name, param_value_size, - param_value, param_value_size_ret) - @ext_ccall libopencl.clGetMutableCommandInfoKHR(command::cl_mutable_command_khr, - param_name::cl_mutable_command_info_khr, - param_value_size::Csize_t, - param_value::Ptr{Cvoid}, - param_value_size_ret::Ptr{Csize_t})::cl_int -end - # typedef cl_int CL_API_CALL clSetMemObjectDestructorAPPLE_t ( cl_mem memobj , void ( CL_CALLBACK * pfn_notify ) ( cl_mem memobj , void * user_data ) , void * user_data ) const clSetMemObjectDestructorAPPLE_t = Cvoid @@ -1731,12 +1313,32 @@ const clIcdGetPlatformIDsKHR_t = Cvoid const clIcdGetPlatformIDsKHR_fn = Ptr{clIcdGetPlatformIDsKHR_t} +# typedef void * CL_API_CALL clIcdGetFunctionAddressForPlatformKHR_t ( cl_platform_id platform , const char * func_name ) +const clIcdGetFunctionAddressForPlatformKHR_t = Cvoid + +const clIcdGetFunctionAddressForPlatformKHR_fn = Ptr{clIcdGetFunctionAddressForPlatformKHR_t} + +# typedef cl_int CL_API_CALL clIcdSetPlatformDispatchDataKHR_t ( cl_platform_id platform , void * dispatch_data ) +const clIcdSetPlatformDispatchDataKHR_t = Cvoid + +const clIcdSetPlatformDispatchDataKHR_fn = Ptr{clIcdSetPlatformDispatchDataKHR_t} + @checked function clIcdGetPlatformIDsKHR(num_entries, platforms, num_platforms) @ext_ccall libopencl.clIcdGetPlatformIDsKHR(num_entries::cl_uint, platforms::Ptr{cl_platform_id}, num_platforms::Ptr{cl_uint})::cl_int end +function clIcdGetFunctionAddressForPlatformKHR(platform, func_name) + @ext_ccall libopencl.clIcdGetFunctionAddressForPlatformKHR(platform::cl_platform_id, + func_name::Ptr{Cchar})::Ptr{Cvoid} +end + +@checked function clIcdSetPlatformDispatchDataKHR(platform, dispatch_data) + @ext_ccall libopencl.clIcdSetPlatformDispatchDataKHR(platform::cl_platform_id, + dispatch_data::Ptr{Cvoid})::cl_int +end + # typedef cl_program CL_API_CALL clCreateProgramWithILKHR_t ( cl_context context , const void * il , size_t length , cl_int * errcode_ret ) const clCreateProgramWithILKHR_t = Cvoid @@ -2452,7 +2054,7 @@ end @checked function clGetMemAllocInfoINTEL(context, ptr, param_name, param_value_size, param_value, param_value_size_ret) - @ext_ccall libopencl.clGetMemAllocInfoINTEL(context::cl_context, ptr::CLPtr{Cvoid}, + @ext_ccall libopencl.clGetMemAllocInfoINTEL(context::cl_context, ptr::PtrOrCLPtr{Cvoid}, param_name::cl_mem_info_intel, param_value_size::Csize_t, param_value::Ptr{Cvoid}, @@ -2462,14 +2064,15 @@ end @checked function clSetKernelArgMemPointerINTEL(kernel, arg_index, arg_value) @ext_ccall libopencl.clSetKernelArgMemPointerINTEL(kernel::cl_kernel, arg_index::cl_uint, - arg_value::CLPtr{Cvoid})::cl_int + arg_value::PtrOrCLPtr{Cvoid})::cl_int end @checked function clEnqueueMemFillINTEL(command_queue, dst_ptr, pattern, pattern_size, size, num_events_in_wait_list, event_wait_list, event) @ext_ccall libopencl.clEnqueueMemFillINTEL(command_queue::cl_command_queue, - dst_ptr::CLPtr{Cvoid}, pattern::Ptr{Cvoid}, - pattern_size::Csize_t, size::Csize_t, + dst_ptr::PtrOrCLPtr{Cvoid}, + pattern::Ptr{Cvoid}, pattern_size::Csize_t, + size::Csize_t, num_events_in_wait_list::cl_uint, event_wait_list::Ptr{cl_event}, event::Ptr{cl_event})::cl_int @@ -2488,7 +2091,7 @@ end @checked function clEnqueueMemAdviseINTEL(command_queue, ptr, size, advice, num_events_in_wait_list, event_wait_list, event) @ext_ccall libopencl.clEnqueueMemAdviseINTEL(command_queue::cl_command_queue, - ptr::CLPtr{Cvoid}, size::Csize_t, + ptr::PtrOrCLPtr{Cvoid}, size::Csize_t, advice::cl_mem_advice_intel, num_events_in_wait_list::cl_uint, event_wait_list::Ptr{cl_event}, @@ -2503,7 +2106,7 @@ const clEnqueueMigrateMemINTEL_fn = Ptr{clEnqueueMigrateMemINTEL_t} @checked function clEnqueueMigrateMemINTEL(command_queue, ptr, size, flags, num_events_in_wait_list, event_wait_list, event) @ext_ccall libopencl.clEnqueueMigrateMemINTEL(command_queue::cl_command_queue, - ptr::CLPtr{Cvoid}, size::Csize_t, + ptr::PtrOrCLPtr{Cvoid}, size::Csize_t, flags::cl_mem_migration_flags, num_events_in_wait_list::cl_uint, event_wait_list::Ptr{cl_event}, @@ -2639,6 +2242,19 @@ end const cl_device_kernel_clock_capabilities_khr = cl_bitfield +const cl_mem_device_address_ext = cl_ulong + +# typedef cl_int CL_API_CALL clSetKernelArgDevicePointerEXT_t ( cl_kernel kernel , cl_uint arg_index , cl_mem_device_address_ext arg_value ) +const clSetKernelArgDevicePointerEXT_t = Cvoid + +const clSetKernelArgDevicePointerEXT_fn = Ptr{clSetKernelArgDevicePointerEXT_t} + +@checked function clSetKernelArgDevicePointerEXT(kernel, arg_index, arg_value) + @ext_ccall libopencl.clSetKernelArgDevicePointerEXT(kernel::cl_kernel, + arg_index::cl_uint, + arg_value::cl_mem_device_address_ext)::cl_int +end + # typedef cl_int CL_API_CALL clCancelCommandsIMG_t ( const cl_event * event_list , size_t num_events_in_list ) const clCancelCommandsIMG_t = Cvoid @@ -2649,6 +2265,18 @@ const clCancelCommandsIMG_fn = Ptr{clCancelCommandsIMG_t} num_events_in_list::Csize_t)::cl_int end +const cl_perf_hint_qcom = cl_uint + +# typedef cl_int CL_API_CALL clSetPerfHintQCOM_t ( cl_context context , cl_perf_hint_qcom perf_hint ) +const clSetPerfHintQCOM_t = Cvoid + +const clSetPerfHintQCOM_fn = Ptr{clSetPerfHintQCOM_t} + +@checked function clSetPerfHintQCOM(context, perf_hint) + @ext_ccall libopencl.clSetPerfHintQCOM(context::cl_context, + perf_hint::cl_perf_hint_qcom)::cl_int +end + const CL_NAME_VERSION_MAX_NAME_SIZE = 64 const CL_SUCCESS = 0 @@ -3647,126 +3275,6 @@ const CL_INTEL_SHARING_FORMAT_QUERY_GL_EXTENSION_NAME = "cl_intel_sharing_format const CL_INTEL_SHARING_FORMAT_QUERY_GL_EXTENSION_VERSION = @CL_MAKE_VERSION(0, 0, 0) -const cl_khr_command_buffer = 1 - -const CL_KHR_COMMAND_BUFFER_EXTENSION_NAME = "cl_khr_command_buffer" - -const CL_KHR_COMMAND_BUFFER_EXTENSION_VERSION = @CL_MAKE_VERSION(0, 9, 5) - -const CL_DEVICE_COMMAND_BUFFER_CAPABILITIES_KHR = 0x12a9 - -const CL_DEVICE_COMMAND_BUFFER_REQUIRED_QUEUE_PROPERTIES_KHR = 0x12aa - -const CL_COMMAND_BUFFER_CAPABILITY_KERNEL_PRINTF_KHR = 1 << 0 - -const CL_COMMAND_BUFFER_CAPABILITY_DEVICE_SIDE_ENQUEUE_KHR = 1 << 1 - -const CL_COMMAND_BUFFER_CAPABILITY_SIMULTANEOUS_USE_KHR = 1 << 2 - -const CL_COMMAND_BUFFER_CAPABILITY_OUT_OF_ORDER_KHR = 1 << 3 - -const CL_COMMAND_BUFFER_FLAGS_KHR = 0x1293 - -const CL_COMMAND_BUFFER_SIMULTANEOUS_USE_KHR = 1 << 0 - -const CL_INVALID_COMMAND_BUFFER_KHR = -1138 - -const CL_INVALID_SYNC_POINT_WAIT_LIST_KHR = -1139 - -const CL_INCOMPATIBLE_COMMAND_QUEUE_KHR = -1140 - -const CL_COMMAND_BUFFER_QUEUES_KHR = 0x1294 - -const CL_COMMAND_BUFFER_NUM_QUEUES_KHR = 0x1295 - -const CL_COMMAND_BUFFER_REFERENCE_COUNT_KHR = 0x1296 - -const CL_COMMAND_BUFFER_STATE_KHR = 0x1297 - -const CL_COMMAND_BUFFER_PROPERTIES_ARRAY_KHR = 0x1298 - -const CL_COMMAND_BUFFER_CONTEXT_KHR = 0x1299 - -const CL_COMMAND_BUFFER_STATE_RECORDING_KHR = 0 - -const CL_COMMAND_BUFFER_STATE_EXECUTABLE_KHR = 1 - -const CL_COMMAND_BUFFER_STATE_PENDING_KHR = 2 - -const CL_COMMAND_COMMAND_BUFFER_KHR = 0x12a8 - -const cl_khr_command_buffer_multi_device = 1 - -const CL_KHR_COMMAND_BUFFER_MULTI_DEVICE_EXTENSION_NAME = "cl_khr_command_buffer_multi_device" - -const CL_KHR_COMMAND_BUFFER_MULTI_DEVICE_EXTENSION_VERSION = @CL_MAKE_VERSION(0, 9, 1) - -const CL_PLATFORM_COMMAND_BUFFER_CAPABILITIES_KHR = 0x0908 - -const CL_COMMAND_BUFFER_PLATFORM_UNIVERSAL_SYNC_KHR = 1 << 0 - -const CL_COMMAND_BUFFER_PLATFORM_REMAP_QUEUES_KHR = 1 << 1 - -const CL_COMMAND_BUFFER_PLATFORM_AUTOMATIC_REMAP_KHR = 1 << 2 - -const CL_DEVICE_COMMAND_BUFFER_NUM_SYNC_DEVICES_KHR = 0x12ab - -const CL_DEVICE_COMMAND_BUFFER_SYNC_DEVICES_KHR = 0x12ac - -const CL_COMMAND_BUFFER_CAPABILITY_MULTIPLE_QUEUE_KHR = 1 << 4 - -const CL_COMMAND_BUFFER_DEVICE_SIDE_SYNC_KHR = 1 << 2 - -const cl_khr_command_buffer_mutable_dispatch = 1 - -const CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_EXTENSION_NAME = "cl_khr_command_buffer_mutable_dispatch" - -const CL_KHR_COMMAND_BUFFER_MUTABLE_DISPATCH_EXTENSION_VERSION = @CL_MAKE_VERSION(0, 9, 3) - -const CL_COMMAND_BUFFER_MUTABLE_KHR = 1 << 1 - -const CL_INVALID_MUTABLE_COMMAND_KHR = -1141 - -const CL_DEVICE_MUTABLE_DISPATCH_CAPABILITIES_KHR = 0x12b0 - -const CL_MUTABLE_DISPATCH_UPDATABLE_FIELDS_KHR = 0x12b1 - -const CL_MUTABLE_DISPATCH_GLOBAL_OFFSET_KHR = 1 << 0 - -const CL_MUTABLE_DISPATCH_GLOBAL_SIZE_KHR = 1 << 1 - -const CL_MUTABLE_DISPATCH_LOCAL_SIZE_KHR = 1 << 2 - -const CL_MUTABLE_DISPATCH_ARGUMENTS_KHR = 1 << 3 - -const CL_MUTABLE_DISPATCH_EXEC_INFO_KHR = 1 << 4 - -const CL_MUTABLE_COMMAND_COMMAND_QUEUE_KHR = 0x12a0 - -const CL_MUTABLE_COMMAND_COMMAND_BUFFER_KHR = 0x12a1 - -const CL_MUTABLE_COMMAND_COMMAND_TYPE_KHR = 0x12ad - -const CL_MUTABLE_COMMAND_PROPERTIES_ARRAY_KHR = 0x12a2 - -const CL_MUTABLE_DISPATCH_KERNEL_KHR = 0x12a3 - -const CL_MUTABLE_DISPATCH_DIMENSIONS_KHR = 0x12a4 - -const CL_MUTABLE_DISPATCH_GLOBAL_WORK_OFFSET_KHR = 0x12a5 - -const CL_MUTABLE_DISPATCH_GLOBAL_WORK_SIZE_KHR = 0x12a6 - -const CL_MUTABLE_DISPATCH_LOCAL_WORK_SIZE_KHR = 0x12a7 - -const CL_STRUCTURE_TYPE_MUTABLE_DISPATCH_CONFIG_KHR = 0 - -const CL_COMMAND_BUFFER_MUTABLE_DISPATCH_ASSERTS_KHR = 0x12b7 - -const CL_MUTABLE_DISPATCH_ASSERTS_KHR = 0x12b8 - -const CL_MUTABLE_DISPATCH_ASSERT_NO_ADDITIONAL_WORK_GROUPS_KHR = 1 << 0 - const cl_khr_fp64 = 1 const CL_KHR_FP64_EXTENSION_NAME = "cl_khr_fp64" @@ -3797,12 +3305,14 @@ const cl_khr_icd = 1 const CL_KHR_ICD_EXTENSION_NAME = "cl_khr_icd" -const CL_KHR_ICD_EXTENSION_VERSION = @CL_MAKE_VERSION(1, 0, 0) +const CL_KHR_ICD_EXTENSION_VERSION = @CL_MAKE_VERSION(2, 0, 0) const CL_PLATFORM_ICD_SUFFIX_KHR = 0x0920 const CL_PLATFORM_NOT_FOUND_KHR = -1001 +const CL_ICD2_TAG_KHR = intptr_t(0x4f50454e434c3331) + const cl_khr_il_program = 1 const CL_KHR_IL_PROGRAM_EXTENSION_NAME = "cl_khr_il_program" @@ -4351,18 +3861,6 @@ const CL_KHR_EXTERNAL_SEMAPHORE_SYNC_FD_EXTENSION_VERSION = @CL_MAKE_VERSION(1, const CL_SEMAPHORE_HANDLE_SYNC_FD_KHR = 0x2058 -const cl_khr_external_semaphore_win32 = 1 - -const CL_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_NAME = "cl_khr_external_semaphore_win32" - -const CL_KHR_EXTERNAL_SEMAPHORE_WIN32_EXTENSION_VERSION = @CL_MAKE_VERSION(0, 9, 1) - -const CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KHR = 0x2056 - -const CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_KMT_KHR = 0x2057 - -const CL_SEMAPHORE_HANDLE_OPAQUE_WIN32_NAME_KHR = 0x2068 - const cl_khr_semaphore = 1 const CL_KHR_SEMAPHORE_EXTENSION_NAME = "cl_khr_semaphore" @@ -5354,7 +4852,7 @@ const cl_khr_kernel_clock = 1 const CL_KHR_KERNEL_CLOCK_EXTENSION_NAME = "cl_khr_kernel_clock" -const CL_KHR_KERNEL_CLOCK_EXTENSION_VERSION = @CL_MAKE_VERSION(0, 9, 0) +const CL_KHR_KERNEL_CLOCK_EXTENSION_VERSION = @CL_MAKE_VERSION(1, 0, 0) const CL_DEVICE_KERNEL_CLOCK_CAPABILITIES_KHR = 0x1076 @@ -5406,6 +4904,18 @@ const CL_KHR_SPIRV_NO_INTEGER_WRAP_DECORATION_EXTENSION_NAME = "cl_khr_spirv_no_ const CL_KHR_SPIRV_NO_INTEGER_WRAP_DECORATION_EXTENSION_VERSION = @CL_MAKE_VERSION(1, 0, 0) +const cl_khr_spirv_queries = 1 + +const CL_KHR_SPIRV_QUERIES_EXTENSION_NAME = "cl_khr_spirv_queries" + +const CL_KHR_SPIRV_QUERIES_EXTENSION_VERSION = @CL_MAKE_VERSION(1, 0, 0) + +const CL_DEVICE_SPIRV_EXTENDED_INSTRUCTION_SETS_KHR = 0x12b9 + +const CL_DEVICE_SPIRV_EXTENSIONS_KHR = 0x12ba + +const CL_DEVICE_SPIRV_CAPABILITIES_KHR = 0x12bb + const cl_khr_srgb_image_writes = 1 const CL_KHR_SRGB_IMAGE_WRITES_EXTENSION_NAME = "cl_khr_srgb_image_writes" @@ -5466,6 +4976,18 @@ const CL_KHR_WORK_GROUP_UNIFORM_ARITHMETIC_EXTENSION_NAME = "cl_khr_work_group_u const CL_KHR_WORK_GROUP_UNIFORM_ARITHMETIC_EXTENSION_VERSION = @CL_MAKE_VERSION(1, 0, 0) +const cl_ext_buffer_device_address = 1 + +const CL_EXT_BUFFER_DEVICE_ADDRESS_EXTENSION_NAME = "cl_ext_buffer_device_address" + +const CL_EXT_BUFFER_DEVICE_ADDRESS_EXTENSION_VERSION = @CL_MAKE_VERSION(1, 0, 2) + +const CL_MEM_DEVICE_PRIVATE_ADDRESS_EXT = 0x5000 + +const CL_MEM_DEVICE_ADDRESS_EXT = 0x5001 + +const CL_KERNEL_EXEC_INFO_DEVICE_PTRS_EXT = 0x5002 + const cl_ext_image_unorm_int_2_101010 = 1 const CL_EXT_IMAGE_UNORM_INT_2_101010_EXTENSION_NAME = "cl_ext_image_unorm_int_2_101010" @@ -5474,6 +4996,32 @@ const CL_EXT_IMAGE_UNORM_INT_2_101010_EXTENSION_VERSION = @CL_MAKE_VERSION(1, 0, const CL_UNORM_INT_2_101010_EXT = 0x10e5 +const cl_ext_image_unsigned_10x6_12x4_14x2 = 1 + +const CL_EXT_IMAGE_UNSIGNED_10X6_12X4_14X2_EXTENSION_NAME = "cl_ext_image_unsigned_10x6_12x4_14x2" + +const CL_EXT_IMAGE_UNSIGNED_10X6_12X4_14X2_EXTENSION_VERSION = @CL_MAKE_VERSION(1, 0, 0) + +const CL_UNSIGNED_INT10X6_EXT = 0x10e6 + +const CL_UNSIGNED_INT12X4_EXT = 0x10e7 + +const CL_UNSIGNED_INT14X2_EXT = 0x10e8 + +const CL_UNORM_INT10X6_EXT = 0x10e1 + +const CL_UNORM_INT12X4_EXT = 0x10e9 + +const CL_UNORM_INT14X2_EXT = 0x10ea + +const cl_ext_immutable_memory_objects = 1 + +const CL_EXT_IMMUTABLE_MEMORY_OBJECTS_EXTENSION_NAME = "cl_ext_immutable_memory_objects" + +const CL_EXT_IMMUTABLE_MEMORY_OBJECTS_EXTENSION_VERSION = @CL_MAKE_VERSION(1, 0, 0) + +const CL_MEM_IMMUTABLE_EXT = 1 << 6 + const cl_img_cancel_command = 1 const CL_IMG_CANCEL_COMMAND_EXTENSION_NAME = "cl_img_cancel_command" @@ -5481,3 +5029,17 @@ const CL_IMG_CANCEL_COMMAND_EXTENSION_NAME = "cl_img_cancel_command" const CL_IMG_CANCEL_COMMAND_EXTENSION_VERSION = @CL_MAKE_VERSION(0, 0, 0) const CL_CANCELLED_IMG = -1126 + +const cl_qcom_perf_hint = 1 + +const CL_QCOM_PERF_HINT_EXTENSION_NAME = "cl_qcom_perf_hint" + +const CL_QCOM_PERF_HINT_EXTENSION_VERSION = @CL_MAKE_VERSION(1, 0, 5) + +const CL_PERF_HINT_HIGH_QCOM = 0x40c3 + +const CL_PERF_HINT_NORMAL_QCOM = 0x40c4 + +const CL_PERF_HINT_LOW_QCOM = 0x40c5 + +const CL_CONTEXT_PERF_HINT_QCOM = 0x40c2 diff --git a/lib/cl/memory.jl b/lib/cl/memory.jl new file mode 100644 index 00000000..6ad90034 --- /dev/null +++ b/lib/cl/memory.jl @@ -0,0 +1,96 @@ +# Raw memory management + +abstract type AbstractMemoryObject <: CLObject end +abstract type AbstractPointerMemory end +const AbstractMemory = Union{AbstractMemoryObject, AbstractPointerMemory} + +# this will be specialized for each memory type +Base.convert(T::Type{<:Union{Ptr, CLPtr}}, mem::AbstractMemory) = + throw(ArgumentError("Illegal conversion of a $(typeof(mem)) to a $T")) + +# ccall integration +# +# taking the pointer of a memory object means returning the underlying pointer, +# and not the pointer of the object itself. +Base.unsafe_convert(P::Type{<:Union{Ptr, CLPtr}}, mem::AbstractMemory) = convert(P, mem) + + +## opaque memory objects + +# This should be implemented by all subtypes +#type MemoryType <: AbstractMemoryObject +# id::cl_mem +# ... +#end + +Base.sizeof(mem::AbstractMemoryObject) = mem.size + +release(mem::AbstractMemoryObject) = clReleaseMemObject(mem) + +function Base.getproperty(mem::AbstractMemoryObject, s::Symbol) + if s == :type + result = Ref{cl_mem_object_type}() + clGetMemObjectInfo(mem, CL_MEM_TYPE, sizeof(cl_mem_object_type), result, C_NULL) + return result[] + elseif s == :flags + result = Ref{cl_mem_flags}() + clGetMemObjectInfo(mem, CL_MEM_FLAGS, sizeof(cl_mem_flags), result, C_NULL) + mf = result[] + flags = Symbol[] + if (mf & CL_MEM_READ_WRITE) != 0 + push!(flags, :rw) + end + if (mf & CL_MEM_WRITE_ONLY) != 0 + push!(flags, :w) + end + if (mf & CL_MEM_READ_ONLY) != 0 + push!(flags, :r) + end + if (mf & CL_MEM_USE_HOST_PTR) != 0 + push!(flags, :use) + end + if (mf & CL_MEM_ALLOC_HOST_PTR) != 0 + push!(flags, :alloc) + end + if (mf & CL_MEM_COPY_HOST_PTR) != 0 + push!(flags, :copy) + end + return tuple(flags...) + elseif s == :size + result = Ref{Csize_t}() + clGetMemObjectInfo(mem, CL_MEM_SIZE, sizeof(Csize_t), result, C_NULL) + return result[] + elseif s == :reference_count + result = Ref{Cuint}() + clGetMemObjectInfo(mem, CL_MEM_REFERENCE_COUNT, sizeof(Cuint), result, C_NULL) + return Int(result[]) + elseif s == :map_count + result = Ref{Cuint}() + clGetMemObjectInfo(mem, CL_MEM_MAP_COUNT, sizeof(Cuint), result, C_NULL) + return Int(result[]) + elseif s == :device_address + result = Ref{cl_mem_device_address_ext}() + clGetMemObjectInfo(mem, CL_MEM_DEVICE_ADDRESS_EXT, sizeof(cl_mem_device_address_ext), result, C_NULL) + return CLPtr{Cvoid}(result[]) + else + return getfield(mem, s) + end +end + +# for passing buffers to OpenCL APIs: use the underlying handle +Base.unsafe_convert(::Type{cl_mem}, mem::AbstractMemoryObject) = mem.id + +# for passing buffers to kernels: pass the private device pointer +Base.convert(::Type{CLPtr{T}}, mem::AbstractMemoryObject) where {T} = + convert(CLPtr{T}, pointer(mem)) + +include("memory/buffer.jl") + +#TODO: enqueue_migrate_mem_objects(queue, mem_objects, flags=0, wait_for=None) +#TODO: enqueue_migrate_mem_objects_ext(queue, mem_objects, flags=0, wait_for=None) + + +## pointer-based memory + +include("memory/usm.jl") +include("memory/svm.jl") diff --git a/lib/cl/buffer.jl b/lib/cl/memory/buffer.jl similarity index 53% rename from lib/cl/buffer.jl rename to lib/cl/memory/buffer.jl index 045d1218..dfd198d8 100644 --- a/lib/cl/buffer.jl +++ b/lib/cl/memory/buffer.jl @@ -1,103 +1,24 @@ -# OpenCL Memory Object - -abstract type AbstractMemoryObject <: CLObject end - -#This should be implemented by all subtypes -# type MemoryType <: AbstractMemoryObject -# id::cl_mem -# ... -# end - -# for passing buffers to OpenCL APIs: use the underlying handle -Base.unsafe_convert(::Type{cl_mem}, mem::AbstractMemoryObject) = mem.id - -# for passing buffers to kernels: keep the buffer, it's handled by `cl.set_arg!` -Base.unsafe_convert(::Type{<:Ptr}, mem::AbstractMemoryObject) = mem - -Base.sizeof(mem::AbstractMemoryObject) = mem.size - -context(mem::AbstractMemoryObject) = mem.context - -function Base.getproperty(mem::AbstractMemoryObject, s::Symbol) - if s == :context - param = Ref{cl_context}() - clGetMemObjectInfo(mem, CL_MEM_CONTEXT, sizeof(cl_context), param, C_NULL) - return Context(param[], retain = true) - elseif s == :mem_type - result = Ref{cl_mem_object_type}() - clGetMemObjectInfo(mem, CL_MEM_TYPE, sizeof(cl_mem_object_type), result, C_NULL) - return result[] - elseif s == :mem_flags - result = Ref{cl_mem_flags}() - clGetMemObjectInfo(mem, CL_MEM_FLAGS, sizeof(cl_mem_flags), result, C_NULL) - mf = result[] - flags = Symbol[] - if (mf & CL_MEM_READ_WRITE) != 0 - push!(flags, :rw) - end - if (mf & CL_MEM_WRITE_ONLY) != 0 - push!(flags, :w) - end - if (mf & CL_MEM_READ_ONLY) != 0 - push!(flags, :r) - end - if (mf & CL_MEM_USE_HOST_PTR) != 0 - push!(flags, :use) - end - if (mf & CL_MEM_ALLOC_HOST_PTR) != 0 - push!(flags, :alloc) - end - if (mf & CL_MEM_COPY_HOST_PTR) != 0 - push!(flags, :copy) - end - return tuple(flags...) - elseif s == :size - result = Ref{Csize_t}() - clGetMemObjectInfo(mem, CL_MEM_SIZE, sizeof(Csize_t), result, C_NULL) - return result[] - elseif s == :reference_count - result = Ref{Cuint}() - clGetMemObjectInfo(mem, CL_MEM_REFERENCE_COUNT, sizeof(Cuint), result, C_NULL) - return Int(result[]) - elseif s == :map_count - result = Ref{Cuint}() - clGetMemObjectInfo(mem, CL_MEM_MAP_COUNT, sizeof(Cuint), result, C_NULL) - return Int(result[]) - else - return getfield(mem, s) - end -end - -#TODO: enqueue_migrate_mem_objects(queue, mem_objects, flags=0, wait_for=None) -#TODO: enqueue_migrate_mem_objects_ext(queue, mem_objects, flags=0, wait_for=None) - # OpenCL.Buffer -mutable struct Buffer{T} <: AbstractMemoryObject - const id::cl_mem - const len::Int - - function Buffer{T}(mem_id::cl_mem, len::Integer; retain::Bool=false) where {T} - buff = new{T}(mem_id, len) - retain && clRetainMemObject(buff) - finalizer(clReleaseMemObject, buff) - return buff - end +struct Buffer <: AbstractMemoryObject + id::cl_mem + ptr::Union{Nothing,CLPtr{Cvoid}} + bytesize::Int + context::Context end -Base.ndims(b::Buffer) = 1 -Base.eltype(b::Buffer{T}) where {T} = T -Base.length(b::Buffer{T}) where {T} = b.len -Base.sizeof(b::Buffer{T}) where {T} = b.len * sizeof(T) +Buffer() = Buffer(C_NULL, CL_NULL, 0, context()) + +Base.pointer(buf::Buffer) = @something buf.ptr error("Conversion of a buffer to a pointer is not supported by this device") +Base.sizeof(buf::Buffer) = buf.bytesize +context(buf::Buffer) = buf.context ## constructors # for internal use -function Buffer{T}(len::Int, flags::Integer, hostbuf=nothing; - device=:rw, host=:rw) where {T} - sz = len * sizeof(T) - +function Buffer(sz::Int, flags::Integer, hostbuf=nothing; + device=:rw, host=:rw, device_private_address=bda_supported(cl.device())) if device == :rw flags |= CL_MEM_READ_WRITE elseif device == :r @@ -121,23 +42,41 @@ function Buffer{T}(len::Int, flags::Integer, hostbuf=nothing; end err_code = Ref{Cint}() - mem_id = clCreateBuffer(context(), flags, sz, something(hostbuf, C_NULL), err_code) + properties = cl_mem_properties[] + if device_private_address + append!(properties, [CL_MEM_DEVICE_PRIVATE_ADDRESS_EXT, CL_TRUE]) + end + mem_id = if isempty(properties) + clCreateBuffer(context(), flags, sz, something(hostbuf, C_NULL), err_code) + else + push!(properties, 0) + clCreateBufferWithProperties(context(), properties, flags, sz, something(hostbuf, C_NULL), err_code) + end if err_code[] != CL_SUCCESS throw(CLError(err_code[])) end - return Buffer{T}(mem_id, len) + + ptr = if device_private_address + ptr_ref = Ref{cl_mem_device_address_ext}() + clGetMemObjectInfo(mem_id, CL_MEM_DEVICE_ADDRESS_EXT, sizeof(cl_mem_device_address_ext), ptr_ref, C_NULL) + CLPtr{Cvoid}(ptr_ref[]) + else + nothing + end + + return Buffer(mem_id, ptr, sz, context()) end # allocated buffer -function Buffer{T}(len::Integer; host_accessible=false, kwargs...) where {T} +function Buffer(sz::Integer; host_accessible=false, kwargs...) flags = host_accessible ? CL_MEM_ALLOC_HOST_PTR : 0 - Buffer{T}(len, flags, nothing; kwargs...) + Buffer(sz, flags, nothing; kwargs...) end # from host memory -function Buffer(hostbuf::Array{T}; copy::Bool=true, kwargs...) where {T} +function Buffer(hostbuf::Array; copy::Bool=true, kwargs...) flags = copy ? CL_MEM_COPY_HOST_PTR : CL_MEM_USE_HOST_PTR - Buffer{T}(length(hostbuf), flags, hostbuf; kwargs...) + Buffer(sizeof(hostbuf), flags, hostbuf; kwargs...) end @@ -190,7 +129,7 @@ enqueue_copy(dst::Buffer, src::Buffer, N; kwargs...) = enqueue_copy(dst, 0, src, 0, N; kwargs...) # map a buffer into the host address space, returning a pointer and an event -function enqueue_map(b::Buffer, offset::Integer, nbytes::Int, flags=:rw; +function enqueue_map(buf::Buffer, offset::Integer, nbytes::Int, flags=:rw; blocking::Bool=false, wait_for::Vector{Event}=Event[]) flags = if flags == :rw CL_MAP_READ | CL_MAP_WRITE @@ -207,7 +146,7 @@ function enqueue_map(b::Buffer, offset::Integer, nbytes::Int, flags=:rw; evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] GC.@preserve wait_for begin status = Ref{Cint}() - ptr = clEnqueueMapBuffer(queue(), b, blocking, flags, offset, nbytes, + ptr = clEnqueueMapBuffer(queue(), buf, blocking, flags, offset, nbytes, n_evts, evt_ids, ret_evt, status) if status[] != CL_SUCCESS throw(CLError(status[])) @@ -216,22 +155,22 @@ function enqueue_map(b::Buffer, offset::Integer, nbytes::Int, flags=:rw; return ptr, Event(ret_evt[]) end end -enqueue_map(b::Buffer, nbytes::Int, flags=:rw; kwargs...) = - enqueue_map(b, 0, nbytes, flags; kwargs...) +enqueue_map(buf::Buffer, nbytes::Int, flags=:rw; kwargs...) = + enqueue_map(buf, 0, nbytes, flags; kwargs...) # unmap a buffer, return an event -function enqueue_unmap(b::Buffer, ptr::Ptr; wait_for::Vector{Event}=Event[]) +function enqueue_unmap(buf::Buffer, ptr::Ptr; wait_for::Vector{Event}=Event[]) n_evts = length(wait_for) evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] GC.@preserve wait_for begin ret_evt = Ref{cl_event}() - clEnqueueUnmapMemObject(queue(), b, ptr, n_evts, evt_ids, ret_evt) + clEnqueueUnmapMemObject(queue(), buf, ptr, n_evts, evt_ids, ret_evt) return Event(ret_evt[]) end end # fill a buffer with a pattern, returning an event -function enqueue_fill(b::Buffer, offset::Integer, pattern::T, N::Integer; +function enqueue_fill(buf::Buffer, offset::Integer, pattern::T, N::Integer; wait_for::Vector{Event}=Event[]) where {T} nbytes = N * sizeof(T) nbytes_pattern = sizeof(T) @@ -240,10 +179,10 @@ function enqueue_fill(b::Buffer, offset::Integer, pattern::T, N::Integer; evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] GC.@preserve begin ret_evt = Ref{cl_event}() - clEnqueueFillBuffer(queue(), b, [pattern], + clEnqueueFillBuffer(queue(), buf, [pattern], nbytes_pattern, offset, nbytes, n_evts, evt_ids, ret_evt) @return_event ret_evt[] end end -enqueue_fill(b::Buffer, pattern, N::Integer) = enqueue_fill(b, 0, pattern, N) +enqueue_fill(buf::Buffer, pattern, N::Integer) = enqueue_fill(buf, 0, pattern, N) diff --git a/lib/cl/memory/memory.jl b/lib/cl/memory/memory.jl deleted file mode 100644 index 2e128c0f..00000000 --- a/lib/cl/memory/memory.jl +++ /dev/null @@ -1,21 +0,0 @@ -# Raw memory management - -export device_alloc, host_alloc, shared_alloc, svm_alloc, free - -# -# untyped buffers -# - -abstract type AbstractMemory end - -Base.convert(T::Type{<:Union{Ptr, CLPtr}}, buf::AbstractMemory) = - throw(ArgumentError("Illegal conversion of a $(typeof(buf)) to a $T")) - -# ccall integration -# -# taking the pointer of a buffer means returning the underlying pointer, -# and not the pointer of the buffer object itself. -Base.unsafe_convert(P::Type{<:Union{Ptr, CLPtr}}, buf::AbstractMemory) = convert(P, buf) - -include("usm.jl") -include("svm.jl") diff --git a/lib/cl/memory/svm.jl b/lib/cl/memory/svm.jl index cbe6f487..e33e3cfa 100644 --- a/lib/cl/memory/svm.jl +++ b/lib/cl/memory/svm.jl @@ -1,4 +1,4 @@ -struct SharedVirtualMemory <: AbstractMemory +struct SharedVirtualMemory <: AbstractPointerMemory ptr::CLPtr{Cvoid} bytesize::Int context::Context @@ -9,7 +9,6 @@ SharedVirtualMemory() = SharedVirtualMemory(CL_NULL, 0, context()) function svm_alloc(bytesize::Integer; alignment::Integer = 0, access::Symbol = :rw, fine_grained = false ) - bytesize == 0 && return SharedVirtualMemory() flags = if access == :rw CL_MEM_READ_WRITE @@ -36,25 +35,20 @@ function svm_alloc(bytesize::Integer; return SharedVirtualMemory(ptr, bytesize, context()) end -function svm_free(buf::SharedVirtualMemory) - if sizeof(buf) != 0 - clSVMFree(context(buf), buf) - end - return -end +svm_free(mem::SharedVirtualMemory) = clSVMFree(context(mem), mem) -Base.pointer(buf::SharedVirtualMemory) = buf.ptr -Base.sizeof(buf::SharedVirtualMemory) = buf.bytesize -context(buf::SharedVirtualMemory) = buf.context +Base.pointer(mem::SharedVirtualMemory) = mem.ptr +Base.sizeof(mem::SharedVirtualMemory) = mem.bytesize +context(mem::SharedVirtualMemory) = mem.context -Base.show(io::IO, buf::SharedVirtualMemory) = - @printf(io, "SharedVirtualMemory(%s at %p)", Base.format_bytes(sizeof(buf)), Int(pointer(buf))) +Base.show(io::IO, mem::SharedVirtualMemory) = + @printf(io, "SharedVirtualMemory(%s at %p)", Base.format_bytes(sizeof(mem)), Int(pointer(mem))) -Base.convert(::Type{Ptr{T}}, buf::SharedVirtualMemory) where {T} = - convert(Ptr{T}, pointer(buf)) +Base.convert(::Type{Ptr{T}}, mem::SharedVirtualMemory) where {T} = + convert(Ptr{T}, pointer(mem)) -Base.convert(::Type{CLPtr{T}}, buf::SharedVirtualMemory) where {T} = - reinterpret(CLPtr{T}, pointer(buf)) +Base.convert(::Type{CLPtr{T}}, mem::SharedVirtualMemory) where {T} = + reinterpret(CLPtr{T}, pointer(mem)) ## memory operations @@ -118,7 +112,6 @@ end function enqueue_svm_fill(ptr::Union{Ptr, CLPtr}, pattern::T, N::Integer; wait_for::Vector{Event}=Event[]) where {T} nbytes = N * sizeof(T) - nbytes == 0 && return pattern_size = sizeof(T) n_evts = length(wait_for) evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] diff --git a/lib/cl/memory/usm.jl b/lib/cl/memory/usm.jl index 86f56455..a12bc52f 100644 --- a/lib/cl/memory/usm.jl +++ b/lib/cl/memory/usm.jl @@ -1,14 +1,11 @@ -abstract type UnifiedMemory <: AbstractMemory end +abstract type UnifiedMemory <: AbstractPointerMemory end -function usm_free(buf::UnifiedMemory; blocking::Bool = false) - if sizeof(buf) != 0 - if blocking - clMemBlockingFreeINTEL(context(buf), buf) - else - clMemFreeINTEL(context(buf), buf) - end +function usm_free(mem::UnifiedMemory; blocking::Bool = false) + if blocking + clMemBlockingFreeINTEL(context(mem), mem) + else + clMemFreeINTEL(context(mem), mem) end - return end @@ -31,7 +28,6 @@ UnifiedDeviceMemory() = UnifiedDeviceMemory(CL_NULL, 0, context()) function device_alloc(bytesize::Integer; alignment::Integer = 0, write_combined::Bool = false ) - bytesize == 0 && return UnifiedDeviceMemory() flags = 0 if write_combined @@ -48,15 +44,15 @@ function device_alloc(bytesize::Integer; return UnifiedDeviceMemory(ptr, bytesize, context()) end -Base.pointer(buf::UnifiedDeviceMemory) = buf.ptr -Base.sizeof(buf::UnifiedDeviceMemory) = buf.bytesize -context(buf::UnifiedDeviceMemory) = buf.context +Base.pointer(mem::UnifiedDeviceMemory) = mem.ptr +Base.sizeof(mem::UnifiedDeviceMemory) = mem.bytesize +context(mem::UnifiedDeviceMemory) = mem.context -Base.show(io::IO, buf::UnifiedDeviceMemory) = - @printf(io, "UnifiedDeviceMemory(%s at %p)", Base.format_bytes(sizeof(buf)), pointer(buf)) +Base.show(io::IO, mem::UnifiedDeviceMemory) = + @printf(io, "UnifiedDeviceMemory(%s at %p)", Base.format_bytes(sizeof(mem)), pointer(mem)) -Base.convert(::Type{CLPtr{T}}, buf::UnifiedDeviceMemory) where {T} = - convert(CLPtr{T}, pointer(buf)) +Base.convert(::Type{CLPtr{T}}, mem::UnifiedDeviceMemory) where {T} = + convert(CLPtr{T}, pointer(mem)) ## host buffer @@ -78,7 +74,6 @@ UnifiedHostMemory() = UnifiedHostMemory(C_NULL, 0, context()) function host_alloc(bytesize::Integer; alignment::Integer = 0, write_combined::Bool = false ) - bytesize == 0 && return UnifiedHostMemory() flags = 0 if write_combined @@ -95,15 +90,15 @@ function host_alloc(bytesize::Integer; return UnifiedHostMemory(ptr, bytesize, context()) end -Base.pointer(buf::UnifiedHostMemory) = buf.ptr -Base.sizeof(buf::UnifiedHostMemory) = buf.bytesize -context(buf::UnifiedHostMemory) = buf.context +Base.pointer(mem::UnifiedHostMemory) = mem.ptr +Base.sizeof(mem::UnifiedHostMemory) = mem.bytesize +context(mem::UnifiedHostMemory) = mem.context -Base.show(io::IO, buf::UnifiedHostMemory) = - @printf(io, "UnifiedHostMemory(%s at %p)", Base.format_bytes(sizeof(buf)), Int(pointer(buf))) +Base.show(io::IO, mem::UnifiedHostMemory) = + @printf(io, "UnifiedHostMemory(%s at %p)", Base.format_bytes(sizeof(mem)), Int(pointer(mem))) -Base.convert(::Type{Ptr{T}}, buf::UnifiedHostMemory) where {T} = - convert(Ptr{T}, pointer(buf)) +Base.convert(::Type{Ptr{T}}, mem::UnifiedHostMemory) where {T} = + convert(Ptr{T}, pointer(mem)) ## shared buffer @@ -124,7 +119,6 @@ UnifiedSharedMemory() = UnifiedSharedMemory(CL_NULL, 0, context()) function shared_alloc(bytesize::Integer; alignment::Integer = 0, write_combined = false, placement = nothing ) - bytesize == 0 && return UnifiedSharedMemory() flags = 0 if write_combined @@ -150,18 +144,18 @@ function shared_alloc(bytesize::Integer; return UnifiedSharedMemory(ptr, bytesize, context()) end -Base.pointer(buf::UnifiedSharedMemory) = buf.ptr -Base.sizeof(buf::UnifiedSharedMemory) = buf.bytesize -context(buf::UnifiedSharedMemory) = buf.context +Base.pointer(mem::UnifiedSharedMemory) = mem.ptr +Base.sizeof(mem::UnifiedSharedMemory) = mem.bytesize +context(mem::UnifiedSharedMemory) = mem.context -Base.show(io::IO, buf::UnifiedSharedMemory) = - @printf(io, "UnifiedSharedMemory(%s at %p)", Base.format_bytes(sizeof(buf)), Int(pointer(buf))) +Base.show(io::IO, mem::UnifiedSharedMemory) = + @printf(io, "UnifiedSharedMemory(%s at %p)", Base.format_bytes(sizeof(mem)), Int(pointer(mem))) -Base.convert(::Type{Ptr{T}}, buf::UnifiedSharedMemory) where {T} = - convert(Ptr{T}, reinterpret(Ptr{Cvoid}, pointer(buf))) +Base.convert(::Type{Ptr{T}}, mem::UnifiedSharedMemory) where {T} = + convert(Ptr{T}, reinterpret(Ptr{Cvoid}, pointer(mem))) -Base.convert(::Type{CLPtr{T}}, buf::UnifiedSharedMemory) where {T} = - convert(CLPtr{T}, pointer(buf)) +Base.convert(::Type{CLPtr{T}}, mem::UnifiedSharedMemory) where {T} = + convert(CLPtr{T}, pointer(mem)) ## memory operations @@ -184,7 +178,6 @@ end function enqueue_usm_fill(ptr::Union{Ptr, CLPtr}, pattern::T, N::Integer; wait_for::Vector{Event}=Event[]) where {T} nbytes = N * sizeof(T) - nbytes == 0 && return pattern_size = sizeof(T) n_evts = length(wait_for) evt_ids = isempty(wait_for) ? C_NULL : [pointer(evt) for evt in wait_for] diff --git a/lib/cl/state.jl b/lib/cl/state.jl index a3628d49..de44bd5d 100644 --- a/lib/cl/state.jl +++ b/lib/cl/state.jl @@ -161,30 +161,72 @@ end abstract type AbstractMemoryBackend end struct SVMBackend <: AbstractMemoryBackend end struct USMBackend <: AbstractMemoryBackend end +struct BufferBackend <: AbstractMemoryBackend end -function default_memory_backend(dev::Device) - # determine if USM is supported - usm = if usm_supported(dev) - caps = usm_capabilities(dev) - caps.host.access && caps.device.access - else - false - end +function supported_memory_backends(dev::Device) + backends = AbstractMemoryBackend[] - # determine if SVM is available (if needed) - if !usm - caps = svm_capabilities(dev) - if !caps.coarse_grain_buffer - error("Device $dev does not support USM or coarse-grained SVM, either of which is required by OpenCL.jl") + # unified shared memory is the first choice, as it gives us separate host and device + # memory spaces that can be directly referenced by raw pointers. + if usm_supported(dev) + usm_caps = usm_capabilities(dev) + if usm_caps.host.access && usm_caps.device.access + push!(backends, USMBackend()) end end - usm ? USMBackend() : SVMBackend() + # plain old buffers are always supported, but we only want to use them if we have the + # buffer device address extension, which allows us to reference them by raw pointers. + if bda_supported(dev) + push!(backends, BufferBackend()) + end + + # shared virtual memory is last, because it comes at a performance cost. + svm_caps = svm_capabilities(dev) + if svm_caps.coarse_grain_buffer + push!(backends, SVMBackend()) + end + + if isempty(backends) + # as a last resort, use plain buffers without the ability to reference by pointer. + # this severely limits compatibility, but it's better than nothing. + push!(backends, BufferBackend()) + end + + return backends +end + +function default_memory_backend(dev::Device) + supported_backends = supported_memory_backends(dev) + + backend_str = load_preference(OpenCL, "default_memory_backend") + backend_str === nothing && return first(supported_backends) + + backend = if backend_str == "usm" + USMBackend() + elseif backend_str == "svm" + SVMBackend() + elseif backend_str == "buffer" + BufferBackend() + else + error("Unknown memory backend '$backend_str' requested") + end + in(backend, supported_backends) ? backend : nothing + backend end function memory_backend() return get!(task_local_storage(), :CLMemoryBackend) do - default_memory_backend(device()) + dev = device() + backend = default_memory_backend(dev) + if backend === nothing + error("Device $(dev) does not support any of the available memory backends") + end + if backend === BufferBackend() && !bda_supported(dev) + @warn """Your device $(dev.name) does not support the necessary extensions for OpenCL.jl's memory management (requiring either USM, coarse-grained SVM, or BDA). + Falling back to plain OpenCL buffers, which severely limits compatibility with other OpenCL.jl, only supporting OpenCL C kernels.""" maxlog=1 _id="memory_backend_$(dev.name)" + end + backend end end diff --git a/src/OpenCL.jl b/src/OpenCL.jl index 23cc0abb..caa63696 100644 --- a/src/OpenCL.jl +++ b/src/OpenCL.jl @@ -7,6 +7,7 @@ using Adapt using Reexport using GPUArrays using Random +using Preferences using Core: LLVMPtr diff --git a/src/array.jl b/src/array.jl index fa8d1572..ee5ee637 100644 --- a/src/array.jl +++ b/src/array.jl @@ -52,7 +52,7 @@ mutable struct CLArray{T, N, M} <: AbstractGPUArray{T, N} maxsize end data = GPUArrays.cached_alloc((CLArray, cl.context(), M, bufsize)) do - buf = alloc(M, bufsize; alignment=Base.datatype_alignment(T)) + buf = managed_alloc(M, bufsize; alignment=Base.datatype_alignment(T)) DataRef(free, buf) end obj = new{T, N, M}(data, maxsize, 0, dims) @@ -96,8 +96,10 @@ const CLVecOrMat{T} = Union{CLVector{T}, CLMatrix{T}} function memory_type() if cl.memory_backend() == cl.USMBackend() return cl.UnifiedDeviceMemory - else + elseif cl.memory_backend() == cl.SVMBackend() return cl.SharedVirtualMemory + elseif cl.memory_backend() == cl.BufferBackend() + return cl.Buffer end end CLArray{T, N}(::UndefInitializer, dims::Dims{N}) where {T, N} = @@ -173,10 +175,14 @@ context(A::CLArray) = cl.context(A.data[].mem) memtype(x::CLArray) = memtype(typeof(x)) memtype(::Type{<:CLArray{<:Any, <:Any, M}}) where {M} = @isdefined(M) ? M : Any -is_device(a::CLArray) = memtype(a) == cl.UnifiedDeviceMemory -is_shared(a::CLArray) = memtype(a) == cl.UnifiedSharedMemory -is_host(a::CLArray) = memtype(a) == cl.UnifiedHostMemory -is_svm(a::CLArray) = memtype(a) == cl.SharedVirtualMemory +# can we read this array from the device (i.e. derive a CLPtr)? +is_device(a::CLArray) = + memtype(a) in (cl.UnifiedDeviceMemory, cl.UnifiedSharedMemory, cl.SharedVirtualMemory, cl.Buffer) +is_shared(a::CLArray) = + memtype(a) in (cl.UnifiedSharedMemory, cl.SharedVirtualMemory) +is_host(a::CLArray) = + memtype(a) in (cl.UnifiedHostMemory, cl.UnifiedSharedMemory, cl.SharedVirtualMemory) + ## derived types @@ -280,13 +286,16 @@ end ## interop with libraries function Base.unsafe_convert(::Type{Ptr{T}}, x::CLArray{T}) where {T} - if is_device(x) + if !is_host(x) throw(ArgumentError("cannot take the CPU address of a $(typeof(x))")) end return convert(Ptr{T}, x.data[]) + x.offset * Base.elsize(x) end function Base.unsafe_convert(::Type{CLPtr{T}}, x::CLArray{T}) where {T} + if !is_device(x) + throw(ArgumentError("cannot take the device address of a $(typeof(x))")) + end return convert(CLPtr{T}, x.data[]) + x.offset * Base.elsize(x) end @@ -379,8 +388,25 @@ for (srcty, dstty) in [(:Array, :CLArray), (:CLArray, :Array), (:CLArray, :CLArr cl.context!(context(device_array)) do if memtype(device_array) == cl.SharedVirtualMemory cl.enqueue_svm_copy(pointer(dst, dst_off), pointer(src, src_off), nbytes; blocking) - else + elseif memtype(device_array) <: cl.UnifiedMemory cl.enqueue_usm_copy(pointer(dst, dst_off), pointer(src, src_off), nbytes; blocking) + else + if src isa CLArray && dst isa CLArray + cl.enqueue_copy(convert(cl.Buffer, dst.data[]), + (dst.offset * Base.elsize(dst)) + (dst_off - 1) * sizeof(T), + convert(cl.Buffer, src.data[]), + (src.offset * Base.elsize(src)) + (src_off - 1) * sizeof(T), + nbytes; blocking) + elseif dst isa CLArray + cl.enqueue_write(convert(cl.Buffer, dst.data[]), + (dst.offset * Base.elsize(dst)) + (dst_off - 1) * sizeof(T), + pointer(src, src_off), nbytes; blocking) + elseif src isa CLArray + cl.enqueue_read(pointer(dst, dst_off), + convert(cl.Buffer, src.data[]), + (src.offset * Base.elsize(src)) + (src_off - 1) * sizeof(T), + nbytes; blocking) + end end end end @@ -417,12 +443,15 @@ fill(v, dims...) = fill!(CLArray{typeof(v)}(undef, dims...), v) fill(v, dims::Dims) = fill!(CLArray{typeof(v)}(undef, dims...), v) function Base.fill!(A::DenseCLArray{T}, val) where {T} + isempty(A) && return A cl.context!(context(A)) do GC.@preserve A begin if memtype(A) == cl.SharedVirtualMemory cl.enqueue_svm_fill(pointer(A), convert(T, val), length(A)) - else + elseif memtype(A) <: cl.UnifiedMemory cl.enqueue_usm_fill(pointer(A), convert(T, val), length(A)) + else + cl.enqueue_fill(convert(cl.Buffer, A.data[]), A.offset * Base.elsize(A), convert(T, val), length(A)) end end end @@ -493,15 +522,17 @@ function Base.resize!(a::CLVector{T}, n::Integer) where {T} # replace the data with a new CL. this 'unshares' the array. # as a result, we can safely support resizing unowned buffers. new_data = cl.context!(context(a)) do - mem = alloc(memtype(a), bufsize; alignment=Base.datatype_alignment(T)) + mem = managed_alloc(memtype(a), bufsize; alignment=Base.datatype_alignment(T)) ptr = convert(CLPtr{T}, mem) m = min(length(a), n) if m > 0 GC.@preserve a begin if memtype(a) == cl.SharedVirtualMemory cl.enqueue_svm_copy(ptr, pointer(a), m*sizeof(T); blocking=false) - else + elseif memtype(a) <: cl.UnifiedMemory cl.enqueue_usm_copy(ptr, pointer(a), m*sizeof(T); blocking=false) + else + cl.enqueue_copy(convert(cl.Buffer, mem), 0, convert(cl.Buffer, a.data[]), a.offset * Base.elsize(a), m*sizeof(T); blocking=false) end end end diff --git a/src/memory.jl b/src/memory.jl index b975d142..94d8e44c 100644 --- a/src/memory.jl +++ b/src/memory.jl @@ -38,9 +38,10 @@ function maybe_synchronize(managed::Managed) return nothing end -function Base.convert(::Type{CLPtr{T}}, managed::Managed{M}) where {T, M} +function Base.convert(typ::Union{Type{<:CLPtr}, Type{cl.Buffer}}, managed::Managed) # let null pointers pass through as-is - ptr = convert(CLPtr{T}, managed.mem) + # XXX: does not work for buffers + ptr = convert(typ, managed.mem) if ptr == cl.CL_NULL return ptr end @@ -55,9 +56,9 @@ function Base.convert(::Type{CLPtr{T}}, managed::Managed{M}) where {T, M} return ptr end -function Base.convert(::Type{Ptr{T}}, managed::Managed{M}) where {T, M} +function Base.convert(typ::Type{<:Ptr}, managed::Managed{M}) where {M} # let null pointers pass through as-is - ptr = convert(Ptr{T}, managed.mem) + ptr = convert(typ, managed.mem) if ptr == C_NULL return ptr end @@ -112,6 +113,13 @@ end ## public interface +function managed_alloc(t::Type{T}, bytes::Int; kwargs...) where T + if bytes == 0 + return Managed(T()) + else + alloc(t, bytes; kwargs...) + end +end function alloc(::Type{cl.UnifiedDeviceMemory}, bytes::Int; alignment::Int = 0) mem = cl.device_alloc(bytes; alignment) @@ -134,7 +142,14 @@ function alloc(::Type{cl.SharedVirtualMemory}, bytes::Int; alignment::Int = 0) return Managed(mem) end -function free(managed::Managed{<:cl.AbstractMemory}) +function alloc(::Type{cl.Buffer}, bytes::Int; alignment::Int = 0) + # TODO: use alignment + buf = cl.Buffer(bytes) + return Managed(buf) +end + +function free(managed::Managed) + sizeof(managed) == 0 && return mem = managed.mem cl.context!(cl.context(mem)) do # "`clSVMFree` does not wait for previously enqueued commands that may be using @@ -148,8 +163,10 @@ function free(managed::Managed{<:cl.AbstractMemory}) if mem isa cl.SharedVirtualMemory cl.svm_free(mem) - else + elseif mem isa cl.UnifiedMemory cl.usm_free(mem) + else + cl.release(mem) end end diff --git a/src/util.jl b/src/util.jl index 85baf052..70a002ff 100644 --- a/src/util.jl +++ b/src/util.jl @@ -53,6 +53,19 @@ function versioninfo(io::IO=stdout) println(io) end + prefs = [ + "default_memory_backend" => load_preference(OpenCL, "default_memory_backend"), + ] + if any(x->!isnothing(x[2]), prefs) + println(io, "Preferences:") + for (key, val) in prefs + if !isnothing(val) + println(io, "- $key: $val") + end + end + println(io) + end + println(io, "Available platforms: ", length(cl.platforms())) for platform in cl.platforms() println(io, " - $(platform.name)") @@ -67,12 +80,31 @@ function versioninfo(io::IO=stdout) # show a list of tags tags = [] - ## memory back-end - backend = cl.default_memory_backend(device) - if backend == cl.SVMBackend() - push!(tags, "svm") - elseif backend == cl.USMBackend() - push!(tags, "usm") + ## memory back-ends + let + svm_tags = [] + svm_caps = cl.svm_capabilities(device) + if svm_caps.coarse_grain_buffer + push!(svm_tags, "c") + end + if svm_caps.fine_grain_buffer + push!(svm_tags, "f") + end + push!(tags, "svm:"*join(svm_tags, "+")) + end + if cl.usm_supported(device) + usm_tags = [] + usm_caps = cl.usm_capabilities(device) + if usm_caps.host.access + push!(usm_tags, "h") + end + if usm_caps.device.access + push!(usm_tags, "d") + end + push!(tags, "usm:"*join(usm_tags, "+")) + end + if cl.bda_supported(device) + push!(tags, "bda") end ## relevant extensions if in("cl_khr_fp16", device.extensions) diff --git a/test/buffer.jl b/test/buffer.jl index c2d0e913..34245c9c 100644 --- a/test/buffer.jl +++ b/test/buffer.jl @@ -1,14 +1,11 @@ @testset "Buffer" begin # simple buffer - let buf = cl.Buffer{Int}(1) - @test ndims(buf) == 1 - @test eltype(buf) == Int - @test length(buf) == 1 + let buf = cl.Buffer(sizeof(Int)) @test sizeof(buf) == sizeof(Int) end # memory copy - let buf = cl.Buffer{Int}(1) + let buf = cl.Buffer(sizeof(Int)) src = [42] cl.enqueue_write(buf, pointer(src), sizeof(src); blocking=true) @@ -18,7 +15,7 @@ end # host accessible, mapped - let buf = cl.Buffer{Int}(1; host_accessible=true) + let buf = cl.Buffer(sizeof(Int); host_accessible=true) src = [42] cl.enqueue_write(buf, pointer(src), sizeof(src); blocking=true) @@ -68,7 +65,7 @@ end # fill - let buf = cl.Buffer{Int}(3) + let buf = cl.Buffer(3*sizeof(Int)) cl.enqueue_fill(buf, 42, 3) arr = Vector{Int}(undef, 3) diff --git a/test/memory.jl b/test/memory.jl index 7ca07f81..6adc081c 100644 --- a/test/memory.jl +++ b/test/memory.jl @@ -17,8 +17,8 @@ buf = create_test_buffer() expectations = [ - (:mem_type, cl.CL_MEM_OBJECT_BUFFER), - (:mem_flags, (:rw, :copy)), + (:type, cl.CL_MEM_OBJECT_BUFFER), + (:flags, (:rw, :copy)), (:size, sizeof(buf)), (:reference_count, 1), (:map_count, 0)