diff --git a/LocalPreferences.toml b/LocalPreferences.toml new file mode 100644 index 00000000..f5638dd8 --- /dev/null +++ b/LocalPreferences.toml @@ -0,0 +1,4 @@ +[GPUCompiler] +# wether caching of object files should be enabled. If the disk cache is enabled +# cache files are storied in scratch memory. +#disk_cache = "false" diff --git a/Project.toml b/Project.toml index 7030b1ed..1efc4463 100644 --- a/Project.toml +++ b/Project.toml @@ -9,17 +9,23 @@ InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240" LLVM = "929cbde3-209d-540e-8aea-75f648917ca0" Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" +Preferences = "21216c6a-2e73-6563-6e65-726566657250" Scratch = "6c6a2e73-6563-6170-7368-637461726353" +Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" +TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76" TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f" UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4" [compat] ExprTools = "0.1" InteractiveUtils = "1" +LLVM = "8" Libdl = "1" Logging = "1" UUIDs = "1" -LLVM = "8" +Preferences = "1" Scratch = "1" +Serialization = "1" +TOML = "1" TimerOutputs = "0.5" julia = "1.8" diff --git a/src/GPUCompiler.jl b/src/GPUCompiler.jl index 56ea5cd0..fa5dc32d 100644 --- a/src/GPUCompiler.jl +++ b/src/GPUCompiler.jl @@ -9,7 +9,9 @@ using ExprTools: splitdef, combinedef using Libdl +using Serialization using Scratch: @get_scratch! +using Preferences const CC = Core.Compiler using Core: MethodInstance, CodeInstance, CodeInfo diff --git a/src/execution.jl b/src/execution.jl index 3ed48353..6dfaaa49 100644 --- a/src/execution.jl +++ b/src/execution.jl @@ -62,6 +62,54 @@ end ## cached compilation +### Notes on interactions with package images and disk cache. +# Julia uses package images (pkgimg) to cache both the result of inference, +# and the result of native code emissions. Up until Julia v1.11 neither the +# inferred nor the nativce code of foreign abstract interpreters was cached +# across sessions. Julia v1.11 allows for caching of inference results across +# sessions as long as those inference results are created during precompilation. +# +# Julia cache hierarchy is roughly as follows: +# Function (name of a thing) +# -> Method (particular piece of code to dispatch to with a signature) +# -> MethodInstance (A particular Method + particular signature) +# -> CodeInstance (A MethodInstance compiled for a world) +# +# In order to cache code across sessions we need to insert CodeInstance(owner=GPUCompilerCacheToken) +# into the internal cache. Once we have done so we know that a particular CodeInstance is unique in +# the system. (During pkgimg loading conflicts will be resolved). +# +# When a pkgimg is loaded we check it's validity, this means checking that all depdencies are the same, +# the pkgimg was created for the right set of compiler flags, and that all source code that was used +# to create this pkgimg is the same. When a CodeInstance is inside a pkgimg we can extend the chain of +# validity even for GPU code, we cannot verify a "runtime" CodeInstance in the same way. +# +# Therefore when we see a compilation request for a CodeInstance that is originating from a pkgimg +# we can use it as part of the hash for the on-disk cache. (see `cache_file`) + +""" + disk_cache_enabled() + +Query if caching to disk is enabled. +""" +disk_cache_enabled() = parse(Bool, @load_preference("disk_cache", "false")) + +""" + enable_disk_cache!(state::Bool=true) + +Activate the GPUCompiler disk cache in the current environment. +You will need to restart your Julia environment for it to take effect. + +!!! note + The cache functionality requires Julia 1.11 +""" +function enable_disk_cache!(state::Bool=true) + @set_preferences!("disk_cache"=>string(state)) +end + +disk_cache_path() = @get_scratch!("disk_cache") +clear_disk_cache!() = rm(disk_cache_path(); recursive=true, force=true) + const cache_lock = ReentrantLock() """ @@ -108,6 +156,37 @@ function cached_compilation(cache::AbstractDict{<:Any,V}, return obj::V end +@noinline function cache_file(ci::CodeInstance, cfg::CompilerConfig) + h = hash(Base.objectid(ci)) + @static if isdefined(Base, :object_build_id) + bid = Base.object_build_id(ci) + if bid === nothing # CI is from a runtime compilation, not worth caching on disk + return nothing + else + bid = bid % UInt64 # The upper 64bit are a checksum, unavailable during precompilation + end + h = hash(bid, h) + end + h = hash(cfg, h) + + gpucompiler_buildid = Base.module_build_id(@__MODULE__) + if (gpucompiler_buildid >> 64) % UInt64 == 0xffffffffffffffff + return nothing # Don't cache during precompilation of GPUCompiler + end + + return joinpath( + disk_cache_path(), + # bifurcate the cache by build id of GPUCompiler + string(gpucompiler_buildid), + string(h, ".jls")) +end + +struct DiskCacheEntry + src::Type # Originally MethodInstance, but upon deserialize they were not uniqued... + cfg::CompilerConfig + asm +end + @noinline function actual_compilation(cache::AbstractDict, src::MethodInstance, world::UInt, cfg::CompilerConfig, compiler::Function, linker::Function) job = CompilerJob(src, cfg, world) @@ -117,20 +196,64 @@ end ci = ci_cache_lookup(ci_cache(job), src, world, world)::Union{Nothing,CodeInstance} if ci !== nothing key = (ci, cfg) - if haskey(cache, key) - obj = cache[key] - end + obj = get(cache, key, nothing) end # slow path: compile and link if obj === nothing || compile_hook[] !== nothing - # TODO: consider loading the assembly from an on-disk cache here - asm = compiler(job) + asm = nothing + path = nothing + ondisk_hit = false + @static if VERSION >= v"1.11.0-" + # Don't try to hit the disk cache if we are for a *compile* hook + # TODO: + # - Sould we hit disk cache if Base.generating_output() + # - Should we allow backend to opt out? + if ci !== nothing && obj === nothing && disk_cache_enabled() + path = cache_file(ci, cfg) + @debug "Looking for on-disk cache" job path + if path !== nothing && isfile(path) + ondisk_hit = true + try + @debug "Loading compiled kernel" job path + # The MI we deserialize here didn't get uniqued... + entry = deserialize(path)::DiskCacheEntry + if entry.src == src.specTypes && entry.cfg == cfg + asm = entry.asm + else + @show entry.src == src.specTypes + @show entry.cfg == cfg + @warn "Cache missmatch" src.specTypes cfg entry.src entry.cfg + end + catch ex + @warn "Failed to load compiled kernel" job path exception=(ex, catch_backtrace()) + end + end + end + end + if asm === nothing || compile_hook[] !== nothing + # Run the compiler in-case we need to hook it. + asm = compiler(job) + end if obj !== nothing # we got here because of a *compile* hook; don't bother linking return obj end + + @static if VERSION >= v"1.11.0-" + if !ondisk_hit && path !== nothing && disk_cache_enabled() + @debug "Writing out on-disk cache" job path + tmppath, io = mktemp(;cleanup=false) + entry = DiskCacheEntry(src.specTypes, cfg, asm) + serialize(io, entry) + close(io) + # atomic move + mkpath(dirname(path)) + Base.rename(tmppath, path, force=true) + end + end + obj = linker(job, asm) if ci === nothing diff --git a/src/jlgen.jl b/src/jlgen.jl index c5054547..38150356 100644 --- a/src/jlgen.jl +++ b/src/jlgen.jl @@ -587,6 +587,27 @@ macro in_world(world, ex) end end +""" + precompile(job::CompilerJob) + +Compile the GPUCompiler job. In particular this will run inference using the foreign +abstract interpreter. +""" +function Base.precompile(@nospecialize(job::CompilerJob)) + if job.source.def.primary_world > job.world || job.world > job.source.def.deleted_world + error("Cannot compile $(job.source) for world $(job.world); method is only valid in worlds $(job.source.def.primary_world) to $(job.source.def.deleted_world)") + end + + # populate the cache + interp = get_interpreter(job) + cache = CC.code_cache(interp) + if ci_cache_lookup(cache, job.source, job.world, job.world) === nothing + ci_cache_populate(interp, cache, job.source, job.world, job.world) + return ci_cache_lookup(cache, job.source, job.world, job.world) !== nothing + end + return true +end + function compile_method_instance(@nospecialize(job::CompilerJob)) if job.source.def.primary_world > job.world || job.world > job.source.def.deleted_world error("Cannot compile $(job.source) for world $(job.world); method is only valid in worlds $(job.source.def.primary_world) to $(job.source.def.deleted_world)") diff --git a/test/native_tests.jl b/test/native_tests.jl index c80d53eb..016c390a 100644 --- a/test/native_tests.jl +++ b/test/native_tests.jl @@ -549,20 +549,21 @@ precompile_test_harness("Inference caching") do load_path import GPUCompiler using PrecompileTools - function kernel() + function kernel(A, x) + A[1] = x return end let - job, _ = NativeCompiler.create_job(kernel, ()) - GPUCompiler.code_typed(job) + job, _ = NativeCompiler.create_job(kernel, (Vector{Int}, Int)) + precompile(job) end # identity is foreign @setup_workload begin job, _ = NativeCompiler.create_job(identity, (Int,)) @compile_workload begin - GPUCompiler.code_typed(job) + precompile(job) end end end) |> string) @@ -578,20 +579,35 @@ precompile_test_harness("Inference caching") do load_path job, _ = NativeCompiler.create_job(identity, (Int,)) GPUCompiler.ci_cache_token(job) end - ci = isdefined(identity_mi, :cache) ? identity_mi.cache : nothing - while ci !== nothing - @test ci.owner !== token - ci = isdefined(ci, :next) ? ci.next : nothing - end + @test !check_presence(identity_mi, token) using InferenceCaching # Check that kernel survived - kernel_mi = GPUCompiler.methodinstance(typeof(InferenceCaching.kernel), Tuple{}) + kernel_mi = GPUCompiler.methodinstance(typeof(InferenceCaching.kernel), Tuple{Vector{Int}, Int}) @test check_presence(kernel_mi, token) # check that identity survived @test check_presence(identity_mi, token) + + GPUCompiler.clear_disk_cache!() + @test GPUCompiler.disk_cache_enabled() == false + + GPUCompiler.enable_disk_cache!() + @test GPUCompiler.disk_cache_enabled() == true + + job, _ = NativeCompiler.create_job(InferenceCaching.kernel, (Vector{Int}, Int)) + @assert job.source == kernel_mi + ci = GPUCompiler.ci_cache_lookup(GPUCompiler.ci_cache(job), job.source, job.world, job.world) + @assert ci !== nothing + @assert ci.inferred !== nothing + path = GPUCompiler.cache_file(ci, job.config) + @test path !== nothing + @test !ispath(path) + NativeCompiler.cached_execution(InferenceCaching.kernel, (Vector{Int}, Int)) + @test ispath(path) + GPUCompiler.clear_disk_cache!() + @test !ispath(path) end end diff --git a/test/native_testsetup.jl b/test/native_testsetup.jl index 973b300d..3406276c 100644 --- a/test/native_testsetup.jl +++ b/test/native_testsetup.jl @@ -71,4 +71,22 @@ function code_execution(@nospecialize(func), @nospecialize(types); kwargs...) end end +const runtime_cache = Dict{Any, Any}() + +function compiler(job) + JuliaContext() do ctx + GPUCompiler.compile(:asm, job, validate=false) + end +end + +function linker(job, asm) + asm +end + +# simulates cached codegen +function cached_execution(@nospecialize(func), @nospecialize(types); kwargs...) + job, kwargs = create_job(func, types; kwargs...) + GPUCompiler.cached_compilation(runtime_cache, job.source, job.config, compiler, linker) +end + end diff --git a/test/ptx_tests.jl b/test/ptx_tests.jl index c4d47525..c059ba60 100644 --- a/test/ptx_tests.jl +++ b/test/ptx_tests.jl @@ -339,14 +339,14 @@ precompile_test_harness("Inference caching") do load_path let job, _ = PTXCompiler.create_job(kernel, ()) - GPUCompiler.code_typed(job) + precompile(job) end # identity is foreign @setup_workload begin job, _ = PTXCompiler.create_job(identity, (Int,)) @compile_workload begin - GPUCompiler.code_typed(job) + precompile(job) end end end) |> string)