diff --git a/LocalPreferences.toml b/LocalPreferences.toml
new file mode 100644
index 00000000..f5638dd8
--- /dev/null
+++ b/LocalPreferences.toml
@@ -0,0 +1,4 @@
+[GPUCompiler]
+# wether caching of object files should be enabled. If the disk cache is enabled
+# cache files are storied in scratch memory.
+#disk_cache = "false"
diff --git a/Project.toml b/Project.toml
index 7030b1ed..1efc4463 100644
--- a/Project.toml
+++ b/Project.toml
@@ -9,17 +9,23 @@ InteractiveUtils = "b77e0a4c-d291-57a0-90e8-8db25a27a240"
 LLVM = "929cbde3-209d-540e-8aea-75f648917ca0"
 Libdl = "8f399da3-3557-5675-b5ff-fb832c97cbdb"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
+Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 Scratch = "6c6a2e73-6563-6170-7368-637461726353"
+Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
+TOML = "fa267f1f-6049-4f14-aa54-33bafae1ed76"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 
 [compat]
 ExprTools = "0.1"
 InteractiveUtils = "1"
+LLVM = "8"
 Libdl = "1"
 Logging = "1"
 UUIDs = "1"
-LLVM = "8"
+Preferences = "1"
 Scratch = "1"
+Serialization = "1"
+TOML = "1"
 TimerOutputs = "0.5"
 julia = "1.8"
diff --git a/src/GPUCompiler.jl b/src/GPUCompiler.jl
index 56ea5cd0..fa5dc32d 100644
--- a/src/GPUCompiler.jl
+++ b/src/GPUCompiler.jl
@@ -9,7 +9,9 @@ using ExprTools: splitdef, combinedef
 
 using Libdl
 
+using Serialization
 using Scratch: @get_scratch!
+using Preferences
 
 const CC = Core.Compiler
 using Core: MethodInstance, CodeInstance, CodeInfo
diff --git a/src/execution.jl b/src/execution.jl
index 3ed48353..6dfaaa49 100644
--- a/src/execution.jl
+++ b/src/execution.jl
@@ -62,6 +62,54 @@ end
 
 ## cached compilation
 
+### Notes on interactions with package images and disk cache.
+# Julia uses package images (pkgimg) to cache both the result of inference,
+# and the result of native code emissions. Up until Julia v1.11 neither the
+# inferred nor the nativce code of foreign abstract interpreters was cached
+# across sessions. Julia v1.11 allows for caching of inference results across
+# sessions as long as those inference results are created during precompilation.
+#
+# Julia cache hierarchy is roughly as follows:
+# Function (name of a thing)
+# -> Method (particular piece of code to dispatch to with a signature)
+#  -> MethodInstance (A particular Method + particular signature)
+#    -> CodeInstance (A MethodInstance compiled for a world)
+#
+# In order to cache code across sessions we need to insert CodeInstance(owner=GPUCompilerCacheToken)
+# into the internal cache. Once we have done so we know that a particular CodeInstance is unique in
+# the system. (During pkgimg loading conflicts will be resolved).
+#
+# When a pkgimg is loaded we check it's validity, this means checking that all depdencies are the same,
+# the pkgimg was created for the right set of compiler flags, and that all source code that was used
+# to create this pkgimg is the same. When a CodeInstance is inside a pkgimg we can extend the chain of
+# validity even for GPU code, we cannot verify a "runtime" CodeInstance in the same way.
+#
+# Therefore when we see a compilation request for a CodeInstance that is originating from a pkgimg
+# we can use it as part of the hash for the on-disk cache. (see `cache_file`)
+
+"""
+    disk_cache_enabled()
+
+Query if caching to disk is enabled.
+"""
+disk_cache_enabled() = parse(Bool, @load_preference("disk_cache", "false"))
+
+"""
+    enable_disk_cache!(state::Bool=true)
+
+Activate the GPUCompiler disk cache in the current environment.
+You will need to restart your Julia environment for it to take effect.
+
+!!! note
+    The cache functionality requires Julia 1.11
+"""
+function enable_disk_cache!(state::Bool=true)
+    @set_preferences!("disk_cache"=>string(state))
+end
+
+disk_cache_path() = @get_scratch!("disk_cache")
+clear_disk_cache!() = rm(disk_cache_path(); recursive=true, force=true)
+
 const cache_lock = ReentrantLock()
 
 """
@@ -108,6 +156,37 @@ function cached_compilation(cache::AbstractDict{<:Any,V},
     return obj::V
 end
 
+@noinline function cache_file(ci::CodeInstance, cfg::CompilerConfig)
+    h = hash(Base.objectid(ci))
+    @static if isdefined(Base, :object_build_id)
+        bid = Base.object_build_id(ci)
+        if bid === nothing # CI is from a runtime compilation, not worth caching on disk
+            return nothing
+        else
+            bid = bid % UInt64 # The upper 64bit are a checksum, unavailable during precompilation
+        end
+        h = hash(bid, h)
+    end
+    h = hash(cfg, h)
+
+    gpucompiler_buildid = Base.module_build_id(@__MODULE__)
+    if (gpucompiler_buildid >> 64) % UInt64 == 0xffffffffffffffff
+        return nothing # Don't cache during precompilation of GPUCompiler
+    end
+
+    return joinpath(
+        disk_cache_path(),
+        # bifurcate the cache by build id of GPUCompiler
+        string(gpucompiler_buildid),
+        string(h, ".jls"))
+end
+
+struct DiskCacheEntry
+    src::Type # Originally MethodInstance, but upon deserialize they were not uniqued... 
+    cfg::CompilerConfig
+    asm
+end
+
 @noinline function actual_compilation(cache::AbstractDict, src::MethodInstance, world::UInt,
                                       cfg::CompilerConfig, compiler::Function, linker::Function)
     job = CompilerJob(src, cfg, world)
@@ -117,20 +196,64 @@ end
     ci = ci_cache_lookup(ci_cache(job), src, world, world)::Union{Nothing,CodeInstance}
     if ci !== nothing
         key = (ci, cfg)
-        if haskey(cache, key)
-            obj = cache[key]
-        end
+        obj = get(cache, key, nothing)
     end
 
     # slow path: compile and link
     if obj === nothing || compile_hook[] !== nothing
-        # TODO: consider loading the assembly from an on-disk cache here
-        asm = compiler(job)
+        asm = nothing
+        path = nothing
+        ondisk_hit = false
+        @static if VERSION >= v"1.11.0-"
+            # Don't try to hit the disk cache if we are for a *compile* hook
+            # TODO:
+            #  - Sould we hit disk cache if Base.generating_output()
+            #  - Should we allow backend to opt out?
+            if ci !== nothing && obj === nothing && disk_cache_enabled()
+                path = cache_file(ci, cfg)
+                @debug "Looking for on-disk cache" job path
+                if path !== nothing && isfile(path)
+                    ondisk_hit = true
+                    try
+                        @debug "Loading compiled kernel" job path
+                        # The MI we deserialize here didn't get uniqued...
+                        entry = deserialize(path)::DiskCacheEntry
+                        if entry.src == src.specTypes && entry.cfg == cfg
+                            asm = entry.asm
+                        else
+                            @show entry.src == src.specTypes
+                            @show entry.cfg == cfg
+                            @warn "Cache missmatch" src.specTypes cfg entry.src entry.cfg
+                        end
+                    catch ex
+                        @warn "Failed to load compiled kernel" job path exception=(ex, catch_backtrace())
+                    end
+                end
+            end
+        end
 
+        if asm === nothing || compile_hook[] !== nothing
+            # Run the compiler in-case we need to hook it.
+            asm = compiler(job)
+        end
         if obj !== nothing
             # we got here because of a *compile* hook; don't bother linking
             return obj
         end
+
+        @static if VERSION >= v"1.11.0-"
+            if !ondisk_hit && path !== nothing && disk_cache_enabled()
+                @debug "Writing out on-disk cache" job path
+                tmppath, io = mktemp(;cleanup=false)
+                entry = DiskCacheEntry(src.specTypes, cfg, asm)
+                serialize(io, entry)
+                close(io)
+                # atomic move
+                mkpath(dirname(path))
+                Base.rename(tmppath, path, force=true)
+            end
+        end
+
         obj = linker(job, asm)
 
         if ci === nothing
diff --git a/src/jlgen.jl b/src/jlgen.jl
index c5054547..38150356 100644
--- a/src/jlgen.jl
+++ b/src/jlgen.jl
@@ -587,6 +587,27 @@ macro in_world(world, ex)
     end
 end
 
+"""
+    precompile(job::CompilerJob)
+
+Compile the GPUCompiler job. In particular this will run inference using the foreign
+abstract interpreter.
+"""
+function Base.precompile(@nospecialize(job::CompilerJob))
+    if job.source.def.primary_world > job.world || job.world > job.source.def.deleted_world
+        error("Cannot compile $(job.source) for world $(job.world); method is only valid in worlds $(job.source.def.primary_world) to $(job.source.def.deleted_world)")
+    end
+
+    # populate the cache
+    interp = get_interpreter(job)
+    cache = CC.code_cache(interp)
+    if ci_cache_lookup(cache, job.source, job.world, job.world) === nothing
+        ci_cache_populate(interp, cache, job.source, job.world, job.world)
+        return ci_cache_lookup(cache, job.source, job.world, job.world) !== nothing
+    end
+    return true
+end
+
 function compile_method_instance(@nospecialize(job::CompilerJob))
     if job.source.def.primary_world > job.world || job.world > job.source.def.deleted_world
         error("Cannot compile $(job.source) for world $(job.world); method is only valid in worlds $(job.source.def.primary_world) to $(job.source.def.deleted_world)")
diff --git a/test/native_tests.jl b/test/native_tests.jl
index c80d53eb..016c390a 100644
--- a/test/native_tests.jl
+++ b/test/native_tests.jl
@@ -549,20 +549,21 @@ precompile_test_harness("Inference caching") do load_path
         import GPUCompiler
         using PrecompileTools
 
-        function kernel()
+        function kernel(A, x)
+            A[1] = x
             return
         end
 
         let
-            job, _ = NativeCompiler.create_job(kernel, ())
-            GPUCompiler.code_typed(job)
+            job, _ = NativeCompiler.create_job(kernel, (Vector{Int}, Int))
+            precompile(job)
         end
 
         # identity is foreign
         @setup_workload begin
             job, _ = NativeCompiler.create_job(identity, (Int,))
             @compile_workload begin
-                GPUCompiler.code_typed(job)
+                precompile(job)
             end
         end
     end) |> string)
@@ -578,20 +579,35 @@ precompile_test_harness("Inference caching") do load_path
             job, _ = NativeCompiler.create_job(identity, (Int,))
             GPUCompiler.ci_cache_token(job)
         end
-        ci = isdefined(identity_mi, :cache) ? identity_mi.cache : nothing
-        while ci !== nothing
-            @test ci.owner !== token
-            ci = isdefined(ci, :next) ? ci.next : nothing
-        end
+        @test !check_presence(identity_mi, token)
 
         using InferenceCaching
 
         # Check that kernel survived
-        kernel_mi = GPUCompiler.methodinstance(typeof(InferenceCaching.kernel), Tuple{})
+        kernel_mi = GPUCompiler.methodinstance(typeof(InferenceCaching.kernel), Tuple{Vector{Int}, Int})
         @test check_presence(kernel_mi, token)
 
         # check that identity survived
         @test check_presence(identity_mi, token)
+
+        GPUCompiler.clear_disk_cache!()
+        @test GPUCompiler.disk_cache_enabled() == false
+
+        GPUCompiler.enable_disk_cache!()
+        @test GPUCompiler.disk_cache_enabled() == true
+
+        job, _ = NativeCompiler.create_job(InferenceCaching.kernel, (Vector{Int}, Int))
+        @assert job.source == kernel_mi
+        ci = GPUCompiler.ci_cache_lookup(GPUCompiler.ci_cache(job), job.source, job.world, job.world)
+        @assert ci !== nothing
+        @assert ci.inferred !== nothing
+        path = GPUCompiler.cache_file(ci, job.config)
+        @test path !== nothing
+        @test !ispath(path)
+        NativeCompiler.cached_execution(InferenceCaching.kernel, (Vector{Int}, Int))
+        @test ispath(path)
+        GPUCompiler.clear_disk_cache!()
+        @test !ispath(path)
     end
 end
 
diff --git a/test/native_testsetup.jl b/test/native_testsetup.jl
index 973b300d..3406276c 100644
--- a/test/native_testsetup.jl
+++ b/test/native_testsetup.jl
@@ -71,4 +71,22 @@ function code_execution(@nospecialize(func), @nospecialize(types); kwargs...)
     end
 end
 
+const runtime_cache = Dict{Any, Any}()
+
+function compiler(job)
+    JuliaContext() do ctx
+        GPUCompiler.compile(:asm, job, validate=false)
+    end
+end
+
+function linker(job, asm)
+    asm
+end
+
+# simulates cached codegen
+function cached_execution(@nospecialize(func), @nospecialize(types); kwargs...)
+    job, kwargs = create_job(func, types; kwargs...)
+    GPUCompiler.cached_compilation(runtime_cache, job.source, job.config, compiler, linker)
+end
+
 end
diff --git a/test/ptx_tests.jl b/test/ptx_tests.jl
index c4d47525..c059ba60 100644
--- a/test/ptx_tests.jl
+++ b/test/ptx_tests.jl
@@ -339,14 +339,14 @@ precompile_test_harness("Inference caching") do load_path
 
         let
             job, _ = PTXCompiler.create_job(kernel, ())
-            GPUCompiler.code_typed(job)
+            precompile(job)
         end
 
         # identity is foreign
         @setup_workload begin
             job, _ = PTXCompiler.create_job(identity, (Int,))
             @compile_workload begin
-                GPUCompiler.code_typed(job)
+                precompile(job)
             end
         end
     end) |> string)