diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index b55f4618f2..3f8edc57ec 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -39,3 +39,46 @@ julia:nightly: tags: - nvidia allow_failure: true + +# ROCm CI + +julia-rocm:1.0: + image: rocm/dev-ubuntu-18.04 + extends: + - .julia:1.0 + - .test + tags: + - rocm + +julia-rocm:1.1: + image: rocm/dev-ubuntu-18.04 + extends: + - .julia:1.1 + - .test + tags: + - rocm + +julia-rocm:1.2: + image: rocm/dev-ubuntu-18.04 + extends: + - .julia:1.2 + - .test + tags: + - rocm + +julia-rocm:1.3: + image: rocm/dev-ubuntu-18.04 + extends: + - .julia:1.3 + - .test + tags: + - rocm + +julia-rocm:nightly: + image: rocm/dev-ubuntu-18.04 + extends: + - .julia:nightly + - .test + tags: + - rocm + allow_failure: true diff --git a/Manifest.toml b/Manifest.toml index bb48887979..605cf26bf7 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -1,5 +1,11 @@ # This file is machine-generated - editing it directly is not advised +[[AMDGPUnative]] +deps = ["Adapt", "BinaryProvider", "HSARuntime", "InteractiveUtils", "LLVM", "Libdl"] +git-tree-sha1 = "3c2d1b869b9036d4743a282dfa8e56b43f475356" +uuid = "12f4821f-d7ee-5ba6-b76b-566925c5fcc5" +version = "0.1.0" + [[AbstractFFTs]] deps = ["LinearAlgebra"] git-tree-sha1 = "380e36c66edfa099cd90116b24c1ce8cafccac40" @@ -92,6 +98,11 @@ git-tree-sha1 = "9a11d428dcdc425072af4aea19ab1e8c3e01c032" uuid = "8f4d0f93-b110-5947-807f-2305c1781a2d" version = "1.3.0" +[[ConstructionBase]] +git-tree-sha1 = "a2a6a5fea4d6f730ec4c18a76d27ec10e8ec1c50" +uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9" +version = "1.0.0" + [[CuArrays]] deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"] git-tree-sha1 = "4757376a85ffb27d4c4f6cdf9635261e6c3a5fec" @@ -162,6 +173,12 @@ git-tree-sha1 = "a0a3b927b1a06e63fb8b91950cc7df340b7d912c" uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7" version = "2.0.0" +[[HSARuntime]] +deps = ["CEnum", "Libdl", "Setfield"] +git-tree-sha1 = "ea40d80684b6f986599f19b1840abc00af4a399b" +uuid = "2c364e2c-59fb-59c3-96f3-194112e690e0" +version = "0.2.5" + [[IRTools]] deps = ["InteractiveUtils", "MacroTools", "Test"] git-tree-sha1 = "72421971e60917b8cd7737f9577c4f0f87eab306" @@ -191,6 +208,7 @@ uuid = "929cbde3-209d-540e-8aea-75f648917ca0" version = "1.3.2" [[LibGit2]] +deps = ["Printf"] uuid = "76f85450-5226-5b5a-8eaa-529ad045b433" [[Libdl]] @@ -268,6 +286,12 @@ uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79" deps = ["InteractiveUtils", "Markdown", "Sockets"] uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb" +[[ROCArrays]] +deps = ["AMDGPUnative", "Adapt", "CEnum", "GPUArrays", "HSARuntime", "Libdl", "LinearAlgebra"] +git-tree-sha1 = "406f9dd59b38763f97370ec7ab7297782d2aec18" +uuid = "ddf941ca-5d6a-11e9-36cc-a3fed13dd2fc" +version = "0.1.0" + [[Random]] deps = ["Serialization"] uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" @@ -290,6 +314,12 @@ uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce" [[Serialization]] uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b" +[[Setfield]] +deps = ["ConstructionBase", "MacroTools"] +git-tree-sha1 = "f4adc8effec88c1dc109190f22370cb9648a5ced" +uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46" +version = "0.5.2" + [[SharedArrays]] deps = ["Distributed", "Mmap", "Random", "Serialization"] uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383" diff --git a/Project.toml b/Project.toml index 7f4ab46477..69bf612092 100644 --- a/Project.toml +++ b/Project.toml @@ -14,6 +14,7 @@ MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09" NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7" +ROCArrays = "ddf941ca-5d6a-11e9-36cc-a3fed13dd2fc" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Reexport = "189a3867-3050-52da-a836-e630ba90ab69" SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce" @@ -26,6 +27,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f" [compat] CuArrays = "1.4.3" NNlib = "0.6" +ROCArrays = "0.1" Zygote = "0.4" julia = "1" diff --git a/docs/src/gpu.md b/docs/src/gpu.md index bb13fdd18d..e9857fbf3c 100644 --- a/docs/src/gpu.md +++ b/docs/src/gpu.md @@ -1,15 +1,17 @@ # GPU Support -NVIDIA GPU support should work out of the box on systems with CUDA and CUDNN installed. For more details see the [CuArrays](https://github.com/JuliaGPU/CuArrays.jl) readme. +NVIDIA GPU support should work out of the box on systems with CUDA and CUDNN installed. For more details see the [CuArrays](https://github.com/JuliaGPU/CuArrays.jl) readme. AMD GPU support should work out of the box on systems with ROCm and external libraries installed. For more details see the [ROCArrays](https://github.com/jpsamaroo/ROCArrays.jl) readme. ## GPU Usage -Support for array operations on other hardware backends, like GPUs, is provided by external packages like [CuArrays](https://github.com/JuliaGPU/CuArrays.jl). Flux is agnostic to array types, so we simply need to move model weights and data to the GPU and Flux will handle it. +Support for array operations on other hardware backends, like GPUs, is provided by external packages like [CuArrays](https://github.com/JuliaGPU/CuArrays.jl) and [ROCArrays](https://github.com/jpsamaroo/ROCArrays.jl). Flux is agnostic to array types, so we simply need to move model weights and data to the GPU and Flux will handle it. For example, we can use `CuArrays` (with the `cu` converter) to run our [basic example](models/basics.md) on an NVIDIA GPU. (Note that you need to have CUDA available to use CuArrays – please see the [CuArrays.jl](https://github.com/JuliaGPU/CuArrays.jl) instructions for more details.) +(Note that the following examples should work on AMD GPUs by loading `ROCArrays` instead of `CuArrays` and replacing `cu` with `roc`. The `gpu` function will automatically use ROCArrays if possible.) + ```julia using CuArrays diff --git a/src/Flux.jl b/src/Flux.jl index d0e0d5bf0f..c8bd33bc8d 100644 --- a/src/Flux.jl +++ b/src/Flux.jl @@ -11,7 +11,7 @@ export gradient export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose, MaxPool, MeanPool, DepthwiseConv, Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm, GroupNorm, - SkipConnection, params, fmap, cpu, gpu, f32, f64 + SkipConnection, params, fmap, cpu, gpu, cugpu, rocgpu, f32, f64 include("optimise/Optimise.jl") using .Optimise @@ -23,6 +23,8 @@ export SGD, Descent, ADAM, Momentum, Nesterov, RMSProp, using CuArrays const use_cuda = Ref(false) +using ROCArrays +const use_rocm = Ref(false) include("utils.jl") include("onehot.jl") @@ -52,6 +54,12 @@ function __init__() @warn "CuArrays.jl did not find libcudnn. Some functionality will not be available." end end + if !ROCArrays.configured + # nothing to do here, and either ROCArrays or one of its dependencies will have warned + else + use_rocm[] = true + include(joinpath(@__DIR__, "rocm/rocm.jl")) + end end end # module diff --git a/src/functor.jl b/src/functor.jl index a36b5765f4..061aa64631 100644 --- a/src/functor.jl +++ b/src/functor.jl @@ -73,7 +73,10 @@ end cpu(m) = fmap(x -> adapt(Array, x), m) -gpu(x) = use_cuda[] ? fmap(CuArrays.cu, x) : x +gpu(x) = use_cuda[] ? fmap(CuArrays.cu, x) : + (use_rocm[] ? fmap(ROCArrays.roc, x) : x) +cugpu(x) = use_cuda[] ? fmap(CuArrays.cu, x) : x +rocgpu(x) = use_rocm[] ? fmap(ROCArrays.roc, x) : x # Precision diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl index 5f9c10904b..46cc089104 100644 --- a/src/layers/stateless.jl +++ b/src/layers/stateless.jl @@ -1,4 +1,5 @@ using CuArrays +using ROCArrays using NNlib: logsoftmax, logσ # Cost functions @@ -36,8 +37,9 @@ Return `-y*log(ŷ + ϵ) - (1-y)*log(1-ŷ + ϵ)`. The ϵ term provides numerica """ binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ) -# Re-definition to fix interaction with CuArrays. +# Re-definition to fix interaction with CuArrays and ROCArrays CuArrays.@cufunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ) +# FIXME: ROCArrays.@rocfunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ) """ logitbinarycrossentropy(logŷ, y) diff --git a/src/onehot.jl b/src/onehot.jl index 754d060740..db0ecb8667 100644 --- a/src/onehot.jl +++ b/src/onehot.jl @@ -38,9 +38,12 @@ import Adapt: adapt, adapt_structure adapt_structure(T, xs::OneHotMatrix) = OneHotMatrix(xs.height, adapt(T, xs.data)) import .CuArrays: CuArray, cudaconvert +import .ROCArrays: ROCArray, rocconvert import Base.Broadcast: BroadcastStyle, ArrayStyle BroadcastStyle(::Type{<:OneHotMatrix{<:CuArray}}) = ArrayStyle{CuArray}() +BroadcastStyle(::Type{<:OneHotMatrix{<:ROCArray}}) = ArrayStyle{ROCArray}() cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data)) +rocconvert(x::OneHotMatrix{<:ROCArray}) = OneHotMatrix(x.height, rocconvert(x.data)) """ onehot(l, labels[, unk]) diff --git a/src/rocm/rocm.jl b/src/rocm/rocm.jl new file mode 100644 index 0000000000..bd08eaa095 --- /dev/null +++ b/src/rocm/rocm.jl @@ -0,0 +1,7 @@ +module ROCm + +using ..ROCArrays + +# TODO: MIOpen stuff included here + +end diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl index ddd92e1ed3..826bb92063 100644 --- a/test/cuda/cuda.jl +++ b/test/cuda/cuda.jl @@ -2,7 +2,7 @@ using Flux, Test using Flux.CuArrays using Flux: gpu -@info "Testing GPU Support" +@info "Testing CUDA GPU Support" @testset "CuArrays" begin diff --git a/test/rocm/miopen.jl b/test/rocm/miopen.jl new file mode 100644 index 0000000000..e69de29bb2 diff --git a/test/rocm/rocm.jl b/test/rocm/rocm.jl new file mode 100644 index 0000000000..d997cc04fa --- /dev/null +++ b/test/rocm/rocm.jl @@ -0,0 +1,65 @@ +using Flux, Test +using Flux.ROCArrays +using Flux: rocgpu + +@info "Testing ROCm GPU Support" + +@testset "ROCArrays" begin + +ROCArrays.allowscalar(false) + +x = randn(5, 5) +cx = rocgpu(x) +@test cx isa ROCArray + +@test Flux.onecold(rocgpu([1.0, 2.0, 3.0])) == 3 + +x = Flux.onehotbatch([1, 2, 3], 1:3) +cx = rocgpu(x) +@test cx isa Flux.OneHotMatrix && cx.data isa ROCArray +@test (cx .+ 1) isa ROCArray + +m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax) +cm = rocgpu(m) + +@test all(p isa ROCArray for p in params(cm)) +@test cm(rocgpu(rand(10, 10))) isa ROCArray{Float32,2} + +x = [1,2,3] +cx = rocgpu(x) +@test Flux.crossentropy(x,x) ≈ Flux.crossentropy(cx,cx) +@test Flux.crossentropy(x,x, weight=1.0) ≈ Flux.crossentropy(cx,cx, weight=1.0) +@test Flux.crossentropy(x,x, weight=[1.0;2.0;3.0]) ≈ Flux.crossentropy(cx,cx, weight=cu([1.0;2.0;3.0])) + +x = σ.([-1.1491, 0.8619, 0.3127]) +y = [1, 1, 0.] +@test Flux.binarycrossentropy.(x,y) ≈ Flux.binarycrossentropy.(cu(x),cu(y)) + +xs = rand(5, 5) +ys = Flux.onehotbatch(1:5,1:5) +@test collect(cu(xs) .+ cu(ys)) ≈ collect(xs .+ ys) + +c = rocgpu(Conv((2,2),3=>4)) +x = rocgpu(rand(10, 10, 3, 2)) +l = c(rocgpu(rand(10,10,3,2))) +@test gradient(x -> sum(c(x)), x)[1] isa ROCArray + +c = rocgpu(CrossCor((2,2),3=>4)) +x = rocgpu(rand(10, 10, 3, 2)) +l = c(rocgpu(rand(10,10,3,2))) +@test gradient(x -> sum(c(x)), x)[1] isa ROCArray + +end + +@testset "onecold rocgpu" begin + y = Flux.onehotbatch(ones(3), 1:10) |> rocgpu; + @test Flux.onecold(y) isa ROCArray + @test y[3,:] isa ROCArray +end + +if isdefined(ROCArrays, :MIOpen) + @info "Testing Flux/MIOpen" + include("miopen.jl") +else + @warn "MIOpen unavailable, not testing GPU DNN support" +end diff --git a/test/runtests.jl b/test/runtests.jl index 1505e96a7c..814e138a80 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -24,6 +24,11 @@ if Flux.use_cuda[] else @warn "CUDA unavailable, not testing GPU support" end +if Flux.use_rocm[] + include("rocm/rocm.jl") +else + @warn "ROCm unavailable, not testing GPU support" +end if VERSION >= v"1.2" doctest(Flux)