FluxML · jpsamaroo · Nov 19, 2019 · Nov 26, 2019 · Nov 26, 2019 · Nov 26, 2019
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
@@ -39,3 +39,46 @@ julia:nightly:
   tags:
     - nvidia
   allow_failure: true
+
+# ROCm CI
+
+julia-rocm:1.0:
+  image: rocm/dev-ubuntu-18.04
+  extends:
+    - .julia:1.0
+    - .test
+  tags:
+    - rocm
+
+julia-rocm:1.1:
+  image: rocm/dev-ubuntu-18.04
+  extends:
+    - .julia:1.1
+    - .test
+  tags:
+    - rocm
+
+julia-rocm:1.2:
+  image: rocm/dev-ubuntu-18.04
+  extends:
+    - .julia:1.2
+    - .test
+  tags:
+    - rocm
+
+julia-rocm:1.3:
+  image: rocm/dev-ubuntu-18.04
+  extends:
+    - .julia:1.3
+    - .test
+  tags:
+    - rocm
+
+julia-rocm:nightly:
+  image: rocm/dev-ubuntu-18.04
+  extends:
+    - .julia:nightly
+    - .test
+  tags:
+    - rocm
+  allow_failure: true
diff --git a/Manifest.toml b/Manifest.toml
@@ -1,5 +1,11 @@
 # This file is machine-generated - editing it directly is not advised
 
+[[AMDGPUnative]]
+deps = ["Adapt", "BinaryProvider", "HSARuntime", "InteractiveUtils", "LLVM", "Libdl"]
+git-tree-sha1 = "3c2d1b869b9036d4743a282dfa8e56b43f475356"
+uuid = "12f4821f-d7ee-5ba6-b76b-566925c5fcc5"
+version = "0.1.0"
+
 [[AbstractFFTs]]
 deps = ["LinearAlgebra"]
 git-tree-sha1 = "380e36c66edfa099cd90116b24c1ce8cafccac40"
@@ -92,6 +98,11 @@ git-tree-sha1 = "9a11d428dcdc425072af4aea19ab1e8c3e01c032"
 uuid = "8f4d0f93-b110-5947-807f-2305c1781a2d"
 version = "1.3.0"
 
+[[ConstructionBase]]
+git-tree-sha1 = "a2a6a5fea4d6f730ec4c18a76d27ec10e8ec1c50"
+uuid = "187b0558-2788-49d3-abe0-74a17ed4e7c9"
+version = "1.0.0"
+
 [[CuArrays]]
 deps = ["AbstractFFTs", "Adapt", "CEnum", "CUDAapi", "CUDAdrv", "CUDAnative", "DataStructures", "GPUArrays", "Libdl", "LinearAlgebra", "MacroTools", "NNlib", "Printf", "Random", "Requires", "SparseArrays", "TimerOutputs"]
 git-tree-sha1 = "4757376a85ffb27d4c4f6cdf9635261e6c3a5fec"
@@ -162,6 +173,12 @@ git-tree-sha1 = "a0a3b927b1a06e63fb8b91950cc7df340b7d912c"
 uuid = "0c68f7d7-f131-5f86-a1c3-88cf8149b2d7"
 version = "2.0.0"
 
+[[HSARuntime]]
+deps = ["CEnum", "Libdl", "Setfield"]
+git-tree-sha1 = "ea40d80684b6f986599f19b1840abc00af4a399b"
+uuid = "2c364e2c-59fb-59c3-96f3-194112e690e0"
+version = "0.2.5"
+
 [[IRTools]]
 deps = ["InteractiveUtils", "MacroTools", "Test"]
 git-tree-sha1 = "72421971e60917b8cd7737f9577c4f0f87eab306"
@@ -191,6 +208,7 @@ uuid = "929cbde3-209d-540e-8aea-75f648917ca0"
 version = "1.3.2"
 
 [[LibGit2]]
+deps = ["Printf"]
 uuid = "76f85450-5226-5b5a-8eaa-529ad045b433"
 
 [[Libdl]]
@@ -268,6 +286,12 @@ uuid = "9abbd945-dff8-562f-b5e8-e1ebf5ef1b79"
 deps = ["InteractiveUtils", "Markdown", "Sockets"]
 uuid = "3fa0cd96-eef1-5676-8a61-b3b8758bbffb"
 
+[[ROCArrays]]
+deps = ["AMDGPUnative", "Adapt", "CEnum", "GPUArrays", "HSARuntime", "Libdl", "LinearAlgebra"]
+git-tree-sha1 = "406f9dd59b38763f97370ec7ab7297782d2aec18"
+uuid = "ddf941ca-5d6a-11e9-36cc-a3fed13dd2fc"
+version = "0.1.0"
+
 [[Random]]
 deps = ["Serialization"]
 uuid = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -290,6 +314,12 @@ uuid = "ea8e919c-243c-51af-8825-aaa63cd721ce"
 [[Serialization]]
 uuid = "9e88b42a-f829-5b0c-bbe9-9e923198166b"
 
+[[Setfield]]
+deps = ["ConstructionBase", "MacroTools"]
+git-tree-sha1 = "f4adc8effec88c1dc109190f22370cb9648a5ced"
+uuid = "efcf1570-3423-57d1-acb7-fd33fddbac46"
+version = "0.5.2"
+
 [[SharedArrays]]
 deps = ["Distributed", "Mmap", "Random", "Serialization"]
 uuid = "1a1011a3-84de-559e-8e89-a11a2f7dc383"

diff --git a/Project.toml b/Project.toml
@@ -14,6 +14,7 @@ MacroTools = "1914dd2f-81c6-5fcd-8719-6d5c9610ff09"
 NNlib = "872c559c-99b0-510c-b3b7-b6c96a88d5cd"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
+ROCArrays = "ddf941ca-5d6a-11e9-36cc-a3fed13dd2fc"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 SHA = "ea8e919c-243c-51af-8825-aaa63cd721ce"
@@ -26,6 +27,7 @@ Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
 [compat]
 CuArrays = "1.4.3"
 NNlib = "0.6"
+ROCArrays = "0.1"
 Zygote = "0.4"
 julia = "1"
 

diff --git a/docs/src/gpu.md b/docs/src/gpu.md
@@ -1,15 +1,17 @@
 # GPU Support
 
-NVIDIA GPU support should work out of the box on systems with CUDA and CUDNN installed. For more details see the [CuArrays](https://github.com/JuliaGPU/CuArrays.jl) readme.
+NVIDIA GPU support should work out of the box on systems with CUDA and CUDNN installed. For more details see the [CuArrays](https://github.com/JuliaGPU/CuArrays.jl) readme. AMD GPU support should work out of the box on systems with ROCm and external libraries installed. For more details see the [ROCArrays](https://github.com/jpsamaroo/ROCArrays.jl) readme.
 
 ## GPU Usage
 
-Support for array operations on other hardware backends, like GPUs, is provided by external packages like [CuArrays](https://github.com/JuliaGPU/CuArrays.jl). Flux is agnostic to array types, so we simply need to move model weights and data to the GPU and Flux will handle it.
+Support for array operations on other hardware backends, like GPUs, is provided by external packages like [CuArrays](https://github.com/JuliaGPU/CuArrays.jl) and [ROCArrays](https://github.com/jpsamaroo/ROCArrays.jl). Flux is agnostic to array types, so we simply need to move model weights and data to the GPU and Flux will handle it.
 
 For example, we can use `CuArrays` (with the `cu` converter) to run our [basic example](models/basics.md) on an NVIDIA GPU.
 
 (Note that you need to have CUDA available to use CuArrays – please see the [CuArrays.jl](https://github.com/JuliaGPU/CuArrays.jl) instructions for more details.)
 
+(Note that the following examples should work on AMD GPUs by loading `ROCArrays` instead of `CuArrays` and replacing `cu` with `roc`. The `gpu` function will automatically use ROCArrays if possible.)
+
 ```julia
 using CuArrays
 

diff --git a/src/Flux.jl b/src/Flux.jl
@@ -11,7 +11,7 @@ export gradient
 
 export Chain, Dense, Maxout, RNN, LSTM, GRU, Conv, CrossCor, ConvTranspose, MaxPool, MeanPool,
        DepthwiseConv, Dropout, AlphaDropout, LayerNorm, BatchNorm, InstanceNorm, GroupNorm,
-       SkipConnection, params, fmap, cpu, gpu, f32, f64
+       SkipConnection, params, fmap, cpu, gpu, cugpu, rocgpu, f32, f64
 
 include("optimise/Optimise.jl")
 using .Optimise
@@ -23,6 +23,8 @@ export SGD, Descent, ADAM, Momentum, Nesterov, RMSProp,
 
 using CuArrays
 const use_cuda = Ref(false)
+using ROCArrays
+const use_rocm = Ref(false)
 
 include("utils.jl")
 include("onehot.jl")
@@ -52,6 +54,12 @@ function __init__()
       @warn "CuArrays.jl did not find libcudnn. Some functionality will not be available."
     end
   end
+  if !ROCArrays.configured
+    # nothing to do here, and either ROCArrays or one of its dependencies will have warned
+  else
+    use_rocm[] = true
+    include(joinpath(@__DIR__, "rocm/rocm.jl"))
+  end
 end
 
 end # module
diff --git a/src/functor.jl b/src/functor.jl
@@ -73,7 +73,10 @@ end
 
 cpu(m) = fmap(x -> adapt(Array, x), m)
 
-gpu(x) = use_cuda[] ? fmap(CuArrays.cu, x) : x
+gpu(x) = use_cuda[] ? fmap(CuArrays.cu, x) :
+        (use_rocm[] ? fmap(ROCArrays.roc, x) : x)
+cugpu(x) = use_cuda[] ? fmap(CuArrays.cu, x) : x
+rocgpu(x) = use_rocm[] ? fmap(ROCArrays.roc, x) : x
 
 # Precision
 

diff --git a/src/layers/stateless.jl b/src/layers/stateless.jl
@@ -1,4 +1,5 @@
 using CuArrays
+using ROCArrays
 using NNlib: logsoftmax, logσ
 
 # Cost functions
@@ -36,8 +37,9 @@ Return `-y*log(ŷ + ϵ) - (1-y)*log(1-ŷ + ϵ)`. The ϵ term provides numerica
 """
 binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
 
-# Re-definition to fix interaction with CuArrays.
+# Re-definition to fix interaction with CuArrays and ROCArrays
 CuArrays.@cufunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
+# FIXME: ROCArrays.@rocfunc binarycrossentropy(ŷ, y; ϵ=eps(ŷ)) = -y*log(ŷ + ϵ) - (1 - y)*log(1 - ŷ + ϵ)
 
 """
     logitbinarycrossentropy(logŷ, y)

diff --git a/src/onehot.jl b/src/onehot.jl
@@ -38,9 +38,12 @@ import Adapt: adapt, adapt_structure
 adapt_structure(T, xs::OneHotMatrix) = OneHotMatrix(xs.height, adapt(T, xs.data))
 
 import .CuArrays: CuArray, cudaconvert
+import .ROCArrays: ROCArray, rocconvert
 import Base.Broadcast: BroadcastStyle, ArrayStyle
 BroadcastStyle(::Type{<:OneHotMatrix{<:CuArray}}) = ArrayStyle{CuArray}()
+BroadcastStyle(::Type{<:OneHotMatrix{<:ROCArray}}) = ArrayStyle{ROCArray}()
 cudaconvert(x::OneHotMatrix{<:CuArray}) = OneHotMatrix(x.height, cudaconvert(x.data))
+rocconvert(x::OneHotMatrix{<:ROCArray}) = OneHotMatrix(x.height, rocconvert(x.data))
 
 """
     onehot(l, labels[, unk])

diff --git a/src/rocm/rocm.jl b/src/rocm/rocm.jl
@@ -0,0 +1,7 @@
+module ROCm
+
+using ..ROCArrays
+
+# TODO: MIOpen stuff included here
+
+end
diff --git a/test/cuda/cuda.jl b/test/cuda/cuda.jl
@@ -2,7 +2,7 @@ using Flux, Test
 using Flux.CuArrays
 using Flux: gpu
 
-@info "Testing GPU Support"
+@info "Testing CUDA GPU Support"
 
 @testset "CuArrays" begin
 

diff --git a/test/rocm/miopen.jl b/test/rocm/miopen.jl
diff --git a/test/rocm/rocm.jl b/test/rocm/rocm.jl
@@ -0,0 +1,65 @@
+using Flux, Test
+using Flux.ROCArrays
+using Flux: rocgpu
+
+@info "Testing ROCm GPU Support"
+
+@testset "ROCArrays" begin
+
+ROCArrays.allowscalar(false)
+
+x = randn(5, 5)
+cx = rocgpu(x)
+@test cx isa ROCArray
+
+@test Flux.onecold(rocgpu([1.0, 2.0, 3.0])) == 3
+
+x = Flux.onehotbatch([1, 2, 3], 1:3)
+cx = rocgpu(x)
+@test cx isa Flux.OneHotMatrix && cx.data isa ROCArray
+@test (cx .+ 1) isa ROCArray
+
+m = Chain(Dense(10, 5, tanh), Dense(5, 2), softmax)
+cm = rocgpu(m)
+
+@test all(p isa ROCArray for p in params(cm))
+@test cm(rocgpu(rand(10, 10))) isa ROCArray{Float32,2}
+
+x = [1,2,3]
+cx = rocgpu(x)
+@test Flux.crossentropy(x,x) ≈ Flux.crossentropy(cx,cx)
+@test Flux.crossentropy(x,x, weight=1.0) ≈ Flux.crossentropy(cx,cx, weight=1.0)
+@test Flux.crossentropy(x,x, weight=[1.0;2.0;3.0]) ≈ Flux.crossentropy(cx,cx, weight=cu([1.0;2.0;3.0]))
+
+x = σ.([-1.1491, 0.8619, 0.3127])
+y = [1, 1, 0.]
+@test Flux.binarycrossentropy.(x,y) ≈ Flux.binarycrossentropy.(cu(x),cu(y))
+
+xs = rand(5, 5)
+ys = Flux.onehotbatch(1:5,1:5)
+@test collect(cu(xs) .+ cu(ys)) ≈ collect(xs .+ ys)
+
+c = rocgpu(Conv((2,2),3=>4))
+x = rocgpu(rand(10, 10, 3, 2))
+l = c(rocgpu(rand(10,10,3,2)))
+@test gradient(x -> sum(c(x)), x)[1] isa ROCArray
+
+c = rocgpu(CrossCor((2,2),3=>4))
+x = rocgpu(rand(10, 10, 3, 2))
+l = c(rocgpu(rand(10,10,3,2)))
+@test gradient(x -> sum(c(x)), x)[1] isa ROCArray
+
+end
+
+@testset "onecold rocgpu" begin
+  y = Flux.onehotbatch(ones(3), 1:10) |> rocgpu;
+  @test Flux.onecold(y) isa ROCArray
+  @test y[3,:] isa ROCArray
+end
+
+if isdefined(ROCArrays, :MIOpen)
+  @info "Testing Flux/MIOpen"
+  include("miopen.jl")
+else
+  @warn "MIOpen unavailable, not testing GPU DNN support"
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -24,6 +24,11 @@ if Flux.use_cuda[]
 else
   @warn "CUDA unavailable, not testing GPU support"
 end
+if Flux.use_rocm[]
+  include("rocm/rocm.jl")
+else
+  @warn "ROCm unavailable, not testing GPU support"
+end
 
 if VERSION >= v"1.2"
   doctest(Flux)