diff --git a/.github/workflows/ci-julia-nightly.yml b/.github/workflows/ci-julia-nightly.yml
new file mode 100644
index 0000000..9c1eabd
--- /dev/null
+++ b/.github/workflows/ci-julia-nightly.yml
@@ -0,0 +1,47 @@
+name: CI (Julia nightly)
+on:
+ pull_request:
+ branches:
+ - master
+ push:
+ branches:
+ - master
+ tags: '*'
+jobs:
+ test-julia-nightly:
+ name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
+ runs-on: ${{ matrix.os }}
+ strategy:
+ fail-fast: false
+ matrix:
+ JULIA_NUM_THREADS:
+ - '1'
+ - '6'
+ version:
+ - 'nightly'
+ os:
+ - ubuntu-latest
+ arch:
+ - x64
+ steps:
+ - uses: actions/checkout@v2
+ - uses: julia-actions/setup-julia@v1
+ with:
+ version: ${{ matrix.version }}
+ arch: ${{ matrix.arch }}
+ - uses: actions/cache@v1
+ env:
+ cache-name: cache-artifacts
+ with:
+ path: ~/.julia/artifacts
+ key: ${{ runner.os }}-test-${{ env.cache-name }}-${{ hashFiles('**/Project.toml') }}
+ restore-keys: |
+ ${{ runner.os }}-test-${{ env.cache-name }}-
+ ${{ runner.os }}-test-
+ ${{ runner.os }}-
+ - uses: julia-actions/julia-buildpkg@v1
+ - uses: julia-actions/julia-runtest@v1
+ - uses: julia-actions/julia-processcoverage@v1
+ - uses: codecov/codecov-action@v1
+ with:
+ file: lcov.info
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index c3ef421..b927098 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -11,14 +11,15 @@ jobs:
test:
name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
runs-on: ${{ matrix.os }}
- continue-on-error: ${{ matrix.version == 'nightly' }}
strategy:
fail-fast: false
matrix:
+ JULIA_NUM_THREADS:
+ - '1'
+ - '6'
version:
- - '1.4' # Replace this with the minimum Julia version that your package supports. E.g. if your package requires Julia 1.5 or higher, change this to '1.5'.
- - '1' # Leave this line unchanged. '1' will automatically expand to the latest stable 1.x release of Julia.
- - 'nightly'
+ - '1.4'
+ - '1' # automatically expands to the latest stable 1.x release of Julia
os:
- ubuntu-latest
arch:
@@ -41,6 +42,8 @@ jobs:
${{ runner.os }}-
- uses: julia-actions/julia-buildpkg@v1
- uses: julia-actions/julia-runtest@v1
+ env:
+ JULIA_NUM_THREADS: ${{ matrix.JULIA_NUM_THREADS }}
- uses: julia-actions/julia-processcoverage@v1
- uses: codecov/codecov-action@v1
with:
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index cb2da3d..0000000
--- a/.travis.yml
+++ /dev/null
@@ -1,22 +0,0 @@
-language: julia
-
-os:
- - linux
-julia:
- - 1.4
- - 1
- - nightly
-env:
- - JULIA_NUM_THREADS=1
- - JULIA_NUM_THREADS=6
-
-notifications:
- email: false
-
-jobs:
- allow_failures:
- - julia: nightly
-
-cache:
- directories:
- - $HOME/.julia/artifacts
diff --git a/Project.toml b/Project.toml
index e03dacc..10afac0 100644
--- a/Project.toml
+++ b/Project.toml
@@ -9,8 +9,19 @@ LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Requires = "ae029012-a4dd-5104-9daa-d747884805df"
[compat]
+CUDA = "1, 2"
DiffRules = "1"
+FillArrays = "0.10"
+ForwardDiff = "0.10"
+KernelAbstractions = "0.4"
+LoopVectorization = "0.8.26, 0.9.7"
+NamedDims = "0.2"
+OffsetArrays = "1"
Requires = "1"
+TensorOperations = "3"
+Tracker = "0.2"
+VectorizationBase = "0.12.33, 0.13.10"
+Zygote = "0.5"
julia = "1.3"
[extras]
@@ -27,7 +38,8 @@ Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
TensorOperations = "6aa20fa7-93e2-5fca-9bc0-fbd0db3c71a2"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
Tracker = "9f7883ad-71c0-57eb-9f7f-b5c9e6d3789c"
+VectorizationBase = "3d5dd08c-fd9d-11e8-17fa-ed2836048c2f"
Zygote = "e88e6eb3-aa80-5325-afca-941959d7151f"
[targets]
-test = ["Test", "CUDA", "FillArrays", "ForwardDiff", "KernelAbstractions", "LinearAlgebra", "LoopVectorization", "NamedDims", "OffsetArrays", "Printf", "Random", "TensorOperations", "Tracker", "Zygote"]
+test = ["Test", "CUDA", "FillArrays", "ForwardDiff", "KernelAbstractions", "LinearAlgebra", "LoopVectorization", "NamedDims", "OffsetArrays", "Printf", "Random", "TensorOperations", "Tracker", "VectorizationBase", "Zygote"]
diff --git a/README.md b/README.md
index 0067e77..00c0a15 100644
--- a/README.md
+++ b/README.md
@@ -1,8 +1,7 @@
Tullio.jl
-
-[](https://github.com/mcabbott/Tullio.jl/actions)
+[](https://github.com/mcabbott/Tullio.jl/actions?query=workflow%3ACI)
[](https://gitlab.com/JuliaGPU/Tullio.jl/-/pipelines)
[](https://github.com/mcabbott/Tullio.jl/releases)
@@ -21,7 +20,7 @@ Tullio is a very flexible einsum macro. It understands many array operations wri
Used by itself the macro writes ordinary nested loops much like [`Einsum.@einsum`](https://github.com/ahwillia/Einsum.jl).
One difference is that it can parse more expressions (such as the convolution `M`, and worse).
-Another is that it will use multi-threading (via [`Threads.@spawn`](https://julialang.org/blog/2019/07/multithreading/)) and recursive tiling, on large enough arrays.
+Another is that it will use multi-threading (via [`Threads.@spawn`](https://julialang.org/blog/2019/07/multithreading/)) and recursive tiling, on large enough arrays.
But it also co-operates with various other packages, provided they are loaded before the macro is called:
* It uses [`LoopVectorization.@avx`](https://github.com/chriselrod/LoopVectorization.jl) to speed many things up. (Disable with `avx=false`.) On a good day this will match the speed of OpenBLAS for matrix multiplication.
@@ -48,18 +47,18 @@ The expression need not be just one line, for example:
Here the macro cannot infer the range of the output's indices `x,y`, so they must be provided explicitly.
(If writing into an existing array, with `out[x,y] = begin ...` or `+=`, then ranges would be taken from there.)
-Because it sees assignment being made, it does not attempt to sum over `a,b`, and it assumes that indices could go out of bounds so does not add `@inbounds` for you.
+Because it sees assignment being made, it does not attempt to sum over `a,b`, and it assumes that indices could go out of bounds so does not add `@inbounds` for you.
(Although in fact `mod(x+a) == mod(x+a, axes(mat,1))` is safe.)
It will also not be able to take a symbolic derivative, but dual numbers will work fine.
Pipe operators `|>` or `<|` indicate functions to be performed *outside* the sum, for example:
```julia
-@tullio lse[j] := log <| exp(mat[i,j]) # vec(log.(sum(exp.(mat), dims=1)))
+@tullio lse[j] := log <| exp(mat[i,j]) # vec(log.(sum(exp.(mat), dims=1)))
```
The option `@tullio verbose=true` will cause it to print index ranges, symbolic derivatives,
-and notices when it is unable to use the packages mentioned above.
+and notices when it is unable to use the packages mentioned above.
And `verbose=2` will print everything.
Notation
@@ -93,7 +92,7 @@ S = [0,1,0,0, 0,0,0,0]
fft(S) ≈ @tullio F[k] := S[x] * exp(-im*pi/8 * (k-1) * x) (k ∈ axes(S,1))
# Finalisers <| or |> are applied after sum (the two are equivalent):
-@tullio N2[j] := sqrt <| M[i,j]^2 # N2 ≈ map(norm, eachcol(M))
+@tullio N2[j] := sqrt <| M[i,j]^2 # N2 ≈ map(norm, eachcol(M))
@tullio n3[_] := A[i]^3 |> (_)^(1/3) # n3[1] ≈ norm(A,3), with _ anon. func.
# Reduction over any function:
@@ -115,8 +114,8 @@ using NamedDims, AxisKeys # Dimension names, plus pretty printing:
Fast & slow
-When used with LoopVectorization, on straightforward matrix multiplication of real numbers,
-`@tullio` tends to be about as fast as OpenBLAS. Depending on the size, and on your computer.
+When used with LoopVectorization, on straightforward matrix multiplication of real numbers,
+`@tullio` tends to be about as fast as OpenBLAS. Depending on the size, and on your computer.
Here's a speed comparison on mine: [v2.5](https://github.com/mcabbott/Tullio.jl/blob/master/benchmarks/02/matmul-0.2.5-Float64-1.5.0.png).
This is a useful diagnostic, but isn't really the goal. Two things `@tullio` is often
@@ -146,7 +145,7 @@ X = rand(1000,1000);
Complex numbers aren't handled by LoopVectorization, so will be much slower.
Chained multiplication is also very slow, because it doesn't know there's a better
-algorithm. Here it just makes 4 loops, instead of multiplying sequentially,
+algorithm. Here it just makes 4 loops, instead of multiplying sequentially,
`30^4` instead of `2 * 30^3` operations:
```julia
@@ -155,7 +154,7 @@ M1, M2, M3 = randn(30,30), randn(30,30), randn(30,30);
@btime @tullio M4[i,l] := $M1[i,j] * $M2[j,k] * $M3[k,l]; # 30.401 μs
```
-At present indices using `pad`, `clamp` or `mod` are also slow. These result in extra
+At present indices using `pad`, `clamp` or `mod` are also slow. These result in extra
checks or operations at every iteration, not just around the edges:
```julia
@@ -169,7 +168,7 @@ x100 = rand(100,100); k7 = randn(7,7);
@btime conv3($x100, $k7); # 283.634 μs
using Flux
-x104 = reshape(x100,(100,100,1,1)); k74 = reshape(k7,(7,7,1,1));
+x104 = reshape(x100,(100,100,1,1)); k74 = reshape(k7,(7,7,1,1));
conv1(x100, k7) ≈ @btime CrossCor($k74, false)($x104) # 586.694 μs
conv2(x100, k7) ≈ @btime Conv($k74, false, stride=2)($x104) # 901.573 μs
conv3(x100, k7) ≈ @btime Conv($k74, false, pad=3)($x104) # 932.658 μs
@@ -180,7 +179,7 @@ conv3(x100, k7) ≈ @btime Conv($k74, false, pad=3)($x104) # 932.658 μs
```julia
using Tullio
-mul(A, B) = @tullio C[i,k] := A[i,j] * B[j,k]
+mul(A, B) = @tullio C[i,k] := A[i,j] * B[j,k]
A = rand(3,40); B = rand(40,500);
A * B ≈ mul(A, B) # true
@@ -219,7 +218,7 @@ end (x in 1:10, y in 1:10) # and prevents range of x from being inferred.
# A stencil?
offsets = [(a,b) for a in -2:2 for b in -2:2 if a>=b] # vector of tuples
-@tullio out[x,y,1] = begin
+@tullio out[x,y,1] = begin
a,b = offsets[k]
i = clamp(x+a, extrema(axes(mat,1))...)
# j = clamp(y+b, extrema(axes(mat,2))...) # can be written clamp(y+b)
@@ -241,11 +240,11 @@ Zygote.gradient(sum∘rowmap, fs, ones(3,2))
Options
The default setting is:
-```@tullio threads=true fastmath=true avx=true tensor=true cuda=256 grad=Base verbose=false A[i,j] := ...```
+```@tullio threads=true fastmath=true avx=true tensor=true cuda=256 grad=Base verbose=false A[i,j] := ...```
* `threads=false` turns off threading, while `threads=64^3` sets a threshold size at which to divide the work (replacing the macro's best guess).
* `avx=false` turns off the use of `LoopVectorization`, while `avx=4` inserts `@avx unroll=4 for i in ...`.
* `grad=false` turns off gradient calculation, and `grad=Dual` switches it to use `ForwardDiff` (which must be loaded).
-* `nograd=A` turns of the gradient calculation just for `A`, and `nograd=(A,B,C)` does this for several arrays.
+* `nograd=A` turns of the gradient calculation just for `A`, and `nograd=(A,B,C)` does this for several arrays.
* `tensor=false` turns off the use of `TensorOperations`.
* Assignment `xi = ...` removes `xi` from the list of indices: its range is note calculated, and it will not be summed over. It also disables `@inbounds` since this is now up to you.
* `verbose=true` prints things like the index ranges inferred, and gradient calculations. `verbose=2` prints absolutely everything.
@@ -256,20 +255,20 @@ The default setting is:
Implicit:
* Indices without shifts must have the same range everywhere they appear, but those with shifts (even `A[i+0]`) run over the intersection of possible ranges.
* Shifted output indices must start at 1, unless `OffsetArrays` is visible in the calling module.
-* The use of `@avx`, and the calculation of gradients, are switched off by sufficiently complex syntax (such as arrays of arrays).
+* The use of `@avx`, and the calculation of gradients, are switched off by sufficiently complex syntax (such as arrays of arrays).
* Gradient hooks are attached for any or all of `ReverseDiff`, `Tracker` & `Zygote`. These packages need not be loaded when the macro is run.
* Gradients are only defined for reductions over `(+)` (default) and `min`, `max`.
* GPU kernels are only constructed when both `KernelAbstractions` and `CUDA` are visible. The default `cuda=256` is passed to `kernel(CUDA(), 256)`.
* The CPU kernels from `KernelAbstractions` are called only when `threads=false`; they are not at present very fast, but perhaps useful for testing.
Extras:
-* `A[i] := i^2 (i in 1:10)` is how you specify a range for indices when this can't be inferred.
+* `A[i] := i^2 (i in 1:10)` is how you specify a range for indices when this can't be inferred.
* `A[i] := B[i, $col] - C[i, 2]` is how you fix one index to a constant (to prevent `col` being summed over).
-* `A[i] := $d * B[i]` is the preferred way to include other constants. Note that no gradient is calculated for `d`.
+* `A[i] := $d * B[i]` is the preferred way to include other constants. Note that no gradient is calculated for `d`.
* Within indexing, `A[mod(i), clamp(j)]` both maps `i` & `j` to lie within `axes(A)`, and disables inference of their ranges from `A`.
* Similarly, `A[pad(i,3)]` extends the range of `i`, inserting zeros outside of `A`. Instead of zero, `pad=NaN` uses this value as padding. The implementation of this (and `mod`, `clamp`) is not very fast at present.
* On the left, when making a new array, an underscore like `A[i+_] :=` inserts whatever shift is needed to make `A` one-based.
-* `Tullio.@printgrad (x+y)*log(x/z) x y z` prints out how symbolic derivatives will be done.
+* `Tullio.@printgrad (x+y)*log(x/z) x y z` prints out how symbolic derivatives will be done.
Interals
@@ -386,7 +385,7 @@ function ∇act!(::Type, ΔC, ΔA, ΔB, C, A, B, ax_i, ax_j, ax_k, keep)
end
```
-Writing `@tullio verbose=2` will print all of these functions out.
+Writing `@tullio verbose=2` will print all of these functions out.
Scalar reductions, such as `@tullio s := A[i,j] * log(B[j,i])`, are slightly different in that the `act!` function simply returns the sum, i.e. the variable `acc` above.
@@ -395,7 +394,7 @@ Scalar reductions, such as `@tullio s := A[i,j] * log(B[j,i])`, are slightly dif
Back-end friends & relatives:
-* [LoopVectorization.jl](https://github.com/chriselrod/LoopVectorization.jl) is used here, if available.
+* [LoopVectorization.jl](https://github.com/chriselrod/LoopVectorization.jl) is used here, if available.
* [Gaius.jl](https://github.com/MasonProtter/Gaius.jl) and [PaddedMatrices.jl](https://github.com/chriselrod/PaddedMatrices.jl) build on that.
@@ -415,7 +414,7 @@ Front-end near-lookalikes:
Things you can't run:
-* [Tortilla.jl](https://www.youtube.com/watch?v=Rp7sTl9oPNI) seems to exist, publicly, only in this very nice talk.
+* [Tortilla.jl](https://www.youtube.com/watch?v=Rp7sTl9oPNI) seems to exist, publicly, only in this very nice talk.
* [ArrayMeta.jl](https://github.com/shashi/ArrayMeta.jl) was a Julia 0.5 take on some of this.
diff --git a/test/runtests.jl b/test/runtests.jl
index fddeed7..cc62603 100644
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -205,17 +205,17 @@ _gradient(x...) = Yota.grad(x...)[2]
t8 = time()
using LoopVectorization
+using VectorizationBase
-if isdefined(LoopVectorization, :SVec) # version 0.8, for Julia ⩽1.5
- using LoopVectorization.VectorizationBase: SVec, Mask
-else # version 0.9, supports Julia 1.6
- using LoopVectorization.VectorizationBase: Vec, Mask
- SVec{N,T} = Vec{N,T}
+@static if Base.VERSION >= v"1.5"
+ const Vec = VectorizationBase.Vec
+else
+ const Vec = VectorizationBase.SVec
end
@testset "LoopVectorization onlyone" begin
- ms = Mask{UInt8}(0x03); # Mask{8,Bool}<1, 1, 0, 0, 0, 0, 0, 0>
- sv = SVec{4,Int}(1,2,3,4) # SVec{4,Int64}<1, 2, 3, 4>
+ ms = mask(Val(8), 2) # Mask{8,Bool}<1, 1, 0, 0, 0, 0, 0, 0>
+ sv = Vec{4,Int}(1,2,3,4) # Vec{4,Int64}<1, 2, 3, 4>
# preliminaries:
@test Tullio.allzero(sv) === false