Merge pull request #57 from mcabbott/avxci2

mcabbott · web-flow · commit 5439bc3876fc · 2021-01-23T14:07:02.000+01:00
More LoopVectorization tests &amp; checks
diff --git a/.buildkite/pipeline.yml b/.buildkite/pipeline.yml
@@ -1,5 +1,5 @@
 env:
-  JULIA_NUM_THREADS: "1"
+  JULIA_NUM_THREADS: "6"
   # SECRET_CODECOV_TOKEN: "..."
 
 steps:
diff --git a/.github/workflows/ci-julia-nightly.yml b/.github/workflows/ci-julia-nightly.yml
@@ -9,7 +9,7 @@ on:
     tags: '*'
 jobs:
   test-julia-nightly:
-    name: NIGHTLY/t-${{ matrix.threads }}/group-${{ matrix.group }}/${{ github.event_name }}/${{ matrix.arch }}+${{ matrix.os }}
+    name: NIGHTLY -t${{ matrix.threads }} / group-${{ matrix.group }} / ${{ github.event_name }} / ${{ matrix.os }}+${{ matrix.arch }}
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
@@ -24,7 +24,7 @@ jobs:
           - ubuntu-latest
         threads:
           - '1'
-          - '2'
+          - '6'
         version:
           - 'nightly'
     steps:
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -9,7 +9,7 @@ on:
     tags: '*'
 jobs:
   test:
-    name: v${{ matrix.version }}/t-${{ matrix.threads }}/group-${{ matrix.group }}/${{ github.event_name }}/${{ matrix.arch }}+${{ matrix.os }}
+    name: v${{ matrix.version }} -t${{ matrix.threads }} / group-${{ matrix.group }} / ${{ github.event_name }} / ${{ matrix.os }}+${{ matrix.arch }}
     runs-on: ${{ matrix.os }}
     strategy:
       fail-fast: false
@@ -24,7 +24,7 @@ jobs:
           - ubuntu-latest
         threads:
           - '1'
-          - '2'
+          - '6' # t>2 might be ignored on Julia <= 1.5
         version:
           - '1.4'
           - '1' # automatically expands to the latest stable 1.x release of Julia
diff --git a/Project.toml b/Project.toml
@@ -13,15 +13,15 @@ CUDA = "1, 2"
 DiffRules = "1"
 FillArrays = "0.10"
 ForwardDiff = "0.10"
-KernelAbstractions = "0.4"
-LoopVectorization = "0.8.26, 0.9.7"
+KernelAbstractions = "0.5.2"
+LoopVectorization = "0.8.26, 0.9.20"
 NamedDims = "0.2"
 OffsetArrays = "1"
 Requires = "1"
 TensorOperations = "3"
 Tracker = "0.2"
-VectorizationBase = "0.12.33, 0.13.10"
-Zygote = "0.5"
+VectorizationBase = "0.12.33, 0.15.7"
+Zygote = "0.6"
 julia = "1.3"
 
 [extras]
diff --git a/README.md b/README.md
@@ -99,7 +99,7 @@ using Tullio
 A = [abs2(i - 11) for i in 1:21]
 
 # Downsample -- range of i is that allowed by both terms:
-@tullio D[i] := (A[2i] + A[2i+1])/2  # 1:10 == intersect(1:10, 0:10)
+@tullio B[i] := (A[2i] + A[2i+1])/2  # 1:10 == intersect(1:10, 0:10)
 
 # Shifts -- range of i calculated in terms of that given for j:
 @tullio M[i,j] := A[i+j-1]  (j in 1:15)  # i in 1:7
@@ -129,6 +129,9 @@ fft(S) ≈ @tullio F[k] := S[x] * exp(-im*pi/8 * (k-1) * x)  (k ∈ axes(S,1))
 @tullio (*) P[i] := A[i+k]  (k in 0:2) # product
 @tullio (max) X[i,_] := D[i,j]         # maximum(D, dims=2), almost
 
+min1(x,y) = ifelse(first(x) < first(y), x, y); # findmin(D, dims=1), almost:
+@tullio (min1) Ts[j+_] := (D[i,j], (i,j))  init=(typemax(Int), (0,0))
+
 # Access to fields & arrays -- this uses j ∈ eachindex(first(N).c)
 N = [(a=i, b=i^2, c=fill(i^3,3)) for i in 1:10]
 @tullio T[i,j] := (N[i].a // 1, N[i].c[j])
@@ -449,7 +452,7 @@ Front-end near-lookalikes:
 
 * [Einsum.jl](https://github.com/ahwillia/Einsum.jl) makes simple loops. See [tests/einsum.jl](https://github.com/mcabbott/Tullio.jl/blob/master/test/einsum.jl) where `using Tullio: @einsum` is an almost-seamless replacement.
 
-* [TensorOperations.jl](https://github.com/Jutho/TensorOperations.jl) and [OMEinsum.jl](https://github.com/under-Peter/OMEinsum.jl) identify patterns on which they can call various basic operations.
+* [TensorOperations.jl](https://github.com/Jutho/TensorOperations.jl) and [OMEinsum.jl](https://github.com/under-Peter/OMEinsum.jl) identify patterns on which they can call various basic operations. [TensorRules.jl](https://github.com/ho-oto/TensorRules.jl) makes `@tensor` differentiable; see also [TensorGrad.jl](https://github.com/mcabbott/TensorGrad.jl) and [TensorTrack.jl](https://github.com/mcabbott/TensorTrack.jl) for earlier attempts.
 
 * [TensorCast.jl](https://github.com/mcabbott/TensorCast.jl) expresses everything as Julia array operations, broadcasting and reduction. (OMEinsum.jl also treats some cases as a special lazy broadcast-reduction.)
 
diff --git a/src/eval.jl b/src/eval.jl
@@ -55,29 +55,27 @@ using Requires
     using .LoopVectorization
     if isdefined(LoopVectorization, :SVec) # version 0.8, for Julia ⩽1.5
         using .LoopVectorization.VectorizationBase: SVec, Mask, prevpow2
+        @require ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" begin
+            # Dual numbers + svec, not needed on version 0.9
+            include("grad/avxdual.jl")
+        end
     else # version 0.9, supports Julia 1.6
         using .LoopVectorization.VectorizationBase: Vec, Mask, prevpow2
         SVec{N,T} = Vec{N,T}
     end
-
+#=
     # Functions needed for safe vectorised max gradient
     @inline Tullio.onlyone(cond::Bool, seen::SVec) = cond && allzero(seen)
 
     @inline Tullio.onlyone(cond::Mask{W}) where {W} = Mask{W}(prevpow2(cond.u))
     @inline Tullio.onlyone(cond::Mask, seen::Union{Int,SVec}) =
         Tullio.allzero(seen) ? Tullio.onlyone(cond) : zero(cond)
 
-    @inline allzero(seen::Int) = iszero(seen)
-    @inline allzero(seen::SVec{N,Int}) where {N} = iszero((!iszero(seen)).u)
-
-    # @inline Tullio.anyone(cond::Mask) = cond != zero(cond)
-    @inline Tullio.anyone(cond::Mask) = cond.u != zero(cond).u # for v0.9
+    @inline allzero(seen::Integer) = iszero(seen)
+    @inline allzero(seen::SVec) = iszero((!iszero(seen)).u)
 
-    @require ForwardDiff = "f6369f11-7733-5829-9624-2563aa707210" begin
-        # Dual numbers + svec, should live in PaddedMatricesForwardDiff?
-        # (And where would the conditional loading go, still here?)
-        include("grad/avxdual.jl")
-    end
+    @inline Tullio.anyone(cond::Mask) = !iszero(cond.u)
+=#
 end
 
 #========== CuArrays ==========#
diff --git a/src/forward.jl b/src/forward.jl
@@ -17,8 +17,8 @@ function insert_forward_gradient(axislist, store)
 
     defineepsilons, readepsilons = [], []
     for (d, (Aepsilon, Aex)) in enumerate(epsilondict)
-        basis = [i==d ? :(one($TYP)) : :(zero($TYP)) for i in 1:length(epsilondict)]
-        push!(defineepsilons, :($Aepsilon = ForwardDiff.Dual(zero($TYP), ($(basis...),))))
+        basis = [i==d ? :($one($TYP)) : :($zero($TYP)) for i in 1:length(epsilondict)]
+        push!(defineepsilons, :($Aepsilon = ForwardDiff.Dual($zero($TYP), ($(basis...),))))
         push!(readepsilons, :($Aex = $Aex + ForwardDiff.partials($ZED, $d) * $dZ[$(store.leftraw...)]))
     end
 
diff --git a/src/macro.jl b/src/macro.jl
@@ -466,7 +466,7 @@ padmodclamp_replace(s, store, inside=false) = s
 padmodclamp_replace(ex::Expr, store, inside=false) =
     if ex.head == :(=) && @capture_(ex.args[1], A_[inds__])
         # This tricky case is 𝛥A[pad(i,2)] = 𝛥A[pad(i,2)] + ...
-        Aex, fun = padmodclamp_pair(A, inds, store)
+        Aex, fun = padmodclamp_pair(A, inds, store, true)
         right = if fun != identity
             padmodclamp_replace(ex.args[2], store, true)
         else
@@ -481,7 +481,7 @@ padmodclamp_replace(ex::Expr, store, inside=false) =
         Expr(ex.head, args...)
     end
 
-padmodclamp_pair(A, inds, store) = begin
+padmodclamp_pair(A, inds, store, assign=false) = begin
     nopadif = []
     inds4 = map(enumerate(inds)) do (d,ex)
         isexpr(ex, :call) || return ex
@@ -494,7 +494,8 @@ padmodclamp_pair(A, inds, store) = begin
         elseif ex.args[1] == :pad && length(ex.args) >= 2
             i = ex.args[2]
             if !all(==(0), ex.args[3:end]) || length(ex.args) == 2
-                push!(nopadif, :($i ∈ $axes($A,$d)))
+                # push!(nopadif, :($i >= first(axes($A,$d))), :($i <= last(axes($A,$d)))) # allows avx
+                push!(nopadif, :($i >= first(axes($A,$d))), :($i <= Base.last(axes($A,$d)))) # allows avx... but LV 0.8, Julia 1.4, needs Base?
             end
             return i
         end
@@ -508,8 +509,10 @@ padmodclamp_pair(A, inds, store) = begin
         for c2 in nopadif[2:end]
             cond = :($cond & $c2)
         end
-        if store.padkeyword == TYP # default
-            ex -> :($cond ? $ex : $zero($eltype($A)))
+        if assign # for gradients, this wraps 𝛥A[pad(i,2)] = 𝛥A[pad(i,2)] + ...
+            ex -> :($cond && $ex)
+        elseif store.padkeyword == TYP # default, pad with zero
+            ex -> :($cond ? $ex : zero(eltype($A)))
         else
             ex -> :($cond ? $ex : $convert($eltype($A), $(store.padkeyword)))
         end
@@ -1070,16 +1073,15 @@ function make_many_actors(act!, args, ex1, outer::Vector, ex3, inner::Vector, ex
     safe = if act! == ACT!
         isempty(store.unsafeleft)
     else # working on ∇act!
-        isempty(store.unsaferight) &&
-            store.redfun == :+ && # Disable @avx for min/max grad, #53
-            store.grad != :Dual   # and for use with ForwardDiff
+        isempty(store.unsaferight)
     end
 
     if safe && store.avx != false && isdefined(store.mod, :LoopVectorization)
         unroll = store.avx == true ? 0 : store.avx # unroll=0 is the default setting
         info1 = store.verbose>0 ? :(@info "running LoopVectorization actor $($note)" maxlog=3 _id=$(hash(store))) : nothing
         check1 = store.verbose>0 ? :(LoopVectorization.check_args($(store.arrays...)) || @error "rejected by LoopVectorization's check_args! $($note)" maxlog=3 _id=$(hash(store))) : nothing
         try
+            act! == ACT! || store.redfun == :+ || throw("use of LoopVectorization for min/max gradients is disabled")
             lex = if isnothing(exloopfinal)
                 quote
 
diff --git a/test/gradients.jl b/test/gradients.jl
diff --git a/test/group-3.jl b/test/group-3.jl
diff --git a/test/parsing.jl b/test/parsing.jl
diff --git a/test/runtests.jl b/test/runtests.jl