Apple silicon workaround (#196)

joachimbrand · web-flow · commit 053c5ec0a8a5 · 2023-03-09T19:11:09.000+13:00
* avoid reductions with custom operators for MPI * avoid MultiScalar reduction only on non-intel * move changes to mapreduce * small tweak * fix mpi errors * clean up * use MPI logical operator directly * Remove MultiScalar from sort_into_targets! fciqmc_col! may now return stats as a vector * Revert "Remove MultiScalar from sort_into_targets!" This reverts commit b54868f. --------- Co-authored-by: Joachim Brand <joachim.brand@gmail.com>
diff --git a/src/RMPI/RMPI.jl b/src/RMPI/RMPI.jl
@@ -31,6 +31,7 @@ const mpi_registry = Dict{Int,Any}()
 abstract type DistributeStrategy end
 
 include("mpidata.jl")
+include("multiscalar.jl")
 include("helpers.jl")
 include("noexchange.jl")
 include("pointtopoint.jl")
diff --git a/src/RMPI/helpers.jl b/src/RMPI/helpers.jl
@@ -87,7 +87,7 @@ end
 
 function sort_into_targets!(dtarget::MPIData, w::AbstractDVec, stats)
     # single threaded MPI version
-    mpi_combine_walkers!(dtarget,w) # combine walkers from different MPI ranks
+    mpi_combine_walkers!(dtarget, w) # combine walkers from different MPI ranks
     res_stats = MPI.Allreduce(Rimu.MultiScalar(stats), +, dtarget.comm)
     return dtarget, w, res_stats
 end
diff --git a/src/RMPI/mpidata.jl b/src/RMPI/mpidata.jl
@@ -92,7 +92,26 @@ end
 
 function Base.mapreduce(f, op, it::MPIDataIterator; kwargs...)
     res = mapreduce(f, op, it.iter; kwargs...)
-    return MPI.Allreduce(res, op, it.data.comm)
+    T = typeof(res)
+    if T <: Bool # MPI.jl does not support Bool reductions
+        res = convert(UInt8, res)
+    end
+    return T(MPI.Allreduce(res, op, it.data.comm))
+end
+
+# Special case for `sum`, which uses a custom (type-widening) reduction operator `add_sum`.
+# Replacing it by `+` is necessary for non-Intel architectures due to a limitation of
+# MPI.jl. On Intel processors, it might be more perfomant.
+# see https://github.com/JuliaParallel/MPI.jl/issues/404
+function Base.mapreduce(f, op::typeof(Base.add_sum), it::MPIDataIterator; kwargs...)
+    res = mapreduce(f, op, it.iter; kwargs...)
+    return MPI.Allreduce(res, +, it.data.comm)
+end
+
+# Special case for `prod`, which uses a custom (type-widening) reduction operator `mul_prod`
+function Base.mapreduce(f, op::typeof(Base.mul_prod), it::MPIDataIterator; kwargs...)
+    res = mapreduce(f, op, it.iter; kwargs...)
+    return MPI.Allreduce(res, *, it.data.comm)
 end
 
 Base.IteratorSize(::MPIDataIterator) = Base.SizeUnknown()
diff --git a/src/RMPI/multiscalar.jl b/src/RMPI/multiscalar.jl
@@ -0,0 +1,10 @@
+# Make MPI reduction of a `MultiScalar` work on non-Intel processors.
+# The `MultiScalar` is converted into a vector before sending through MPI.Allreduce.
+# Testing shows that this is about the same speed or even a bit faster on Intel processors
+# than reducing the MultiScalar directly via a custom reduction operator.
+# Defining the method in RMPI is strictly type piracy as MultiScalar belongs to Rimu and
+# not to RMPI. Might clean this up later.
+function MPI.Allreduce(ms::Rimu.MultiScalar{T}, op, comm::MPI.Comm) where {T<:Tuple}
+    result_vector = MPI.Allreduce([ms...], op, comm)
+    return Rimu.MultiScalar(T(result_vector))
+end
diff --git a/src/helpers.jl b/src/helpers.jl
@@ -34,6 +34,7 @@ end
 MultiScalar(args...) = MultiScalar(args)
 MultiScalar(v::SVector) = MultiScalar(Tuple(v))
 MultiScalar(m::MultiScalar) = m
+MultiScalar{T}(m::MultiScalar{T}) where T<:Tuple = m
 MultiScalar(arg) = MultiScalar((arg,))
 
 Base.getindex(m::MultiScalar, i) = m.tuple[i]
diff --git a/test/RMPI.jl b/test/RMPI.jl
@@ -6,17 +6,17 @@ using Test
 
 @testset "DistributeStrategies" begin
     # `DistributeStrategy`s
-    ham = HubbardReal1D(BoseFS((1,2,3)))
+    ham = HubbardReal1D(BoseFS((1, 2, 3)))
     for setup in [RMPI.mpi_no_exchange, RMPI.mpi_all_to_all, RMPI.mpi_point_to_point]
-        dv = DVec(starting_address(ham)=>10; style=IsDynamicSemistochastic())
+        dv = DVec(starting_address(ham) => 10; style=IsDynamicSemistochastic())
         v = MPIData(dv; setup)
-        df, state = lomc!(ham,v)
+        df, state = lomc!(ham, v)
         @test size(df) == (100, 12)
     end
     # need to do mpi_one_sided separately
-    dv = DVec(starting_address(ham)=>10; style=IsDynamicSemistochastic())
-    v = RMPI.mpi_one_sided(dv; capacity = 1000)
-    df, state = lomc!(ham,v)
+    dv = DVec(starting_address(ham) => 10; style=IsDynamicSemistochastic())
+    v = RMPI.mpi_one_sided(dv; capacity=1000)
+    df, state = lomc!(ham, v)
     @test size(df) == (100, 12)
 end
 
@@ -29,13 +29,13 @@ end
                 counts = zeros(Int, k)
                 displs = zeros(Int, k)
 
-                RMPI.sort_and_count!(counts, displs, vals, ordfun.(vals), (0, k-1))
+                RMPI.sort_and_count!(counts, displs, vals, ordfun.(vals), (0, k - 1))
                 @test issorted(vals, by=ordfun)
                 @test sum(counts) == l
 
-                for i in 0:(k - 1)
-                    c = counts[i + 1]
-                    d = displs[i + 1]
+                for i in 0:(k-1)
+                    c = counts[i+1]
+                    d = displs[i+1]
                     r = (1:c) .+ d
                     ords = ordfun.(vals)
                     @test all(ords[r] .== i)
@@ -79,7 +79,7 @@ end
     @testset "dot" begin
         @test dot(dv1, dv2) == 0
         @test dot(dv1, dv1) == dot(localpart(dv1), dv1)
-        rand_ham = MatrixHamiltonian(rand(ComplexF64, 4,4))
+        rand_ham = MatrixHamiltonian(rand(ComplexF64, 4, 4))
         ldv1 = localpart(dv1)
         @test norm(dot(dv1, rand_ham, dv1)) ≈ norm(dot(ldv1, rand_ham, ldv1))
     end
diff --git a/test/mpi_runtests.jl b/test/mpi_runtests.jl
@@ -71,7 +71,7 @@ end
             end
             @testset "Single component $type" begin
                 for i in 1:N_REPEATS
-                    add = BoseFS((0,0,10,0,0))
+                    add = BoseFS((0, 0, 10, 0, 0))
                     H = HubbardMom1D(add)
                     Random.seed!(7350 * i)
                     v, dv = setup_dv(
@@ -98,7 +98,7 @@ end
                         @test sum(values(v)) ≈ sum(values(dv))
                         f((k, v)) = (k == add) + v > 0
                         @test mapreduce(f, |, pairs(v); init=true) ==
-                            mapreduce(f, |, pairs(dv); init=true)
+                              mapreduce(f, |, pairs(dv); init=true)
                     end
 
                     @testset "Operations" begin
@@ -127,7 +127,7 @@ end
             end
             @testset "Two-component $type" begin
                 for i in 1:N_REPEATS
-                    add = BoseFS2C((0,0,10,0,0), (0,0,2,0,0))
+                    add = BoseFS2C((0, 0, 10, 0, 0), (0, 0, 2, 0, 0))
                     H = BoseHubbardMom1D2C(add)
                     Random.seed!(7350 * i)
                     v, dv = setup_dv(
@@ -225,7 +225,7 @@ end
             (RMPI.mpi_one_sided, (; capacity=1000)),
         )
             @testset "Regular with $setup and post-steps" begin
-                H = HubbardReal1D(BoseFS((1,1,1,1,1,1,1)); u=6.0)
+                H = HubbardReal1D(BoseFS((1, 1, 1, 1, 1, 1, 1)); u=6.0)
                 dv = MPIData(
                     DVec(starting_address(H) => 3; style=IsDynamicSemistochastic());
                     setup,
@@ -253,7 +253,7 @@ end
                 @test all(0 .≤ df.loneliness .≤ 1)
             end
             @testset "Initiator with $setup" begin
-                H = HubbardMom1D(BoseFS((0,0,0,7,0,0,0)); u=6.0)
+                H = HubbardMom1D(BoseFS((0, 0, 0, 7, 0, 0, 0)); u=6.0)
                 dv = MPIData(
                     InitiatorDVec(starting_address(H) => 3);
                     setup,
@@ -295,7 +295,9 @@ end
 
     # Make sure all ranks came this far.
     @testset "Finish" begin
-        @test MPI.Allreduce(true, &, mpi_comm())
+        # MPI.jl currently doesn't properly map logical operators (MPI v0.20.8)
+        @test MPI.Allreduce(true, MPI.LAND, mpi_comm())
+        # @test MPI.Allreduce(true, &, mpi_comm())
     end
 end