Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
66 changes: 66 additions & 0 deletions p2p_bw.def
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
include: macros/bench_frame.def
include: macros/bench_p2p.def
include: macros/mtest.def

subcode: bw_loop
$for j=0:window
buf = (char *) gbuf + j * size % (MAX_BUFSIZE - size)
BLOCK

page: p2p_bw, bench_frame
MAX_BUFSIZE: 10000000
WINDOW_MAX: 64
data: buf, size, MPI_CHAR

&call foreach_size
bench_p2p(comm, gsrc, gdst, size)

subcode: send_side
$my MPI_Request reqs[$(WINDOW_MAX)]
&call bw_loop
MPI_Isend($(data), dst, TAG, comm, &reqs[j])
MPI_Waitall(window, reqs, MPI_STATUSES_IGNORE)
MPI_Recv(NULL, 0, MPI_DATATYPE_NULL, dst, TAG, comm, MPI_STATUS_IGNORE)

subcode: recv_side
$my MPI_Request reqs[$(WINDOW_MAX)]
&call bw_loop
MPI_Irecv($(data), src, TAG, comm, &reqs[j])
MPI_Waitall(window, reqs, MPI_STATUSES_IGNORE)
MPI_Send(NULL, 0, MPI_DATATYPE_NULL, src, TAG, comm)

page: get_bw, bench_frame
IS_RMA: 1
WINDOW_MAX: 100
data: buf, size, MPI_CHAR

$for int size = 1; size < MAX_BUFSIZE; size *= 2
bench_p2p(comm, 0, 1, size)

subcode: send_side
MPI_Win_fence(0, win)
&call bw_loop
MPI_Get($(data), dst, 0, size, MPI_CHAR, win)
MPI_Win_fence(0, win)

subcode: recv_side
MPI_Win_fence(0, win)
MPI_Win_fence(0, win)

page: put_bw, bench_frame
IS_RMA: 1
WINDOW_MAX: 100
data: buf, size, MPI_CHAR

$for int size = 1; size < MAX_BUFSIZE; size *= 2
bench_p2p(comm, 0, 1, size)

subcode: send_side
MPI_Win_fence(0, win)
&call bw_loop
MPI_Put($(data), dst, 0, size, MPI_CHAR, win)
MPI_Win_fence(0, win)

subcode: recv_side
MPI_Win_fence(0, win)
MPI_Win_fence(0, win)
5 changes: 4 additions & 1 deletion src/mpid/ch4/src/ch4_self.c
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,10 @@ int MPIDI_Self_finalize(void)
MPIR_Datatype_get_size_macro(recvtype, rdata_sz); \
sdata_sz *= sendcnt; \
rdata_sz *= recvcnt; \
MPIR_Localcopy(sendbuf, sendcnt, sendtype, recvbuf, recvcnt, recvtype); \
MPIR_Localcopy_gpu(sendbuf, sendcnt, sendtype, 0, NULL, \
recvbuf, recvcnt, recvtype, 0, NULL, \
MPL_GPU_COPY_DIRECTION_NONE, \
MPL_GPU_ENGINE_TYPE_COPY_HIGH_BANDWIDTH, 1); \
status.MPI_SOURCE = 0; \
status.MPI_TAG = tag; \
if (sdata_sz > rdata_sz) { \
Expand Down
5 changes: 4 additions & 1 deletion test/mpi/bench/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,11 @@ noinst_PROGRAMS = \
get_bw \
put_bw \
p2p_one \
self_one \
self_bw \
barrier \
bcast
bcast \
allreduce

.def.c:
mydef_page $<
32 changes: 0 additions & 32 deletions test/mpi/bench/bcast.def

This file was deleted.

36 changes: 36 additions & 0 deletions test/mpi/bench/coll_latency.def
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
include: macros/bench_frame.def
include: macros/bench_coll.def
include: macros/mtest.def

page: bcast, bench_frame
data: buf, size, MPI_CHAR
NEED_ROOT: 1

$(if:0)
&call measure_with_barrier
MPI_Bcast($(data), root, comm)
$(else)
$call bench_coll, measure_bcast

page: allreduce, bench_frame
SIZE_MIN: 4
data: buf, size/sizeof(int), MPI_INT

$call bench_coll, measure_allreduce

#----------------------------------------
subcode: measure_common
&call measure_coll_latency, iter
void *buf = (char *) gbuf + i * size % (MAX_BUFSIZE - size)
BLOCK
$(for:min,max,avg,sigma)
*pf_$1 = tf_$1

fncode: measure_bcast(int iter, int root, comm, size, pf_min, pf_max, pf_avg, pf_sigma)
&call measure_common
MPI_Bcast($(data), root, comm)

fncode: measure_allreduce(int iter, comm, size, pf_min, pf_max, pf_avg, pf_sigma)
&call measure_common
MPI_Allreduce(MPI_IN_PLACE, $(data), MPI_SUM, comm)

25 changes: 25 additions & 0 deletions test/mpi/bench/macros/bench_coll.def
Original file line number Diff line number Diff line change
@@ -1,3 +1,28 @@
subcode: bench_coll(measure_func)
$(if:NEED_ROOT)
int root = 0
$(set:params=iter, root, comm, size)
$(else)
$(set:params=iter, comm, size)
$(if:!SIZE_MIN)
$(set:SIZE_MIN=0)
$(set:SIZE_STEP=size=(size==0)?1:size*2)
$(else)
$(set:SIZE_STEP=size*=2)
$if grank == 0
$call header_coll_latency
$for int size = $(SIZE_MIN); size < $(MAX_BUFSIZE); $(SIZE_STEP)
$my tf_min, tf_max, tf_avg, tf_sigma
$(set:MIN_ITER=0.001/tf_max)
&call coll_warmup
$(measure_func)($(params), &tf_min, &tf_max, &tf_avg, &tf_sigma)
tf_dur = tf_max
$if iter < 100
iter = 100
$(measure_func)($(params), &tf_min, &tf_max, &tf_avg, &tf_sigma)
$if grank == 0
$call report_coll_latency, size

subcode: coll_warmup
$if grank == 0
&call warm_up, iter, tf_dur
Expand Down
32 changes: 23 additions & 9 deletions test/mpi/bench/macros/bench_frame.def
Original file line number Diff line number Diff line change
Expand Up @@ -23,19 +23,17 @@ subcode: bench_frame
$global grank, gsize: int
MPI_Comm_rank(MPI_COMM_WORLD, &grank);
MPI_Comm_size(MPI_COMM_WORLD, &gsize);
$(if:MIN_PROCS)
$if gsize < $(MIN_PROCS)
printf("! Test $(_pagename) requires $(MIN_PROCS) processes !\n");
return 1

$call check_launch

MPI_Comm comm = MPI_COMM_WORLD;

$my void *buf
$global void *gbuf
$(if:HAS_MTEST)
$call mtest_malloc, MAX_BUFSIZE
$call mtest_malloc, gbuf, MAX_BUFSIZE
$(else)
buf = malloc(MAX_BUFSIZE)
$if !buf
gbuf = malloc(MAX_BUFSIZE)
$if !gbuf
printf("! Failed to allocate buffer (size=%d)\n", MAX_BUFSIZE)
return 1

Expand All @@ -44,7 +42,7 @@ subcode: bench_frame
$if grank == 0
MPI_Win_create(NULL, 0, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &win)
$else
MPI_Win_create(buf, MAX_BUFSIZE, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &win)
MPI_Win_create(gbuf, MAX_BUFSIZE, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &win)

$if grank == 0
printf("TEST $(_pagename):\n")
Expand All @@ -60,6 +58,22 @@ subcode: bench_frame
MTest_Finalize(0);
$(else)
MPI_Finalize();
# -----
subcode: check_launch
$(if:MIN_PROCS)
$if gsize < $(MIN_PROCS)
printf("! Test $(_pagename) requires $(MIN_PROCS) processes !\n");
return 1
$(if:MEM_TYPES=sendrecv)
$if gsize % 2 == 1
printf("! Test $(_pagename) requires even number of processes to form even/odd pairs !\n");
$global gsrc, gdst: int
$if grank % 2 == 0
gsrc = grank
gdst = grank + 1
$else
gsrc = grank - 1
gdst = grank

macros:
use_double: 1
Expand Down
46 changes: 35 additions & 11 deletions test/mpi/bench/macros/bench_p2p.def
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@
*/

macros:
MIN_PROCS: 2
MEM_TYPES: sendrecv

subcode: _autoload
Expand All @@ -29,49 +28,74 @@ subcode: _autoload
subcode: report_header
$call header_latency

fncode: bench_p2p(comm, src, dst, buf, size)
fncode: bench_p2p(comm, src, dst, size)
int rank;
MPI_Comm_rank(comm, &rank)

$(if:!MULTIPLICITY)
$(set:MULTIPLICITY=1)

$if rank == src
iter = bench_warmup(comm, dst, buf, size)
iter = bench_warmup(comm, dst, size)
$call adjust_iter_for_window, iter
&call run_stat, NUM_REPEAT, tf_latency
tf_latency = bench_send(iter, comm, dst, buf, size)
tf_latency = bench_send(iter, comm, dst, size)
tf_latency /= iter
$call report_latency, size, $(MULTIPLICITY)
$call send_stop
$elif rank == dst
bench_recv(comm, src, buf, size)
bench_recv(comm, src, size)

subcode: send_stop
iter = 0;
MPI_Send(&iter, 1, MPI_INT, dst, SYNC_TAG, comm)

#----------------------------------------
fncode: bench_send(int iter, comm, dst, buf, size)
fncode: calc_window(int iter)
window = iter / 10
$if window == 0
window = 1
$elif window > $(WINDOW_MAX)
window = $(WINDOW_MAX)
return window

subcode: adjust_iter_for_window(iter)
$(if:WINDOW_MAX)
int window = calc_window($(iter))
iter -= iter % window

subcode: loop_with_window(iter)
$(if:WINDOW_MAX)
window = calc_window($(iter))
$for i=0:iter:window
BLOCK
$(else)
$for i=0:iter
BLOCK

fncode: bench_send(int iter, comm, dst, size)
# synchronize with receiver
MPI_Send(&iter, 1, MPI_INT, dst, SYNC_TAG, comm);

&call measure, iter
tf_start = MPI_Wtime()
&call loop_with_window, iter
$call @send_side
tf_dur = MPI_Wtime() - tf_start

return tf_dur

fncode: bench_recv(comm, src, buf, size)
fncode: bench_recv(comm, src, size)
$while 1
int iter;
# synchronize with sender */
MPI_Recv(&iter, 1, MPI_INT, src, SYNC_TAG, comm, MPI_STATUS_IGNORE);
$if iter == 0
# time to quit
break
$for i=0:iter
&call loop_with_window, iter
$call @recv_side

fncode: bench_warmup(comm, dst, buf, size): int
fncode: bench_warmup(comm, dst, size): int
&call warm_up, iter, tf_dur
tf_dur = bench_send(iter, comm, dst, buf, size)
tf_dur = bench_send(iter, comm, dst, size)
return iter
12 changes: 6 additions & 6 deletions test/mpi/bench/macros/mtest.def
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
macros:
HAS_MTEST: 1

subcode: mtest_malloc(size)
subcode: mtest_malloc(buf, size)
MTestArgList *head = MTestArgListCreate(argc, argv)
$(if:MEM_TYPES=sendrecv)
int send_rank = 0, recv_rank = 1;
$(for:a in send,recv)
$if grank == $(a)_rank
$call alloc_mem_dev, $(a)mem, $(a)dev
$if grank == gsrc
$call alloc_mem_dev, sendmem, senddev
$elif grank == gdst
$call alloc_mem_dev, recvmem, recvdev
$(else)
# all procs allocating the same memory types
$call alloc_mem_dev, memtype, device
Expand All @@ -17,5 +17,5 @@ subcode: mtest_malloc(size)
$my mtest_mem_type_e memtype, int device
memtype = MTestArgListGetMemType(head, "$(memtype)")
device = MTestArgListGetInt_with_default(head, "$(memdev)", grank)
MTestMalloc($(size), memtype, NULL, &buf, device)
MTestMalloc($(size), memtype, NULL, &$(buf), device)
MTestPrintfMsg(1, "[%d] Allocating buffer: memtype=%s, device=%d, size=%d\n", grank, MTest_memtype_name(memtype), device, $(size))
Loading