Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
99 changes: 56 additions & 43 deletions fabtests/benchmarks/benchmark_shared.c
Original file line number Diff line number Diff line change
Expand Up @@ -94,15 +94,14 @@ void ft_benchmark_usage(void)
}

/* Pingpong latency test with pre-posted receive buffers. */
static int pingpong_pre_posted_rx(size_t inject_size)
static int pingpong_pre_posted_rx(size_t inject_size, union ft_timer *timer)
{
int ret, i;

if (opts.dst_addr) {
for (i = 0; i < opts.iterations + opts.warmup_iterations; i++) {
if (i == opts.warmup_iterations)
ft_start();

ft_start(timer);
if (opts.transfer_size <= inject_size)
ret = ft_inject(ep, remote_fi_addr,
opts.transfer_size);
Expand All @@ -119,7 +118,7 @@ static int pingpong_pre_posted_rx(size_t inject_size)
} else {
for (i = 0; i < opts.iterations + opts.warmup_iterations; i++) {
if (i == opts.warmup_iterations)
ft_start();
ft_start(timer);

ret = ft_rx(ep, opts.transfer_size);
if (ret)
Expand All @@ -135,20 +134,20 @@ static int pingpong_pre_posted_rx(size_t inject_size)
return ret;
}
}
ft_stop();
ft_stop(timer);

return FI_SUCCESS;
}

/* Pingpong latency test without pre-posted receive buffers. */
static int pingpong_no_pre_posted_rx(size_t inject_size)
static int pingpong_no_pre_posted_rx(size_t inject_size, union ft_timer *timer)
{
int ret, i;

if (opts.dst_addr) {
for (i = 0; i < opts.iterations + opts.warmup_iterations; i++) {
if (i == opts.warmup_iterations)
ft_start();
ft_start(timer);

if (opts.transfer_size <= inject_size)
ret = ft_inject(ep, remote_fi_addr,
Expand All @@ -170,7 +169,7 @@ static int pingpong_no_pre_posted_rx(size_t inject_size)
} else {
for (i = 0; i < opts.iterations + opts.warmup_iterations; i++) {
if (i == opts.warmup_iterations)
ft_start();
ft_start(timer);

ret = ft_post_rx(ep, opts.transfer_size, &rx_ctx);
if (ret)
Expand All @@ -197,14 +196,15 @@ static int pingpong_no_pre_posted_rx(size_t inject_size)
return ret;
}
}
ft_stop();
ft_stop(timer);

return FI_SUCCESS;
}

int pingpong(void)
{
int ret;
union ft_timer timer = {};
size_t inject_size = fi->tx_attr->inject_size;

ret = fi_getopt(&ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_MSG_SIZE,
Expand Down Expand Up @@ -236,24 +236,24 @@ int pingpong(void)
return ret;
}

ret = pingpong_no_pre_posted_rx(inject_size);
ret = pingpong_no_pre_posted_rx(inject_size, &timer);
if (ret)
return ret;
} else {
ret = ft_sync();
if (ret)
return ret;

ret = pingpong_pre_posted_rx(inject_size);
ret = pingpong_pre_posted_rx(inject_size, &timer);
if (ret)
return ret;
}

if (opts.machr)
show_perf_mr(opts.transfer_size, opts.iterations, &start, &end, 2,
show_perf_mr(opts.transfer_size, opts.iterations, timer, 2,
opts.argc, opts.argv);
else
show_perf(NULL, opts.transfer_size, opts.iterations, &start, &end, 2);
show_perf(NULL, opts.transfer_size, opts.iterations, timer, 2);

return 0;
}
Expand Down Expand Up @@ -296,6 +296,7 @@ int run_pingpong(void)
int pingpong_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote)
{
int ret, i;
union ft_timer timer = {};
size_t inject_size = fi->tx_attr->inject_size;

ret = fi_getopt(&ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_RMA_SIZE,
Expand Down Expand Up @@ -335,7 +336,7 @@ int pingpong_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote)
for (i = 0; i < opts.iterations + opts.warmup_iterations; i++) {

if (i == opts.warmup_iterations)
ft_start();
ft_start(&timer);

if (rma_op == FT_RMA_WRITE)
*(tx_buf + opts.transfer_size - 1) = (char)i;
Expand All @@ -357,7 +358,7 @@ int pingpong_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote)
} else {
for (i = 0; i < opts.iterations + opts.warmup_iterations; i++) {
if (i == opts.warmup_iterations)
ft_start();
ft_start(&timer);

ret = ft_rx_rma(i, rma_op, ep, opts.transfer_size);
if (ret)
Expand All @@ -377,20 +378,21 @@ int pingpong_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote)
return ret;
}
}
ft_stop();
ft_stop(&timer);

if (opts.machr)
show_perf_mr(opts.transfer_size, opts.iterations, &start, &end, 2,
show_perf_mr(opts.transfer_size, opts.iterations, timer, 2,
opts.argc, opts.argv);
else
show_perf(NULL, opts.transfer_size, opts.iterations, &start, &end, 2);
show_perf(NULL, opts.transfer_size, opts.iterations, timer, 2);

return 0;
}

int rma_tx_completion(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote)
{
int ret, i;
union ft_timer timer = {};
size_t inject_size = fi->tx_attr->inject_size;

ret = fi_getopt(&ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_RMA_SIZE,
Expand Down Expand Up @@ -418,7 +420,7 @@ int rma_tx_completion(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote)
if (opts.dst_addr) {
for (i = 0; i < opts.iterations + opts.warmup_iterations; i++) {
if (i == opts.warmup_iterations)
ft_start();
ft_start(&timer);

if (opts.transfer_size <= inject_size)
ret = ft_inject_rma(rma_op, remote, ep,
Expand All @@ -431,12 +433,12 @@ int rma_tx_completion(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote)
return ret;
}

ft_stop();
ft_stop(&timer);
if (opts.machr)
show_perf_mr(opts.transfer_size, opts.iterations, &start, &end, 1,
show_perf_mr(opts.transfer_size, opts.iterations, timer, 1,
opts.argc, opts.argv);
else
show_perf(NULL, opts.transfer_size, opts.iterations, &start, &end, 1);
show_perf(NULL, opts.transfer_size, opts.iterations, timer, 1);

/* Inform RMA target that the test has ended */
ret = ft_sync();
Expand All @@ -461,14 +463,19 @@ int rma_tx_completion(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote)
return 0;
}

static int bw_tx_comp()
static int bw_tx_comp(union ft_timer *timer)
{
int ret;

ret = ft_get_tx_comp(tx_seq);
if (ret)
return ret;
return ft_rx(ep, FT_RMA_SYNC_MSG_BYTES);
if (timer)
ft_timer_pause(timer);
ret = ft_rx(ep, FT_RMA_SYNC_MSG_BYTES);
if (timer)
ft_timer_resume(timer);
return ret;
Comment on lines +473 to +478
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could this give you false performance numbers? Depending on your completion level, the send may have not been completely sent so you could be reporting higher bandwidth. Same on the receive side for the RMA ops. The RMA ops aren't going to generate completions so if you stop the timer right after get_rx_comp you're not actually waiting for all the messages to come in. Unfortunately, for RMA ops you kind of have to wait for the sync message to ensure you're stopping the timer when all the data has arrived.

Copy link
Contributor Author

@alekswn alekswn Oct 21, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Makes sense. The rationale here was to exclude server -> client send latency from time calculation. For writedata benchmark we observed synchronization overhead of the same order of magnitude as overall timing due to connection setup delay for EFA.

Connection setup delay is only seen for a first few synchronizations, but we see 30% error in bandwidth measurements attributed to connection setup latency. The delay is negligible after first 5 synchronizations.

The reason the synchronization delay affects writedata test so much is asymmetry of the test. Client makes a lot of writes in the loop, but server replies with just a single send at the end of each window. We are warming up client->server path with the warm-up iterations, but do not warm up server->client connection.

If we want keep synchronization timing included, I think an alternative approach would be to warm up server->client path as well. Does it sound like a better approach?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Gotcha, thanks for the context. I think it's ok to warmup both sides. In my opinion, it's a little weird that the writedata function has such a different model than the other functions. I understand why (because of the rx completion) but I think it would be ok to have all ops behave in the same pattern. Can we change writedata to be two sided? Would that help as well? Or are you explicitly looking to have it remain a one-sided test?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for suggestion!
Will try to make it symmetric.

}

static int bw_rx_comp(int window)
Expand All @@ -493,16 +500,20 @@ static int bw_rx_comp(int window)
return ft_tx(ep, remote_fi_addr, FT_RMA_SYNC_MSG_BYTES, &tx_ctx);
}

static int rma_bw_rx_comp()
static int rma_bw_rx_comp(union ft_timer *timer)
{
int ret;

/* rx_seq is always one ahead */
ret = ft_get_rx_comp(rx_seq - 1);
if (ret)
return ret;

return ft_tx(ep, remote_fi_addr, FT_RMA_SYNC_MSG_BYTES, &tx_ctx);
if (timer)
ft_timer_pause(timer);
ret = ft_tx(ep, remote_fi_addr, FT_RMA_SYNC_MSG_BYTES, &tx_ctx);
if (timer)
ft_timer_resume(timer);
return ret;
}

static uint64_t set_fi_more_flag(int i, int j, uint64_t flags)
Expand All @@ -520,6 +531,7 @@ int bandwidth(void)
{
int ret, i, j;
uint64_t flags = 0;
union ft_timer timer = {};
size_t inject_size = fi->tx_attr->inject_size;

ret = fi_getopt(&ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_MSG_SIZE,
Expand Down Expand Up @@ -549,7 +561,7 @@ int bandwidth(void)
if (opts.dst_addr) {
for (i = j = 0; i < opts.iterations + opts.warmup_iterations; i++) {
if (i == opts.warmup_iterations)
ft_start();
ft_start(&timer);

if (ft_check_opts(FT_OPT_VERIFY_DATA)) {
ret = ft_fill_buf(tx_ctx_arr[j].buf,
Expand Down Expand Up @@ -578,19 +590,19 @@ int bandwidth(void)
return ret;

if (++j == opts.window_size) {
ret = bw_tx_comp();
ret = bw_tx_comp(NULL);
if (ret)
return ret;
j = 0;
}
}
ret = bw_tx_comp();
ret = bw_tx_comp(NULL);
if (ret)
return ret;
} else {
for (i = j = 0; i < opts.iterations + opts.warmup_iterations; i++) {
if (i == opts.warmup_iterations)
ft_start();
ft_start(&timer);

if (opts.use_fi_more) {
flags = set_fi_more_flag(i, j, flags);
Expand Down Expand Up @@ -620,27 +632,27 @@ int bandwidth(void)
if (ret)
return ret;
}
ft_stop();
ft_stop(&timer);

if (opts.machr)
show_perf_mr(opts.transfer_size, opts.iterations, &start, &end, 1,
show_perf_mr(opts.transfer_size, opts.iterations, timer, 1,
opts.argc, opts.argv);
else
show_perf(NULL, opts.transfer_size, opts.iterations, &start, &end, 1);
show_perf(NULL, opts.transfer_size, opts.iterations, timer, 1);

return 0;
}

static int bw_rma_comp(enum ft_rma_opcodes rma_op, int num_completions)
static int bw_rma_comp(enum ft_rma_opcodes rma_op, int num_completions, union ft_timer *timer)
{
int ret;

if (rma_op == FT_RMA_WRITEDATA) {
/* for writedata, only the client sends,
* and only the server verifies. */
if (opts.dst_addr)
return bw_tx_comp();
ret = rma_bw_rx_comp();
return bw_tx_comp(timer);
ret = rma_bw_rx_comp(timer);
} else {
ret = ft_get_tx_comp(tx_seq);
}
Expand All @@ -662,6 +674,7 @@ int bandwidth_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote)
{
int ret, i, j;
uint64_t flags = 0;
union ft_timer timer = {};
size_t offset, inject_size = fi->tx_attr->inject_size;

ret = fi_getopt(&ep->fid, FI_OPT_ENDPOINT, FI_OPT_INJECT_RMA_SIZE,
Expand Down Expand Up @@ -690,7 +703,7 @@ int bandwidth_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote)
MAX(ft_tx_prefix_size(), ft_rx_prefix_size());
for (i = j = 0; i < opts.iterations + opts.warmup_iterations; i++) {
if (i == opts.warmup_iterations)
ft_start();
ft_start(&timer);
if (j == 0) {
offset = offset_rma_start;
if (ft_check_opts(FT_OPT_VERIFY_DATA) && opts.transfer_size > 0) {
Expand Down Expand Up @@ -768,22 +781,22 @@ int bandwidth_rma(enum ft_rma_opcodes rma_op, struct fi_rma_iov *remote)
return ret;

if (++j == opts.window_size) {
ret = bw_rma_comp(rma_op, j);
ret = bw_rma_comp(rma_op, j, &timer);
if (ret)
return ret;
j = 0;
}
offset += opts.transfer_size;
}
ret = bw_rma_comp(rma_op, j);
ret = bw_rma_comp(rma_op, j, &timer);
if (ret)
return ret;
ft_stop();
ft_stop(&timer);

if (opts.machr)
show_perf_mr(opts.transfer_size, opts.iterations, &start, &end, 1,
show_perf_mr(opts.transfer_size, opts.iterations, timer, 1,
opts.argc, opts.argv);
else
show_perf(NULL, opts.transfer_size, opts.iterations, &start, &end, 1);
show_perf(NULL, opts.transfer_size, opts.iterations, timer, 1);
return 0;
}
Loading