Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
80 changes: 76 additions & 4 deletions darshan-util/pydarshan/darshan/backend/cffi_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,26 @@
check_version(ffi, libdutil)


_mod_names = [
"NULL",
"POSIX",
"MPI-IO",
"H5F",
"H5D",
"PNETCDF_FILE",
"PNETCDF_VAR",
"BG/Q",
"LUSTRE",
"STDIO",
"DXT_POSIX",
"DXT_MPIIO",
"MDHIM",
"APXC",
"APMPI",
"HEATMAP",
]
def mod_name_to_idx(mod_name):
return _mod_names.index(mod_name)

_structdefs = {
"BG/Q": "struct darshan_bgq_record **",
Expand Down Expand Up @@ -685,6 +705,8 @@ def _df_to_rec(rec_dict, mod_name, rec_index_of_interest=None):
fcounters_df = rec_dict["fcounters"]
counters_n_cols = counters_df.shape[1]
fcounters_n_cols = fcounters_df.shape[1]
id_col = counters_df.columns.get_loc("id")
rank_col = counters_df.columns.get_loc("rank")
if rec_index_of_interest is None:
num_recs = counters_df.shape[0]
# newer pandas versions can support ...
Expand All @@ -701,10 +723,60 @@ def _df_to_rec(rec_dict, mod_name, rec_index_of_interest=None):
rec_arr.fcounters = fcounters_df.iloc[rec_index_of_interest, 2:].to_numpy()
rec_arr.counters = counters_df.iloc[rec_index_of_interest, 2:].to_numpy()
if num_recs > 1:
rec_arr.id = counters_df.iloc[rec_index_of_interest, 0].to_numpy().reshape((num_recs, 1))
rec_arr.rank = counters_df.iloc[rec_index_of_interest, 1].to_numpy().reshape((num_recs, 1))
rec_arr.id = counters_df.iloc[rec_index_of_interest, id_col].to_numpy().reshape((num_recs, 1))
rec_arr.rank = counters_df.iloc[rec_index_of_interest, rank_col].to_numpy().reshape((num_recs, 1))
else:
rec_arr.id = counters_df.iloc[rec_index_of_interest, 0]
rec_arr.rank = counters_df.iloc[rec_index_of_interest, 1]
rec_arr.id = counters_df.iloc[rec_index_of_interest, id_col]
rec_arr.rank = counters_df.iloc[rec_index_of_interest, rank_col]
buf = rec_arr.tobytes()
return buf


def log_get_derived_metrics(rec_dict, mod_name, nprocs):
"""
Passes a set of records (in pandas format) to the Darshan accumulator
interface, and returns the corresponding derived metrics struct.

Parameters:
rec_dict: Dictionary containing the counter and fcounter dataframes.
mod_name: Name of the Darshan module.
nprocs: Number of processes participating in accumulation.

Returns:
darshan_derived_metrics struct (cdata object)
"""
mod_idx = mod_name_to_idx(mod_name)
darshan_accumulator = ffi.new("darshan_accumulator *")
r = libdutil.darshan_accumulator_create(mod_idx, nprocs, darshan_accumulator)
if r != 0:
raise RuntimeError("A nonzero exit code was received from "
"darshan_accumulator_create() at the C level. "
f"This could mean that the {mod_name} module does not "
"support derived metric calculation, or that "
"another kind of error occurred. It may be possible "
"to retrieve additional information from the stderr "
"stream.")

num_recs = rec_dict["fcounters"].shape[0]
record_array = _df_to_rec(rec_dict, mod_name)

r_i = libdutil.darshan_accumulator_inject(darshan_accumulator[0], record_array, num_recs)
if r_i != 0:
raise RuntimeError("A nonzero exit code was received from "
"darshan_accumulator_inject() at the C level. "
"It may be possible "
"to retrieve additional information from the stderr "
"stream.")
derived_metrics = ffi.new("struct darshan_derived_metrics *")
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The CFFI stuff is of course a bit awkward, especially given how many different entrypoints to the C layer we need to interact with just to get a single final struct back, but I'd probably be inclined to just live with it if we just operate on DataFrames or Arrow in-memory stuff in a few years anyway.

It occurs to me that for some things, like struct darshan_derived_metrics, we could probably avoid CFFI traversal by using the bytes from a NumPy recarray, but this would be more verbose with minimal gain for now I suspect.

total_record = ffi.new(_structdefs[mod_name].replace("**", "*"))
r = libdutil.darshan_accumulator_emit(darshan_accumulator[0],
derived_metrics,
total_record)
libdutil.darshan_accumulator_destroy(darshan_accumulator[0])
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I suppose we could check for a 0 retcode here, though the function appears to have no error handling anyway, not even in the form of non-zero exit code.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, I just punted on error checking that for a couple of reasons:

  • Internal knowledge that the destroy() call doesn't actually have an error path despite having a return code -- it's basically just freeing some stuff, so if there's an error it's likely going to be a crash.
  • We want to call destroy() before potentially raising an exception related to the emit() call returning a failure just to make sure we clean up C memory. It just becomes a little complicated/verbose to unwind the errors from back-to-back calls like that, so I just opted for simplicity with the internal knowledge from above.

if r != 0:
raise RuntimeError("A nonzero exit code was received from "
"darshan_accumulator_emit() at the C level. "
"It may be possible "
"to retrieve additional information from the stderr "
"stream.")
return derived_metrics
3 changes: 2 additions & 1 deletion darshan-util/pydarshan/darshan/cli/base.html
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,8 @@ <h3>${fig_title}</h3>
<figcaption>${fig.fig_description}</figcaption>
% else:
<!-- temporary handling for DXT-disabled cases -->
<figcaption style="font-weight: bold; color: red; width: 400px;">
<!-- now also handles the bandwidth text... -->
<figcaption style="font-weight: bold; color: ${fig.text_only_color}; width: 400px;">
${fig.fig_description}
</figcaption>
% endif
Expand Down
44 changes: 44 additions & 0 deletions darshan-util/pydarshan/darshan/cli/summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@

import darshan
import darshan.cli
from darshan.backend.cffi_backend import log_get_derived_metrics
from darshan.lib.accum import log_get_bytes_bandwidth
from darshan.experimental.plots import (
plot_dxt_heatmap,
plot_io_cost,
Expand Down Expand Up @@ -53,6 +55,11 @@ def __init__(
fig_args: dict,
fig_description: str = "",
fig_width: int = 500,
# when there is no HTML data generated
# for the figure (i.e., no image/plot),
# we have the option of changing the caption
# text color for a warning/important standalone text
text_only_color: str = "red",
):
self.section_title = section_title
if not fig_title:
Expand All @@ -65,7 +72,11 @@ def __init__(
# temporary handling for DXT disabled cases
# so special error message can be passed
# in place of an encoded image
# NOTE: this code path is now also
# being used for adding the bandwidth
# text, which doesn't really have an image...
self.fig_html = None
self.text_only_color = text_only_color
if self.fig_func:
self.generate_fig()

Expand Down Expand Up @@ -487,6 +498,39 @@ def register_figures(self):
)
self.figures.append(opcount_fig)

try:
if mod in ["POSIX", "MPI-IO", "STDIO"]:
# get the module's record dataframe and then pass to
# Darshan accumulator interface to generate a cumulative
# record and derived metrics
rec_dict = self.report.records[mod].to_df()
mod_name = mod
nprocs = self.report.metadata['job']['nprocs']
derived_metrics = log_get_derived_metrics(rec_dict, mod_name, nprocs)

# this is really just some text
# so using ReportFigure feels awkward...
bandwidth_fig = ReportFigure(
section_title=sect_title,
fig_title="",
fig_func=None,
fig_args=None,
fig_description=log_get_bytes_bandwidth(derived_metrics=derived_metrics,
mod_name=mod),
text_only_color="blue")
self.figures.append(bandwidth_fig)
except (RuntimeError, KeyError):
# the module probably doesn't support derived metrics
# calculations, but the C code doesn't distinguish other
# types of errors

# the KeyError appears to be needed for a subset of logs
# for which _structdefs lacks APMPI or APXC entries;
# for example `e3sm_io_heatmap_only.darshan` in logs
# repo
pass


#########################
# Data Access by Category
if not {"POSIX", "STDIO"}.isdisjoint(set(self.report.modules)):
Expand Down
50 changes: 50 additions & 0 deletions darshan-util/pydarshan/darshan/lib/accum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@

def log_get_bytes_bandwidth(derived_metrics, mod_name: str) -> str:
"""
Summarize I/O performance for a given darshan module.

Parameters
----------
derived_metrics:
structure (cdata object) describing metrics derived from a
set of records passed to the Darshan accumulator interface
mod_name: str
Name of the darshan module to summarize the I/O
performance for.

Returns
-------
out: str
A short string summarizing the performance of the given module
in the provided log file, including bandwidth and total data
transferred.

Raises
------
RuntimeError
When a provided module name is not supported for the accumulator
interface for provision of the summary data, or for any other
error that occurs in the C/CFFI interface.
ValueError
When a provided module name does not exist in the log file.

Examples
--------

>>> from darshan.log_utils import get_log_path
>>> from darshan.lib.accum import log_get_bytes_bandwidth

>>> log_path = get_log_path("imbalanced-io.darshan")
>>> log_get_bytes_bandwidth(log_path, "POSIX")
I/O performance estimate (at the POSIX layer): transferred 101785.8 MiB at 164.99 MiB/s

>>> log_get_bytes_bandwidth(log_path, "MPI-IO")
I/O performance estimate (at the MPI-IO layer): transferred 126326.8 MiB at 101.58 MiB/s
"""
# get total bytes (in MiB) and bandwidth (in MiB/s) for
# a given module -- this information was commonly reported
# in the old perl-based summary reports
total_mib = derived_metrics.total_bytes / 2 ** 20
total_bw = derived_metrics.agg_perf_by_slowest
ret_str = f"I/O performance estimate (at the {mod_name} layer): transferred {total_mib:.1f} MiB at {total_bw:.2f} MiB/s"
return ret_str
3 changes: 3 additions & 0 deletions darshan-util/pydarshan/darshan/report.py
Original file line number Diff line number Diff line change
Expand Up @@ -661,6 +661,9 @@ def mod_read_all_records(self, mod, dtype=None, warnings=True):
cn = backend.counter_names(mod)
fcn = backend.fcounter_names(mod)

if mod not in self._modules:
raise ValueError(f"mod {mod} is not available in this DarshanReport object.")

# update module metadata
self._modules[mod]['num_records'] = 0
if mod not in self.counters:
Expand Down
94 changes: 94 additions & 0 deletions darshan-util/pydarshan/darshan/tests/test_lib_accum.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
import darshan
from darshan.backend.cffi_backend import log_get_derived_metrics
from darshan.lib.accum import log_get_bytes_bandwidth
from darshan.log_utils import get_log_path

import pytest


@pytest.mark.parametrize("log_path, mod_name, expected_str", [
# the expected bytes/bandwidth strings are pasted
# directly from the old perl summary reports;
# exceptions noted below
# in some cases we defer to darshan-parser for the expected
# values; see discussion in gh-839
("imbalanced-io.darshan",
"STDIO",
"I/O performance estimate (at the STDIO layer): transferred 1.1 MiB at 0.01 MiB/s"),
("imbalanced-io.darshan",
"MPI-IO",
"I/O performance estimate (at the MPI-IO layer): transferred 126326.8 MiB at 101.58 MiB/s"),
# imbalanced-io.darshan does have LUSTRE data,
# but it doesn't support derived metrics at time
# of writing
("imbalanced-io.darshan",
"LUSTRE",
"RuntimeError"),
# APMPI doesn't support derived metrics either
("e3sm_io_heatmap_only.darshan",
"APMPI",
"RuntimeError"),
("imbalanced-io.darshan",
"POSIX",
"I/O performance estimate (at the POSIX layer): transferred 101785.8 MiB at 164.99 MiB/s"),
("laytonjb_test1_id28730_6-7-43012-2131301613401632697_1.darshan",
"STDIO",
"I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 4.22 MiB/s"),
("runtime_and_dxt_heatmaps_diagonal_write_only.darshan",
"POSIX",
"I/O performance estimate (at the POSIX layer): transferred 0.0 MiB at 0.02 MiB/s"),
("treddy_mpi-io-test_id4373053_6-2-60198-9815401321915095332_1.darshan",
"STDIO",
"I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 16.47 MiB/s"),
("e3sm_io_heatmap_only.darshan",
"STDIO",
"I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 3.26 MiB/s"),
("e3sm_io_heatmap_only.darshan",
"MPI-IO",
"I/O performance estimate (at the MPI-IO layer): transferred 73880.2 MiB at 105.69 MiB/s"),
("partial_data_stdio.darshan",
"MPI-IO",
"I/O performance estimate (at the MPI-IO layer): transferred 32.0 MiB at 2317.98 MiB/s"),
("partial_data_stdio.darshan",
"STDIO",
"I/O performance estimate (at the STDIO layer): transferred 16336.0 MiB at 2999.14 MiB/s"),
# the C derived metrics code can't distinguish
# between different kinds of errors at this time,
# but we can still intercept in some cases...
("partial_data_stdio.darshan",
"GARBAGE",
"ValueError"),
("skew-app.darshan",
"POSIX",
"I/O performance estimate (at the POSIX layer): transferred 41615.8 MiB at 157.49 MiB/s"),
("skew-app.darshan",
"MPI-IO",
"I/O performance estimate (at the MPI-IO layer): transferred 41615.8 MiB at 55.22 MiB/s"),
])
def test_derived_metrics_bytes_and_bandwidth(log_path, mod_name, expected_str):
# test the basic scenario of retrieving
# the total data transferred and bandwidth
# for all records in a given module; the situation
# of accumulating derived metrics with filtering
# (i.e., for a single filename) is not tested here

log_path = get_log_path(log_path)
with darshan.DarshanReport(log_path, read_all=True) as report:
if expected_str == "ValueError":
with pytest.raises(ValueError,
match=f"mod {mod_name} is not available"):
report.mod_read_all_records(mod_name, dtype="pandas")
else:
report.mod_read_all_records(mod_name, dtype="pandas")
rec_dict = report.records[mod_name][0]
nprocs = report.metadata['job']['nprocs']

if expected_str == "RuntimeError":
with pytest.raises(RuntimeError,
match=f"{mod_name} module does not support derived"):
log_get_derived_metrics(rec_dict, mod_name, nprocs)
else:
derived_metrics = log_get_derived_metrics(rec_dict, mod_name, nprocs)
actual_str = log_get_bytes_bandwidth(derived_metrics=derived_metrics,
mod_name=mod_name)
assert actual_str == expected_str
6 changes: 6 additions & 0 deletions darshan-util/pydarshan/darshan/tests/test_summary.py
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,12 @@ def test_main_all_logs_repo_files(tmpdir, log_filepath):
else:
assert actual_runtime_heatmap_titles == 0

# check for presence of bandwidth summary strings
# (more detailed per-module probes are present
# in test_derived_metrics_bytes_and_bandwidth())
assert "I/O performance estimate" in report_str
assert "color: blue" in report_str


class TestReportData:

Expand Down