diff --git a/darshan-util/pydarshan/darshan/backend/cffi_backend.py b/darshan-util/pydarshan/darshan/backend/cffi_backend.py index 6e6d76f4c..5076c4edc 100644 --- a/darshan-util/pydarshan/darshan/backend/cffi_backend.py +++ b/darshan-util/pydarshan/darshan/backend/cffi_backend.py @@ -51,6 +51,26 @@ check_version(ffi, libdutil) +_mod_names = [ + "NULL", + "POSIX", + "MPI-IO", + "H5F", + "H5D", + "PNETCDF_FILE", + "PNETCDF_VAR", + "BG/Q", + "LUSTRE", + "STDIO", + "DXT_POSIX", + "DXT_MPIIO", + "MDHIM", + "APXC", + "APMPI", + "HEATMAP", +] +def mod_name_to_idx(mod_name): + return _mod_names.index(mod_name) _structdefs = { "BG/Q": "struct darshan_bgq_record **", @@ -685,6 +705,8 @@ def _df_to_rec(rec_dict, mod_name, rec_index_of_interest=None): fcounters_df = rec_dict["fcounters"] counters_n_cols = counters_df.shape[1] fcounters_n_cols = fcounters_df.shape[1] + id_col = counters_df.columns.get_loc("id") + rank_col = counters_df.columns.get_loc("rank") if rec_index_of_interest is None: num_recs = counters_df.shape[0] # newer pandas versions can support ... @@ -701,10 +723,60 @@ def _df_to_rec(rec_dict, mod_name, rec_index_of_interest=None): rec_arr.fcounters = fcounters_df.iloc[rec_index_of_interest, 2:].to_numpy() rec_arr.counters = counters_df.iloc[rec_index_of_interest, 2:].to_numpy() if num_recs > 1: - rec_arr.id = counters_df.iloc[rec_index_of_interest, 0].to_numpy().reshape((num_recs, 1)) - rec_arr.rank = counters_df.iloc[rec_index_of_interest, 1].to_numpy().reshape((num_recs, 1)) + rec_arr.id = counters_df.iloc[rec_index_of_interest, id_col].to_numpy().reshape((num_recs, 1)) + rec_arr.rank = counters_df.iloc[rec_index_of_interest, rank_col].to_numpy().reshape((num_recs, 1)) else: - rec_arr.id = counters_df.iloc[rec_index_of_interest, 0] - rec_arr.rank = counters_df.iloc[rec_index_of_interest, 1] + rec_arr.id = counters_df.iloc[rec_index_of_interest, id_col] + rec_arr.rank = counters_df.iloc[rec_index_of_interest, rank_col] buf = rec_arr.tobytes() return buf + + +def log_get_derived_metrics(rec_dict, mod_name, nprocs): + """ + Passes a set of records (in pandas format) to the Darshan accumulator + interface, and returns the corresponding derived metrics struct. + + Parameters: + rec_dict: Dictionary containing the counter and fcounter dataframes. + mod_name: Name of the Darshan module. + nprocs: Number of processes participating in accumulation. + + Returns: + darshan_derived_metrics struct (cdata object) + """ + mod_idx = mod_name_to_idx(mod_name) + darshan_accumulator = ffi.new("darshan_accumulator *") + r = libdutil.darshan_accumulator_create(mod_idx, nprocs, darshan_accumulator) + if r != 0: + raise RuntimeError("A nonzero exit code was received from " + "darshan_accumulator_create() at the C level. " + f"This could mean that the {mod_name} module does not " + "support derived metric calculation, or that " + "another kind of error occurred. It may be possible " + "to retrieve additional information from the stderr " + "stream.") + + num_recs = rec_dict["fcounters"].shape[0] + record_array = _df_to_rec(rec_dict, mod_name) + + r_i = libdutil.darshan_accumulator_inject(darshan_accumulator[0], record_array, num_recs) + if r_i != 0: + raise RuntimeError("A nonzero exit code was received from " + "darshan_accumulator_inject() at the C level. " + "It may be possible " + "to retrieve additional information from the stderr " + "stream.") + derived_metrics = ffi.new("struct darshan_derived_metrics *") + total_record = ffi.new(_structdefs[mod_name].replace("**", "*")) + r = libdutil.darshan_accumulator_emit(darshan_accumulator[0], + derived_metrics, + total_record) + libdutil.darshan_accumulator_destroy(darshan_accumulator[0]) + if r != 0: + raise RuntimeError("A nonzero exit code was received from " + "darshan_accumulator_emit() at the C level. " + "It may be possible " + "to retrieve additional information from the stderr " + "stream.") + return derived_metrics diff --git a/darshan-util/pydarshan/darshan/cli/base.html b/darshan-util/pydarshan/darshan/cli/base.html index 99382cb34..302858046 100644 --- a/darshan-util/pydarshan/darshan/cli/base.html +++ b/darshan-util/pydarshan/darshan/cli/base.html @@ -49,7 +49,8 @@

${fig_title}

${fig.fig_description}
% else: -
+ +
${fig.fig_description}
% endif diff --git a/darshan-util/pydarshan/darshan/cli/summary.py b/darshan-util/pydarshan/darshan/cli/summary.py index 56050d5c0..f638cd6a3 100644 --- a/darshan-util/pydarshan/darshan/cli/summary.py +++ b/darshan-util/pydarshan/darshan/cli/summary.py @@ -14,6 +14,8 @@ import darshan import darshan.cli +from darshan.backend.cffi_backend import log_get_derived_metrics +from darshan.lib.accum import log_get_bytes_bandwidth from darshan.experimental.plots import ( plot_dxt_heatmap, plot_io_cost, @@ -53,6 +55,11 @@ def __init__( fig_args: dict, fig_description: str = "", fig_width: int = 500, + # when there is no HTML data generated + # for the figure (i.e., no image/plot), + # we have the option of changing the caption + # text color for a warning/important standalone text + text_only_color: str = "red", ): self.section_title = section_title if not fig_title: @@ -65,7 +72,11 @@ def __init__( # temporary handling for DXT disabled cases # so special error message can be passed # in place of an encoded image + # NOTE: this code path is now also + # being used for adding the bandwidth + # text, which doesn't really have an image... self.fig_html = None + self.text_only_color = text_only_color if self.fig_func: self.generate_fig() @@ -487,6 +498,39 @@ def register_figures(self): ) self.figures.append(opcount_fig) + try: + if mod in ["POSIX", "MPI-IO", "STDIO"]: + # get the module's record dataframe and then pass to + # Darshan accumulator interface to generate a cumulative + # record and derived metrics + rec_dict = self.report.records[mod].to_df() + mod_name = mod + nprocs = self.report.metadata['job']['nprocs'] + derived_metrics = log_get_derived_metrics(rec_dict, mod_name, nprocs) + + # this is really just some text + # so using ReportFigure feels awkward... + bandwidth_fig = ReportFigure( + section_title=sect_title, + fig_title="", + fig_func=None, + fig_args=None, + fig_description=log_get_bytes_bandwidth(derived_metrics=derived_metrics, + mod_name=mod), + text_only_color="blue") + self.figures.append(bandwidth_fig) + except (RuntimeError, KeyError): + # the module probably doesn't support derived metrics + # calculations, but the C code doesn't distinguish other + # types of errors + + # the KeyError appears to be needed for a subset of logs + # for which _structdefs lacks APMPI or APXC entries; + # for example `e3sm_io_heatmap_only.darshan` in logs + # repo + pass + + ######################### # Data Access by Category if not {"POSIX", "STDIO"}.isdisjoint(set(self.report.modules)): diff --git a/darshan-util/pydarshan/darshan/lib/accum.py b/darshan-util/pydarshan/darshan/lib/accum.py new file mode 100644 index 000000000..d1fec2e2f --- /dev/null +++ b/darshan-util/pydarshan/darshan/lib/accum.py @@ -0,0 +1,50 @@ + +def log_get_bytes_bandwidth(derived_metrics, mod_name: str) -> str: + """ + Summarize I/O performance for a given darshan module. + + Parameters + ---------- + derived_metrics: + structure (cdata object) describing metrics derived from a + set of records passed to the Darshan accumulator interface + mod_name: str + Name of the darshan module to summarize the I/O + performance for. + + Returns + ------- + out: str + A short string summarizing the performance of the given module + in the provided log file, including bandwidth and total data + transferred. + + Raises + ------ + RuntimeError + When a provided module name is not supported for the accumulator + interface for provision of the summary data, or for any other + error that occurs in the C/CFFI interface. + ValueError + When a provided module name does not exist in the log file. + + Examples + -------- + + >>> from darshan.log_utils import get_log_path + >>> from darshan.lib.accum import log_get_bytes_bandwidth + + >>> log_path = get_log_path("imbalanced-io.darshan") + >>> log_get_bytes_bandwidth(log_path, "POSIX") + I/O performance estimate (at the POSIX layer): transferred 101785.8 MiB at 164.99 MiB/s + + >>> log_get_bytes_bandwidth(log_path, "MPI-IO") + I/O performance estimate (at the MPI-IO layer): transferred 126326.8 MiB at 101.58 MiB/s + """ + # get total bytes (in MiB) and bandwidth (in MiB/s) for + # a given module -- this information was commonly reported + # in the old perl-based summary reports + total_mib = derived_metrics.total_bytes / 2 ** 20 + total_bw = derived_metrics.agg_perf_by_slowest + ret_str = f"I/O performance estimate (at the {mod_name} layer): transferred {total_mib:.1f} MiB at {total_bw:.2f} MiB/s" + return ret_str diff --git a/darshan-util/pydarshan/darshan/report.py b/darshan-util/pydarshan/darshan/report.py index 17b0e0e3c..047e4d568 100644 --- a/darshan-util/pydarshan/darshan/report.py +++ b/darshan-util/pydarshan/darshan/report.py @@ -661,6 +661,9 @@ def mod_read_all_records(self, mod, dtype=None, warnings=True): cn = backend.counter_names(mod) fcn = backend.fcounter_names(mod) + if mod not in self._modules: + raise ValueError(f"mod {mod} is not available in this DarshanReport object.") + # update module metadata self._modules[mod]['num_records'] = 0 if mod not in self.counters: diff --git a/darshan-util/pydarshan/darshan/tests/test_lib_accum.py b/darshan-util/pydarshan/darshan/tests/test_lib_accum.py new file mode 100644 index 000000000..be3a55b56 --- /dev/null +++ b/darshan-util/pydarshan/darshan/tests/test_lib_accum.py @@ -0,0 +1,94 @@ +import darshan +from darshan.backend.cffi_backend import log_get_derived_metrics +from darshan.lib.accum import log_get_bytes_bandwidth +from darshan.log_utils import get_log_path + +import pytest + + +@pytest.mark.parametrize("log_path, mod_name, expected_str", [ + # the expected bytes/bandwidth strings are pasted + # directly from the old perl summary reports; + # exceptions noted below + # in some cases we defer to darshan-parser for the expected + # values; see discussion in gh-839 + ("imbalanced-io.darshan", + "STDIO", + "I/O performance estimate (at the STDIO layer): transferred 1.1 MiB at 0.01 MiB/s"), + ("imbalanced-io.darshan", + "MPI-IO", + "I/O performance estimate (at the MPI-IO layer): transferred 126326.8 MiB at 101.58 MiB/s"), + # imbalanced-io.darshan does have LUSTRE data, + # but it doesn't support derived metrics at time + # of writing + ("imbalanced-io.darshan", + "LUSTRE", + "RuntimeError"), + # APMPI doesn't support derived metrics either + ("e3sm_io_heatmap_only.darshan", + "APMPI", + "RuntimeError"), + ("imbalanced-io.darshan", + "POSIX", + "I/O performance estimate (at the POSIX layer): transferred 101785.8 MiB at 164.99 MiB/s"), + ("laytonjb_test1_id28730_6-7-43012-2131301613401632697_1.darshan", + "STDIO", + "I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 4.22 MiB/s"), + ("runtime_and_dxt_heatmaps_diagonal_write_only.darshan", + "POSIX", + "I/O performance estimate (at the POSIX layer): transferred 0.0 MiB at 0.02 MiB/s"), + ("treddy_mpi-io-test_id4373053_6-2-60198-9815401321915095332_1.darshan", + "STDIO", + "I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 16.47 MiB/s"), + ("e3sm_io_heatmap_only.darshan", + "STDIO", + "I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 3.26 MiB/s"), + ("e3sm_io_heatmap_only.darshan", + "MPI-IO", + "I/O performance estimate (at the MPI-IO layer): transferred 73880.2 MiB at 105.69 MiB/s"), + ("partial_data_stdio.darshan", + "MPI-IO", + "I/O performance estimate (at the MPI-IO layer): transferred 32.0 MiB at 2317.98 MiB/s"), + ("partial_data_stdio.darshan", + "STDIO", + "I/O performance estimate (at the STDIO layer): transferred 16336.0 MiB at 2999.14 MiB/s"), + # the C derived metrics code can't distinguish + # between different kinds of errors at this time, + # but we can still intercept in some cases... + ("partial_data_stdio.darshan", + "GARBAGE", + "ValueError"), + ("skew-app.darshan", + "POSIX", + "I/O performance estimate (at the POSIX layer): transferred 41615.8 MiB at 157.49 MiB/s"), + ("skew-app.darshan", + "MPI-IO", + "I/O performance estimate (at the MPI-IO layer): transferred 41615.8 MiB at 55.22 MiB/s"), +]) +def test_derived_metrics_bytes_and_bandwidth(log_path, mod_name, expected_str): + # test the basic scenario of retrieving + # the total data transferred and bandwidth + # for all records in a given module; the situation + # of accumulating derived metrics with filtering + # (i.e., for a single filename) is not tested here + + log_path = get_log_path(log_path) + with darshan.DarshanReport(log_path, read_all=True) as report: + if expected_str == "ValueError": + with pytest.raises(ValueError, + match=f"mod {mod_name} is not available"): + report.mod_read_all_records(mod_name, dtype="pandas") + else: + report.mod_read_all_records(mod_name, dtype="pandas") + rec_dict = report.records[mod_name][0] + nprocs = report.metadata['job']['nprocs'] + + if expected_str == "RuntimeError": + with pytest.raises(RuntimeError, + match=f"{mod_name} module does not support derived"): + log_get_derived_metrics(rec_dict, mod_name, nprocs) + else: + derived_metrics = log_get_derived_metrics(rec_dict, mod_name, nprocs) + actual_str = log_get_bytes_bandwidth(derived_metrics=derived_metrics, + mod_name=mod_name) + assert actual_str == expected_str diff --git a/darshan-util/pydarshan/darshan/tests/test_summary.py b/darshan-util/pydarshan/darshan/tests/test_summary.py index 282ab7d36..1c1acd2fa 100644 --- a/darshan-util/pydarshan/darshan/tests/test_summary.py +++ b/darshan-util/pydarshan/darshan/tests/test_summary.py @@ -236,6 +236,12 @@ def test_main_all_logs_repo_files(tmpdir, log_filepath): else: assert actual_runtime_heatmap_titles == 0 + # check for presence of bandwidth summary strings + # (more detailed per-module probes are present + # in test_derived_metrics_bytes_and_bandwidth()) + assert "I/O performance estimate" in report_str + assert "color: blue" in report_str + class TestReportData: