darshan-hpc · shanedsnyder · Feb 14, 2023 · Oct 27, 2022 · Nov 1, 2022 · Nov 4, 2022
diff --git a/darshan-util/pydarshan/darshan/backend/cffi_backend.py b/darshan-util/pydarshan/darshan/backend/cffi_backend.py
@@ -51,6 +51,26 @@
 check_version(ffi, libdutil)
 
 
+_mod_names = [
+    "NULL",
+    "POSIX",
+    "MPI-IO",
+    "H5F",
+    "H5D",
+    "PNETCDF_FILE",
+    "PNETCDF_VAR",
+    "BG/Q",
+    "LUSTRE",
+    "STDIO",
+    "DXT_POSIX",
+    "DXT_MPIIO",
+    "MDHIM",
+    "APXC",
+    "APMPI",
+    "HEATMAP",
+]
+def mod_name_to_idx(mod_name):
+    return _mod_names.index(mod_name)
 
 _structdefs = {
     "BG/Q": "struct darshan_bgq_record **",
@@ -685,6 +705,8 @@ def _df_to_rec(rec_dict, mod_name, rec_index_of_interest=None):
     fcounters_df = rec_dict["fcounters"]
     counters_n_cols = counters_df.shape[1]
     fcounters_n_cols = fcounters_df.shape[1]
+    id_col = counters_df.columns.get_loc("id")
+    rank_col = counters_df.columns.get_loc("rank")
     if rec_index_of_interest is None:
         num_recs = counters_df.shape[0]
         # newer pandas versions can support ...
@@ -701,10 +723,60 @@ def _df_to_rec(rec_dict, mod_name, rec_index_of_interest=None):
     rec_arr.fcounters = fcounters_df.iloc[rec_index_of_interest, 2:].to_numpy()
     rec_arr.counters = counters_df.iloc[rec_index_of_interest, 2:].to_numpy()
     if num_recs > 1:
-        rec_arr.id = counters_df.iloc[rec_index_of_interest, 0].to_numpy().reshape((num_recs, 1))
-        rec_arr.rank = counters_df.iloc[rec_index_of_interest, 1].to_numpy().reshape((num_recs, 1))
+        rec_arr.id = counters_df.iloc[rec_index_of_interest, id_col].to_numpy().reshape((num_recs, 1))
+        rec_arr.rank = counters_df.iloc[rec_index_of_interest, rank_col].to_numpy().reshape((num_recs, 1))
     else:
-        rec_arr.id = counters_df.iloc[rec_index_of_interest, 0]
-        rec_arr.rank = counters_df.iloc[rec_index_of_interest, 1]
+        rec_arr.id = counters_df.iloc[rec_index_of_interest, id_col]
+        rec_arr.rank = counters_df.iloc[rec_index_of_interest, rank_col]
     buf = rec_arr.tobytes()
     return buf
+
+
+def log_get_derived_metrics(rec_dict, mod_name, nprocs):
+    """
+    Passes a set of records (in pandas format) to the Darshan accumulator
+    interface, and returns the corresponding derived metrics struct.
+
+    Parameters:
+        rec_dict: Dictionary containing the counter and fcounter dataframes.
+        mod_name: Name of the Darshan module.
+        nprocs: Number of processes participating in accumulation.
+
+    Returns:
+        darshan_derived_metrics struct (cdata object)
+    """
+    mod_idx = mod_name_to_idx(mod_name)
+    darshan_accumulator = ffi.new("darshan_accumulator *")
+    r = libdutil.darshan_accumulator_create(mod_idx, nprocs, darshan_accumulator)
+    if r != 0:
+        raise RuntimeError("A nonzero exit code was received from "
+                           "darshan_accumulator_create() at the C level. "
+                           f"This could mean that the {mod_name} module does not "
+                           "support derived metric calculation, or that "
+                           "another kind of error occurred. It may be possible "
+                           "to retrieve additional information from the stderr "
+                           "stream.")
+
+    num_recs = rec_dict["fcounters"].shape[0]
+    record_array = _df_to_rec(rec_dict, mod_name)
+
+    r_i = libdutil.darshan_accumulator_inject(darshan_accumulator[0], record_array, num_recs)
+    if r_i != 0:
+        raise RuntimeError("A nonzero exit code was received from "
+                           "darshan_accumulator_inject() at the C level. "
+                           "It may be possible "
+                           "to retrieve additional information from the stderr "
+                           "stream.")
+    derived_metrics = ffi.new("struct darshan_derived_metrics *")
+    total_record = ffi.new(_structdefs[mod_name].replace("**", "*"))
+    r = libdutil.darshan_accumulator_emit(darshan_accumulator[0],
+                                          derived_metrics,
+                                          total_record)
+    libdutil.darshan_accumulator_destroy(darshan_accumulator[0])
+    if r != 0:
+        raise RuntimeError("A nonzero exit code was received from "
+                           "darshan_accumulator_emit() at the C level. "
+                           "It may be possible "
+                           "to retrieve additional information from the stderr "
+                           "stream.")
+    return derived_metrics
diff --git a/darshan-util/pydarshan/darshan/cli/base.html b/darshan-util/pydarshan/darshan/cli/base.html
@@ -49,7 +49,8 @@ <h3>${fig_title}</h3>
               <figcaption>${fig.fig_description}</figcaption>
             % else:
               <!-- temporary handling for DXT-disabled cases -->
-              <figcaption style="font-weight: bold; color: red; width: 400px;">
+              <!-- now also handles the bandwidth text... -->
+              <figcaption style="font-weight: bold; color: ${fig.text_only_color}; width: 400px;">
                 ${fig.fig_description}
               </figcaption>
             % endif

diff --git a/darshan-util/pydarshan/darshan/cli/summary.py b/darshan-util/pydarshan/darshan/cli/summary.py
@@ -14,6 +14,8 @@
 
 import darshan
 import darshan.cli
+from darshan.backend.cffi_backend import log_get_derived_metrics
+from darshan.lib.accum import log_get_bytes_bandwidth
 from darshan.experimental.plots import (
     plot_dxt_heatmap,
     plot_io_cost,
@@ -53,6 +55,11 @@ def __init__(
         fig_args: dict,
         fig_description: str = "",
         fig_width: int = 500,
+        # when there is no HTML data generated
+        # for the figure (i.e., no image/plot),
+        # we have the option of changing the caption
+        # text color for a warning/important standalone text
+        text_only_color: str = "red",
     ):
         self.section_title = section_title
         if not fig_title:
@@ -65,7 +72,11 @@ def __init__(
         # temporary handling for DXT disabled cases
         # so special error message can be passed
         # in place of an encoded image
+        # NOTE: this code path is now also
+        # being used for adding the bandwidth
+        # text, which doesn't really have an image...
         self.fig_html = None
+        self.text_only_color = text_only_color
         if self.fig_func:
             self.generate_fig()
 
@@ -487,6 +498,39 @@ def register_figures(self):
                 )
                 self.figures.append(opcount_fig)
 
+            try:
+                if mod in ["POSIX", "MPI-IO", "STDIO"]:
+                    # get the module's record dataframe and then pass to
+                    # Darshan accumulator interface to generate a cumulative
+                    # record and derived metrics
+                    rec_dict = self.report.records[mod].to_df()
+                    mod_name = mod
+                    nprocs = self.report.metadata['job']['nprocs']
+                    derived_metrics = log_get_derived_metrics(rec_dict, mod_name, nprocs)
+
+                    # this is really just some text
+                    # so using ReportFigure feels awkward...
+                    bandwidth_fig = ReportFigure(
+                            section_title=sect_title,
+                            fig_title="",
+                            fig_func=None,
+                            fig_args=None,
+                            fig_description=log_get_bytes_bandwidth(derived_metrics=derived_metrics,
+                                                                    mod_name=mod),
+                            text_only_color="blue")
+                    self.figures.append(bandwidth_fig)
+            except (RuntimeError, KeyError):
+                # the module probably doesn't support derived metrics
+                # calculations, but the C code doesn't distinguish other
+                # types of errors
+
+                # the KeyError appears to be needed for a subset of logs
+                # for which _structdefs lacks APMPI or APXC entries;
+                # for example `e3sm_io_heatmap_only.darshan` in logs
+                # repo
+                pass
+
+
         #########################
         # Data Access by Category
         if not {"POSIX", "STDIO"}.isdisjoint(set(self.report.modules)):

diff --git a/darshan-util/pydarshan/darshan/lib/accum.py b/darshan-util/pydarshan/darshan/lib/accum.py
@@ -0,0 +1,50 @@
+
+def log_get_bytes_bandwidth(derived_metrics, mod_name: str) -> str:
+    """
+    Summarize I/O performance for a given darshan module.
+
+    Parameters
+    ----------
+    derived_metrics:
+        structure (cdata object) describing metrics derived from a
+        set of records passed to the Darshan accumulator interface
+    mod_name: str
+        Name of the darshan module to summarize the I/O
+        performance for.
+
+    Returns
+    -------
+    out: str
+        A short string summarizing the performance of the given module
+        in the provided log file, including bandwidth and total data
+        transferred.
+
+    Raises
+    ------
+    RuntimeError
+        When a provided module name is not supported for the accumulator
+        interface for provision of the summary data, or for any other
+        error that occurs in the C/CFFI interface.
+    ValueError
+        When a provided module name does not exist in the log file.
+
+    Examples
+    --------
+
+    >>> from darshan.log_utils import get_log_path
+    >>> from darshan.lib.accum import log_get_bytes_bandwidth
+
+    >>> log_path = get_log_path("imbalanced-io.darshan")
+    >>> log_get_bytes_bandwidth(log_path, "POSIX")
+    I/O performance estimate (at the POSIX layer): transferred 101785.8 MiB at 164.99 MiB/s
+
+    >>> log_get_bytes_bandwidth(log_path, "MPI-IO")
+    I/O performance estimate (at the MPI-IO layer): transferred 126326.8 MiB at 101.58 MiB/s
+    """
+    # get total bytes (in MiB) and bandwidth (in MiB/s) for
+    # a given module -- this information was commonly reported
+    # in the old perl-based summary reports
+    total_mib = derived_metrics.total_bytes / 2 ** 20
+    total_bw = derived_metrics.agg_perf_by_slowest
+    ret_str = f"I/O performance estimate (at the {mod_name} layer): transferred {total_mib:.1f} MiB at {total_bw:.2f} MiB/s"
+    return ret_str
diff --git a/darshan-util/pydarshan/darshan/report.py b/darshan-util/pydarshan/darshan/report.py
@@ -661,6 +661,9 @@ def mod_read_all_records(self, mod, dtype=None, warnings=True):
         cn = backend.counter_names(mod)
         fcn = backend.fcounter_names(mod)
 
+        if mod not in self._modules:
+            raise ValueError(f"mod {mod} is not available in this DarshanReport object.")
+
         # update module metadata
         self._modules[mod]['num_records'] = 0
         if mod not in self.counters:

diff --git a/darshan-util/pydarshan/darshan/tests/test_lib_accum.py b/darshan-util/pydarshan/darshan/tests/test_lib_accum.py
@@ -0,0 +1,94 @@
+import darshan
+from darshan.backend.cffi_backend import log_get_derived_metrics
+from darshan.lib.accum import log_get_bytes_bandwidth
+from darshan.log_utils import get_log_path
+
+import pytest
+
+
+@pytest.mark.parametrize("log_path, mod_name, expected_str", [
+    # the expected bytes/bandwidth strings are pasted
+    # directly from the old perl summary reports;
+    # exceptions noted below
+    # in some cases we defer to darshan-parser for the expected
+    # values; see discussion in gh-839
+    ("imbalanced-io.darshan",
+     "STDIO",
+     "I/O performance estimate (at the STDIO layer): transferred 1.1 MiB at 0.01 MiB/s"),
+    ("imbalanced-io.darshan",
+     "MPI-IO",
+     "I/O performance estimate (at the MPI-IO layer): transferred 126326.8 MiB at 101.58 MiB/s"),
+    # imbalanced-io.darshan does have LUSTRE data,
+    # but it doesn't support derived metrics at time
+    # of writing
+    ("imbalanced-io.darshan",
+     "LUSTRE",
+     "RuntimeError"),
+    # APMPI doesn't support derived metrics either
+    ("e3sm_io_heatmap_only.darshan",
+     "APMPI",
+     "RuntimeError"),
+    ("imbalanced-io.darshan",
+     "POSIX",
+     "I/O performance estimate (at the POSIX layer): transferred 101785.8 MiB at 164.99 MiB/s"),
+    ("laytonjb_test1_id28730_6-7-43012-2131301613401632697_1.darshan",
+     "STDIO",
+     "I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 4.22 MiB/s"),
+    ("runtime_and_dxt_heatmaps_diagonal_write_only.darshan",
+     "POSIX",
+     "I/O performance estimate (at the POSIX layer): transferred 0.0 MiB at 0.02 MiB/s"),
+    ("treddy_mpi-io-test_id4373053_6-2-60198-9815401321915095332_1.darshan",
+     "STDIO",
+     "I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 16.47 MiB/s"),
+    ("e3sm_io_heatmap_only.darshan",
+     "STDIO",
+     "I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 3.26 MiB/s"),
+    ("e3sm_io_heatmap_only.darshan",
+     "MPI-IO",
+     "I/O performance estimate (at the MPI-IO layer): transferred 73880.2 MiB at 105.69 MiB/s"),
+    ("partial_data_stdio.darshan",
+     "MPI-IO",
+     "I/O performance estimate (at the MPI-IO layer): transferred 32.0 MiB at 2317.98 MiB/s"),
+    ("partial_data_stdio.darshan",
+     "STDIO",
+     "I/O performance estimate (at the STDIO layer): transferred 16336.0 MiB at 2999.14 MiB/s"),
+    # the C derived metrics code can't distinguish
+    # between different kinds of errors at this time,
+    # but we can still intercept in some cases...
+    ("partial_data_stdio.darshan",
+     "GARBAGE",
+     "ValueError"),
+    ("skew-app.darshan",
+     "POSIX",
+     "I/O performance estimate (at the POSIX layer): transferred 41615.8 MiB at 157.49 MiB/s"),
+    ("skew-app.darshan",
+     "MPI-IO",
+     "I/O performance estimate (at the MPI-IO layer): transferred 41615.8 MiB at 55.22 MiB/s"),
+])
+def test_derived_metrics_bytes_and_bandwidth(log_path, mod_name, expected_str):
+    # test the basic scenario of retrieving
+    # the total data transferred and bandwidth
+    # for all records in a given module; the situation
+    # of accumulating derived metrics with filtering
+    # (i.e., for a single filename) is not tested here
+
+    log_path = get_log_path(log_path)
+    with darshan.DarshanReport(log_path, read_all=True) as report:
+        if expected_str == "ValueError":
+            with pytest.raises(ValueError,
+                               match=f"mod {mod_name} is not available"):
+                report.mod_read_all_records(mod_name, dtype="pandas")
+        else:
+            report.mod_read_all_records(mod_name, dtype="pandas")
+            rec_dict = report.records[mod_name][0]
+            nprocs = report.metadata['job']['nprocs']
+
+            if expected_str == "RuntimeError":
+                with pytest.raises(RuntimeError,
+                                   match=f"{mod_name} module does not support derived"):
+                    log_get_derived_metrics(rec_dict, mod_name, nprocs)
+            else:
+                derived_metrics = log_get_derived_metrics(rec_dict, mod_name, nprocs)
+                actual_str = log_get_bytes_bandwidth(derived_metrics=derived_metrics,
+                                                     mod_name=mod_name)
+                assert actual_str == expected_str
diff --git a/darshan-util/pydarshan/darshan/tests/test_summary.py b/darshan-util/pydarshan/darshan/tests/test_summary.py
@@ -236,6 +236,12 @@ def test_main_all_logs_repo_files(tmpdir, log_filepath):
                     else:
                         assert actual_runtime_heatmap_titles == 0
 
+                    # check for presence of bandwidth summary strings
+                    # (more detailed per-module probes are present
+                    # in test_derived_metrics_bytes_and_bandwidth())
+                    assert "I/O performance estimate" in report_str
+                    assert "color: blue" in report_str
+
 
 class TestReportData: