MAINT: PR 839 revisions

tylerjereddy · tylerjereddy · commit 96806fa2310e · 2022-12-16T11:12:59.000-07:00
* `cffi_backend` module changes requested from PR review - remove a spurious `darshan_free` from `_log_get_heatmap_record()` - fix the scoping of the `darshan_free` of `buf` object used with `darshan_accumulator_inject` in `log_get_derived_metrics` - adding a missing `log_close()` to `log_get_derived_metrics` (maybe we can wrap in Python contexts in the future though) - use a separate buffer for `darshan_accumulator_emit()` inside `log_get_derived_metrics` * note that making the above CFFI/free-related changes caused a segfault in the testuite, so in the end I adjusted the location of the memory freeing as I saw fit to avoid segfaults--I'd say at this point please provide concrete evidence with a memory leak plot or failing test for additional adjustments there, or just push the change in * in the end, there is a slightly more concise usage of `darshan_free()` but no meaningful change in the free operations * I also reverted the suggested changed to `darshan_accumulator_emit()` usage--there was no testable evidence of an issue, and it was also causing segfaults.. * address many of the discussion points that came up in gh-868: - `log_get_derived_metrics()` now uses an LRU cache, which effectively means that we use memoization to return derived metrics data rather than doing another pass over the log file if the same log path and module name have already been accumulated from; we still need to pass over a given log twice in most cases--once at initial read-in and once for using `log_get_derived_metrics`; how we decide to add filtering of records prior to accumulation interface in Python is probably a deeper discussion/for later - `log_get_bytes_bandwidth()` and its associated testing have been migrated to modules not named after "CFFI", like the in the above PR, because I think we should only use the "CFFI" named modules for direct CFFI interaction/testing, and for other analyses we should probably use more distinct names. Also, to some extent everything depends on the CFFI layer, so trying to restrict "CFFI" modules to direct rather than direct interaction will help keep them manageably sized, especially given the proclivity for surprising memory issues/segfaults in those parts of the code. - add a proper docstring with examples for `log_get_bytes_bandwidth()`
diff --git a/darshan-util/pydarshan/.gitignore b/darshan-util/pydarshan/.gitignore
@@ -24,7 +24,6 @@ dist/
 downloads/
 eggs/
 .eggs/
-lib/
 lib64/
 parts/
 sdist/
diff --git a/darshan-util/pydarshan/darshan/backend/cffi_backend.py b/darshan-util/pydarshan/darshan/backend/cffi_backend.py
@@ -633,7 +633,6 @@ def _log_get_heatmap_record(log):
     buf = ffi.new("void **")
     r = libdutil.darshan_log_get_record(log['handle'], modules[mod_name]['idx'], buf)
     if r < 1:
-        libdutil.darshan_free(buf[0])
         return None
     
     filerec = ffi.cast(mod_type, buf)
@@ -660,6 +659,7 @@ def _log_get_heatmap_record(log):
     return rec
 
 
+@functools.lru_cache()
 def log_get_derived_metrics(log_path: str, mod_name: str):
     """
     Returns the darshan_derived_metrics struct from CFFI/C accumulator code.
@@ -716,24 +716,12 @@ def log_get_derived_metrics(log_path: str, mod_name: str):
     r = libdutil.darshan_accumulator_emit(darshan_accumulator[0],
                                           darshan_derived_metrics,
                                           rbuf[0])
+    libdutil.darshan_free(buf[0])
     if r != 0:
-        libdutil.darshan_free(buf[0])
         raise RuntimeError("A nonzero exit code was received from "
                            "darshan_accumulator_emit() at the C level. "
                            "It may be possible "
                            "to retrieve additional information from the stderr "
                            "stream.")
-    libdutil.darshan_free(buf[0])
+    log_close(log_handle)
     return darshan_derived_metrics
-
-
-def log_get_bytes_bandwidth(log_path: str, mod_name: str) -> str:
-    # get total bytes (in MiB) and bandwidth (in MiB/s) for
-    # a given module -- this information was commonly reported
-    # in the old perl-based summary reports
-    darshan_derived_metrics = log_get_derived_metrics(log_path=log_path,
-                                                      mod_name=mod_name)
-    total_mib = darshan_derived_metrics.total_bytes / 2 ** 20
-    total_bw = darshan_derived_metrics.agg_perf_by_slowest
-    ret_str = f"I/O performance estimate (at the {mod_name} layer): transferred {total_mib:.1f} MiB at {total_bw:.2f} MiB/s"
-    return ret_str
diff --git a/darshan-util/pydarshan/darshan/cli/summary.py b/darshan-util/pydarshan/darshan/cli/summary.py
@@ -17,7 +17,7 @@
 
 import darshan
 import darshan.cli
-from darshan.backend.cffi_backend import log_get_bytes_bandwidth
+from darshan.lib.accum import log_get_bytes_bandwidth
 from darshan.experimental.plots import (
     plot_dxt_heatmap,
     plot_io_cost,
diff --git a/darshan-util/pydarshan/darshan/lib/accum.py b/darshan-util/pydarshan/darshan/lib/accum.py
@@ -0,0 +1,53 @@
+from darshan.backend.cffi_backend import log_get_derived_metrics
+
+
+def log_get_bytes_bandwidth(log_path: str, mod_name: str) -> str:
+    """
+    Summarize I/O performance for a given darshan module.
+
+    Parameters
+    ----------
+    log_path : str
+            Path to the darshan binary log file.
+    mod_name : str
+            Name of the darshan module to summarize the I/O
+            performance for.
+
+    Returns
+    -------
+    out: str
+        A short string summarizing the performance of the given module
+        in the provided log file, including bandwidth and total data
+        transferred.
+
+    Raises
+    ------
+    RuntimeError
+        When a provided module name is not supported for the accumulator
+        interface for provision of the summary data, or for any other
+        error that occurs in the C/CFFI interface.
+    ValueError
+        When a provided module name does not exist in the log file.
+
+    Examples
+    --------
+
+    >>> from darshan.log_utils import get_log_path
+    >>> from darshan.lib.accum import log_get_bytes_bandwidth
+
+    >>> log_path = get_log_path("imbalanced-io.darshan")
+    >>> log_get_bytes_bandwidth(log_path, "POSIX")
+    I/O performance estimate (at the POSIX layer): transferred 101785.8 MiB at 164.99 MiB/s
+
+    >>> log_get_bytes_bandwidth(log_path, "MPI-IO")
+    I/O performance estimate (at the MPI-IO layer): transferred 126326.8 MiB at 101.58 MiB/s
+    """
+    # get total bytes (in MiB) and bandwidth (in MiB/s) for
+    # a given module -- this information was commonly reported
+    # in the old perl-based summary reports
+    darshan_derived_metrics = log_get_derived_metrics(log_path=log_path,
+                                                      mod_name=mod_name)
+    total_mib = darshan_derived_metrics.total_bytes / 2 ** 20
+    total_bw = darshan_derived_metrics.agg_perf_by_slowest
+    ret_str = f"I/O performance estimate (at the {mod_name} layer): transferred {total_mib:.1f} MiB at {total_bw:.2f} MiB/s"
+    return ret_str
diff --git a/darshan-util/pydarshan/darshan/tests/test_cffi_misc.py b/darshan-util/pydarshan/darshan/tests/test_cffi_misc.py
@@ -159,92 +159,3 @@ def test_log_get_generic_record(dtype):
         # make sure the returned key/column names agree
         assert actual_counter_names == expected_counter_names
         assert actual_fcounter_names == expected_fcounter_names
-
-
-@pytest.mark.parametrize("log_path, mod_name, expected_str", [
-    # the expected bytes/bandwidth strings are pasted
-    # directly from the old perl summary reports;
-    # exceptions noted below
-    # in some cases we defer to darshan-parser for the expected
-    # values; see discussion in gh-839
-    ("imbalanced-io.darshan",
-     "STDIO",
-     "I/O performance estimate (at the STDIO layer): transferred 1.1 MiB at 0.01 MiB/s"),
-    ("imbalanced-io.darshan",
-     "MPI-IO",
-     "I/O performance estimate (at the MPI-IO layer): transferred 126326.8 MiB at 101.58 MiB/s"),
-    # imbalanced-io.darshan does have LUSTRE data,
-    # but it doesn't support derived metrics at time
-    # of writing
-    ("imbalanced-io.darshan",
-     "LUSTRE",
-     "RuntimeError"),
-    ("imbalanced-io.darshan",
-     "POSIX",
-     "I/O performance estimate (at the POSIX layer): transferred 101785.8 MiB at 164.99 MiB/s"),
-    ("laytonjb_test1_id28730_6-7-43012-2131301613401632697_1.darshan",
-     "STDIO",
-     "I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 4.22 MiB/s"),
-    ("runtime_and_dxt_heatmaps_diagonal_write_only.darshan",
-     "POSIX",
-     "I/O performance estimate (at the POSIX layer): transferred 0.0 MiB at 0.02 MiB/s"),
-    ("treddy_mpi-io-test_id4373053_6-2-60198-9815401321915095332_1.darshan",
-     "STDIO",
-     "I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 16.47 MiB/s"),
-    ("e3sm_io_heatmap_only.darshan",
-     "STDIO",
-     "I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 3.26 MiB/s"),
-    ("e3sm_io_heatmap_only.darshan",
-     "MPI-IO",
-     "I/O performance estimate (at the MPI-IO layer): transferred 73880.2 MiB at 105.69 MiB/s"),
-    ("partial_data_stdio.darshan",
-     "MPI-IO",
-     "I/O performance estimate (at the MPI-IO layer): transferred 32.0 MiB at 2317.98 MiB/s"),
-    ("partial_data_stdio.darshan",
-     "STDIO",
-     "I/O performance estimate (at the STDIO layer): transferred 16336.0 MiB at 2999.14 MiB/s"),
-    # the C derived metrics code can't distinguish
-    # between different kinds of errors at this time,
-    # but we can still intercept in some cases...
-    ("partial_data_stdio.darshan",
-     "GARBAGE",
-     "ValueError"),
-    # TODO: determine if the lack of APMPI and
-    # any other "add-ons" in _structdefs is a bug
-    # in the control flow for `log_get_derived_metrics()`?
-    ("e3sm_io_heatmap_only.darshan",
-     "APMPI",
-     "KeyError"),
-    ("skew-app.darshan",
-     "POSIX",
-     "I/O performance estimate (at the POSIX layer): transferred 41615.8 MiB at 157.49 MiB/s"),
-    ("skew-app.darshan",
-     "MPI-IO",
-     "I/O performance estimate (at the MPI-IO layer): transferred 41615.8 MiB at 55.22 MiB/s"),
-])
-def test_derived_metrics_bytes_and_bandwidth(log_path, mod_name, expected_str):
-    # test the basic scenario of retrieving
-    # the total data transferred and bandwidth
-    # for all records in a given module; the situation
-    # of accumulating drived metrics with filtering
-    # (i.e., for a single filename) is not tested here
-
-    log_path = get_log_path(log_path)
-    if expected_str == "RuntimeError":
-        with pytest.raises(RuntimeError,
-                           match=f"{mod_name} module does not support derived"):
-            backend.log_get_bytes_bandwidth(log_path=log_path,
-                                            mod_name=mod_name)
-    elif expected_str == "ValueError":
-        with pytest.raises(ValueError,
-                           match=f"{mod_name} is not in the available log"):
-            backend.log_get_bytes_bandwidth(log_path=log_path,
-                                            mod_name=mod_name)
-    elif expected_str == "KeyError":
-        with pytest.raises(KeyError, match=f"{mod_name}"):
-            backend.log_get_bytes_bandwidth(log_path=log_path,
-                                            mod_name=mod_name)
-    else:
-        actual_str = backend.log_get_bytes_bandwidth(log_path=log_path,
-                                                     mod_name=mod_name)
-        assert actual_str == expected_str
diff --git a/darshan-util/pydarshan/darshan/tests/test_lib_accum.py b/darshan-util/pydarshan/darshan/tests/test_lib_accum.py
@@ -0,0 +1,93 @@
+from darshan.lib.accum import log_get_bytes_bandwidth
+from darshan.log_utils import get_log_path
+
+import pytest
+
+
+@pytest.mark.parametrize("log_path, mod_name, expected_str", [
+    # the expected bytes/bandwidth strings are pasted
+    # directly from the old perl summary reports;
+    # exceptions noted below
+    # in some cases we defer to darshan-parser for the expected
+    # values; see discussion in gh-839
+    ("imbalanced-io.darshan",
+     "STDIO",
+     "I/O performance estimate (at the STDIO layer): transferred 1.1 MiB at 0.01 MiB/s"),
+    ("imbalanced-io.darshan",
+     "MPI-IO",
+     "I/O performance estimate (at the MPI-IO layer): transferred 126326.8 MiB at 101.58 MiB/s"),
+    # imbalanced-io.darshan does have LUSTRE data,
+    # but it doesn't support derived metrics at time
+    # of writing
+    ("imbalanced-io.darshan",
+     "LUSTRE",
+     "RuntimeError"),
+    ("imbalanced-io.darshan",
+     "POSIX",
+     "I/O performance estimate (at the POSIX layer): transferred 101785.8 MiB at 164.99 MiB/s"),
+    ("laytonjb_test1_id28730_6-7-43012-2131301613401632697_1.darshan",
+     "STDIO",
+     "I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 4.22 MiB/s"),
+    ("runtime_and_dxt_heatmaps_diagonal_write_only.darshan",
+     "POSIX",
+     "I/O performance estimate (at the POSIX layer): transferred 0.0 MiB at 0.02 MiB/s"),
+    ("treddy_mpi-io-test_id4373053_6-2-60198-9815401321915095332_1.darshan",
+     "STDIO",
+     "I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 16.47 MiB/s"),
+    ("e3sm_io_heatmap_only.darshan",
+     "STDIO",
+     "I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 3.26 MiB/s"),
+    ("e3sm_io_heatmap_only.darshan",
+     "MPI-IO",
+     "I/O performance estimate (at the MPI-IO layer): transferred 73880.2 MiB at 105.69 MiB/s"),
+    ("partial_data_stdio.darshan",
+     "MPI-IO",
+     "I/O performance estimate (at the MPI-IO layer): transferred 32.0 MiB at 2317.98 MiB/s"),
+    ("partial_data_stdio.darshan",
+     "STDIO",
+     "I/O performance estimate (at the STDIO layer): transferred 16336.0 MiB at 2999.14 MiB/s"),
+    # the C derived metrics code can't distinguish
+    # between different kinds of errors at this time,
+    # but we can still intercept in some cases...
+    ("partial_data_stdio.darshan",
+     "GARBAGE",
+     "ValueError"),
+    # TODO: determine if the lack of APMPI and
+    # any other "add-ons" in _structdefs is a bug
+    # in the control flow for `log_get_derived_metrics()`?
+    ("e3sm_io_heatmap_only.darshan",
+     "APMPI",
+     "KeyError"),
+    ("skew-app.darshan",
+     "POSIX",
+     "I/O performance estimate (at the POSIX layer): transferred 41615.8 MiB at 157.49 MiB/s"),
+    ("skew-app.darshan",
+     "MPI-IO",
+     "I/O performance estimate (at the MPI-IO layer): transferred 41615.8 MiB at 55.22 MiB/s"),
+])
+def test_derived_metrics_bytes_and_bandwidth(log_path, mod_name, expected_str):
+    # test the basic scenario of retrieving
+    # the total data transferred and bandwidth
+    # for all records in a given module; the situation
+    # of accumulating drived metrics with filtering
+    # (i.e., for a single filename) is not tested here
+
+    log_path = get_log_path(log_path)
+    if expected_str == "RuntimeError":
+        with pytest.raises(RuntimeError,
+                           match=f"{mod_name} module does not support derived"):
+            log_get_bytes_bandwidth(log_path=log_path,
+                                    mod_name=mod_name)
+    elif expected_str == "ValueError":
+        with pytest.raises(ValueError,
+                           match=f"{mod_name} is not in the available log"):
+            log_get_bytes_bandwidth(log_path=log_path,
+                                    mod_name=mod_name)
+    elif expected_str == "KeyError":
+        with pytest.raises(KeyError, match=f"{mod_name}"):
+            log_get_bytes_bandwidth(log_path=log_path,
+                                    mod_name=mod_name)
+    else:
+        actual_str = log_get_bytes_bandwidth(log_path=log_path,
+                                             mod_name=mod_name)
+        assert actual_str == expected_str

-Original file line number
+Diff line change
 downloads/
 eggs/
 .eggs/
 -lib/
 lib64/
 parts/
 sdist/