Skip to content

Commit 96806fa

Browse files
committed
MAINT: PR 839 revisions
* `cffi_backend` module changes requested from PR review - remove a spurious `darshan_free` from `_log_get_heatmap_record()` - fix the scoping of the `darshan_free` of `buf` object used with `darshan_accumulator_inject` in `log_get_derived_metrics` - adding a missing `log_close()` to `log_get_derived_metrics` (maybe we can wrap in Python contexts in the future though) - use a separate buffer for `darshan_accumulator_emit()` inside `log_get_derived_metrics` * note that making the above CFFI/free-related changes caused a segfault in the testuite, so in the end I adjusted the location of the memory freeing as I saw fit to avoid segfaults--I'd say at this point please provide concrete evidence with a memory leak plot or failing test for additional adjustments there, or just push the change in * in the end, there is a slightly more concise usage of `darshan_free()` but no meaningful change in the free operations * I also reverted the suggested changed to `darshan_accumulator_emit()` usage--there was no testable evidence of an issue, and it was also causing segfaults.. * address many of the discussion points that came up in gh-868: - `log_get_derived_metrics()` now uses an LRU cache, which effectively means that we use memoization to return derived metrics data rather than doing another pass over the log file if the same log path and module name have already been accumulated from; we still need to pass over a given log twice in most cases--once at initial read-in and once for using `log_get_derived_metrics`; how we decide to add filtering of records prior to accumulation interface in Python is probably a deeper discussion/for later - `log_get_bytes_bandwidth()` and its associated testing have been migrated to modules not named after "CFFI", like the in the above PR, because I think we should only use the "CFFI" named modules for direct CFFI interaction/testing, and for other analyses we should probably use more distinct names. Also, to some extent everything depends on the CFFI layer, so trying to restrict "CFFI" modules to direct rather than direct interaction will help keep them manageably sized, especially given the proclivity for surprising memory issues/segfaults in those parts of the code. - add a proper docstring with examples for `log_get_bytes_bandwidth()`
1 parent f1bac18 commit 96806fa

File tree

6 files changed

+150
-106
lines changed

6 files changed

+150
-106
lines changed

darshan-util/pydarshan/.gitignore

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,6 @@ dist/
2424
downloads/
2525
eggs/
2626
.eggs/
27-
lib/
2827
lib64/
2928
parts/
3029
sdist/

darshan-util/pydarshan/darshan/backend/cffi_backend.py

Lines changed: 3 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -633,7 +633,6 @@ def _log_get_heatmap_record(log):
633633
buf = ffi.new("void **")
634634
r = libdutil.darshan_log_get_record(log['handle'], modules[mod_name]['idx'], buf)
635635
if r < 1:
636-
libdutil.darshan_free(buf[0])
637636
return None
638637

639638
filerec = ffi.cast(mod_type, buf)
@@ -660,6 +659,7 @@ def _log_get_heatmap_record(log):
660659
return rec
661660

662661

662+
@functools.lru_cache()
663663
def log_get_derived_metrics(log_path: str, mod_name: str):
664664
"""
665665
Returns the darshan_derived_metrics struct from CFFI/C accumulator code.
@@ -716,24 +716,12 @@ def log_get_derived_metrics(log_path: str, mod_name: str):
716716
r = libdutil.darshan_accumulator_emit(darshan_accumulator[0],
717717
darshan_derived_metrics,
718718
rbuf[0])
719+
libdutil.darshan_free(buf[0])
719720
if r != 0:
720-
libdutil.darshan_free(buf[0])
721721
raise RuntimeError("A nonzero exit code was received from "
722722
"darshan_accumulator_emit() at the C level. "
723723
"It may be possible "
724724
"to retrieve additional information from the stderr "
725725
"stream.")
726-
libdutil.darshan_free(buf[0])
726+
log_close(log_handle)
727727
return darshan_derived_metrics
728-
729-
730-
def log_get_bytes_bandwidth(log_path: str, mod_name: str) -> str:
731-
# get total bytes (in MiB) and bandwidth (in MiB/s) for
732-
# a given module -- this information was commonly reported
733-
# in the old perl-based summary reports
734-
darshan_derived_metrics = log_get_derived_metrics(log_path=log_path,
735-
mod_name=mod_name)
736-
total_mib = darshan_derived_metrics.total_bytes / 2 ** 20
737-
total_bw = darshan_derived_metrics.agg_perf_by_slowest
738-
ret_str = f"I/O performance estimate (at the {mod_name} layer): transferred {total_mib:.1f} MiB at {total_bw:.2f} MiB/s"
739-
return ret_str

darshan-util/pydarshan/darshan/cli/summary.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
import darshan
1919
import darshan.cli
20-
from darshan.backend.cffi_backend import log_get_bytes_bandwidth
20+
from darshan.lib.accum import log_get_bytes_bandwidth
2121
from darshan.experimental.plots import (
2222
plot_dxt_heatmap,
2323
plot_io_cost,
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
from darshan.backend.cffi_backend import log_get_derived_metrics
2+
3+
4+
def log_get_bytes_bandwidth(log_path: str, mod_name: str) -> str:
5+
"""
6+
Summarize I/O performance for a given darshan module.
7+
8+
Parameters
9+
----------
10+
log_path : str
11+
Path to the darshan binary log file.
12+
mod_name : str
13+
Name of the darshan module to summarize the I/O
14+
performance for.
15+
16+
Returns
17+
-------
18+
out: str
19+
A short string summarizing the performance of the given module
20+
in the provided log file, including bandwidth and total data
21+
transferred.
22+
23+
Raises
24+
------
25+
RuntimeError
26+
When a provided module name is not supported for the accumulator
27+
interface for provision of the summary data, or for any other
28+
error that occurs in the C/CFFI interface.
29+
ValueError
30+
When a provided module name does not exist in the log file.
31+
32+
Examples
33+
--------
34+
35+
>>> from darshan.log_utils import get_log_path
36+
>>> from darshan.lib.accum import log_get_bytes_bandwidth
37+
38+
>>> log_path = get_log_path("imbalanced-io.darshan")
39+
>>> log_get_bytes_bandwidth(log_path, "POSIX")
40+
I/O performance estimate (at the POSIX layer): transferred 101785.8 MiB at 164.99 MiB/s
41+
42+
>>> log_get_bytes_bandwidth(log_path, "MPI-IO")
43+
I/O performance estimate (at the MPI-IO layer): transferred 126326.8 MiB at 101.58 MiB/s
44+
"""
45+
# get total bytes (in MiB) and bandwidth (in MiB/s) for
46+
# a given module -- this information was commonly reported
47+
# in the old perl-based summary reports
48+
darshan_derived_metrics = log_get_derived_metrics(log_path=log_path,
49+
mod_name=mod_name)
50+
total_mib = darshan_derived_metrics.total_bytes / 2 ** 20
51+
total_bw = darshan_derived_metrics.agg_perf_by_slowest
52+
ret_str = f"I/O performance estimate (at the {mod_name} layer): transferred {total_mib:.1f} MiB at {total_bw:.2f} MiB/s"
53+
return ret_str

darshan-util/pydarshan/darshan/tests/test_cffi_misc.py

Lines changed: 0 additions & 89 deletions
Original file line numberDiff line numberDiff line change
@@ -159,92 +159,3 @@ def test_log_get_generic_record(dtype):
159159
# make sure the returned key/column names agree
160160
assert actual_counter_names == expected_counter_names
161161
assert actual_fcounter_names == expected_fcounter_names
162-
163-
164-
@pytest.mark.parametrize("log_path, mod_name, expected_str", [
165-
# the expected bytes/bandwidth strings are pasted
166-
# directly from the old perl summary reports;
167-
# exceptions noted below
168-
# in some cases we defer to darshan-parser for the expected
169-
# values; see discussion in gh-839
170-
("imbalanced-io.darshan",
171-
"STDIO",
172-
"I/O performance estimate (at the STDIO layer): transferred 1.1 MiB at 0.01 MiB/s"),
173-
("imbalanced-io.darshan",
174-
"MPI-IO",
175-
"I/O performance estimate (at the MPI-IO layer): transferred 126326.8 MiB at 101.58 MiB/s"),
176-
# imbalanced-io.darshan does have LUSTRE data,
177-
# but it doesn't support derived metrics at time
178-
# of writing
179-
("imbalanced-io.darshan",
180-
"LUSTRE",
181-
"RuntimeError"),
182-
("imbalanced-io.darshan",
183-
"POSIX",
184-
"I/O performance estimate (at the POSIX layer): transferred 101785.8 MiB at 164.99 MiB/s"),
185-
("laytonjb_test1_id28730_6-7-43012-2131301613401632697_1.darshan",
186-
"STDIO",
187-
"I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 4.22 MiB/s"),
188-
("runtime_and_dxt_heatmaps_diagonal_write_only.darshan",
189-
"POSIX",
190-
"I/O performance estimate (at the POSIX layer): transferred 0.0 MiB at 0.02 MiB/s"),
191-
("treddy_mpi-io-test_id4373053_6-2-60198-9815401321915095332_1.darshan",
192-
"STDIO",
193-
"I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 16.47 MiB/s"),
194-
("e3sm_io_heatmap_only.darshan",
195-
"STDIO",
196-
"I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 3.26 MiB/s"),
197-
("e3sm_io_heatmap_only.darshan",
198-
"MPI-IO",
199-
"I/O performance estimate (at the MPI-IO layer): transferred 73880.2 MiB at 105.69 MiB/s"),
200-
("partial_data_stdio.darshan",
201-
"MPI-IO",
202-
"I/O performance estimate (at the MPI-IO layer): transferred 32.0 MiB at 2317.98 MiB/s"),
203-
("partial_data_stdio.darshan",
204-
"STDIO",
205-
"I/O performance estimate (at the STDIO layer): transferred 16336.0 MiB at 2999.14 MiB/s"),
206-
# the C derived metrics code can't distinguish
207-
# between different kinds of errors at this time,
208-
# but we can still intercept in some cases...
209-
("partial_data_stdio.darshan",
210-
"GARBAGE",
211-
"ValueError"),
212-
# TODO: determine if the lack of APMPI and
213-
# any other "add-ons" in _structdefs is a bug
214-
# in the control flow for `log_get_derived_metrics()`?
215-
("e3sm_io_heatmap_only.darshan",
216-
"APMPI",
217-
"KeyError"),
218-
("skew-app.darshan",
219-
"POSIX",
220-
"I/O performance estimate (at the POSIX layer): transferred 41615.8 MiB at 157.49 MiB/s"),
221-
("skew-app.darshan",
222-
"MPI-IO",
223-
"I/O performance estimate (at the MPI-IO layer): transferred 41615.8 MiB at 55.22 MiB/s"),
224-
])
225-
def test_derived_metrics_bytes_and_bandwidth(log_path, mod_name, expected_str):
226-
# test the basic scenario of retrieving
227-
# the total data transferred and bandwidth
228-
# for all records in a given module; the situation
229-
# of accumulating drived metrics with filtering
230-
# (i.e., for a single filename) is not tested here
231-
232-
log_path = get_log_path(log_path)
233-
if expected_str == "RuntimeError":
234-
with pytest.raises(RuntimeError,
235-
match=f"{mod_name} module does not support derived"):
236-
backend.log_get_bytes_bandwidth(log_path=log_path,
237-
mod_name=mod_name)
238-
elif expected_str == "ValueError":
239-
with pytest.raises(ValueError,
240-
match=f"{mod_name} is not in the available log"):
241-
backend.log_get_bytes_bandwidth(log_path=log_path,
242-
mod_name=mod_name)
243-
elif expected_str == "KeyError":
244-
with pytest.raises(KeyError, match=f"{mod_name}"):
245-
backend.log_get_bytes_bandwidth(log_path=log_path,
246-
mod_name=mod_name)
247-
else:
248-
actual_str = backend.log_get_bytes_bandwidth(log_path=log_path,
249-
mod_name=mod_name)
250-
assert actual_str == expected_str
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
from darshan.lib.accum import log_get_bytes_bandwidth
2+
from darshan.log_utils import get_log_path
3+
4+
import pytest
5+
6+
7+
@pytest.mark.parametrize("log_path, mod_name, expected_str", [
8+
# the expected bytes/bandwidth strings are pasted
9+
# directly from the old perl summary reports;
10+
# exceptions noted below
11+
# in some cases we defer to darshan-parser for the expected
12+
# values; see discussion in gh-839
13+
("imbalanced-io.darshan",
14+
"STDIO",
15+
"I/O performance estimate (at the STDIO layer): transferred 1.1 MiB at 0.01 MiB/s"),
16+
("imbalanced-io.darshan",
17+
"MPI-IO",
18+
"I/O performance estimate (at the MPI-IO layer): transferred 126326.8 MiB at 101.58 MiB/s"),
19+
# imbalanced-io.darshan does have LUSTRE data,
20+
# but it doesn't support derived metrics at time
21+
# of writing
22+
("imbalanced-io.darshan",
23+
"LUSTRE",
24+
"RuntimeError"),
25+
("imbalanced-io.darshan",
26+
"POSIX",
27+
"I/O performance estimate (at the POSIX layer): transferred 101785.8 MiB at 164.99 MiB/s"),
28+
("laytonjb_test1_id28730_6-7-43012-2131301613401632697_1.darshan",
29+
"STDIO",
30+
"I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 4.22 MiB/s"),
31+
("runtime_and_dxt_heatmaps_diagonal_write_only.darshan",
32+
"POSIX",
33+
"I/O performance estimate (at the POSIX layer): transferred 0.0 MiB at 0.02 MiB/s"),
34+
("treddy_mpi-io-test_id4373053_6-2-60198-9815401321915095332_1.darshan",
35+
"STDIO",
36+
"I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 16.47 MiB/s"),
37+
("e3sm_io_heatmap_only.darshan",
38+
"STDIO",
39+
"I/O performance estimate (at the STDIO layer): transferred 0.0 MiB at 3.26 MiB/s"),
40+
("e3sm_io_heatmap_only.darshan",
41+
"MPI-IO",
42+
"I/O performance estimate (at the MPI-IO layer): transferred 73880.2 MiB at 105.69 MiB/s"),
43+
("partial_data_stdio.darshan",
44+
"MPI-IO",
45+
"I/O performance estimate (at the MPI-IO layer): transferred 32.0 MiB at 2317.98 MiB/s"),
46+
("partial_data_stdio.darshan",
47+
"STDIO",
48+
"I/O performance estimate (at the STDIO layer): transferred 16336.0 MiB at 2999.14 MiB/s"),
49+
# the C derived metrics code can't distinguish
50+
# between different kinds of errors at this time,
51+
# but we can still intercept in some cases...
52+
("partial_data_stdio.darshan",
53+
"GARBAGE",
54+
"ValueError"),
55+
# TODO: determine if the lack of APMPI and
56+
# any other "add-ons" in _structdefs is a bug
57+
# in the control flow for `log_get_derived_metrics()`?
58+
("e3sm_io_heatmap_only.darshan",
59+
"APMPI",
60+
"KeyError"),
61+
("skew-app.darshan",
62+
"POSIX",
63+
"I/O performance estimate (at the POSIX layer): transferred 41615.8 MiB at 157.49 MiB/s"),
64+
("skew-app.darshan",
65+
"MPI-IO",
66+
"I/O performance estimate (at the MPI-IO layer): transferred 41615.8 MiB at 55.22 MiB/s"),
67+
])
68+
def test_derived_metrics_bytes_and_bandwidth(log_path, mod_name, expected_str):
69+
# test the basic scenario of retrieving
70+
# the total data transferred and bandwidth
71+
# for all records in a given module; the situation
72+
# of accumulating drived metrics with filtering
73+
# (i.e., for a single filename) is not tested here
74+
75+
log_path = get_log_path(log_path)
76+
if expected_str == "RuntimeError":
77+
with pytest.raises(RuntimeError,
78+
match=f"{mod_name} module does not support derived"):
79+
log_get_bytes_bandwidth(log_path=log_path,
80+
mod_name=mod_name)
81+
elif expected_str == "ValueError":
82+
with pytest.raises(ValueError,
83+
match=f"{mod_name} is not in the available log"):
84+
log_get_bytes_bandwidth(log_path=log_path,
85+
mod_name=mod_name)
86+
elif expected_str == "KeyError":
87+
with pytest.raises(KeyError, match=f"{mod_name}"):
88+
log_get_bytes_bandwidth(log_path=log_path,
89+
mod_name=mod_name)
90+
else:
91+
actual_str = log_get_bytes_bandwidth(log_path=log_path,
92+
mod_name=mod_name)
93+
assert actual_str == expected_str

0 commit comments

Comments
 (0)