Skip to content

Commit d6d7692

Browse files
authored
NetCDF IO cleanup, including in-memory reads/writes with netCDF4 (#10624)
* Rewrite DataTree.to_netcdf and support netCDF4 in-memory This PR includes a handful of significant changes: 1. It refactors the internal structure of `DataTree.to_netcdf()` and `DataTree.to_zarr()` to use lower level interfaces, rather than calling `Dataset` methods. This allows for properly supporting `compute=False` (and likely various other improvements). 2. Reading and writing in-memory data with netCDF4-python is now supported, including DataTree. 3. The `engine` argument in `DataTree.to_netcdf()` is now set consistently with `Dataset.to_netcdf()`, preferring `netcdf4` to `h5netcdf`. 3. Calling `Dataset.to_netcdf()` without a target now always returns a `memoryview` object, *including* in the case where `engine='scipy'` is used (which currently returns `bytes`). This is a breaking change, rather than merely issuing a warning as is done in #10571. I believe it probably makes sense to do as a this breaking change because (1) it offers significant performance benefits, (2) the default behavior without specifying an engine will already change (because `netcdf4` is preferred to the `scipy` backend) and (3) restoring previous behavior is easy (by wrapping the memoryview with `bytes()`).
1 parent 9a7e33d commit d6d7692

15 files changed

+680
-351
lines changed

doc/whats-new.rst

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,10 @@ v2025.09.1 (unreleased)
1313
New Features
1414
~~~~~~~~~~~~
1515

16+
- ``engine='netcdf4'`` now supports reading and writing in-memory netCDF files.
17+
All of Xarray's netCDF backends now support in-memory reads and writes
18+
(:pull:`10624`).
19+
By `Stephan Hoyer <https://github.com/shoyer>`_.
1620

1721
Breaking changes
1822
~~~~~~~~~~~~~~~~
@@ -22,13 +26,28 @@ Breaking changes
2226
dataset in-place. (:issue:`10167`)
2327
By `Maximilian Roos <https://github.com/max-sixty>`_.
2428

29+
- The default ``engine`` when reading/writing netCDF files in-memory is now
30+
netCDF4, consistent with Xarray's default ``engine`` when read/writing netCDF
31+
files to disk (:pull:`10624`).
32+
By `Stephan Hoyer <https://github.com/shoyer>`_.
33+
2534
Deprecations
2635
~~~~~~~~~~~~
2736

2837

2938
Bug fixes
3039
~~~~~~~~~
3140

41+
- Xarray objects opened from file-like objects with ``engine='h5netcdf'`` can
42+
now be pickled, as long as the underlying file-like object also supports
43+
pickle (:issue:`10712`).
44+
By `Stephan Hoyer <https://github.com/shoyer>`_.
45+
46+
- Closing Xarray objects opened from file-like objects with ```engine='scipy'``
47+
no longer closes the underlying file, consistent with the h5netcdf backend
48+
(:pull:`10624`).
49+
By `Stephan Hoyer <https://github.com/shoyer>`_.
50+
3251
- Fix the ``align_chunks`` parameter on the :py:meth:`~xarray.Dataset.to_zarr` method, it was not being
3352
passed to the underlying :py:meth:`~xarray.backends.api` method (:issue:`10501`, :pull:`10516`).
3453
- Fix error when encoding an empty :py:class:`numpy.datetime64` array

xarray/backends/api.py

Lines changed: 31 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@
9999

100100
def get_default_netcdf_write_engine(
101101
format: T_NetcdfTypes | None,
102-
to_fileobject_or_memoryview: bool,
102+
to_fileobject: bool,
103103
) -> Literal["netcdf4", "h5netcdf", "scipy"]:
104104
"""Return the default netCDF library to use for writing a netCDF file."""
105105
module_names = {
@@ -118,7 +118,7 @@ def get_default_netcdf_write_engine(
118118
else:
119119
raise ValueError(f"unexpected {format=}")
120120

121-
if to_fileobject_or_memoryview:
121+
if to_fileobject:
122122
candidates.remove("netcdf4")
123123

124124
for engine in candidates:
@@ -545,14 +545,12 @@ def open_dataset(
545545
cache: bool | None = None,
546546
decode_cf: bool | None = None,
547547
mask_and_scale: bool | Mapping[str, bool] | None = None,
548-
decode_times: bool
549-
| CFDatetimeCoder
550-
| Mapping[str, bool | CFDatetimeCoder]
551-
| None = None,
552-
decode_timedelta: bool
553-
| CFTimedeltaCoder
554-
| Mapping[str, bool | CFTimedeltaCoder]
555-
| None = None,
548+
decode_times: (
549+
bool | CFDatetimeCoder | Mapping[str, bool | CFDatetimeCoder] | None
550+
) = None,
551+
decode_timedelta: (
552+
bool | CFTimedeltaCoder | Mapping[str, bool | CFTimedeltaCoder] | None
553+
) = None,
556554
use_cftime: bool | Mapping[str, bool] | None = None,
557555
concat_characters: bool | Mapping[str, bool] | None = None,
558556
decode_coords: Literal["coordinates", "all"] | bool | None = None,
@@ -788,10 +786,9 @@ def open_dataarray(
788786
cache: bool | None = None,
789787
decode_cf: bool | None = None,
790788
mask_and_scale: bool | None = None,
791-
decode_times: bool
792-
| CFDatetimeCoder
793-
| Mapping[str, bool | CFDatetimeCoder]
794-
| None = None,
789+
decode_times: (
790+
bool | CFDatetimeCoder | Mapping[str, bool | CFDatetimeCoder] | None
791+
) = None,
795792
decode_timedelta: bool | CFTimedeltaCoder | None = None,
796793
use_cftime: bool | None = None,
797794
concat_characters: bool | None = None,
@@ -1018,14 +1015,12 @@ def open_datatree(
10181015
cache: bool | None = None,
10191016
decode_cf: bool | None = None,
10201017
mask_and_scale: bool | Mapping[str, bool] | None = None,
1021-
decode_times: bool
1022-
| CFDatetimeCoder
1023-
| Mapping[str, bool | CFDatetimeCoder]
1024-
| None = None,
1025-
decode_timedelta: bool
1026-
| CFTimedeltaCoder
1027-
| Mapping[str, bool | CFTimedeltaCoder]
1028-
| None = None,
1018+
decode_times: (
1019+
bool | CFDatetimeCoder | Mapping[str, bool | CFDatetimeCoder] | None
1020+
) = None,
1021+
decode_timedelta: (
1022+
bool | CFTimedeltaCoder | Mapping[str, bool | CFTimedeltaCoder] | None
1023+
) = None,
10291024
use_cftime: bool | Mapping[str, bool] | None = None,
10301025
concat_characters: bool | Mapping[str, bool] | None = None,
10311026
decode_coords: Literal["coordinates", "all"] | bool | None = None,
@@ -1260,14 +1255,12 @@ def open_groups(
12601255
cache: bool | None = None,
12611256
decode_cf: bool | None = None,
12621257
mask_and_scale: bool | Mapping[str, bool] | None = None,
1263-
decode_times: bool
1264-
| CFDatetimeCoder
1265-
| Mapping[str, bool | CFDatetimeCoder]
1266-
| None = None,
1267-
decode_timedelta: bool
1268-
| CFTimedeltaCoder
1269-
| Mapping[str, bool | CFTimedeltaCoder]
1270-
| None = None,
1258+
decode_times: (
1259+
bool | CFDatetimeCoder | Mapping[str, bool | CFDatetimeCoder] | None
1260+
) = None,
1261+
decode_timedelta: (
1262+
bool | CFTimedeltaCoder | Mapping[str, bool | CFTimedeltaCoder] | None
1263+
) = None,
12711264
use_cftime: bool | Mapping[str, bool] | None = None,
12721265
concat_characters: bool | Mapping[str, bool] | None = None,
12731266
decode_coords: Literal["coordinates", "all"] | bool | None = None,
@@ -1523,10 +1516,9 @@ def _remove_path(
15231516

15241517

15251518
def open_mfdataset(
1526-
paths: str
1527-
| os.PathLike
1528-
| ReadBuffer
1529-
| NestedSequence[str | os.PathLike | ReadBuffer],
1519+
paths: (
1520+
str | os.PathLike | ReadBuffer | NestedSequence[str | os.PathLike | ReadBuffer]
1521+
),
15301522
chunks: T_Chunks = None,
15311523
concat_dim: (
15321524
str
@@ -1540,10 +1532,9 @@ def open_mfdataset(
15401532
compat: CompatOptions | CombineKwargDefault = _COMPAT_DEFAULT,
15411533
preprocess: Callable[[Dataset], Dataset] | None = None,
15421534
engine: T_Engine = None,
1543-
data_vars: Literal["all", "minimal", "different"]
1544-
| None
1545-
| list[str]
1546-
| CombineKwargDefault = _DATA_VARS_DEFAULT,
1535+
data_vars: (
1536+
Literal["all", "minimal", "different"] | None | list[str] | CombineKwargDefault
1537+
) = _DATA_VARS_DEFAULT,
15471538
coords=_COORDS_DEFAULT,
15481539
combine: Literal["by_coords", "nested"] = "by_coords",
15491540
parallel: bool = False,
@@ -2068,8 +2059,8 @@ def to_netcdf(
20682059
path_or_file = _normalize_path(path_or_file)
20692060

20702061
if engine is None:
2071-
to_fileobject_or_memoryview = not isinstance(path_or_file, str)
2072-
engine = get_default_netcdf_write_engine(format, to_fileobject_or_memoryview)
2062+
to_fileobject = isinstance(path_or_file, IOBase)
2063+
engine = get_default_netcdf_write_engine(format, to_fileobject)
20732064

20742065
# validate Dataset keys, DataArray names, and attr keys/values
20752066
_validate_dataset_names(dataset)

0 commit comments

Comments
 (0)