From 01e405db24302f2bad936f6f942e70e378e413e8 Mon Sep 17 00:00:00 2001 From: Simon Perkins Date: Fri, 7 Mar 2025 15:19:40 +0200 Subject: [PATCH 1/3] Initial commit --- xarray/backends/api.py | 4 ++-- xarray/core/chunk.py | 12 ++++++++++ xarray/tests/test_backends_datatree.py | 31 ++++++++++++++++++++++++++ 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/xarray/backends/api.py b/xarray/backends/api.py index 019c5d11ed0..03f3a35255e 100644 --- a/xarray/backends/api.py +++ b/xarray/backends/api.py @@ -35,7 +35,7 @@ from xarray.backends.locks import _get_scheduler from xarray.coders import CFDatetimeCoder, CFTimedeltaCoder from xarray.core import indexing -from xarray.core.chunk import _get_chunk, _maybe_chunk +from xarray.core.chunk import _get_chunk, _maybe_chunk, _maybe_get_path_chunk from xarray.core.combine import ( _infer_concat_order_from_positions, _nested_combine, @@ -450,7 +450,7 @@ def _datatree_from_backend_datatree( node.dataset, filename_or_obj, engine, - chunks, + _maybe_get_path_chunk(node.path, chunks), overwrite_encoded_chunks, inline_array, chunked_array_type, diff --git a/xarray/core/chunk.py b/xarray/core/chunk.py index e8ceba30e4e..d1c1d5f5cfb 100644 --- a/xarray/core/chunk.py +++ b/xarray/core/chunk.py @@ -145,3 +145,15 @@ def _maybe_chunk( return var else: return var + + +def _maybe_get_path_chunk(path: str, chunks: int | dict | Any) -> int | dict | Any: + """Returns path-specific chunks from a chunks dictionary, if path is a key of chunks. + Otherwise, returns chunks as is""" + if isinstance(chunks, dict): + try: + return chunks[path] + except KeyError: + pass + + return chunks diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index efc1e131722..5474803dcff 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -256,6 +256,37 @@ def test_open_datatree_chunks(self, tmpdir, simple_datatree) -> None: assert_chunks_equal(tree, original_tree, enforce_dask=True) + @requires_dask + def test_open_datatree_path_chunks(self, tmpdir, simple_datatree) -> None: + filepath = tmpdir / "test.nc" + + root_chunks = {"x": 2, "y": 1} + set1_chunks = {"x": 1, "y": 2} + set2_chunks = {"x": 2, "y": 3} + + root_data = xr.Dataset({"a": ("y", [6, 7, 8]), "set0": ("x", [9, 10])}) + set1_data = xr.Dataset({"a": ("y", [-1, 0, 1]), "b": ("x", [-10, 6])}) + set2_data = xr.Dataset({"a": ("y", [1, 2, 3]), "b": ("x", [0.1, 0.2])}) + original_tree = DataTree.from_dict( + { + "/": root_data.chunk(root_chunks), + "/group1": set1_data.chunk(set1_chunks), + "/group2": set2_data.chunk(set2_chunks), + } + ) + original_tree.to_netcdf(filepath, engine="netcdf4") + + chunks = { + "/": root_chunks, + "/group1": set1_chunks, + "/group2": set2_chunks, + } + + with open_datatree(filepath, engine="netcdf4", chunks=chunks) as tree: + xr.testing.assert_identical(tree, original_tree) + + assert_chunks_equal(tree, original_tree, enforce_dask=True) + def test_open_groups(self, unaligned_datatree_nc) -> None: """Test `open_groups` with a netCDF4 file with an unaligned group hierarchy.""" unaligned_dict_of_datasets = open_groups(unaligned_datatree_nc) From 5379b4a552409b3fbdaa6b2c8a1502b74cb8133f Mon Sep 17 00:00:00 2001 From: Simon Perkins Date: Fri, 7 Mar 2025 15:28:38 +0200 Subject: [PATCH 2/3] Add Zarr test case --- xarray/tests/test_backends_datatree.py | 30 ++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/xarray/tests/test_backends_datatree.py b/xarray/tests/test_backends_datatree.py index 5474803dcff..664ecafda0a 100644 --- a/xarray/tests/test_backends_datatree.py +++ b/xarray/tests/test_backends_datatree.py @@ -580,6 +580,36 @@ def test_open_datatree_chunks(self, tmpdir, simple_datatree) -> None: # from each node. xr.testing.assert_identical(tree.compute(), original_tree) + def test_open_datatree_path_chunks(self, tmpdir, simple_datatree) -> None: + filepath = tmpdir / "test.zarr" + + root_chunks = {"x": 2, "y": 1} + set1_chunks = {"x": 1, "y": 2} + set2_chunks = {"x": 2, "y": 3} + + root_data = xr.Dataset({"a": ("y", [6, 7, 8]), "set0": ("x", [9, 10])}) + set1_data = xr.Dataset({"a": ("y", [-1, 0, 1]), "b": ("x", [-10, 6])}) + set2_data = xr.Dataset({"a": ("y", [1, 2, 3]), "b": ("x", [0.1, 0.2])}) + original_tree = DataTree.from_dict( + { + "/": root_data.chunk(root_chunks), + "/group1": set1_data.chunk(set1_chunks), + "/group2": set2_data.chunk(set2_chunks), + } + ) + original_tree.to_zarr(filepath) + + chunks = { + "/": root_chunks, + "/group1": set1_chunks, + "/group2": set2_chunks, + } + + with open_datatree(filepath, engine="zarr", chunks=chunks) as tree: + xr.testing.assert_identical(tree, original_tree) + assert_chunks_equal(tree, original_tree, enforce_dask=True) + xr.testing.assert_identical(tree.compute(), original_tree) + def test_open_groups(self, unaligned_datatree_zarr) -> None: """Test `open_groups` with a zarr store of an unaligned group hierarchy.""" From 4d5383b878fb4605d9ccf379278ba14849a84978 Mon Sep 17 00:00:00 2001 From: Simon Perkins Date: Fri, 7 Mar 2025 15:37:52 +0200 Subject: [PATCH 3/3] Expand the T_Chunks type definition to include a {"path": {"dim": size}} option --- xarray/core/types.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/xarray/core/types.py b/xarray/core/types.py index 186738ed718..99fa00f52b2 100644 --- a/xarray/core/types.py +++ b/xarray/core/types.py @@ -217,7 +217,9 @@ def copy( T_ChunkDimFreq: TypeAlias = Union["TimeResampler", T_ChunkDim] T_ChunksFreq: TypeAlias = T_ChunkDim | Mapping[Any, T_ChunkDimFreq] # We allow the tuple form of this (though arguably we could transition to named dims only) -T_Chunks: TypeAlias = T_ChunkDim | Mapping[Any, T_ChunkDim] +T_Chunks: TypeAlias = ( + T_ChunkDim | Mapping[Any, T_ChunkDim] | Mapping[Any, Mapping[Any, T_ChunkDim]] +) T_NormalizedChunks = tuple[tuple[int, ...], ...] DataVars = Mapping[Any, Any]