Skip to content

Commit ad1d3e6

Browse files
authored
adds MagView.rechunk method (#1342)
* adds MagView.rechunk method * changelog * test * pr feedback
1 parent 76764b3 commit ad1d3e6

File tree

6 files changed

+250
-72
lines changed

6 files changed

+250
-72
lines changed

webknossos/Changelog.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,10 @@ For upgrade instructions, please check the respective _Breaking Changes_ section
1515
### Breaking Changes
1616

1717
### Added
18+
- Added `MagView.rechunk` methods to allow for rechunking of datasets. [#1342](https://github.com/scalableminds/webknossos-libs/pull/1342)
1819

1920
### Changed
21+
- Enforces that `chunk_shape` and `shard_shape` have power-of-two values. This assumptions was used in the code previously, but not explicitly enforced. [#1342](https://github.com/scalableminds/webknossos-libs/pull/1342)
2022

2123
### Fixed
2224

webknossos/tests/dataset/test_dataset.py

Lines changed: 110 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ def for_each_chunking_advanced(ds: Dataset, view: View) -> None:
164164
.get_view(absolute_offset=offset, size=size)
165165
)
166166
chunk_data = chunk.read()
167-
assert np.array_equal(
167+
np.testing.assert_array_equal(
168168
np.ones(chunk_data.shape, dtype=np.dtype("uint8"))
169169
* (sum(chunk.bounding_box.topleft) % 256),
170170
chunk_data,
@@ -566,7 +566,7 @@ def test_view_write(data_format: DataFormat, output_path: UPath) -> None:
566566
wk_view.write(write_data, allow_unaligned=True)
567567

568568
data = wk_view.read(absolute_offset=(0, 0, 0), size=(10, 10, 10))
569-
assert np.array_equal(data, write_data)
569+
np.testing.assert_array_equal(data, write_data)
570570

571571

572572
@pytest.mark.parametrize("output_path", [TESTOUTPUT_DIR, REMOTE_TESTOUTPUT_DIR])
@@ -581,13 +581,13 @@ def test_direct_zarr_access(output_path: UPath, data_format: DataFormat) -> None
581581
write_data = (np.random.rand(3, 10, 10, 10) * 255).astype(np.uint8)
582582
mag.get_zarr_array()[:, 0:10, 0:10, 0:10].write(write_data).result()
583583
data = mag.read(absolute_offset=(0, 0, 0), size=(10, 10, 10))
584-
assert np.array_equal(data, write_data)
584+
np.testing.assert_array_equal(data, write_data)
585585

586586
# write: wk, read: zarr
587587
write_data = (np.random.rand(3, 10, 10, 10) * 255).astype(np.uint8)
588588
mag.write(write_data, absolute_offset=(0, 0, 0), allow_unaligned=True)
589589
data = mag.get_zarr_array()[:, 0:10, 0:10, 0:10].read().result()
590-
assert np.array_equal(data, write_data)
590+
np.testing.assert_array_equal(data, write_data)
591591

592592

593593
@pytest.mark.parametrize("data_format,output_path", DATA_FORMATS_AND_OUTPUT_PATHS)
@@ -852,7 +852,7 @@ def test_write_multi_channel_uint8(data_format: DataFormat, output_path: UPath)
852852

853853
mag.write(data, allow_resize=True)
854854

855-
assert np.array_equal(data, mag.read())
855+
np.testing.assert_array_equal(data, mag.read())
856856

857857
assure_exported_properties(ds)
858858

@@ -876,7 +876,7 @@ def test_wkw_write_multi_channel_uint16(
876876
mag.write(data, allow_resize=True)
877877
written_data = mag.read()
878878

879-
assert np.array_equal(data, written_data)
879+
np.testing.assert_array_equal(data, written_data)
880880

881881
assure_exported_properties(ds)
882882

@@ -993,7 +993,7 @@ def test_read_padded_data(data_format: DataFormat, output_path: UPath) -> None:
993993
data = mag.read(absolute_offset=(0, 0, 0), size=(10, 10, 10))
994994

995995
assert data.shape == (3, 10, 10, 10)
996-
assert np.array_equal(data, np.zeros((3, 10, 10, 10)))
996+
np.testing.assert_array_equal(data, np.zeros((3, 10, 10, 10)))
997997

998998

999999
@pytest.mark.parametrize("data_format,output_path", DATA_FORMATS_AND_OUTPUT_PATHS)
@@ -1272,7 +1272,7 @@ def test_chunking_wk(data_format: DataFormat, output_path: UPath) -> None:
12721272
chunk_shape=shard_shape,
12731273
executor=executor,
12741274
)
1275-
assert np.array_equal(original_data + 50, mag.get_view().read()[0])
1275+
np.testing.assert_array_equal(original_data + 50, mag.get_view().read()[0])
12761276

12771277
# Reset the data
12781278
mag.write(absolute_offset=(70, 80, 90), data=original_data, allow_resize=True)
@@ -1282,7 +1282,7 @@ def test_chunking_wk(data_format: DataFormat, output_path: UPath) -> None:
12821282
chunk_job,
12831283
chunk_shape=shard_shape,
12841284
)
1285-
assert np.array_equal(original_data + 50, mag.get_view().read()[0])
1285+
np.testing.assert_array_equal(original_data + 50, mag.get_view().read()[0])
12861286

12871287
assure_exported_properties(ds)
12881288

@@ -1410,7 +1410,7 @@ def test_changing_layer_bounding_box(
14101410
assert tuple(bbox_size) == (12, 12, 10)
14111411
less_data = mag.read(absolute_offset=(0, 0, 0), size=bbox_size)
14121412
assert less_data.shape == (3, 12, 12, 10)
1413-
assert np.array_equal(original_data[:, :12, :12, :10], less_data)
1413+
np.testing.assert_array_equal(original_data[:, :12, :12, :10], less_data)
14141414

14151415
layer.bounding_box = layer.bounding_box.with_size(
14161416
[36, 48, 60]
@@ -1420,7 +1420,7 @@ def test_changing_layer_bounding_box(
14201420
assert tuple(bbox_size) == (36, 48, 60)
14211421
more_data = mag.read(absolute_offset=(0, 0, 0), size=bbox_size)
14221422
assert more_data.shape == (3, 36, 48, 60)
1423-
assert np.array_equal(more_data[:, :24, :24, :24], original_data)
1423+
np.testing.assert_array_equal(more_data[:, :24, :24, :24], original_data)
14241424

14251425
assert tuple(ds.get_layer("color").bounding_box.topleft) == (0, 0, 0)
14261426

@@ -1432,12 +1432,12 @@ def test_changing_layer_bounding_box(
14321432
new_bbox_size = ds.get_layer("color").bounding_box.size
14331433
assert tuple(new_bbox_offset) == (10, 10, 0)
14341434
assert tuple(new_bbox_size) == (14, 14, 24)
1435-
assert np.array_equal(
1435+
np.testing.assert_array_equal(
14361436
original_data,
14371437
mag.read(absolute_offset=(0, 0, 0), size=mag.bounding_box.bottomright),
14381438
)
14391439

1440-
assert np.array_equal(
1440+
np.testing.assert_array_equal(
14411441
original_data[:, 10:, 10:, :],
14421442
mag.read(absolute_offset=(10, 10, 0), size=(14, 14, 24)),
14431443
)
@@ -1447,7 +1447,7 @@ def test_changing_layer_bounding_box(
14471447
layer.bounding_box = BoundingBox((0, 0, 0), new_bbox_size)
14481448
new_data = mag.read()
14491449
assert new_data.shape == (3, 14, 14, 24)
1450-
assert np.array_equal(original_data[:, :14, :14, :], new_data)
1450+
np.testing.assert_array_equal(original_data[:, :14, :14, :], new_data)
14511451

14521452
assure_exported_properties(ds)
14531453

@@ -1635,11 +1635,11 @@ def test_writing_subset_of_compressed_data_multi_channel(
16351635
view.write(relative_offset=(10, 20, 30), data=write_data2)
16361636
view.write(relative_offset=(10, 20, 30), data=write_data2, allow_unaligned=True)
16371637

1638-
assert np.array_equal(
1638+
np.testing.assert_array_equal(
16391639
write_data2,
16401640
compressed_mag.read(relative_offset=(60, 80, 100), size=(10, 10, 10)),
16411641
) # the new data was written
1642-
assert np.array_equal(
1642+
np.testing.assert_array_equal(
16431643
write_data1[:, :60, :80, :100],
16441644
compressed_mag.read(relative_offset=(0, 0, 0), size=(60, 80, 100)),
16451645
) # the old data is still there
@@ -1681,11 +1681,11 @@ def test_writing_subset_of_compressed_data_single_channel(
16811681
view.write(relative_offset=(10, 20, 30), data=write_data2)
16821682
view.write(relative_offset=(10, 20, 30), data=write_data2, allow_unaligned=True)
16831683

1684-
assert np.array_equal(
1684+
np.testing.assert_array_equal(
16851685
write_data2,
16861686
compressed_mag.read(absolute_offset=(60, 80, 100), size=(10, 10, 10))[0],
16871687
) # the new data was written
1688-
assert np.array_equal(
1688+
np.testing.assert_array_equal(
16891689
write_data1[:60, :80, :100],
16901690
compressed_mag.read(absolute_offset=(0, 0, 0), size=(60, 80, 100))[0],
16911691
) # the old data is still there
@@ -1861,7 +1861,7 @@ def test_add_layer_as_ref(data_format: DataFormat, output_path: UPath) -> None:
18611861
(np.random.rand(3, 10, 10, 10) * 255).astype(np.uint8), allow_unaligned=True
18621862
)
18631863

1864-
assert np.array_equal(
1864+
np.testing.assert_array_equal(
18651865
mag.read(absolute_offset=(0, 0, 0), size=(10, 10, 10)),
18661866
original_mag.read(absolute_offset=(0, 0, 0), size=(10, 10, 10)),
18671867
)
@@ -1963,7 +1963,7 @@ def test_add_mag_as_ref(data_format: DataFormat, output_path: UPath) -> None:
19631963
assert not layer.get_mag(1).read_only
19641964
assert ref_mag_2.read_only
19651965

1966-
assert np.array_equal(
1966+
np.testing.assert_array_equal(
19671967
ref_mag_2.read(absolute_offset=(0, 0, 0), size=(10, 10, 10))[0],
19681968
original_layer.get_mag(2).read(absolute_offset=(0, 0, 0), size=(10, 10, 10))[0],
19691969
)
@@ -2051,10 +2051,10 @@ def test_add_mag_as_copy(data_format: DataFormat, output_path: UPath) -> None:
20512051
allow_unaligned=True,
20522052
)
20532053

2054-
assert np.array_equal(
2054+
np.testing.assert_array_equal(
20552055
copy_mag.read(absolute_offset=(0, 0, 0), size=(5, 5, 5))[0], new_data
20562056
)
2057-
assert np.array_equal(original_mag.read()[0], original_data)
2057+
np.testing.assert_array_equal(original_mag.read()[0], original_data)
20582058

20592059
assure_exported_properties(original_ds)
20602060
assure_exported_properties(copy_ds)
@@ -2100,10 +2100,10 @@ def test_add_fs_copy_mag(data_format: DataFormat, output_path: UPath) -> None:
21002100
allow_unaligned=True,
21012101
)
21022102

2103-
assert np.array_equal(
2103+
np.testing.assert_array_equal(
21042104
copy_mag.read(absolute_offset=(0, 0, 0), size=(5, 5, 5))[0], new_data
21052105
)
2106-
assert np.array_equal(original_mag.read()[0], original_data)
2106+
np.testing.assert_array_equal(original_mag.read()[0], original_data)
21072107

21082108
assure_exported_properties(original_ds)
21092109
assure_exported_properties(copy_ds)
@@ -2127,7 +2127,7 @@ def test_search_dataset_also_for_long_layer_name(
21272127
write_data = (np.random.rand(10, 10, 10) * 255).astype(np.uint8)
21282128
mag.write(write_data, absolute_offset=(20, 20, 20), allow_resize=True)
21292129

2130-
assert np.array_equal(
2130+
np.testing.assert_array_equal(
21312131
mag.read(absolute_offset=(20, 20, 20), size=(20, 20, 20)),
21322132
np.expand_dims(write_data, 0),
21332133
)
@@ -2146,7 +2146,7 @@ def test_search_dataset_also_for_long_layer_name(
21462146
# when opening the dataset, it searches both for the long and the short path
21472147
layer = Dataset.open(ds_path).get_layer("color")
21482148
mag = layer.get_mag("2")
2149-
assert np.array_equal(
2149+
np.testing.assert_array_equal(
21502150
mag.read(absolute_offset=(20, 20, 20), size=(20, 20, 20)),
21512151
np.expand_dims(write_data, 0),
21522152
)
@@ -2318,7 +2318,7 @@ def test_dataset_conversion_wkw_only() -> None:
23182318
assert origin_info.compression_mode == converted_info.compression_mode
23192319
assert origin_info.chunk_shape == converted_info.chunk_shape
23202320
assert origin_info.data_format == converted_info.data_format
2321-
assert np.array_equal(
2321+
np.testing.assert_array_equal(
23222322
origin_ds.layers[layer_name].mags[mag].read(),
23232323
converted_ds.layers[layer_name].mags[mag].read(),
23242324
)
@@ -2412,7 +2412,7 @@ def test_for_zipped_chunks(data_format: DataFormat) -> None:
24122412
executor=executor,
24132413
)
24142414

2415-
assert np.array_equal(
2415+
np.testing.assert_array_equal(
24162416
source_view.read() + 50,
24172417
target_view.read(),
24182418
)
@@ -2470,6 +2470,25 @@ def test_for_zipped_chunks_invalid_target_chunk_shape_wk(
24702470
assure_exported_properties(ds)
24712471

24722472

2473+
@pytest.mark.parametrize("output_path", OUTPUT_PATHS)
2474+
def test_invalid_chunk_shard_shape(output_path: UPath) -> None:
2475+
ds_path = prepare_dataset_path(
2476+
DEFAULT_DATA_FORMAT, output_path, "invalid_chunk_shape"
2477+
)
2478+
ds = Dataset(ds_path, voxel_size=(1, 1, 1))
2479+
layer = ds.add_layer("color", COLOR_CATEGORY, data_format=DEFAULT_DATA_FORMAT)
2480+
2481+
with pytest.raises(ValueError, match=".*must be a power of two.*"):
2482+
layer.add_mag("1", chunk_shape=(3, 4, 4))
2483+
2484+
with pytest.raises(ValueError, match=".*must be a multiple.*"):
2485+
layer.add_mag("1", chunk_shape=(16, 16, 16), shard_shape=(8, 16, 16))
2486+
2487+
with pytest.raises(ValueError, match=".*must be a multiple.*"):
2488+
# also not a power-of-two shard shape
2489+
layer.add_mag("1", chunk_shape=(16, 16, 16), shard_shape=(53, 16, 16))
2490+
2491+
24732492
@pytest.mark.parametrize("data_format,output_path", DATA_FORMATS_AND_OUTPUT_PATHS)
24742493
def test_read_only_view(data_format: DataFormat, output_path: UPath) -> None:
24752494
ds_path = prepare_dataset_path(data_format, output_path, "read_only_view")
@@ -2590,12 +2609,69 @@ def test_compression(data_format: DataFormat, output_path: UPath) -> None:
25902609
)
25912610
mag1 = Dataset.open(compressed_dataset_path).get_layer("color").get_mag(1)
25922611
else:
2593-
mag1.compress()
2612+
with get_executor("sequential") as executor:
2613+
mag1.compress(executor=executor)
25942614

25952615
assert mag1._is_compressed()
25962616
assert mag1.info.data_format == data_format
25972617

2598-
assert np.array_equal(
2618+
np.testing.assert_array_equal(
2619+
write_data, mag1.read(absolute_offset=(60, 80, 100), size=(10, 20, 30))
2620+
)
2621+
2622+
# writing unaligned data to a compressed dataset works because the data gets padded, but it prints a warning
2623+
mag1.write(
2624+
(np.random.rand(3, 10, 20, 30) * 255).astype(np.uint8), allow_resize=True
2625+
)
2626+
2627+
assure_exported_properties(mag1.layer.dataset)
2628+
2629+
2630+
@pytest.mark.parametrize("data_format,output_path", DATA_FORMATS_AND_OUTPUT_PATHS)
2631+
def test_rechunking(data_format: DataFormat, output_path: UPath) -> None:
2632+
new_dataset_path = prepare_dataset_path(data_format, output_path)
2633+
ds = Dataset(new_dataset_path, voxel_size=(2, 2, 1))
2634+
mag1 = ds.add_layer(
2635+
"color", COLOR_CATEGORY, num_channels=3, data_format=data_format
2636+
).add_mag(
2637+
1,
2638+
compress=False,
2639+
chunk_shape=(16, 16, 16),
2640+
shard_shape=(16, 16, 16) if data_format == DataFormat.Zarr else (64, 64, 64),
2641+
)
2642+
2643+
# writing unaligned data to an uncompressed dataset
2644+
write_data = (np.random.rand(3, 10, 20, 30) * 255).astype(np.uint8)
2645+
mag1.write(write_data, absolute_offset=(60, 80, 100), allow_resize=True)
2646+
2647+
assert not mag1._is_compressed()
2648+
2649+
if output_path == REMOTE_TESTOUTPUT_DIR:
2650+
# Remote datasets require a `target_path` for rechunking
2651+
with pytest.raises(AssertionError):
2652+
mag1.rechunk()
2653+
2654+
compressed_dataset_path = (
2655+
REMOTE_TESTOUTPUT_DIR / f"simple_{data_format}_dataset_compressed"
2656+
)
2657+
with pytest.warns(UserWarning, match=".*can be slow.*"):
2658+
mag1.rechunk(
2659+
target_path=compressed_dataset_path,
2660+
)
2661+
mag1 = Dataset.open(compressed_dataset_path).get_layer("color").get_mag(1)
2662+
else:
2663+
with get_executor("sequential") as executor:
2664+
mag1.rechunk(executor=executor)
2665+
2666+
assert mag1.info.data_format == data_format
2667+
assert mag1._is_compressed()
2668+
assert mag1.info.chunk_shape == Vec3Int.full(32)
2669+
if data_format == DataFormat.Zarr:
2670+
assert mag1.info.shard_shape == Vec3Int.full(32)
2671+
else:
2672+
assert mag1.info.shard_shape == Vec3Int.full(1024)
2673+
2674+
np.testing.assert_array_equal(
25992675
write_data, mag1.read(absolute_offset=(60, 80, 100), size=(10, 20, 30))
26002676
)
26012677

@@ -2816,7 +2892,7 @@ def test_read_bbox() -> None:
28162892
allow_resize=True,
28172893
)
28182894

2819-
assert np.array_equal(
2895+
np.testing.assert_array_equal(
28202896
mag.read(absolute_offset=(20, 30, 40), size=(40, 50, 60)),
28212897
mag.read(
28222898
absolute_bounding_box=BoundingBox(topleft=(20, 30, 40), size=(40, 50, 60))
@@ -2863,7 +2939,7 @@ def test_add_layer_as_copy(data_format: DataFormat, output_path: UPath) -> None:
28632939
assert color_layer.mags.keys() == original_color_layer.mags.keys()
28642940
assert len(color_layer.mags.keys()) >= 1
28652941
for mag in color_layer.mags.keys():
2866-
assert np.array_equal(
2942+
np.testing.assert_array_equal(
28672943
color_layer.get_mag(mag).read(), original_color_layer.get_mag(mag).read()
28682944
)
28692945
# Test if the copied layer contains actual data
@@ -2912,7 +2988,7 @@ def test_rename_layer(data_format: DataFormat, output_path: UPath) -> None:
29122988
assert ds.get_layer("color2").data_format == data_format
29132989

29142990
# The "mag" object which was created before renaming the layer is still valid
2915-
assert np.array_equal(mag.read()[0], write_data)
2991+
np.testing.assert_array_equal(mag.read()[0], write_data)
29162992

29172993
assure_exported_properties(ds)
29182994

@@ -3107,7 +3183,7 @@ def test_pickle_view() -> None:
31073183

31083184
# Make sure that the pickled mag can still read data
31093185
assert pickled_mag1._cached_array is None
3110-
assert np.array_equal(
3186+
np.testing.assert_array_equal(
31113187
data_to_write,
31123188
pickled_mag1.read(relative_offset=(0, 0, 0), size=data_to_write.shape[-3:]),
31133189
)

webknossos/webknossos/annotation/annotation.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -187,7 +187,7 @@ class Annotation:
187187
# The following underscored attributes are just for initialization
188188
# in case the skeleton is not given. They are always None as attributes.
189189
_dataset_name: str | None = None
190-
_voxel_size: Vector3 | None = None
190+
_voxel_size: VoxelSize | Vector3 | None = None
191191
_organization_id: str | None = None
192192
_description: str | None = None
193193
owner_name: str | None = None

webknossos/webknossos/dataset/layer.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -596,6 +596,16 @@ def add_mag(
596596
+ f"performance in WEBKNOSSOS. Got {chunk_shape}."
597597
)
598598

599+
if not shard_shape._is_power_of_two():
600+
raise ValueError(
601+
f"The shard shape must be a power of two. Got {shard_shape}."
602+
)
603+
604+
if not chunk_shape._is_power_of_two():
605+
raise ValueError(
606+
f"The chunk shape must be a power of two. Got {chunk_shape}."
607+
)
608+
599609
self._assert_mag_does_not_exist_yet(mag)
600610
mag_path = self._create_dir_for_mag(mag)
601611

0 commit comments

Comments
 (0)