diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5d6e313..ba618d5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -14,7 +14,7 @@ jobs: fail-fast: false matrix: os: [ubuntu-latest, windows-latest, macos-latest] - python-version: ["3.8", "3.9", "3.10", "3.11", "3.12"] + python-version: ["3.9", "3.10", "3.11", "3.12"] runs-on: ${{ matrix.os }} steps: @@ -25,11 +25,13 @@ jobs: uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} + - name: Install dependencies shell: bash run: | python -m pip install --upgrade pip python -m pip install flake8 pytest + - name: Lint with flake8 shell: bash run: | @@ -37,11 +39,52 @@ jobs: flake8 . --count --select=E9,F63,F7,F82 --show-source --statistics # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide flake8 . --count --ignore=F401,W503,E203 --max-complexity=99 --max-line-length=127 --statistics + - name: Install h5json shell: bash run: | pip install -e . + + - name: Checkout HSDS + uses: actions/checkout@v4 + with: + repository: HDFGroup/hsds + path: ${{github.workspace}}/hsds + + - name: Install HSDS + working-directory: ${{github.workspace}}/hsds + shell: bash + run: | + pip install -e . + + - name: Start HSDS + shell: bash + working-directory: ${{github.workspace}}/hsds + run: | + mkdir hsds_root + mkdir hsds_root/hsds_bucket + cp admin/config/groups.default admin/config/groups.txt + cp admin/config/passwd.default admin/config/passwd.txt + hsds --root_dir hsds_root --host localhost --port 5101 --password_file admin/config/passwd.txt --logfile hs.log --loglevel DEBUG --config_dir=admin/config --count=4 & + + - name: Wait for node startup + shell: bash + run: | + sleep 30 + + - name: HSDS Setup + shell: bash + env: + ADMIN_PASSWORD: admin + ADMIN_USERNAME: admin + working-directory: ${{github.workspace}}/hsds + run: | + python tests/integ/setup_test.py + - name: Run tests shell: bash + HS_ENDPOINT: http://localhost:5101 + HS_USERNAME: test_user1 + HS_PASSWORD: test run: | python testall.py diff --git a/data/hdf5/dset_creationprop.h5 b/data/hdf5/dset_creationprop.h5 index ff5b7a7..12b7a32 100644 Binary files a/data/hdf5/dset_creationprop.h5 and b/data/hdf5/dset_creationprop.h5 differ diff --git a/data/json/nullspace_dset.json b/data/json/nullspace_dset.json deleted file mode 100644 index 8808f21..0000000 --- a/data/json/nullspace_dset.json +++ /dev/null @@ -1,34 +0,0 @@ -{ - "apiVersion": "1.1.0", - "datasets": { - "23d3e919-7b53-11e4-961d-3c15c2da029e": { - "alias": [ - "/DS1" - ], - "shape": { - "class": "H5S_NULL" - }, - "type": { - "base": "H5T_STD_I32LE", - "class": "H5T_INTEGER" - }, - "value": null - } - }, - "groups": { - "23d2e06b-7b53-11e4-9910-3c15c2da029e": { - "alias": [ - "/" - ], - "links": [ - { - "class": "H5L_TYPE_HARD", - "collection": "datasets", - "id": "23d3e919-7b53-11e4-961d-3c15c2da029e", - "title": "DS1" - } - ] - } - }, - "root": "23d2e06b-7b53-11e4-9910-3c15c2da029e" -} diff --git a/pyproject.toml b/pyproject.toml index bcba820..879e7ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -9,7 +9,6 @@ classifiers = [ "Topic :: Software Development :: Build Tools", "License :: OSI Approved :: BSD License", "Programming Language :: Python :: 3", - "Programming Language :: Python :: 3.8", "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", @@ -17,19 +16,19 @@ classifiers = [ ] authors = [{ "name" = "The HDF Group", "email" = "help@hdfgroup.org" }] keywords = ["json", "hdf5", "multidimensional array", "data", "datacube"] -requires-python = ">=3.8" +requires-python = ">=3.9" dependencies = [ - "h5py >=3.10", + "h5py >= 3.10", "numpy >= 2.0; python_version>='3.9'", "jsonschema >=4.4.0", "tomli; python_version<'3.11'", - "numpy >=1.20,<2.0.0; python_version=='3.8'", ] + dynamic = ["version"] [project.urls] -Homepage = "https://hdf5-json.readthedocs.io" -Documentation = "https://hdf5-json.readthedocs.io" +Homepage = "https://support.hdfgroup.org/documentation/hdf5-json/latest/" +Documentation = "https://support.hdfgroup.org/documentation/hdf5-json/latest/" Source = "https://github.com/HDFGroup/hdf5-json" "Bug Reports" = "https://github.com/HDFGroup/hdf5-json/issues" Social = "https://twitter.com/hdf5" @@ -52,6 +51,9 @@ build-backend = "setuptools.build_meta" package-dir = { "" = "src" } packages = [ "h5json", + "h5json.jsonstore", + "h5json.h5pystore", + "h5json.hsdsstore", "h5json.h5tojson", "h5json.jsontoh5", "h5json.schema", diff --git a/setup.cfg b/setup.cfg new file mode 100644 index 0000000..b2f3e82 --- /dev/null +++ b/setup.cfg @@ -0,0 +1,6 @@ +[flake8] +max-line-length = 120 +# E402: module level import not at top of file +# C901: too complex +# F401: unused exports are necessary in __init__.py +ignore = E402, C901, F401 diff --git a/src/h5json/__init__.py b/src/h5json/__init__.py index 704d241..d4a7f78 100644 --- a/src/h5json/__init__.py +++ b/src/h5json/__init__.py @@ -21,6 +21,14 @@ from .hdf5dtype import getTypeResponse from .hdf5dtype import getItemSize from .hdf5dtype import createDataType +from .objid import createObjId +from .objid import getCollectionForId +from .objid import isObjId +from .objid import isS3ObjKey +from .objid import getS3Key +from .objid import getObjId +from .objid import isSchema2Id +from .objid import isRootObjId from .hdf5db import Hdf5db from . import _version diff --git a/src/h5json/array_util.py b/src/h5json/array_util.py new file mode 100644 index 0000000..cb39cd5 --- /dev/null +++ b/src/h5json/array_util.py @@ -0,0 +1,713 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## + +import math +import base64 +import binascii +import numpy as np + +from .hdf5dtype import isVlen + +MAX_VLEN_ELEMENT = 1_000_000 # restrict largest vlen element to one million + + +def bytesArrayToList(data): + """ + Convert list that may contain bytes type elements to list of string elements + + TBD: Need to deal with non-string byte data (hexencode?) + """ + if type(data) in (bytes, str): + is_list = False + elif isinstance(data, (np.ndarray, np.generic)): + if len(data.shape) == 0: + is_list = False + data = data.tolist() # tolist will return a scalar in this case + if type(data) in (list, tuple, np.ndarray): + is_list = True + else: + is_list = False + else: + is_list = True + elif type(data) in (list, tuple): + is_list = True + else: + is_list = False + if is_list: + out = [] + for item in data: + try: + rec_item = bytesArrayToList(item) # recursive call + out.append(rec_item) + except ValueError as err: + raise err + elif type(data) is bytes: + try: + out = data.decode("utf-8") + except UnicodeDecodeError as err: + raise ValueError(err) + else: + out = data + + return out + + +def toTuple(rank, data, encoding=None): + """ + Convert a list to a tuple, recursively. + Example. [[1,2],[3,4]] -> ((1,2),(3,4)) + """ + if type(data) in (list, tuple): + if rank > 0: + return list(toTuple(rank - 1, x) for x in data) + else: + return tuple(toTuple(rank - 1, x) for x in data) + else: + if encoding: + data = data.encode(encoding, "surrogateesacpe") + return data + + +def getArraySize(arr): + """ + Get size in bytes of a numpy array. + """ + nbytes = arr.dtype.itemsize + for n in arr.shape: + nbytes *= n + return nbytes + + +def getNumElements(dims): + """ + Get num elements defined by a shape + """ + num_elements = 0 + if isinstance(dims, int): + num_elements = dims + elif isinstance(dims, (list, tuple)): + num_elements = 1 + for dim in dims: + num_elements *= dim + else: + raise ValueError("Unexpected argument") + return num_elements + + +def jsonToArray(data_shape, data_dtype, data_json): + """ + Return numpy array from the given json array. + """ + + # print(f"jsonToArray - data_shape: {data_shape} dtype: {data_dtype} data: {data_json}") + + def get_array(data, rank, dtype): + # helper function to create an array with encoding if needed + try: + arr = np.array(data, dtype=dtype) + except UnicodeEncodeError: + # Unable to encode data, encode as utf8 with surrogate escaping + data = toTuple(rank, data, encoding="utf8") + arr = np.array(data, dtype=dtype) + return arr + + if data_json is None: + return np.array([]).astype(data_dtype) + + if isinstance(data_json, (list, tuple)): + if None in data_json: + return np.array([]).astype(data_dtype) + + # need some special conversion for compound types -- + # each element must be a tuple, but the JSON decoder + # gives us a list instead. + if len(data_dtype) > 0 and not isinstance(data_json, (list, tuple)): + raise TypeError("expected list data for compound data type") + npoints = getNumElements(data_shape) + np_shape_rank = len(data_shape) + + if type(data_json) in (list, tuple): + data_json = toTuple(np_shape_rank, data_json) + + if isVlen(data_dtype): + # for vlen data we need to initialize of zero numpy array to ensure the right shape + arr = np.zeros(data_shape, dtype=data_dtype) + arr[...] = data_json + else: + try: + arr = get_array(data_json, np_shape_rank, data_dtype) + except ValueError: + if npoints <= 1 and isinstance(data_json, list): + # try converting data to a tuple + arr = get_array(tuple(data_json), np_shape_rank, data_dtype) + else: + raise + + # raise an exception of the array shape doesn't match the selection shape + # allow if the array is a scalar and the selection shape is one element, + # numpy is ok with this + if arr.size != npoints: + msg = "Input data doesn't match selection number of elements" + msg += f" Expected {npoints}, but received: {arr.size}" + # try adding an extra dimension to data_json + # for cases where e.g. compound types are not getting interpreted correctly + data_json = toTuple(np_shape_rank, [data_json, ]) + arr = get_array(data_json, np_shape_rank, data_dtype) + if arr.size != npoints: + # still no good, raise error + raise ValueError(msg) + + if arr.shape != tuple(data_shape): + arr = arr.reshape(tuple(data_shape)) + + return arr + + +def getElementSize(e, dt): + """ + Get number of byte needed to given element as a bytestream + """ + # print(f"getElementSize - e: {e} dt: {dt} metadata: {dt.metadata}") + if len(dt) > 1: + count = 0 + for name in dt.names: + field_dt = dt[name] + field_val = e[name] + count += getElementSize(field_val, field_dt) + elif not dt.base.metadata or "vlen" not in dt.base.metadata: + count = dt.itemsize # fixed size element + else: + # variable length element + vlen = dt.base.metadata["vlen"] + if isinstance(e, int): + if e == 0: + count = 4 # non-initialized element + else: + raise ValueError(f"Unexpected value: {e}") + elif isinstance(e, bytes): + count = len(e) + 4 + elif isinstance(e, str): + count = len(e.encode("utf-8")) + 4 + elif isinstance(e, np.ndarray): + nElements = math.prod(e.shape) + if e.dtype.kind != "O": + count = e.dtype.itemsize * nElements + else: + arr1d = e.reshape((nElements,)) + count = 0 + for item in arr1d: + count += getElementSize(item, dt) + count += 4 # byte count + elif isinstance(e, list) or isinstance(e, tuple): + if not e: + # empty list, just add byte count + count = 4 + else: + # not sure how to deal with this + count = len(e) * vlen.itemsize + 4 # +4 for byte count + else: + raise TypeError("unexpected type: {}".format(type(e))) + # print("getElementSize returning:", count) + return count + + +def getByteArraySize(arr): + """ + Get number of bytes needed to store given numpy array as a bytestream + """ + if not isVlen(arr.dtype): + return arr.itemsize * math.prod(arr.shape) + nElements = math.prod(arr.shape) + # reshape to 1d for easier iteration + arr1d = arr.reshape((nElements,)) + dt = arr1d.dtype + count = 0 + for e in arr1d: + count += getElementSize(e, dt) + return count + + +def copyBuffer(src, des, offset): + """ + Copy to buffer at given offset + """ + # print(f"copyBuffer - src: {src} offset: {offset}") + # TBD: just do: des[offset:] = src[:] ? + for i in range(len(src)): + des[i + offset] = src[i] + + # print("returning:", offset + len(src)) + return offset + len(src) + + +def copyElement(e, dt, buffer, offset): + """ + Copy element to bytearray + """ + + # print(f"copyElement - dt: {dt} offset: {offset}") + if len(dt) > 1: + for name in dt.names: + field_dt = dt[name] + field_val = e[name] + offset = copyElement(field_val, field_dt, buffer, offset) + elif not dt.base.metadata or "vlen" not in dt.base.metadata: + # print(f"no vlen: {e} type: {type(e)} e.dtype: {e.dtype} itemsize: {dt.itemsize}") + e_buf = np.asarray(e, dtype=dt).tobytes() + if len(e_buf) < dt.itemsize: + # extend the buffer for fixed size strings + e_buf_ex = bytearray(dt.itemsize) + for i in range(len(e_buf)): + e_buf_ex[i] = e_buf[i] + e_buf = bytes(e_buf_ex) + + offset = copyBuffer(e_buf, buffer, offset) + else: + # variable length element + vlen = dt.base.metadata["vlen"] + if isinstance(e, int): + if e == 0: + # write 4-byte integer 0 to buffer + offset = copyBuffer(b"\x00\x00\x00\x00", buffer, offset) + else: + raise ValueError("Unexpected value: {}".format(e)) + elif isinstance(e, bytes): + count = np.int32(len(e)) + if count > MAX_VLEN_ELEMENT: + raise ValueError("vlen element too large") + offset = copyBuffer(count.tobytes(), buffer, offset) + offset = copyBuffer(e, buffer, offset) + elif isinstance(e, str): + text = e.encode("utf-8") + count = np.int32(len(text)) + if count > MAX_VLEN_ELEMENT: + raise ValueError("vlen element too large") + offset = copyBuffer(count.tobytes(), buffer, offset) + offset = copyBuffer(text, buffer, offset) + + elif isinstance(e, np.ndarray): + nElements = math.prod(e.shape) + + if e.dtype.kind != "O": + count = np.int32(e.dtype.itemsize * nElements) + if count > MAX_VLEN_ELEMENT: + raise ValueError("vlen element too large") + offset = copyBuffer(count.tobytes(), buffer, offset) + offset = copyBuffer(e.tobytes(), buffer, offset) + else: + arr1d = e.reshape((nElements,)) + for item in arr1d: + offset = copyElement(item, dt, buffer, offset) + + elif isinstance(e, list) or isinstance(e, tuple): + # print("cooyBuffer list/tuple vlen:", vlen, "e:", e) + count = np.int32(len(e) * vlen.itemsize) + offset = copyBuffer(count.tobytes(), buffer, offset) + if isinstance(e, np.ndarray): + arr = e + else: + arr = np.asarray(e, dtype=vlen) + offset = copyBuffer(arr.tobytes(), buffer, offset) + + else: + raise TypeError("unexpected type: {}".format(type(e))) + return offset + + +def getElementCount(buffer, offset=0): + """ + Get the count value from persisted vlen array + """ + + n = offset + m = offset + 4 + count_bytes = bytes(buffer[n:m]) + + try: + count = int(np.frombuffer(count_bytes, dtype=" MAX_VLEN_ELEMENT: + # expect variable length element to be between 0 and 1mb + raise ValueError("varlen element size expected to be less than 1MB") + return count + + +def readElement(buffer, offset, arr, index, dt): + """ + Read a single element from buffer into array. + + Parameters: + buffer (bytearray): Byte array to read an element from. + offset (int): Starting offset in the buffer. + arr (numpy.ndarray): Array to store the element. + index (int): Index in 'arr' at which to store the element. + dt (numpy.dtype): Numpy datatype of the element. + + Note: If the provided datatype is a variable-length sequence, + this function will read the byte count from the first 4 bytes + of the buffer, and then read the entire sequence. + + Returns: + int: The updated offset value after reading the element. + """ + # print("readElement, offset:", offset) + if len(dt) > 1: + e = arr[index] + for name in dt.names: + field_dt = dt[name] + offset = readElement(buffer, offset, e, name, field_dt) + elif not dt.base.metadata or "vlen" not in dt.base.metadata: + count = dt.itemsize + n = offset + m = offset + count + e_buffer = buffer[n:m] + offset += count + try: + e = np.frombuffer(bytes(e_buffer), dtype=dt) + arr[index] = e[0] + + except ValueError: + # print(f"ValueError setting {e_buffer} and dtype: {dt}") + raise + else: + # variable length element + vlenBaseType = dt.base.metadata["vlen"] + e = arr[index] + + if isinstance(e, np.ndarray): + nelements = math.prod(dt.shape) + e.reshape((nelements,)) + for i in range(nelements): + offset = readElement(buffer, offset, e, i, dt) + e.reshape(dt.shape) + else: + # total number of bytes in the vlen sequence/variable-length string + count = getElementCount(buffer, offset=offset) + offset += 4 + n = offset + m = offset + count + if count > 0: + e_buffer = buffer[n:m] + offset += count + + if vlenBaseType is bytes: + arr[index] = bytes(e_buffer) + elif vlenBaseType is str: + s = e_buffer.decode("utf-8") + arr[index] = s + else: + try: + e = np.frombuffer(bytes(e_buffer), dtype=vlenBaseType) + except ValueError: + msg = f"Failed to parse vlen data: {e_buffer} with dtype: {vlenBaseType}" + raise ValueError(msg) + arr[index] = e + return offset + + +def encodeData(data, encoding="base64"): + """ Encode given data """ + if encoding != "base64": + raise ValueError("only base64 encoding is supported") + try: + if isinstance(data, str): + data = data.encode("utf8") + except UnicodeEncodeError: + raise ValueError("can not encode string value") + if not isinstance(data, bytes): + msg = "Expected str or bytes type to encodeData, " + msg += f"but got: {type(data)}" + raise TypeError(msg) + try: + encoded_data = base64.b64encode(data) + except Exception as e: + # TBD: what exceptions can be raised? + raise ValueError(f"Unable to encode: {e}") + return encoded_data + + +def decodeData(data, encoding="base64"): + if encoding != "base64": + raise ValueError("only base64 decoding is supported") + try: + decoded_data = base64.b64decode(data) + except Exception as e: + # TBD: catch actual exception + raise ValueError(f"Unable to decode: {e}") + return decoded_data + + +def arrayToBytes(arr, encoding=None): + """ + Return byte representation of numpy array + """ + + if isVlen(arr.dtype): + nSize = getByteArraySize(arr) + buffer = bytearray(nSize) + offset = 0 + nElements = math.prod(arr.shape) + arr1d = arr.reshape((nElements,)) + for e in arr1d: + offset = copyElement(e, arr1d.dtype, buffer, offset) + data = bytes(buffer) + else: + # fixed length type + data = arr.tobytes() + + if encoding: + data = encodeData(data) + return data + + +def bytesToArray(data, dt, shape, encoding=None): + """ + Create numpy array based on byte representation + """ + if encoding: + # decode the data + # will raise ValueError if non-decodable + data = decodeData(data) + if not isVlen(dt): + # regular numpy from string + arr = np.frombuffer(data, dtype=dt) + else: + nElements = getNumElements(shape) + + arr = np.zeros((nElements,), dtype=dt) + offset = 0 + for index in range(nElements): + offset = readElement(data, offset, arr, index, dt) + if shape is not None: + arr = arr.reshape(shape) + # check that we can update the array if needed + # Note: this seems to have been required starting with numpuy v 1.17 + # Setting the flag directly is not recommended. + # cf: https://github.com/numpy/numpy/issues/9440 + + if not arr.flags["WRITEABLE"]: + arr_copy = arr.copy() + arr = arr_copy + + return arr + + +def getNumpyValue(value, dt=None, encoding=None): + """ + Return value as numpy type for given dtype and encoding + Encoding is expected to be one of None or "base64" + """ + # create a scalar numpy array + arr = np.zeros((), dtype=dt) + + if encoding and not isinstance(value, str): + msg = "Expected value to be string to use encoding" + raise ValueError(msg) + + if encoding == "base64": + try: + data = base64.decodebytes(value.encode("utf-8")) + except binascii.Error: + msg = "Unable to decode base64 string: {value}" + # log.warn(msg) + raise ValueError(msg) + arr = bytesToArray(data, dt, dt.shape) + else: + if isinstance(value, list): + # convert to tuple + value = tuple(value) + elif dt.kind == "f" and isinstance(value, str) and value == "nan": + value = np.nan + else: + # use as is + pass + arr = np.asarray(value, dtype=dt.base) + return arr[()] + + +def squeezeArray(data): + """ + Reduce dimensions by removing any 1-extent dimensions. + Just return input if no 1-extent dimensions + + Note: only works with ndarrays (for now at least) + """ + if not isinstance(data, np.ndarray): + raise TypeError("expected ndarray") + if len(data.shape) <= 1: + return data + can_reduce = True + for extent in data.shape: + if extent == 1: + can_reduce = True + break + if can_reduce: + data = data.squeeze() + return data + + +class IndexIterator(object): + """ + Class to iterate through list of chunks of a given dataset + """ + + def __init__(self, shape, sel=None): + self._shape = shape + self._rank = len(self._shape) + self._stop = False + + if self._rank < 1: + raise ValueError("IndexIterator can not be used on arrays of zero rank") + + if sel is None: + # select over entire dataset + slices = [] + for dim in range(self._rank): + slices.append(slice(0, self._shape[dim])) + self._sel = tuple(slices) + else: + if isinstance(sel, slice): + self._sel = (sel,) + else: + self._sel = sel + if len(self._sel) != self._rank: + raise ValueError("Invalid selection - selection region must have same rank as shape") + self._index = [] + for dim in range(self._rank): + s = self._sel[dim] + if s.start < 0 or s.stop > self._shape[dim] or s.stop <= s.start: + raise ValueError( + "Invalid selection - selection region must be within dataset space" + ) + self._index.append(s.start) + + def __iter__(self): + return self + + def __next__(self): + if self._stop: + raise StopIteration() + # bump up the last index and carry forward if we run outside the selection + dim = self._rank - 1 + ret_index = self._index.copy() + while True: + s = self._sel[dim] + if s.step: + step = s.step + else: + step = 1 + self._index[dim] += step + + if self._index[dim] < s.stop: + # we still have room to extend along this dimensions + break + + # reset to the start and continue iterating with higher dimension + self._index[dim] = s.start + dim -= 1 + if dim < 0: + # ran past last index, stop iteration on next run + self._stop = True + + return tuple(ret_index) + + +def ndarray_compare(arr1, arr2): + # compare two numpy arrays. + # return true if the same (exclusive of null vs. empty array) + # false otherwise + # TBD: this is slow for multi-megabyte vlen arrays, needs to be optimized + if not isinstance(arr1, np.ndarray) and not isinstance(arr2, np.ndarray): + if not isinstance(arr1, np.void) and not isinstance(arr2, np.void): + return arr1 == arr2 + if isinstance(arr1, np.void) and not isinstance(arr2, np.void): + if arr1.size == 0 and not arr2: + return True + else: + return False + if not isinstance(arr1, np.void) and isinstance(arr2, np.void): + if not arr1 and arr2.size == 0: + return True + else: + return False + # both np.voids + if arr1.size != arr2.size: + return False + + if len(arr1) != len(arr2): + return False + + for i in range(len(arr1)): + if not ndarray_compare(arr1[i], arr2[i]): + return False + return True + + if isinstance(arr1, np.ndarray) and not isinstance(arr2, np.ndarray): + # same only if arr1 is empty and arr2 is 0 + if arr1.size == 0 and not arr2: + return True + else: + return False + if not isinstance(arr1, np.ndarray) and isinstance(arr2, np.ndarray): + # same only if arr1 is empty and arr2 size is 0 + if not arr1 and arr2.size == 0: + return True + else: + return False + + # two ndarrays... + if arr1.shape != arr2.shape: + return False + if arr2.dtype != arr2.dtype: + return False + + if isVlen(arr1.dtype): + # need to compare element by element + + nElements = np.prod(arr1.shape) + arr1 = arr1.reshape((nElements,)) + arr2 = arr2.reshape((nElements,)) + for i in range(nElements): + if not ndarray_compare(arr1[i], arr2[i]): + return False + return True + else: + # can just us np array_compare + return np.array_equal(arr1, arr2) + + +def getBroadcastShape(mshape, element_count): + # if element_count is less than the number of elements + # defined by mshape, return a numpy compatible broadcast + # shape that contains element_count elements. + # If non exists return None + + if np.prod(mshape) == element_count: + return None + + if element_count == 1: + # this always works + return [1,] + + bcshape = [] + rank = len(mshape) + for n in range(rank - 1): + bcshape.insert(0, mshape[rank - n - 1]) + if element_count == np.prod(bcshape): + return bcshape # have a match + + return None # no broadcast found diff --git a/src/h5json/config.py b/src/h5json/config.py new file mode 100755 index 0000000..b7602ff --- /dev/null +++ b/src/h5json/config.py @@ -0,0 +1,213 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import os +import json + + +class Config: + """ + User Config state + """ + _cfg = {} # global state + + def __init__(self, config_file=None, **kwargs): + if Config._cfg: + return # already initialized + if config_file: + self._config_file = config_file + elif os.path.isfile(".hscfg"): + self._config_file = ".hscfg" + else: + self._config_file = os.path.expanduser("~/.hscfg") + # process config file if found + if os.path.isfile(self._config_file): + line_number = 0 + with open(self._config_file) as f: + for line in f: + line_number += 1 + s = line.strip() + if not s: + continue + if s[0] == '#': + # comment line + continue + fields = s.split('=') + if len(fields) < 2: + print(f"config file: {self._config_file} line: {line_number} is not valid") + continue + k = fields[0].strip() + v = fields[1].strip() + if k == "complex_names": + self.complex_names = v + elif k == "bool_names": + self.bool_names = v + elif k == "track_order": + self.track_order = v + else: + Config._cfg[k] = v + + # add standard keys if not already picked up + for k in ("hs_endpoint", "hs_username", "hs_password", "hs_api_key"): + if k not in Config._cfg: + Config._cfg[k] = "" + + # override any config values with environment variable if found + for k in Config._cfg.keys(): + if k.upper() in os.environ: + Config._cfg[k] = os.environ[k.upper()] + + # update any values that are passed in to the constructor + for k in kwargs.keys(): + Config._cfg[k] = kwargs[k] + + # finally, set defaults for any expected keys that are not already set + for k in ("hs_endpoint", "hs_username", "hs_endpoint"): + if k not in Config._cfg: + Config._cfg[k] = None + if "bool_names" not in Config._cfg: + Config._cfg["bool_names"] = (b"FALSE", b"TRUE") + if "complex_names" not in Config._cfg: + Config._cfg["complex_names"] = ("r", "i") + if "track_order" not in Config._cfg: + Config._cfg["track_order"] = False + + def __getitem__(self, name): + """ Get a config item """ + if name not in Config._cfg: + if name.upper() in os.environ: + Config._cfg[name] = os.environ[name.upper()] + else: + return None + return Config._cfg[name] + + def get(self, name, default): + """ return config value for name or default if None """ + val = self.__getitem__(name) + if val is None: + return default + else: + return default + + def __setitem__(self, name, obj): + """ set config item """ + Config._cfg[name] = obj + + def __delitem__(self, name): + """ Delete option. """ + del Config._cfg[name] + + def __len__(self): + return len(Config._cfg) + + def __iter__(self): + """ Iterate over config names """ + keys = Config._cfg.keys() + for key in keys: + yield key + + def __contains__(self, name): + return name in Config._cfg + + def __repr__(self): + return json.dumps(Config._cfg) + + def keys(self): + return Config._cfg.keys() + + @property + def hs_endpoint(self): + return Config._cfg.get("hs_endpoint") + + @property + def hs_username(self): + return Config._cfg.get("hs_username") + + @property + def hs_password(self): + return Config._cfg.get("hs_password") + + @property + def hs_api_key(self): + return Config._cfg.get("hs_api_key") + + @property + def bool_names(self): + if "bool_names" in Config._cfg: + names = Config._cfg["bool_names"] + else: + names = (b"FALSE", b"TRUE") + return names + + @bool_names.setter + def bool_names(self, value): + if isinstance(value, str): + names = value.split(()) + if len(names) < 2: + raise ValueError("bool_names must have two items") + elif len(names) == 2: + pass + else: + names = names[:2] # just use the first two items + elif len(value) != 2: + raise ValueError("expected two-element list for bool_names") + else: + names = value + Config._cfg["bool_names"] = tuple(names) + + @property + def complex_names(self): + if "complex_names" in Config._cfg: + names = Config._cfg["complex_names"] + else: + names = ("r", "i") + return names + + @complex_names.setter + def complex_names(self, value): + if isinstance(value, str): + names = value.split() + if len(names) < 2: + raise ValueError("complex_names must have two items") + elif len(names) == 2: + pass + else: + names = names[:2] # just use the first two items + elif len(value) != 2: + raise ValueError("complex_names must have two values") + else: + names = value + + Config._cfg["complex_names"] = tuple(names) + + @property + def track_order(self): + if "track_order" in Config._cfg: + track = Config._cfg["track_order"] + else: + track = False + return track + + @track_order.setter + def track_order(self, value): + if isinstance(value, str): + tokens = value.split() + if len(tokens) == 0: + track = False + else: + track = bool(tokens[0]) # strip any comments + else: + track = bool(value) + Config._cfg["track_order"] = track + + +def get_config(config_file=None, **kwargs): + return Config(config_file=config_file, **kwargs) diff --git a/src/h5json/dset_util.py b/src/h5json/dset_util.py new file mode 100644 index 0000000..5b10323 --- /dev/null +++ b/src/h5json/dset_util.py @@ -0,0 +1,42 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## + +import time + + +def resize_dataset(dset_json, shape): + shape_json = dset_json["shape"] + shape_class = shape_json["class"] + if shape_class != "H5S_SIMPLE": + raise TypeError(f"dataset with shape class: {shape_class} cannot be resized") + if len(shape_json["dims"]) != len(shape): + raise ValueError("Resize shape parameter doesn't match dataset's rank") + if "maxdims" not in shape_json: + raise ValueError("Dataset is not resizable") + dims = shape_json["dims"] + maxdims = shape_json["maxdims"] + + if shape_json["dims"] == list(shape): + # no change, just return + return + for i in range(len(dims)): + extent = shape[i] + if extent < 0: + raise ValueError("dimensions can't be negative") + if maxdims[i] == "H5S_UNLIMITED": + # any positive extent is ok + continue + if extent > maxdims[i]: + raise ValueError(f"extent for dimension {i} can't be larger than {maxdims[i]}") + + shape_json["dims"] = list(shape) + dset_json["modified"] = time.time() diff --git a/src/h5json/filters.py b/src/h5json/filters.py new file mode 100644 index 0000000..cda3817 --- /dev/null +++ b/src/h5json/filters.py @@ -0,0 +1,55 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## + +import h5py + +_HDF_FILTERS = { + 1: {"class": "H5Z_FILTER_DEFLATE", "alias": "gzip", "options": ["level"]}, + 2: {"class": "H5Z_FILTER_SHUFFLE", "alias": "shuffle"}, + 3: {"class": "H5Z_FILTER_FLETCHER32", "alias": "fletcher32"}, + 4: { + "class": "H5Z_FILTER_SZIP", + "alias": "szip", + "options": ["bitsPerPixel", "coding", "pixelsPerBlock", "pixelsPerScanLine"], + }, + 5: {"class": "H5Z_FILTER_NBIT"}, + 6: { + "class": "H5Z_FILTER_SCALEOFFSET", + "alias": "scaleoffset", + "options": ["scaleType", "scaleOffset"], + }, + 32000: {"class": "H5Z_FILTER_LZF", "alias": "lzf"}, +} + +_HDF_FILTER_OPTION_ENUMS = { + "coding": { + h5py.h5z.SZIP_EC_OPTION_MASK: "H5_SZIP_EC_OPTION_MASK", + h5py.h5z.SZIP_NN_OPTION_MASK: "H5_SZIP_NN_OPTION_MASK", + }, + "scaleType": { + h5py.h5z.SO_FLOAT_DSCALE: "H5Z_SO_FLOAT_DSCALE", + h5py.h5z.SO_FLOAT_ESCALE: "H5Z_SO_FLOAT_ESCALE", + h5py.h5z.SO_INT: "H5Z_SO_INT", + }, +} + +# h5py supported filters +_H5PY_FILTERS = { + "gzip": 1, + "shuffle": 2, + "fletcher32": 3, + "szip": 4, + "scaleoffset": 6, + "lzf": 32000, +} + +_H5PY_COMPRESSION_FILTERS = ("gzip", "lzf", "szip") diff --git a/src/h5json/h5py_util.py b/src/h5json/h5py_util.py new file mode 100644 index 0000000..ebe2dbd --- /dev/null +++ b/src/h5json/h5py_util.py @@ -0,0 +1,109 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## + +import h5py +import numpy as np + +from . import hdf5dtype + + +def is_reference(val): + """ Return True if the type or value is a Reference """ + + if isinstance(val, object) and val.__class__.__name__ == "Reference": + return True + elif isinstance(val, type) and val.__name__ == "Reference": + return True + else: + return False + + +def is_regionreference(val): + """ Return True if the type or value is a RegionReference """ + + if isinstance(val, object) and val.__class__.__name__ == "RegionReference": + return True + elif isinstance(val, type) and val.__name__ == "RegionReference": + return True + + return False + + +def has_reference(dtype): + """ return True if the dtype (or a sub-type) is a Reference type """ + has_ref = False + if not isinstance(dtype, np.dtype): + return False + if len(dtype) > 0: + for name in dtype.fields: + item = dtype.fields[name] + if has_reference(item[0]): + has_ref = True + break + elif dtype.metadata and "ref" in dtype.metadata: + basedt = dtype.metadata["ref"] + has_ref = is_reference(basedt) + elif dtype.metadata and "vlen" in dtype.metadata: + basedt = dtype.metadata["vlen"] + has_ref = has_reference(basedt) + return has_ref + + +def convert_dtype(srcdt, to_h5py=True): + """Return a dtype based on input dtype, converting any Reference types from + h5py style to h5json and vice-versa. + """ + + if len(srcdt) > 0: + fields = [] + for name in srcdt.fields: + item = srcdt.fields[name] + # item is a tuple of dtype and integer offset + field_dt = convert_dtype(item[0], to_h5py=to_h5py) + fields.append((name, field_dt)) + tgt_dt = np.dtype(fields) + else: + # check if this a "special dtype" + if srcdt.metadata and "ref" in srcdt.metadata: + ref = srcdt.metadata["ref"] + if is_reference(ref): + if to_h5py: + tgt_dt = h5py.special_dtype(ref=h5py.Reference) + else: + tgt_dt = hdf5dtype.special_dtype(ref=hdf5dtype.Reference) + elif is_regionreference(ref): + if to_h5py: + tgt_dt = h5py.special_dtype(ref=h5py.RegionReference) + else: + tgt_dt = hdf5dtype.special_dtype(ref=hdf5dtype.RegionReference) + else: + msg = f"Unexpected ref type: {srcdt}" + raise TypeError(msg) + elif srcdt.metadata and "vlen" in srcdt.metadata: + src_vlen = srcdt.metadata["vlen"] + if isinstance(src_vlen, np.dtype): + tgt_base = convert_dtype(src_vlen, to_h5py=to_h5py) + else: + tgt_base = src_vlen + if to_h5py: + tgt_dt = h5py.special_dtype(vlen=tgt_base) + else: + tgt_dt = hdf5dtype.special_dtype(vlen=tgt_base) + elif srcdt.kind == "U": + # use vlen for unicode strings + if to_h5py: + tgt_dt = h5py.special_dtype(vlen=str) + else: + tgt_dt = hdf5dtype.special_dtype(vlen=str) + else: + tgt_dt = srcdt + return tgt_dt diff --git a/src/h5json/h5pystore/__init__.py b/src/h5json/h5pystore/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/h5json/h5pystore/h5py_reader.py b/src/h5json/h5pystore/h5py_reader.py new file mode 100644 index 0000000..bc4b582 --- /dev/null +++ b/src/h5json/h5pystore/h5py_reader.py @@ -0,0 +1,516 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import h5py +import numpy as np +import logging + +from ..objid import createObjId, getCollectionForId +from ..hdf5dtype import getTypeItem, isOpaqueDtype +from ..array_util import bytesArrayToList +from .. import selections +from .. import filters + +from ..h5py_util import is_reference, is_regionreference, has_reference, convert_dtype +from ..h5reader import H5Reader + + +class H5pyReader(H5Reader): + """ + This class can be used by HDF5DB to read content from an HDF5 file (using h5py) + """ + + def _copy_element(self, val, src_dt, tgt_dt, fin=None): + """ convert the given dataset or attribute element from h5py to h5json equivalent """ + + out = None + if len(src_dt) > 0: + out_fields = [] + i = 0 + for name in src_dt.fields: + field_src_dt = src_dt.fields[name][0] + field_tgt_dt = tgt_dt.fields[name][0] + field_val = val[i] + i += 1 + out_field = self._copy_element(field_val, field_src_dt, field_tgt_dt, fin=fin) + out_fields.append(out_field) + out = tuple(out_fields) + elif src_dt.metadata and "ref" in src_dt.metadata: + if not tgt_dt.metadata or "ref" not in tgt_dt.metadata: + raise TypeError(f"Expected tgt dtype to be ref, but got: {tgt_dt}") + ref = tgt_dt.metadata["ref"] + if is_reference(ref): + # initialize out to null ref + out = h5py.Reference() # null h5py ref + + if ref and val: + try: + fin_obj = fin[val] + except AttributeError as ae: + msg = f"Unable able to get obj for ref value: {ae}" + self.log.error(msg) + raise ValueError(msg) + + addr = h5py.h5o.get_info(fin_obj.id).addr + if addr not in self._addr_map: + msg = f"No object found for ref object: {fin_obj.name}" + self.log.warning(msg) + out = "" + else: + obj_id = self._addr_map[addr] + collection = getCollectionForId(obj_id) + out = f"{collection}/{obj_id}" + + elif is_regionreference(ref): + self.log.warning("region reference not supported") + # TBD: just return a null region reference till we have support + out = "" + else: + raise TypeError(f"Unexpected ref type: {type(ref)}") + elif src_dt.metadata and "vlen" in src_dt.metadata: + if not isinstance(val, np.ndarray): + raise TypeError(f"Expecting ndarray or vlen element, but got: {type(val)}") + if not tgt_dt.metadata or "vlen" not in tgt_dt.metadata: + raise TypeError(f"Expected tgt dtype to be vlen, but got: {tgt_dt}") + src_vlen_dt = src_dt.metadata["vlen"] + tgt_vlen_dt = tgt_dt.metadata["vlen"] + if has_reference(src_vlen_dt): + if len(val.shape) == 0: + # scalar array + e = val[()] + v = self._copy_element(e, src_vlen_dt, tgt_vlen_dt, fin=fin) + out = np.array(v, dtype=tgt_dt) + else: + out = np.zeros(val.shape, dtype=tgt_dt) + for i in range(len(out)): + e = val[i] + out[i] = self._copy_element(e, src_vlen_dt, tgt_vlen_dt, fin=fin) + else: + # can just directly copy the array + out = np.zeros(val.shape, dtype=tgt_dt) + out[...] = val[...] + else: + out = val # can just copy as is + return out + + def _copy_array(self, src_arr, fin=None): + """Copy the numpy array to a new array. + Convert any reference type to point to item in the target's hierarchy. + """ + + if not isinstance(src_arr, np.ndarray): + raise TypeError(f"Expecting ndarray, but got: {src_arr}") + tgt_dt = convert_dtype(src_arr.dtype, to_h5py=False) + tgt_arr = np.zeros(src_arr.shape, dtype=tgt_dt) + + if has_reference(src_arr.dtype): + # flatten array to simplify iteration + count = int(np.prod(src_arr.shape)) + tgt_arr_flat = tgt_arr.reshape((count,)) + src_arr_flat = src_arr.reshape((count,)) + for i in range(count): + e = src_arr_flat[i] + element = self._copy_element(e, src_arr.dtype, tgt_dt, fin=fin) + tgt_arr_flat[i] = element + tgt_arr = tgt_arr_flat.reshape(src_arr.shape) + else: + # can just copy the entire array + tgt_arr[...] = src_arr[...] + return tgt_arr + + def visit(self, path, obj): + name = obj.__class__.__name__ + self.log.info(f"visit: {path} name: {name}") + + obj_id = createObjId(obj_type=name, root_id=self._root_id) # create uuid + + self._id_map[obj_id] = obj + + addr = h5py.h5o.get_info(obj.id).addr + self._addr_map[addr] = obj_id + + def __init__( + self, + filepath, + app_logger=None + ): + self._id_map = {} + self._addr_map = {} + if app_logger: + self.log = app_logger + else: + self.log = logging.getLogger() + if not h5py.is_hdf5(filepath): + self.log.warn(f"File: {filepath} is not an HDF5 file") + raise IOError("not an HDF5 file") + super().__init__(filepath, app_logger=app_logger) + self._f = None + self._root_id = None + + def open(self): + if self._f: + return # already open + if self._id_map: + return # objects already loaded + if not self._root_id: + # get the root id from db if available + if self.db.root_id: + self.log.info("H5pyReader: got root_id from db") + self._root_id = self.db.root_id + else: + self.log.info("H5pyReader: creating root id") + self._root_id = createObjId(obj_type="groups") + + f = h5py.File(self.filepath) + self._f = f + self._id_map[self._root_id] = f + addr = h5py.h5o.get_info(f.id).addr + self._addr_map[addr] = self._root_id + f.visititems(self.visit) + + return self._root_id + + def close(self): + if self._f: + self._f.close() + self._f = None + + def isClosed(self): + return False if self._f else True + + def get_root_id(self): + """ Return root id """ + return self._root_id + + def getObjIdByAddress(self, addr): + if addr in self._addr_map: + return self._addr_map[addr] + else: + return None + + def getAttribute(self, obj_id, name, include_data=True): + """ Return JSON for the given attribute """ + + obj = self._id_map[obj_id] + + if name not in obj.attrs: + msg = f"Attribute: [{name}] not found in object: {obj.name}" + self.log.info(msg) + return None + + # get the attribute! + attrObj = h5py.h5a.open(obj.id, np.bytes_(name)) + + item = {} + + # check if the dataset is using a committed type + typeid = attrObj.get_type() + type_item = None + if h5py.h5t.TypeID.committed(typeid): + type_uuid = None + addr = h5py.h5o.get_info(typeid).addr + type_uuid = self.getObjIdByAddress(addr) + committedType = self._id_map[type_uuid] + type_item = getTypeItem(committedType.dtype) + type_item["id"] = type_uuid + else: + type_item = getTypeItem(attrObj.dtype) + item["type"] = type_item + + shape_item = {} + if attrObj.shape is None or attrObj.get_storage_size() == 0: + # If storage size is 0, assume this is a null space obj + # See: h5py issue https://github.com/h5py/h5py/issues/279 + shape_item["class"] = "H5S_NULL" + else: + if attrObj.shape: + shape_item["class"] = "H5S_SIMPLE" + shape_item["dims"] = attrObj.shape + else: + shape_item["class"] = "H5S_SCALAR" + + item["shape"] = shape_item + if shape_item["class"] == "H5S_NULL": + include_data = False + elif isinstance(type_item, dict) and type_item["class"] == "H5T_OPAQUE": + # TBD - don't include data for OPAQUE until JSON serialization + # issues are addressed + include_data = False + else: + pass # use include_data parameter + + if include_data: + try: + data = obj.attrs[name] + # convert from h5py to h5json + data = self._copy_array(data, fin=obj.file) + except TypeError: + self.log.warning("type error reading attribute") + + if include_data and data is not None: + value = bytesArrayToList(data) + item["value"] = value + else: + pass # no data + + # timestamps will be added by getAttributeItem() + return item + + def getAttributes(self, obj_id, include_data=True): + h5obj = self._id_map[obj_id] + self.log.info(f"getAttributes: {obj_id} include_data={include_data}") + items = {} # with python 3.7+, this will maintain the attribute order we got from h5py + attrs = h5obj.attrs + for name in attrs: + item = self.getAttribute(obj_id, name, include_data=include_data) + items[name] = item + + return items + + def _getLink(self, parent, link_name): + if link_name not in parent: + return None + + item = {"title": link_name} + # get the link object, one of HardLink, SoftLink, or ExternalLink + try: + linkObj = parent.get(link_name, None, False, True) + linkClass = linkObj.__class__.__name__ + except TypeError: + # UDLink? set class as 'user' + linkClass = "UDLink" # user defined links + item["class"] = "H5L_TYPE_USER_DEFINED" + if linkClass == "SoftLink": + item["class"] = "H5L_TYPE_SOFT" + item["h5path"] = linkObj.path + elif linkClass == "ExternalLink": + item["class"] = "H5L_TYPE_EXTERNAL" + item["h5path"] = linkObj.path + item["file"] = linkObj.filename + elif linkClass == "HardLink": + # Hardlink doesn't have any properties itself, just get the linked + # object + obj = parent[link_name] + addr = h5py.h5o.get_info(obj.id).addr + item["class"] = "H5L_TYPE_HARD" + if addr not in self._addr_map: + self.log.error(f"expected to find addr for link {link_name} in addr_map") + item["id"] = None + else: + item["id"] = self._addr_map[addr] + + return item + + def _getLinks(self, grp): + items = {} # with python 3.7+, this will maintain the link order we got from h5py + for link_name in grp: + item = self._getLink(grp, link_name) + items[link_name] = item + return items + + def _getGroup(self, grp, include_links=True): + self.log.info(f"_getGroup alias: [{grp.name}]") + + item = {"alias": grp.name} + + if include_links: + links = self._getLinks(grp) + item["links"] = links + return item + + def _getDatatype(self, ctype, include_attrs=True): + self.log.info(f"getDatatype alias: ]{ctype.name}") + item = {"alias": ctype.name} + item["type"] = getTypeItem(ctype.dtype) + + return item + + def _getHDF5DatasetCreationProperties(self, dset, type_class): + """ Get dataset creation properties maintained by HDF5 library """ + + # + # Fill in creation properties + # + creationProps = {} + plist = h5py.h5d.DatasetID.get_create_plist(dset.id) + + # alloc time + nAllocTime = plist.get_alloc_time() + if nAllocTime == h5py.h5d.ALLOC_TIME_DEFAULT: + creationProps["allocTime"] = "H5D_ALLOC_TIME_DEFAULT" + elif nAllocTime == h5py.h5d.ALLOC_TIME_LATE: + creationProps["allocTime"] = "H5D_ALLOC_TIME_LATE" + elif nAllocTime == h5py.h5d.ALLOC_TIME_EARLY: + creationProps["allocTime"] = "H5D_ALLOC_TIME_EARLY" + elif nAllocTime == h5py.h5d.ALLOC_TIME_INCR: + creationProps["allocTime"] = "H5D_ALLOC_TIME_INCR" + else: + self.log.warning(f"Unknown alloc time value: {nAllocTime}") + + # fill time + nFillTime = plist.get_fill_time() + if nFillTime == h5py.h5d.FILL_TIME_ALLOC: + creationProps["fillTime"] = "H5D_FILL_TIME_ALLOC" + elif nFillTime == h5py.h5d.FILL_TIME_NEVER: + creationProps["fillTime"] = "H5D_FILL_TIME_NEVER" + elif nFillTime == h5py.h5d.FILL_TIME_IFSET: + creationProps["fillTime"] = "H5D_FILL_TIME_IFSET" + else: + self.log.warning(f"unknown fill time value: {nFillTime}") + + if type_class == "H5T_OPAQUE": + # TBD: store opaque fill value as a hex string + self.log.warning("Opaque fill value not supported") + else: + if plist.fill_value_defined() == h5py.h5d.FILL_VALUE_USER_DEFINED: + creationProps["fillValue"] = bytesArrayToList(dset.fillvalue) + + # layout + nLayout = plist.get_layout() + if nLayout == h5py.h5d.COMPACT: + creationProps["layout"] = {"class": "H5D_COMPACT"} + elif nLayout == h5py.h5d.CONTIGUOUS: + creationProps["layout"] = {"class": "H5D_CONTIGUOUS"} + elif nLayout == h5py.h5d.CHUNKED: + creationProps["layout"] = {"class": "H5D_CHUNKED", "dims": dset.chunks} + else: + self.log.warning(f"Unknown layout value: {nLayout}") + + num_filters = plist.get_nfilters() + filter_props = [] + if num_filters: + for n in range(num_filters): + filter_info = plist.get_filter(n) + opt_values = filter_info[2] + filter_prop = {} + filter_id = filter_info[0] + filter_prop["id"] = filter_id + if filter_info[3]: + filter_prop["name"] = bytesArrayToList(filter_info[3]) + if filter_id in filters._HDF_FILTERS: + hdf_filter = filters._HDF_FILTERS[filter_id] + filter_prop["class"] = hdf_filter["class"] + if "options" in hdf_filter: + filter_opts = hdf_filter["options"] + for i in range(len(filter_opts)): + if len(opt_values) <= i: + break # end of option values + opt_value = opt_values[i] + opt_value_enum = None + option_name = filter_opts[i] + if option_name in filters._HDF_FILTER_OPTION_ENUMS: + option_enums = filters._HDF_FILTER_OPTION_ENUMS[option_name] + if opt_value in option_enums: + opt_value_enum = option_enums[opt_value] + if opt_value_enum: + filter_prop[option_name] = opt_value_enum + else: + filter_prop[option_name] = opt_value + else: + # custom filter + filter_prop["class"] = "H5Z_FILTER_USER" + if opt_values: + filter_prop["parameters"] = opt_values + filter_props.append(filter_prop) + creationProps["filters"] = filter_props + + return creationProps + + def _getDataset(self, dset): + self.log.info(f"getDataset alias: [{dset.name}]") + + item = {"alias": dset.name} + + typeid = dset.id.get_type() + if h5py.h5t.TypeID.committed(typeid): + type_uuid = None + addr = h5py.h5o.get_info(typeid).addr + type_uuid = self.getObjIdByAddress(addr) + committedType = self.getObjectById(type_uuid) + type_item = committedType["type"] + type_item["id"] = type_uuid + else: + type_item = getTypeItem(dset.dtype) + item["type"] = type_item + + shape_item = {} + if dset.shape is None: + # new with h5py 2.6, null space datasets will return None for shape + shape_item["class"] = "H5S_NULL" + elif len(dset.shape) == 0: + shape_item["class"] = "H5S_SCALAR" + else: + shape_item["class"] = "H5S_SIMPLE" + shape_item["dims"] = list(dset.shape) + maxshape = [] + include_maxdims = False + for i in range(len(dset.shape)): + extent = 0 + if len(dset.maxshape) > i: + extent = dset.maxshape[i] + if extent is None: + extent = 0 + if extent > dset.shape[i] or extent == 0: + include_maxdims = True + maxshape.append(extent) + if include_maxdims: + shape_item["maxdims"] = maxshape + item["shape"] = shape_item + + item["cpl"] = self._getHDF5DatasetCreationProperties(dset, type_item["class"]) + + return item + + def getObjectById(self, obj_id, include_attrs=True, include_links=True): + """ return object with given id """ + if obj_id not in self._id_map: + raise KeyError(f"{obj_id} not found") + h5obj = self._id_map[obj_id] + if isinstance(h5obj, h5py.Group): + obj_json = self._getGroup(h5obj, include_links=include_links) + elif isinstance(h5obj, h5py.Dataset): + obj_json = self._getDataset(h5obj) + elif isinstance(h5obj, h5py.Datatype): + obj_json = self._getDatatype(h5obj) + else: + raise TypeError(f"unexpected object type: {type(h5obj)}") + + if include_attrs: + attributes = self.getAttributes(obj_id) + obj_json["attributes"] = attributes + + return obj_json + + def getDatasetValues(self, dset_id, sel=None, dtype=None): + """ + Get values from dataset identified by obj_id. + If a slices list or tuple is provided, it should have the same + number of elements as the rank of the dataset. + """ + + dset = self._id_map[dset_id] + self.log.info(f"getDatasetValues: {dset_id}") + if dset.shape is None: + # TBD: return something like h5py.Empty in this case? + return None + if isOpaqueDtype(dset.dtype): + # TBD: Opaque data not supported yet + return None + if sel is None or sel.select_type == selections.H5S_SELECT_ALL: + arr = dset[...] + elif isinstance(sel, selections.SimpleSelection): + arr = dset[sel.slices] + else: + raise NotImplementedError("selection type not supported") + + # convert any h5py references to h5json references + arr = self._copy_array(arr, fin=dset.file) + return arr diff --git a/src/h5json/h5pystore/h5py_writer.py b/src/h5json/h5pystore/h5py_writer.py new file mode 100644 index 0000000..14942c1 --- /dev/null +++ b/src/h5json/h5pystore/h5py_writer.py @@ -0,0 +1,462 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import h5py +import numpy as np +import time + +from ..objid import getCollectionForId, isValidUuid, createObjId +from ..hdf5dtype import createDataType +from ..h5py_util import is_reference, is_regionreference, has_reference, convert_dtype +from ..array_util import jsonToArray +from .. import selections +from .. import filters +from ..h5writer import H5Writer + + +class H5pyWriter(H5Writer): + """ + This class saves state from the Hdf5Db class into an HDF5 file. + """ + + def __init__( + self, + filepath, + append=False, + no_data=False, + app_logger=None + ): + super().__init__(filepath, append=append, no_data=no_data, app_logger=app_logger) + self._id_map = {} + if append: + self._init = False + else: + self._init = True + self._flush_time = 0.0 + self._f = None # h5py file handle + + def _copy_element(self, val, src_dt, tgt_dt, fout=None): + """ convert the given dataset or attribute element to h5py equivalent """ + out = None + if len(src_dt) > 0: + out_fields = [] + i = 0 + for name in src_dt.fields: + field_src_dt = src_dt.fields[name][0] + field_tgt_dt = tgt_dt.fields[name][0] + field_val = val[i] + i += 1 + out_field = self._copy_element(field_val, field_src_dt, field_tgt_dt) + out_fields.append(out_field) + out = tuple(out_fields) + elif src_dt.metadata and "ref" in src_dt.metadata: + if not tgt_dt.metadata or "ref" not in tgt_dt.metadata: + raise TypeError(f"Expected tgt dtype to be ref, but got: {tgt_dt}") + ref = tgt_dt.metadata["ref"] + if is_reference(ref): + # initialize out to null ref + out = h5py.Reference() # null h5py ref + + if ref and val: + if isinstance(val, bytes): + val = val.decode("ascii") + # strip out collection prefix if present + parts = val.split("/") + obj_uuid = parts[-1] + if not isValidUuid(obj_uuid): + msg = f"invalid uuid: {obj_uuid}" + self.log.warning(msg) + elif obj_uuid not in self._id_map: + self.log.warning(f"ref object {obj_uuid} not found") + else: + h5path = self._id_map[obj_uuid] + try: + obj = fout[h5path] + out = obj.ref + except KeyError: + self.log.warning(f"referenced object: {h5path} not found") + + elif is_regionreference(ref): + self.log.warning("region reference not supported") + # TBD: just return a null region reference till we have support + out = h5py.RegionReference() + else: + raise TypeError(f"Unexpected ref type: {type(ref)}") + elif src_dt.metadata and "vlen" in src_dt.metadata: + if not tgt_dt.metadata or "vlen" not in tgt_dt.metadata: + raise TypeError(f"Expected tgt dtype to be vlen, but got: {tgt_dt}") + src_vlen_dt = src_dt.metadata["vlen"] + tgt_vlen_dt = tgt_dt.metadata["vlen"] + + if has_reference(src_vlen_dt): + if isinstance(val, np.ndarray) and val.shape == (): + val = val[()] + if isinstance(val, np.ndarray) or isinstance(val, list) or isinstance(val, tuple): + count = len(val) + out = np.zeros((count,), dtype=tgt_dt) + for i in range(count): + e = val[i] + out[i] = self._copy_element(e, src_vlen_dt, tgt_vlen_dt, fout=fout) + else: + # scalar array + v = self._copy_element(val, src_vlen_dt, tgt_vlen_dt, fout=fout) + out = np.array(v, dtype=tgt_dt) + else: + # can just directly copy the array + out = np.zeros(val.shape, dtype=tgt_dt) + out[...] = val[...] + else: + out = val # can just copy as is + return out + + def _copy_array(self, src_arr, fout=None): + """Copy the numpy array to a new array. + Convert any reference type to point to item in the target's hierarchy. + """ + if not isinstance(src_arr, np.ndarray): + raise TypeError(f"Expecting ndarray, but got: {src_arr}") + tgt_dt = convert_dtype(src_arr.dtype, to_h5py=True) + tgt_arr = np.zeros(src_arr.shape, dtype=tgt_dt) + + if has_reference(src_arr.dtype): + # flatten array to simplify iteration + count = int(np.prod(src_arr.shape)) + tgt_arr_flat = tgt_arr.reshape((count,)) + src_arr_flat = src_arr.reshape((count,)) + for i in range(count): + e = src_arr_flat[i] + element = self._copy_element(e, src_arr.dtype, tgt_dt, fout=fout) + tgt_arr_flat[i] = element + tgt_arr = tgt_arr_flat.reshape(src_arr.shape) + else: + # can just copy the entire array + tgt_arr[...] = src_arr[...] + return tgt_arr + + def _createGroup(self, parent, grp_json, name=None): + """ create the group and any links it contains """ + grp = parent.create_group(name) + return grp + + def _createDataset(self, parent, dset_json, name=None): + """ create a dataset object """ + + dtype = self.db.getDtype(dset_json) + + kwargs = {"dtype": dtype} + shape_json = dset_json["shape"] + shape_class = shape_json["class"] + if shape_class == "H5S_NULL": + # skip the shape keyword to create a null space dataset + pass + elif shape_class == "H5S_SCALAR": + kwargs["shape"] = () + else: + kwargs["shape"] = shape_json["dims"] + if "dcpl" in dset_json and shape_class != "H5S_NULL": + creation_props = dset_json["dcpl"] + if "fillValue" in creation_props: + fillvalue = creation_props["fillValue"] + if fillvalue and len(dtype) > 1 and type(fillvalue) in (list, tuple): + # for compound types, need to convert from list to dataset compatible element + + if len(dtype) != len(fillvalue): + msg = "fillvalue has incorrect number of elements" + self.log.warning(msg) + raise ValueError(msg) + + fillvalue = jsonToArray((), dtype, fillvalue) + + kwargs["fillvalue"] = fillvalue + + if "trackTimes" in creation_props: + kwargs["track_times"] = creation_props["trackTimes"] + if "layout" in creation_props: + layout = creation_props["layout"] + if "dims" in layout: + kwargs["chunks"] = tuple(layout["dims"]) + if "filters" in creation_props: + filter_props = creation_props["filters"] + for filter_prop in filter_props: + if "id" not in filter_prop: + self.log.warning("filter id not provided") + continue + filter_id = filter_prop["id"] + if filter_id not in filters._HDF_FILTERS: + self.log.warning(f"unknown filter id: {filter_id} ignoring") + continue + + hdf_filter = filters._HDF_FILTERS[filter_id] + + self.log.info(f"got filter: {filter_id}") + if "alias" not in hdf_filter: + self.log.warning(f"unsupported filter id: {filter_id} ignoring") + continue + + filter_alias = hdf_filter["alias"] + if not h5py.h5z.filter_avail(filter_id): + msg = "compression filter not available, filter: {filter_alias}, ignoring" + self.log.warning(msg) + continue + if filter_alias in filters._H5PY_COMPRESSION_FILTERS: + if kwargs.get("compression"): + msg = f"compression filter already set for {filter_alias}, ignoring" + self.log.info(msg) + continue + + kwargs["compression"] = filter_alias + self.log.info("setting compression filter to: {filter_alias}") + if filter_alias == "gzip": + # check for an optional compression value + if "level" in filter_prop: + kwargs["compression_opts"] = filter_prop["level"] + elif filter_alias == "szip": + bitsPerPixel = None + coding = "nn" + + if "bitsPerPixel" in filter_prop: + bitsPerPixel = filter_prop["bitsPerPixel"] + if "coding" in filter_prop: + if filter_prop["coding"] == "H5_SZIP_EC_OPTION_MASK": + coding = "ec" + elif filter_prop["coding"] == "H5_SZIP_NN_OPTION_MASK": + coding = "nn" + else: + self.log.warning("invalid szip option: 'coding'") + # note: pixelsPerBlock, and pixelsPerScanline not supported by h5py, + # so these options will be ignored + if "pixelsPerBlock" in filter_props: + self.log.info("ignoring szip option: 'pixelsPerBlock'") + if "pixelsPerScanline" in filter_props: + self.log.info("ignoring szip option: 'pixelsPerScanline'") + if bitsPerPixel: + kwargs["compression_opts"] = (coding, bitsPerPixel) + else: + if filter_alias == "shuffle": + kwargs["shuffle"] = True + elif filter_alias == "fletcher32": + kwargs["fletcher32"] = True + elif filter_alias == "scaleoffset": + if "scaleOffset" not in filter_prop: + msg = "No scale_offset provided for scale offset filter, ignoring" + self.log(msg) + continue + kwargs["scaleoffset"] = filter_prop["scaleOffset"] + else: + self.log.info(f"Unexpected filter name: {filter_alias}, ignoring") + + dset = parent.create_dataset(name, **kwargs) + return dset + + def _createDatatype(self, parent, ctype_json, name=None): + """ create a datatype object """ + + type_item = ctype_json["type"] + dtype = createDataType(type_item) + parent[name] = dtype + return parent[name] + + def _createObjects(self, parent, links_json, visited=set()): + """ create child object in the given group, recurse for any sub-groups """ + + for title in links_json: + link_json = links_json[title] + link_class = link_json["class"] + if link_class == "H5L_TYPE_SOFT" and title not in parent: + h5path = link_json["h5path"] + parent[title] = h5py.SoftLink(h5path) + elif link_class == "H5L_TYPE_EXTERNAL" and title not in parent: + h5path = link_json["h5path"] + filename = link_json["file"] + parent[title] = h5py.ExternalLink(filename, h5path) + elif link_class == "H5L_TYPE_USER_DEFINED" and title not in parent: + self.log.warning("unable to create user-defined link: {title}") + elif link_class == "H5L_TYPE_HARD": + tgt_id = link_json["id"] + + collection = getCollectionForId(tgt_id) + + obj_json = self.db.getObjectById(tgt_id) + + if tgt_id in self._id_map: + # object has already been created + tgt_path = self._id_map[tgt_id] + tgt_obj = parent[tgt_path] + if title not in parent: + parent[title] = tgt_obj + if collection == "groups" and tgt_id not in visited: + # recurse over sub-objects to pick up any new links + grp_links = obj_json["links"] + visited.add(tgt_id) + self._createObjects(tgt_obj, grp_links, visited=visited) + else: + # need to create tgt_id object + parent_path = parent.name + if parent_path[-1] != '/': + parent_path += '/' + self._id_map[tgt_id] = parent_path + title + kwds = {"name": title} + if collection == "groups": + tgt_grp = self._createGroup(parent, obj_json, **kwds) + if "links" in obj_json: + grp_links = obj_json["links"] + visited.add(tgt_id) + self._createObjects(tgt_grp, grp_links, visited=visited) + elif collection == "datasets": + self._createDataset(parent, obj_json, **kwds) + elif collection == "datatypes": + self._createDatatype(parent, obj_json, **kwds) + else: + self.log.warning(f"unexpected collection: {collection}") + visited.add(tgt_id) + + else: + self.log.warning(f"unexpected link class: {link_class}") + + def updateDatasetValues(self, dset_id, dset): + """ write any pending dataset values """ + dset_json = self.db.getObjectById(dset_id) + if "updates" not in dset_json: + return + updates = dset_json["updates"] + for (sel, val) in updates: + slices = [] + for dim in range(len(sel.shape)): + start = sel.start[dim] + stop = start + sel.count[dim] + step = sel.step[dim] + slices.append(slice(start, stop, step)) + slices = tuple(slices) + dset[slices] = val + self.log.debug(f"h5py_writer dset {dset.name} updated") + + def initializeDatasetValues(self, dset_id, dset): + """ write all dataset values """ + + if dset.shape is None: + return # null space dataset + + sel_all = selections.select(dset.shape, ...) + arr = self.db.getDatasetValues(dset_id, sel_all) + if arr is not None: + dset[...] = arr + + def createAttribute(self, obj, name, attr_json): + """ add the given attribute to obj """ + + src_dt = self.db.getDtype(attr_json) + + # handle special case of null space attribute here + shape_json = attr_json["shape"] + shape_class = shape_json["class"] + if shape_class == "H5S_NULL": + obj.attrs[name] = h5py.Empty(convert_dtype(src_dt, to_h5py=True)) + return + + if shape_class == "H5S_SCALAR": + dims = () + else: + dims = shape_json["dims"] + src_arr = jsonToArray(dims, src_dt, attr_json["value"]) + if not isinstance(src_arr, np.ndarray): + raise TypeError("Unexpected type for src_arr") + tgt_arr = self._copy_array(src_arr, fout=obj.file) + obj.attrs[name] = tgt_arr + + def updateAttributes(self, obj_id, obj): + """ create/replace any modified attributes """ + + obj_json = self.db.getObjectById(obj_id) + + if "attributes" not in obj_json: + # no attributes + return + + attrs = obj_json["attributes"] + for name in attrs: + attr_json = attrs[name] + if "created" in attr_json and attr_json["created"] < self._flush_time: + # attribute should be saved already + continue + self.createAttribute(obj, name, attr_json) + + def flush(self): + """ Write dirty items """ + if self.closed: + # no db set yet + self.log.warning("h5py_writer - flush called but no db") + return False + if not self._f: + self.log.warning("h5py_writer file not open") + raise IOError("open not called") + + self.log.info("h5py_writer.flush()") + + root_id = self.db.root_id + self._id_map[root_id] = "/" + + if self.db.new_objects or self._init: + root_json = self.db.getObjectById(root_id) + + if "links" in root_json: + root_links = root_json["links"] + self._createObjects(self._f, root_links, visited=set((root_id,))) + + # update attributes, dataset values + for obj_id in self._id_map: + if self.db.is_dirty(obj_id) or self._init: + h5path = self._id_map[obj_id] + obj = self._f[h5path] + self.updateAttributes(obj_id, obj) + collection = getCollectionForId(obj_id) + if collection == "datasets" and not self.no_data: + if self._init: + self.initializeDatasetValues(obj_id, obj) + else: + self.updateDatasetValues(obj_id, obj) + # mark time write is complete + # updates before this time will not need to be written + # TBD: possible race condition with multithreading + self._flush_time = time.time() + + self._init = False # done with init after first flush + return True # all objects written successfully + + def open(self): + """ open HDF5 file """ + self.log.debug("h5pyWriter open") + if self.db is None: + # no db set yet + self.log.warning("no self.db db_ref") + raise ValueError("no db") + mode = 'a' if self._append else 'w' + self.log.info(f"creating h5py file: {self._filepath} mode: {mode}") + self._f = h5py.File(self._filepath, mode=mode) + self._append = True # switch to append mode for next file open + if self.db.root_id: + self._root_id = self.db.root_id + else: + self._root_id = createObjId(obj_type="groups") + return self._root_id + + def close(self): + """ close storage handle """ + self.log.debug("h5py_writer.close()") + if not self._f: + # no open on file + return + self.flush() + self._f.close() + self._f = None + + def isClosed(self): + """ return closed status """ + return False if self._f else True diff --git a/src/h5json/h5reader.py b/src/h5json/h5reader.py new file mode 100644 index 0000000..3bf49ca --- /dev/null +++ b/src/h5json/h5reader.py @@ -0,0 +1,94 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +from abc import ABC, abstractmethod +import weakref + +import logging + + +class H5Reader(ABC): + """ + This abstract class defines properties and methods that the Hdf5db class uses for reading from an HDF5 + compatible storage medium. + """ + + def __init__( + self, + filepath, + app_logger=None + ): + self._filepath = filepath + if app_logger: + self.log = app_logger + else: + self.log = logging.getLogger() + + def set_db(self, db): + self._db_ref = weakref.ref(db) + + @property + def db(self): + if not self._db_ref: + raise ValueError("db not available") + return self._db_ref() + + @property + def filepath(self): + """ return filepath """ + return self._filepath + + @property + def closed(self): + """ return True if the reader handle is closed (or never opened) """ + return self.isClosed() + + @abstractmethod + def get_root_id(self): + """ Return root id """ + pass + + @abstractmethod + def getObjectById(self, obj_id, include_attrs=True, include_links=True): + """ return object with given id """ + pass + + @abstractmethod + def getAttribute(self, obj_id, name, includeData=True): + """ + Get attribute given an object id and name + returns: JSON object + """ + pass + + @abstractmethod + def getDatasetValues(self, obj_id, sel=None, dtype=None): + """ + Get values from dataset identified by obj_id. + If a slices list or tuple is provided, it should have the same + number of elements as the rank of the dataset. + """ + pass + + @abstractmethod + def open(self): + """ Open data source for reading """ + pass + + @abstractmethod + def close(self): + """ close any open handles to the storage """ + pass + + @abstractmethod + def isClosed(self): + """ return True if handle is closed """ + pass diff --git a/src/h5json/h5tojson/h5tojson.py b/src/h5json/h5tojson/h5tojson.py index 89a65bd..284de84 100755 --- a/src/h5json/h5tojson/h5tojson.py +++ b/src/h5json/h5tojson/h5tojson.py @@ -10,235 +10,44 @@ # request a copy from help@hdfgroup.org. # ############################################################################## import sys -import json -import argparse import os.path as op -import tempfile import logging -import logging.handlers -from h5json import Hdf5db -from h5json import hdf5dtype - - -class DumpJson: - """ - DumpJson - return json representation of all objects within the given file - """ - - def __init__(self, db, app_logger=None, options=None): - self.options = options - self.db = db - if app_logger: - self.log = app_logger - else: - self.log = logging.getLogger() - self.json = {} - - def dumpAttribute(self, col_name, uuid, attr_name): - self.log.info("dumpAttribute: [" + attr_name + "]") - item = self.db.getAttributeItem(col_name, uuid, attr_name) - response = {"name": attr_name} - typeItem = item["type"] - response["type"] = hdf5dtype.getTypeResponse(typeItem) - response["shape"] = item["shape"] - if not self.options.D: - if "value" not in item: - self.log.warning("no value key in attribute: " + attr_name) - else: - response["value"] = item[ - "value" - ] # dump values unless header -D was passed - return response - - def dumpAttributes(self, col_name, uuid): - attr_list = self.db.getAttributeItems(col_name, uuid) - self.log.info("dumpAttributes: " + uuid) - items = [] - for attr in attr_list: - item = self.dumpAttribute(col_name, uuid, attr["name"]) - items.append(item) - - return items - - def dumpLink(self, uuid, name): - item = self.db.getLinkItemByUuid(uuid, name) - for key in ("ctime", "mtime", "href"): - if key in item: - del item[key] - return item - - def dumpLinks(self, uuid): - link_list = self.db.getLinkItems(uuid) - items = [] - for link in link_list: - item = self.dumpLink(uuid, link["title"]) - items.append(item) - return items - - def dumpGroup(self, uuid): - item = self.db.getGroupItemByUuid(uuid) - if "alias" in item: - alias = item["alias"] - if alias: - self.log.info("dumpGroup alias: [" + alias[0] + "]") - for key in ("ctime", "mtime", "linkCount", "attributeCount", "id"): - if key in item: - del item[key] - attributes = self.dumpAttributes("groups", uuid) - if attributes: - item["attributes"] = attributes - links = self.dumpLinks(uuid) - if links: - item["links"] = links - return item - - def dumpGroups(self): - groups = {} - item = self.dumpGroup(self.root_uuid) - groups[self.root_uuid] = item - uuids = self.db.getCollection("groups") - for uuid in uuids: - item = self.dumpGroup(uuid) - groups[uuid] = item - - self.json["groups"] = groups - - def dumpDataset(self, uuid): - response = {} - self.log.info("dumpDataset: " + uuid) - item = self.db.getDatasetItemByUuid(uuid) - if "alias" in item: - alias = item["alias"] - if alias: - self.log.info("dumpDataset alias: [" + alias[0] + "]") - response["alias"] = item["alias"] - - typeItem = item["type"] - response["type"] = hdf5dtype.getTypeResponse(typeItem) - shapeItem = item["shape"] - shape_rsp = {} - num_elements = 1 - shape_rsp["class"] = shapeItem["class"] - if "dims" in shapeItem: - shape_rsp["dims"] = shapeItem["dims"] - for dim in shapeItem["dims"]: - num_elements *= dim - if "maxdims" in shapeItem: - maxdims = [] - for dim in shapeItem["maxdims"]: - if dim == 0: - maxdims.append("H5S_UNLIMITED") - else: - maxdims.append(dim) - shape_rsp["maxdims"] = maxdims - response["shape"] = shape_rsp - - if "creationProperties" in item: - response["creationProperties"] = item["creationProperties"] - - attributes = self.dumpAttributes("datasets", uuid) - if attributes: - response["attributes"] = attributes - - if not (self.options.D or self.options.d): - if num_elements > 0: - value = self.db.getDatasetValuesByUuid(uuid) - response["value"] = value # dump values unless header flag was passed - else: - response["value"] = [] # empty list - return response - def dumpDatasets(self): - uuids = self.db.getCollection("datasets") - if uuids: - datasets = {} - for uuid in uuids: - item = self.dumpDataset(uuid) - datasets[uuid] = item - - self.json["datasets"] = datasets - - def dumpDatatype(self, uuid): - response = {} - item = self.db.getCommittedTypeItemByUuid(uuid) - response["alias"] = item["alias"] - typeItem = item["type"] - response["type"] = hdf5dtype.getTypeResponse(typeItem) - attributes = self.dumpAttributes("datatypes", uuid) - if attributes: - response["attributes"] = attributes - return response - - def dumpDatatypes(self): - uuids = self.db.getCollection("datatypes") - if uuids: - datatypes = {} - for uuid in uuids: - item = self.dumpDatatype(uuid) - datatypes[uuid] = item - - self.json["datatypes"] = datatypes - - def dumpFile(self): - - self.root_uuid = self.db.getUUIDByPath("/") - - db_version_info = self.db.getVersionInfo() - - self.json["apiVersion"] = db_version_info["hdf5-json-version"] - self.json["root"] = self.root_uuid - - self.dumpGroups() - - self.dumpDatasets() - - self.dumpDatatypes() - - print(json.dumps(self.json, sort_keys=True, indent=4)) - - -def getTempFileName(): - """ - Generate a temporary filename to avoid problems with trying to create a dbfile - in a read-only directory. (See: https://github.com/HDFGroup/h5serv/issues/37) - """ - f = tempfile.NamedTemporaryFile(delete=False) - f.close() - return f.name +from h5json import Hdf5db +from h5json.jsonstore.h5json_writer import H5JsonWriter +from h5json.h5pystore.h5py_reader import H5pyReader def main(): - parser = argparse.ArgumentParser(usage="%(prog)s [-h] [-D|-d] ") - parser.add_argument("-D", action="store_true", help="surpress all data output") - parser.add_argument( - "-d", - action="store_true", - help="surpress data output for" + " datasets (but not attribute values)", - ) - parser.add_argument("filename", nargs="+", help="HDF5 to be converted to json") - args = parser.parse_args() + if len(sys.argv) < 2 or sys.argv[1] in ("-h", "--help"): + print(f"usage: {sys.argv[0]} [-h] [--nodata] ") + sys.exit(0) + + no_data = False + filename = None + for i in range(1, len(sys.argv)): + if sys.argv[i] == "--nodata": + no_data = True + else: + filename = sys.argv[i] # create logger - log = logging.getLogger("h5serv") - # log.setLevel(logging.WARN) - log.setLevel(logging.INFO) - # add log handler - handler = logging.FileHandler("./h5tojson.log") - - # add handler to logger - log.addHandler(handler) + logfname = "h5tojson.log" + loglevel = logging.DEBUG + logging.basicConfig(filename=logfname, format='%(levelname)s %(asctime)s %(message)s', level=loglevel) + log = logging.getLogger() - filename = args.filename[0] + # check that the input file exists if not op.isfile(filename): - sys.exit("Cannot find file: %s" % filename) + sys.exit(f"Cannot find file: {filename}") - log.info("h5tojson " + filename) + log.info(f"h5tojson {filename}") - dbFilename = getTempFileName() - log.info("Using dbFile: " + dbFilename) - with Hdf5db(filename, dbFilePath=dbFilename, readonly=True, app_logger=log) as db: - dumper = DumpJson(db, app_logger=log, options=args) - dumper.dumpFile() + db = Hdf5db(app_logger=log) + db.reader = H5pyReader(filename, app_logger=log) + db.writer = H5JsonWriter(None, no_data=no_data, app_logger=log) + db.open() # read HDF5 data into db + db.close() # close will trigger write to json file if __name__ == "__main__": diff --git a/src/h5json/h5writer.py b/src/h5json/h5writer.py new file mode 100644 index 0000000..3dfb8da --- /dev/null +++ b/src/h5json/h5writer.py @@ -0,0 +1,85 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +from abc import ABC, abstractmethod +import weakref +import logging + + +class H5Writer(ABC): + """ + This abstract class defines properties and methods that the Hdf5db class uses for writing to an HDF5 + compatible storage medium. + """ + + def __init__( + self, + filepath, + append=False, + no_data=False, + app_logger=None + ): + self._filepath = filepath + self._append = append + self._no_data = no_data + self._filepath = filepath + self._db_ref = None + if app_logger: + self.log = app_logger + else: + self.log = logging.getLogger() + + def set_db(self, db): + self._db_ref = weakref.ref(db) + self.log.debug("writer set db ref") + + @property + def filepath(self): + return self._filepath + + @property + def closed(self): + return self.isClosed() + + @property + def db(self): + if not self._db_ref: + self.log.debug("db not available") + return None + return self._db_ref() + + @property + def append(self): + return self._append + + @property + def no_data(self): + return self._no_data + + @abstractmethod + def open(self): + """ open storage handle, return root_id""" + return None + + @abstractmethod + def flush(self): + """ Write dirty items """ + pass + + @abstractmethod + def close(self): + """ close storage handle """ + pass + + @abstractmethod + def isClosed(self): + """ return True if handle is closed """ + pass diff --git a/src/h5json/hdf5db.py b/src/h5json/hdf5db.py index 27f2094..581399f 100644 --- a/src/h5json/hdf5db.py +++ b/src/h5json/hdf5db.py @@ -9,3523 +9,782 @@ # distribution tree. If you do not have access to this file, you may # # request a copy from help@hdfgroup.org. # ############################################################################## -import errno import time -import h5py import numpy as np -import uuid -import os.path as op -import os -import json import logging -from .hdf5dtype import getTypeItem, createDataType, getItemSize +from .hdf5dtype import getTypeItem, createDataType, Reference, special_dtype +from .array_util import jsonToArray, bytesArrayToList +from .dset_util import resize_dataset +from .objid import createObjId, getCollectionForId, isValidUuid, getUuidFromId +from . import selections from .apiversion import _apiver - - -# global dictionary to direct back to the Hdf5db instance by filename -# (needed for visititems callback) -# Will break in multi-threaded context -_db = {} - -UUID_LEN = 36 # length for uuid strings - -# standard compress filters -_HDF_FILTERS = { - 1: {"class": "H5Z_FILTER_DEFLATE", "alias": "gzip", "options": ["level"]}, - 2: {"class": "H5Z_FILTER_SHUFFLE", "alias": "shuffle"}, - 3: {"class": "H5Z_FILTER_FLETCHER32", "alias": "fletcher32"}, - 4: { - "class": "H5Z_FILTER_SZIP", - "alias": "szip", - "options": ["bitsPerPixel", "coding", "pixelsPerBlock", "pixelsPerScanLine"], - }, - 5: {"class": "H5Z_FILTER_NBIT"}, - 6: { - "class": "H5Z_FILTER_SCALEOFFSET", - "alias": "scaleoffset", - "options": ["scaleType", "scaleOffset"], - }, - 32000: {"class": "H5Z_FILTER_LZF", "alias": "lzf"}, -} - -_HDF_FILTER_OPTION_ENUMS = { - "coding": { - h5py.h5z.SZIP_EC_OPTION_MASK: "H5_SZIP_EC_OPTION_MASK", - h5py.h5z.SZIP_NN_OPTION_MASK: "H5_SZIP_NN_OPTION_MASK", - }, - "scaleType": { - h5py.h5z.SO_FLOAT_DSCALE: "H5Z_SO_FLOAT_DSCALE", - h5py.h5z.SO_FLOAT_ESCALE: "H5Z_SO_FLOAT_ESCALE", - h5py.h5z.SO_INT: "H5Z_SO_INT", - }, -} - -# h5py supported filters -_H5PY_FILTERS = { - "gzip": 1, - "shuffle": 2, - "fletcher32": 3, - "szip": 4, - "scaleoffset": 6, - "lzf": 32000, -} - -_H5PY_COMPRESSION_FILTERS = ("gzip", "lzf", "szip") - - -def visitObj(path, obj): - hdf5db = _db[obj.file.filename] - hdf5db.visit(path, obj) +from .h5reader import H5Reader +from .h5writer import H5Writer class Hdf5db: """ - This class is used to manage UUID lookup tables for primary HDF objects (Groups, Datasets, - and Datatypes). For HDF5 files that are read/write, this information is managed within - the file itself in the "__db__" group. For read-only files, the data is managed in - an external file (domain filename with ".db" extension). - - "___db__" ("root" for read-only case) - description: Group object (member of root group). Only objects below this group are used - for UUID data - members: "{groups}", "{datasets}", "{datatypes}", "{objects}", "{paths}" - attrs: 'rootUUID': UUID of the root group - - "{groups}" - description: contains map of UUID->group objects - members: hard link to each anonymous group (i.e. groups which are not - linked to by anywhere else). Link name is the UUID - attrs: group reference (or path for read-only files) to the group (for non- - anonymous groups). - - "{datasets}" - description: contains map of UUID->dataset objects - members: hard link to each anonymous dataset (i.e. datasets which are not - linked to by anywhere else). Link name is the UUID - attrs: dataset reference (or path for read-only files) to the dataset (for non- - anonymous datasets). - - "{dataset_props}: - description contains dataset creation properties" - members: sub-group with link name as UUID. Sub-group attributes are the creation props - - "{datatypes}" - description: contains map of UUID->datatyped objects - members: hard link to each anonymous datatype (i.e. datatypes which are not - linked to by anywhere else). Link name is the UUID - attrs: datatype reference (or path for read-only files) to the datatype (for non- - anonymous datatypes). - - "{addr}" - description: contains map of file offset to UUID. - members: none - attrs: map of file offset to UUID + This class is used to manage id lookup tables for primary HDF objects (Groups, Datasets, + and Datatypes). By default all data is held in-memory. Initialize with h5_reader to read from + an HDF5 compatible storage pool, and or, h5_writer to write to an HDF5 compatible storage pool. """ - @staticmethod - def createHDF5File(filePath): - # create an "empty" hdf5 file - # if op.isfile(filePath): - # raise IOError(errno.EEXIST, "Resource already exists") - - f = h5py.File(filePath, "w") - f.close() - @staticmethod def getVersionInfo(): versionInfo = {} versionInfo["hdf5-json-version"] = _apiver - versionInfo["h5py_version"] = h5py.version.version - versionInfo["hdf5_version"] = h5py.version.hdf5_version return versionInfo def __init__( self, - filePath, - dbFilePath=None, - readonly=False, + h5_reader: H5Reader = None, + h5_writer: H5Writer = None, app_logger=None, - root_uuid=None, - update_timestamps=True, - userid=None, ): if app_logger: self.log = app_logger else: self.log = logging.getLogger() - if len(filePath) == 0 or not op.isfile(filePath): - raise IOError(errno.ENXIO, "file not found") - if not h5py.is_hdf5(filePath): - raise IOError(errno.EINVAL, "not an HDF5 file") - - mode = "r" - if readonly: - self.readonly = True - else: - if not os.stat(filePath).st_mode & 0o200: - # file is read-only - self.readonly = True + + self._db = {} + + self._new_objects = set() # set of for newly created objects + self._dirty_objects = set() # set of modified objects + self._deleted_objects = set() # set of deleted objects + + self._root_id = None + + if h5_reader: + self._reader = h5_reader + self._reader.set_db(self) + else: + self._reader = None + + if h5_writer: + self._writer = h5_writer + self._writer.set_db(self) + else: + self._writer = None + + @property + def db(self): + """ return object db dictionary """ + return self._db + + @property + def reader(self): + """ return reader instance """ + return self._reader + + @reader.setter + def reader(self, value: H5Reader): + """ set the reader """ + if self._writer: + self.flush() + if self._reader: + self._reader.close() + self._reader = value + self._reader.set_db(self) + """ + root_id = value.get_root_id() + if not root_id: + raise ValueError(f"reader {type(value)} unable to return root_id") + group_json = value.getObjectById(root_id) + if not group_json: + raise ValueError(f"reader {type(value)} unable to return group json") + self._reader = value + self._db[root_id] = group_json + self._root_id = root_id + """ + + @property + def writer(self): + """ return writer instance """ + return self._writer + + @writer.setter + def writer(self, value: H5Writer): + """ set the writer """ + if self._writer: + self._writer.close() + self._writer = value + if self._writer: + self.log.debug("writer set_db") + self._writer.set_db(self) + + @property + def root_id(self): + """ return root uuid """ + return self._root_id + + def is_new(self, obj_id): + """ return true if this is a new object (has not been persisted) """ + return obj_id in self._new_objects + + def is_dirty(self, obj_id): + """ return true if this object has been modified """ + if self.is_new(obj_id): + return True + return obj_id in self._dirty_objects + + @property + def new_objects(self): + return self._new_objects + + @property + def dirty_objects(self): + return self._dirty_objects + + @property + def deleted_objects(self): + return self._deleted_objects + + def make_dirty(self, obj_id): + """ Mark the object as dirty and update the lastModified timestamp """ + if self.is_new(obj_id): + # object hasn't been initially written yet, just return + return + if obj_id not in self.db: + self.log.error("make dirty called on deleted object") + raise KeyError(f"obj_id: {obj_id} not found") + if self.db[obj_id] is None: + # object deleted, just return + return + obj_json = self.db[obj_id] + obj_json["lastModified"] = time.time() + self._dirty_objects.add(obj_id) + + def flush(self): + """ write out any changes """ + self.log.debug("db.flush()") + if not self.writer: + return # nothing to do + if not self.writer.flush(): + # flush not successful, don't clear dirty set + return + + # reset new and dirty sets + self._new_objects = set() + self._dirty_objects = set() + + def open(self): + """ open reader and writer if set """ + self.log.debug("db.open()") + if self.root_id: + self.log.debug("root id already set, re-open call") + if self.writer: + self.writer.open() + if self.reader: + self.reader.open() + else: + self.log.debug("db.open, getting root_id") + + if self.writer and self.writer.append: + # append mode for the writer, open writer and get the root id + self.log.debug("db.open, write append, getting root_id from writer") + self._root_id = self.writer.open() + if self.reader: + reader_root_id = self.reader.open() + if reader_root_id != self._root_id: + # TBD: need someway to reconcile if both reader and writer have + # an potentiated idea on what there root id is + self.log.warn("reader root_id does not match writer root_id") + elif self.reader: + self.log.debug("db.open, getting root_id from reader") + self._root_id = self.reader.open() + if self.writer: + writer_root_id = self.writer.open() + if writer_root_id != self._root_id: + # TBD: same as above, need to deal with inconsistent root ids + self.log.warning("writer root_id does not match reader root_id") else: - mode = "r+" - self.readonly = False - - self.log.info("init -- filePath: " + filePath + " mode: " + mode) - - self.update_timestamps = update_timestamps - - self.f = h5py.File(filePath, mode, libver="latest") - - self.root_uuid = root_uuid - - if self.readonly: - # for read-only files, add a dot in front of the name to be used as - # the db file. This won't collide with actual data files, since - # "." is not allowed as the first character in a domain name. - if not dbFilePath: - dirname = op.dirname(self.f.filename) - basename = op.basename(self.f.filename) - if len(dirname) > 0: - dbFilePath = dirname + "/." + basename - else: - dbFilePath = "." + basename - dbMode = "r+" - if not op.isfile(dbFilePath): - dbMode = "w" - self.log.info("dbFilePath: " + dbFilePath + " mode: " + dbMode) - self.dbf = h5py.File(dbFilePath, dbMode) - else: - self.dbf = None # for read only - # create a global reference to this class - # so visitObj can call back - _db[filePath] = self + # no root id set by writer or reader, initialize now + self._root_id = createObjId(obj_type="groups") + if self.writer: + # open writer in create mode now that we have a root id + self.writer.open() + + # create a root group just as a memory object + group_json = {"links": {}, "attributes": {}, "cpl": {}} + group_json["created"] = time.time() + self._db[self._root_id] = group_json + + self.log.debug(f"db.open() - returning root_id: {self._root_id}") + return self._root_id + + def close(self): + """ close reader and writer handles """ + self.log.info("Hdf5db __close") + self.flush() + if self.writer: + self.writer.close() + if self.reader: + self.reader.close() + + @property + def closed(self): + return False if self.root_id else True def __enter__(self): + """ called on package init """ self.log.info("Hdf5db __enter") return self def __exit__(self, type, value, traceback): + """ called on package exit """ self.log.info("Hdf5db __exit") - filename = self.f.filename - self.f.flush() - self.f.close() - if self.dbf: - self.dbf.flush() - self.dbf.close() - del _db[filename] - - def getTimeStampName(self, uuid, objType="object", name=None): - ts_name = uuid - if objType != "object": - if len(name) == 0: - self.log.error("empty name passed to setCreateTime") - raise Exception("bad setCreateTimeParameter") - if objType == "attribute": - ts_name += "_attr:[" - ts_name += name - ts_name += "]" - elif objType == "link": - ts_name += "_link:[" - ts_name += name - ts_name += "]" + self.close() + + def getObjectById(self, obj_id): + """ return object with given id """ + if obj_id not in self.db: + if self.reader: + # load the obj from the reader + obj_json = self.reader.getObjectById(obj_id) + self.db[obj_id] = obj_json else: - msg = "Bad objType passed to setCreateTime" - self.log.error(msg) - raise IOError(errno.EIO, msg) - return ts_name - - """ - setCreateTime - sets the create time timestamp for the - given object. - uuid - id of object - objtype - one of "object", "link", "attribute" - name - name (for attributes, links... ignored for objects) - timestamp - time (otherwise current time will be used) - - returns - nothing - - Note - should only be called once per object - """ - - def setCreateTime(self, uuid, objType="object", name=None, timestamp=None): - if not self.update_timestamps: - return - ctime_grp = self.dbGrp["{ctime}"] - ts_name = self.getTimeStampName(uuid, objType, name) - if timestamp is None: - timestamp = time.time() - if ts_name in ctime_grp.attrs: - self.log.warning("modifying create time for object: " + ts_name) - ctime_grp.attrs.create(ts_name, timestamp, dtype="int64") - - """ - getCreateTime - gets the create time timestamp for the - given object. - uuid - id of object - objtype - one of "object", "link", "attribute" - name - name (for attributes, links... ignored for objects) - useRoot - if true, use the time value for root object as default - - returns - create time for object, or create time for root if not set - """ - - def getCreateTime(self, uuid, objType="object", name=None, useRoot=True): - ctime_grp = self.dbGrp["{ctime}"] - ts_name = self.getTimeStampName(uuid, objType, name) - timestamp = None - if ts_name in ctime_grp.attrs: - timestamp = ctime_grp.attrs[ts_name] - elif useRoot: - # return root timestamp - root_uuid = self.dbGrp.attrs["rootUUID"] - if root_uuid in ctime_grp.attrs: - timestamp = ctime_grp.attrs[root_uuid] - return timestamp - - """ - setModifiedTime - sets the modified time timestamp for the - given object. - uuid - id of object - objtype - one of "object", "link", "attribute" - name - name (for attributes, links... ignored for objects) - timestamp - time (otherwise current time will be used) + raise KeyError(f"obj_id: {obj_id} not found") + obj_json = self.db[obj_id] - returns - nothing + return obj_json - """ + def getObjectIdByPath(self, h5path, parent_id=None): + """ Return id for the given link path starting from parent_id if set, + otherwise the root_id """ - def setModifiedTime(self, uuid, objType="object", name=None, timestamp=None): - if not self.update_timestamps: - return - mtime_grp = self.dbGrp["{mtime}"] - ts_name = self.getTimeStampName(uuid, objType, name) - if timestamp is None: - timestamp = time.time() - mtime_grp.attrs.create(ts_name, timestamp, dtype="int64") + if self.closed: + self.open() # initiate db - """ - getModifiedTime - gets the modified time timestamp for the - given object. - uuid - id of object - objtype - one of "object", "link", "attribute" - name - name (for attributes, links... ignored for objects) - useRoot - if true, use the time value for root object as default - - returns - create time for object, or create time for root if not set - """ + if h5path == "/": + return self.root_id # just return root id - def getModifiedTime(self, uuid, objType="object", name=None, useRoot=True): - mtime_grp = self.dbGrp["{mtime}"] - ts_name = self.getTimeStampName(uuid, objType, name) - timestamp = None - if ts_name in mtime_grp.attrs: - timestamp = mtime_grp.attrs[ts_name] - else: - # return create time if no modified time has been set - ctime_grp = self.dbGrp["{ctime}"] - if ts_name in ctime_grp.attrs: - timestamp = ctime_grp.attrs[ts_name] - elif useRoot: - # return root timestamp - root_uuid = self.dbGrp.attrs["rootUUID"] - timestamp = mtime_grp.attrs[root_uuid] - return timestamp - - """ - getAclGroup - return the db group "{acl}" if present, - otherwise return None - """ - - def getAclGroup(self, create=False): - if not self.dbGrp: - return None # file not initialized - if "{acl}" in self.dbGrp: - return self.dbGrp["{acl}"] - if not create: - return None - self.dbGrp.create_group("{acl}") - return self.dbGrp["{acl}"] - - """ - getAclDtype - return detype for ACL - """ + if parent_id is None: + parent_id = self.root_id + self.log.debug(f"getObjectIdDByPath(h5path: {h5path} parent_id: {parent_id}") - def getAclDtype(self): - fields = [] - fields.append(("userid", np.int32)) - fields.append(("create", np.int8)) - fields.append(("read", np.int8)) - fields.append(("update", np.int8)) - fields.append(("delete", np.int8)) - fields.append(("readACL", np.int8)) - fields.append(("updateACL", np.int8)) - dt = np.dtype(fields) - return dt + obj_json = self.getObjectById(parent_id) + if obj_json is None: + self.log.warning("getObjectIdDByPath - parent_id not found") + raise KeyError("parent_id: {parent_id} not found") - """ - getAclDataset - return ACL datset for given uuid - """ - - def getAclDataset(self, obj_uuid, create=False): - acl_group = self.getAclGroup(create=create) - - if acl_group is None: - return None - - if obj_uuid in acl_group: - return acl_group[obj_uuid] - - if not create: - return None - - # create dataset - dt = self.getAclDtype() - acl_group.create_dataset(obj_uuid, (0,), dtype=dt, maxshape=(None,)) - return acl_group[obj_uuid] - - """ - getNumAcls - return number of acls associatted with given uuid - """ + obj_id = parent_id + searched_ids = set(obj_id) - def getNumAcls(self, obj_uuid): - acl_group = self.getAclGroup() - if acl_group is None: - return 0 - if obj_uuid not in acl_group: - return 0 - acls = acl_group[obj_uuid] - return acls.shape[0] - - """ - convertAclNdArrayToDict - helper function - return acl item to dict - """ - - def convertAclNdArrayToDict(self, acl_ndarray): - fields = acl_ndarray.dtype.fields.keys() - acl = {} - for field in fields: - value = int(acl_ndarray[field]) - acl[field] = value - return acl - - def getDefaultAcl(self): - """Get default acl - returns dict obj""" - - dt = self.getAclDtype() - acl = {} - for field in dt.fields.keys(): - if field == "userid": - acl[field] = 0 - else: - acl[field] = 1 # default is allowed - return acl - - def getAcl(self, obj_uuid, userid): - """ - getAcl - return ACL for given uuid and userid - returns ACL associated with the given uuid, or if none exists, - the ACL associatted with the root group. - - If an ACL is not present for a userid/obj and ACL will be returned - via the following precedence: - - 1) obj_uuid, user_id - 2) root_uuid, user_id - 3) obj_uuid, 0 - 4) root_uuid, 0 - 5) 'all perm' ACL - """ - acl_grp = self.getAclGroup() - - if acl_grp is not None: - acl = self.getAclByObjAndUser(obj_uuid, userid) - if acl is not None: - return acl - - if obj_uuid != self.root_uuid and userid != 0: - # get the root acl for this user - acl = self.getAclByObjAndUser(self.root_uuid, userid) - if acl is not None: - return acl - - if userid != 0: - # get acl for default user - acl = self.getAclByObjAndUser(obj_uuid, 0) - if acl is not None: - return acl - - if obj_uuid != self.root_uuid: - # get root acl for default user - acl = self.getAclByObjAndUser(self.root_uuid, 0) - if acl is not None: - return acl - - # create an ACL with default permissions - acl = self.getDefaultAcl() - - return acl - - def getAclByObjAndUser(self, obj_uuid, userid): - """ - get ACL for specific uuid and user - return None if not found - """ - acl = None - acl_dset = self.getAclDataset(obj_uuid) - - if acl_dset: - # iterate through elements, looking for user_id - acls = acl_dset[...] - num_acls = acl_dset.shape[0] - acl = None - for i in range(num_acls): - item = acls[i] - if item["userid"] == userid: - acl = item - break - - if acl is not None: - acl = self.convertAclNdArrayToDict(acl) - return acl - - def getAcls(self, obj_uuid): - """ - getAcls - get all acls for given uuid - """ - acls = [] - acl_dset = self.getAclDataset(obj_uuid) - - if acl_dset: - # iterate through elements, looking for user_id - num_acls = acl_dset.shape[0] - - for i in range(num_acls): - item = acl_dset[i] - acl = self.convertAclNdArrayToDict(item) - acls.append(acl) - - return acls - - def setAcl(self, obj_uuid, acl): - """ - setAcl - set the acl for given uuid. - """ - acl_dset = self.getAclDataset(obj_uuid, create=True) - - if acl_dset is None: - msg = "Unexpected error acl not created for uuid:[" + obj_uuid + "]" - self.log.error(msg) - raise IOError(errno.EIO, msg) - - userid = acl["userid"] - - # iterate through elements, looking for user_id - acls = acl_dset[...] - num_acls = acl_dset.shape[0] - - user_index = None - - for i in range(num_acls): - item = acls[i] - if item["userid"] == userid: - # update this element - user_index = i - break - - if user_index is None: - # userid not found - add row - acl_dset.resize(((num_acls + 1),)) - user_index = num_acls - - # update the acl dataset - item = acl_dset[user_index] - for field in acl.keys(): - item[field] = acl[field] - acl_dset[user_index] = item # save back to the file - - def initFile(self): - # self.log.info("initFile") - if self.readonly: - self.dbGrp = self.dbf - if "{groups}" in self.dbf: - # file already initialized - self.root_uuid = self.dbGrp.attrs["rootUUID"] - return - - else: - if "__db__" in self.f: - # file already initialized - self.dbGrp = self.f["__db__"] - self.root_uuid = self.dbGrp.attrs["rootUUID"] - return # already initialized - self.dbGrp = self.f.create_group("__db__") - - self.log.info("initializing file") - if not self.root_uuid: - self.root_uuid = str(uuid.uuid1()) - self.dbGrp.attrs["rootUUID"] = self.root_uuid - self.dbGrp.create_group("{groups}") - self.dbGrp.create_group("{datasets}") - self.dbGrp.create_group("{datatypes}") - self.dbGrp.create_group("{addr}") # store object address - self.dbGrp.create_group("{ctime}") # stores create timestamps - self.dbGrp.create_group("{mtime}") # store modified timestamps - - mtime = op.getmtime(self.f.filename) - ctime = mtime - self.setCreateTime(self.root_uuid, timestamp=ctime) - self.setModifiedTime(self.root_uuid, timestamp=mtime) - - self.f.visititems(visitObj) - - def visit(self, path, obj): - name = obj.__class__.__name__ - if len(path) >= 6 and path[:6] == "__db__": - return # don't include the db objects - self.log.info("visit: " + path + " name: " + name) - col = None - if name == "Group": - col = self.dbGrp["{groups}"].attrs - elif name == "Dataset": - col = self.dbGrp["{datasets}"].attrs - elif name == "Datatype": - col = self.dbGrp["{datatypes}"].attrs - else: - msg = "Unknown object type: " + __name__ + " found during scan of HDF5 file" - self.log.error(msg) - raise IOError(errno.EIO, msg) - uuid1 = uuid.uuid1() # create uuid - id = str(uuid1) - addrGrp = self.dbGrp["{addr}"] - if not self.readonly: - # storing db in the file itself, so we can link to the object directly - col[id] = obj.ref # save attribute ref to object - else: - # store path to object - col[id] = obj.name - addr = h5py.h5o.get_info(obj.id).addr - # store reverse map as an attribute - addrGrp.attrs[str(addr)] = id - - # - # Get Datset creation properties - # - def getDatasetCreationProps(self, dset_uuid): - prop_list = {} - if "{dataset_props}" not in self.dbGrp: - # no, group, so no properties - return prop_list # return empty dict - dbPropsGrp = self.dbGrp["{dataset_props}"] - - if dset_uuid not in dbPropsGrp.attrs: - return prop_list # return empty dict - prop_str = dbPropsGrp.attrs[dset_uuid] - # expand json string - try: - prop_list = json.loads(prop_str) - except ValueError as ve: - msg = ( - "Unable to load creation properties for dataset:[" - + dset_uuid - + "]: " - + ve.message - ) - self.log.error(msg) - raise IOError(errno.EIO, msg) - - # fill in Filter class values - if "filters" in prop_list: - prop_filters = prop_list["filters"] - for prop_filter in prop_filters: - if "class" not in prop_filter: - filter_id = prop_filter["id"] - if filter_id in _HDF_FILTERS: - hdf_filter = _HDF_FILTERS[filter_id] - prop_filter["class"] = hdf_filter["class"] - else: - prop_filter["class"] = "H5Z_FILTER_USER" - - return prop_list - - # - # Set dataset creation property - # - def setDatasetCreationProps(self, dset_uuid, prop_dict): - self.log.info("setDataProp([" + dset_uuid + "]") - if not prop_dict: - # just ignore if empty dictionary - return - if "{dataset_props}" not in self.dbGrp: - self.dbGrp.create_group("{dataset_props}") - dbPropsGrp = self.dbGrp["{dataset_props}"] - if dset_uuid in dbPropsGrp.attrs: - # this should be write once - msg = ( - "Unexpected error setting dataset creation properties for dataset:[" - + dset_uuid - + "]" - ) - self.log.error(msg) - raise IOError(errno.EIO, msg) - prop_str = json.dumps(prop_dict) - dbPropsGrp.attrs[dset_uuid] = prop_str - - def getUUIDByAddress(self, addr): - if "{addr}" not in self.dbGrp: - self.log.error("expected to find {addr} group") - return None - addrGrp = self.dbGrp["{addr}"] - obj_uuid = None - if str(addr) in addrGrp.attrs: - obj_uuid = addrGrp.attrs[str(addr)] - if obj_uuid and type(obj_uuid) is not str: - # convert bytes to unicode - obj_uuid = obj_uuid.decode("utf-8") - return obj_uuid - - def getNumLinksToObjectInGroup(self, grp, obj): - """ - Get the number of links in a group to an object - """ - objAddr = h5py.h5o.get_info(obj.id).addr - numLinks = 0 - for name in grp: - try: - child = grp[name] - except KeyError: - # UDLink? Ignore for now - self.log.info("ignoring link (UDLink?): " + name) + link_names = h5path.split('/') + self.log.debug(f"link_names: {link_names}") + for link_name in link_names: + if not link_name: continue - - addr = h5py.h5o.get_info(child.id).addr - if addr == objAddr: - numLinks = numLinks + 1 - - return numLinks - - def getNumLinksToObject(self, obj): - """ - Get the number of links to the given object - """ - self.initFile() - groups = self.dbGrp["{groups}"] - numLinks = 0 - # iterate through each group in the file and unlink tgt if it is linked - # by the group - for uuidName in groups: - # iterate through anonymous groups - grp = groups[uuidName] - nLinks = self.getNumLinksToObjectInGroup(grp, obj) - if nLinks > 0: - numLinks += nLinks - for uuidName in groups.attrs: - # now non anonymous groups - grpRef = groups.attrs[uuidName] - grp = self.f[grpRef] # dereference - nLinks = self.getNumLinksToObjectInGroup(grp, obj) - if nLinks > 0: - numLinks += nLinks - # finally, check the root group - root = self.getObjByPath("/") - nLinks = self.getNumLinksToObjectInGroup(root, obj) - numLinks += nLinks - - return numLinks - - def getUUIDByPath(self, path): - self.initFile() - self.log.info("getUUIDByPath: [" + path + "]") - if len(path) >= 6 and path[:6] == "__db__": - msg = "getUUIDByPath called with invalid path: [" + path + "]" - self.log.error(msg) - raise IOError(errno.EIO, msg) - if path == "/": - # just return the root UUID - root_uuid = self.dbGrp.attrs["rootUUID"] - if root_uuid and type(root_uuid) is not str: - # convert bytes to unicode - root_uuid = root_uuid.decode("utf-8") - return root_uuid - - obj = self.f[path] # will throw KeyError if object doesn't exist - addr = h5py.h5o.get_info(obj.id).addr - obj_uuid = self.getUUIDByAddress(addr) - return obj_uuid - - def getObjByPath(self, path): - if len(path) >= 6 and path[:6] == "__db__": - return None # don't include the db objects - obj = self.f[path] # will throw KeyError if object doesn't exist - return obj - - def getObjectByUuid(self, col_type, obj_uuid): - # col_type should be either "datasets", "groups", or "datatypes" - if col_type not in ("datasets", "groups", "datatypes"): - msg = "Unexpectd error, invalid col_type: [" + col_type + "]" - self.log.error(msg) - raise IOError(errno.EIO, msg) - if col_type == "groups" and obj_uuid == self.dbGrp.attrs["rootUUID"]: - return self.f["/"] # returns root group - - obj = None # Group, Dataset, or Datatype - col_name = "{" + col_type + "}" - # get the collection group for this collection type - col = self.dbGrp[col_name] - if obj_uuid in col.attrs: - ref = col.attrs[obj_uuid] - obj = self.f[ref] # this works for read-only as well - elif obj_uuid in col: - # anonymous object - obj = col[obj_uuid] - - return obj - - def getDatasetObjByUuid(self, obj_uuid): - self.initFile() - self.log.info("getDatasetObjByUuid(" + obj_uuid + ")") - - obj = self.getObjectByUuid("datasets", obj_uuid) - - return obj - - def getGroupObjByUuid(self, obj_uuid): - self.initFile() - self.log.info("getGroupObjByUuid(" + obj_uuid + ")") - - obj = self.getObjectByUuid("groups", obj_uuid) - - return obj - - def getDatasetTypeItemByUuid(self, obj_uuid): - dset = self.getDatasetObjByUuid(obj_uuid) # throws exception if not found - item = {"id": obj_uuid} - item["type"] = getTypeItem(dset.dtype) - if self.update_timestamps: - item["ctime"] = self.getCreateTime(obj_uuid) - item["mtime"] = self.getModifiedTime(obj_uuid) - - return item - - def getNullReference(self): - """ - getNullReference - return a null object reference - """ - tmpGrp = None - if "{tmp}" not in self.dbGrp: - tmpGrp = self.dbGrp.create_group("{tmp}") - else: - tmpGrp = self.dbGrp["{tmp}"] - if "nullref" not in tmpGrp: - dt = h5py.special_dtype(ref=h5py.Reference) - tmpGrp.create_dataset("nullref", (1,), dtype=dt) - nullref_dset = tmpGrp["nullref"] - return nullref_dset[0] - - def getNullRegionReference(self): - """ - getNullRegionReference - return a null region reference - """ - tmpGrp = None - if "{tmp}" not in self.dbGrp: - tmpGrp = self.dbGrp.create_group("{tmp}") - else: - tmpGrp = self.dbGrp["{tmp}"] - if "nullregref" not in tmpGrp: - dt = h5py.special_dtype(ref=h5py.RegionReference) - tmpGrp.create_dataset("nullregref", (1,), dtype=dt) - nullregref_dset = tmpGrp["nullregref"] - return nullregref_dset[0] - - def getShapeItemByDsetObj(self, obj): - item = {} - if obj.shape is None: - # new with h5py 2.6, null space datasets will return None for shape - item["class"] = "H5S_NULL" - elif len(obj.shape) == 0: - # check to see if this is a null space vs a scalar dataset we'll do - # this by seeing if an exception is raised when reading the dataset - # h5py issue https://github.com/h5py/h5py/issues/279 will provide a - # better way to determine null spaces - # Update 3/10/17: Above issue is closed, but waiting on 2.7 final release - try: - val = obj[...] - if val is None: - self.log.warning("no value returned for scalar dataset") - item["class"] = "H5S_SCALAR" - except IOError: - item["class"] = "H5S_NULL" - else: - item["class"] = "H5S_SIMPLE" - item["dims"] = obj.shape - maxshape = [] - include_maxdims = False - for i in range(len(obj.shape)): - extent = 0 - if len(obj.maxshape) > i: - extent = obj.maxshape[i] - if extent is None: - extent = 0 - if extent > obj.shape[i] or extent == 0: - include_maxdims = True - maxshape.append(extent) - if include_maxdims: - item["maxdims"] = maxshape - return item - - def getShapeItemByAttrObj(self, obj): - item = {} - if obj.shape is None or obj.get_storage_size() == 0: - # If storage size is 0, assume this is a null space obj - # See: h5py issue https://github.com/h5py/h5py/issues/279 - item["class"] = "H5S_NULL" - else: - if obj.shape: - item["class"] = "H5S_SIMPLE" - item["dims"] = obj.shape - else: - item["class"] = "H5S_SCALAR" - return item - - # - # Get dataset creation properties maintained by HDF5 library - # - def getHDF5DatasetCreationProperties(self, obj_uuid, type_class): - dset = self.getDatasetObjByUuid(obj_uuid) - # - # Fill in creation properties - # - creationProps = {} - plist = h5py.h5d.DatasetID.get_create_plist(dset.id) - - # alloc time - nAllocTime = plist.get_alloc_time() - if nAllocTime == h5py.h5d.ALLOC_TIME_DEFAULT: - creationProps["allocTime"] = "H5D_ALLOC_TIME_DEFAULT" - elif nAllocTime == h5py.h5d.ALLOC_TIME_LATE: - creationProps["allocTime"] = "H5D_ALLOC_TIME_LATE" - elif nAllocTime == h5py.h5d.ALLOC_TIME_EARLY: - creationProps["allocTime"] = "H5D_ALLOC_TIME_EARLY" - elif nAllocTime == h5py.h5d.ALLOC_TIME_INCR: - creationProps["allocTime"] = "H5D_ALLOC_TIME_INCR" - else: - self.log.warning("Unknown alloc time value: " + str(nAllocTime)) - - # fill time - nFillTime = plist.get_fill_time() - if nFillTime == h5py.h5d.FILL_TIME_ALLOC: - creationProps["fillTime"] = "H5D_FILL_TIME_ALLOC" - elif nFillTime == h5py.h5d.FILL_TIME_NEVER: - creationProps["fillTime"] = "H5D_FILL_TIME_NEVER" - elif nFillTime == h5py.h5d.FILL_TIME_IFSET: - creationProps["fillTime"] = "H5D_FILL_TIME_IFSET" - else: - self.log.warning("unknown fill time value: " + str(nFillTime)) - - if type_class not in ("H5T_VLEN", "H5T_OPAQUE"): - if plist.fill_value_defined() == h5py.h5d.FILL_VALUE_USER_DEFINED: - creationProps["fillValue"] = self.bytesArrayToList(dset.fillvalue) - - # layout - nLayout = plist.get_layout() - if nLayout == h5py.h5d.COMPACT: - creationProps["layout"] = {"class": "H5D_COMPACT"} - elif nLayout == h5py.h5d.CONTIGUOUS: - creationProps["layout"] = {"class": "H5D_CONTIGUOUS"} - elif nLayout == h5py.h5d.CHUNKED: - creationProps["layout"] = {"class": "H5D_CHUNKED", "dims": dset.chunks} - else: - self.log.warning("Unknown layout value:" + str(nLayout)) - - num_filters = plist.get_nfilters() - filter_props = [] - if num_filters: - for n in range(num_filters): - filter_info = plist.get_filter(n) - opt_values = filter_info[2] - filter_prop = {} - filter_id = filter_info[0] - filter_prop["id"] = filter_id - if filter_info[3]: - filter_prop["name"] = self.bytesArrayToList(filter_info[3]) - if filter_id in _HDF_FILTERS: - hdf_filter = _HDF_FILTERS[filter_id] - filter_prop["class"] = hdf_filter["class"] - if "options" in hdf_filter: - filter_opts = hdf_filter["options"] - for i in range(len(filter_opts)): - if len(opt_values) <= i: - break # end of option values - opt_value = opt_values[i] - opt_value_enum = None - option_name = filter_opts[i] - if option_name in _HDF_FILTER_OPTION_ENUMS: - option_enums = _HDF_FILTER_OPTION_ENUMS[option_name] - if opt_value in option_enums: - opt_value_enum = option_enums[opt_value] - if opt_value_enum: - filter_prop[option_name] = opt_value_enum - else: - filter_prop[option_name] = opt_value - else: - # custom filter - filter_prop["class"] = "H5Z_FILTER_USER" - if opt_values: - filter_prop["parameters"] = opt_values - filter_props.append(filter_prop) - creationProps["filters"] = filter_props - - return creationProps - - # - # Get dataset information - type, shape, num attributes, creation properties - # - def getDatasetItemByUuid(self, obj_uuid): - dset = self.getDatasetObjByUuid(obj_uuid) - if dset is None: - if self.getModifiedTime(obj_uuid, useRoot=False): - msg = "Dataset with uuid: " + obj_uuid + " has been previously deleted" - self.log.info(msg) - raise IOError(errno.ENOENT, msg) + link_tgt = None + self.log.debug(f"link_name: {link_name}") + if not obj_id: + break + if 'links' not in obj_json: + self.log.error(f"expected to find links key in: {obj_json}") + raise KeyError(h5path) + links = obj_json['links'] + self.log.debug(f"links: {links}") + if link_name not in links: + self.log.warning(f"link: {link_name} not found in {obj_id}") + self.log.debug(f"links: {links}") + raise KeyError(h5path) + link_tgt = links[link_name] + self.log.debug(f"link_tgt: {link_tgt}") + link_class = link_tgt['class'] + obj_id = None + obj_json = None + if link_class == 'H5L_TYPE_HARD': + # hard link + obj_id = link_tgt['id'] + if obj_id in searched_ids: + self.log.warning(f"circular reference using path: {h5path}") + raise KeyError(h5path) + obj_json = self.getObjectById(obj_id) + searched_ids.add(obj_id) + elif link_class == 'H5L_TYPE_SOFT': + self.log.warning("getObjectIdByPath can't follow soft links") + elif link_class == 'H5L_TYPE_EXTERNAL': + self.log.warning("getObjectIdByPath can't follow external links") else: - msg = "Dataset with uuid: " + obj_uuid + " was not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - # fill in the item info for the dataset - item = {"id": obj_uuid} - - alias = [] - if dset.name and not dset.name.startswith("/__db__"): - alias.append(dset.name) # just use the default h5py path for now - item["alias"] = alias - - item["attributeCount"] = len(dset.attrs) - - # check if the dataset is using a committed type - typeid = h5py.h5d.DatasetID.get_type(dset.id) - typeItem = None - if h5py.h5t.TypeID.committed(typeid): - type_uuid = None - addr = h5py.h5o.get_info(typeid).addr - type_uuid = self.getUUIDByAddress(addr) - committedType = self.getCommittedTypeItemByUuid(type_uuid) - typeItem = committedType["type"] - typeItem["uuid"] = type_uuid - else: - typeItem = getTypeItem(dset.dtype) + self.log.error(f"link type: {link_class} not supported") - item["type"] = typeItem + if not obj_id: + self.log.warning(f"get_bypath {h5path} not found") + raise KeyError(h5path) + return obj_id - # get shape - item["shape"] = self.getShapeItemByDsetObj(dset) - - if self.update_timestamps: - item["ctime"] = self.getCreateTime(obj_uuid) - item["mtime"] = self.getModifiedTime(obj_uuid) - - creationProps = self.getDatasetCreationProps(obj_uuid) - if creationProps: - # if chunks is not in the db props, add it from the dataset prop - # (so auto-chunk values can be returned) - if dset.chunks and "layout" not in creationProps: - creationProps["layout"] = {"class": "H5D_CHUNKED", "dims": dset.chunks} - else: - # no db-tracked creation properties, pull properties from library - creationProps = self.getHDF5DatasetCreationProperties( - obj_uuid, typeItem["class"] - ) + def getObjectByPath(self, path): + """ Get Object JSON at given path """ + obj_id = self.getObjectIdByPath(path) + obj_json = self.getObjectById(obj_id) + return obj_json - if creationProps: - item["creationProperties"] = creationProps + def getDtype(self, obj_json): + """ Return numpy data type for given object id + """ - return item + if "type" not in obj_json: + # group id? + raise TypeError(f"{obj_json} does not have a datatype") + type_item = obj_json["type"] + if isValidUuid(type_item) and getCollectionForId(type_item) == "datatypes": + ctype_id = "t-" + getUuidFromId(type_item) + ctype_json = self.getObjectById(ctype_id) + if ctype_json is None: + raise KeyError(f"ctype: {ctype_id} not found") - def createTypeFromItem(self, attr_type): - """ - createTypeFromItem - create type given dictionary definition - """ - dt = None - - if isinstance(attr_type, (str, bytes)) and len(attr_type) == UUID_LEN: - # assume attr_type is a uuid of a named datatype - tgt = self.getCommittedTypeObjByUuid(attr_type) - if tgt is None: - msg = ( - "Unable to create attribute, committed type with uuid of: " - + attr_type - + " not found" - ) - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - dt = tgt # can use the object as the dt parameter - else: - try: - dt = createDataType(attr_type) - except KeyError as ke: - msg = "Unable to create type: " + str(ke) - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - except TypeError as te: - msg = "Unable to create type: " + str(te) - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - if dt is None: - msg = "Unexpected error creating type" - self.log.error(msg) - raise IOError(errno, errno.EIO, msg) - return dt - - def createCommittedType(self, datatype, obj_uuid=None): - """ - createCommittedType - creates new named datatype - Returns item - """ - self.log.info("createCommittedType") - self.initFile() - if self.readonly: - msg = "Can't create committed type (updates are not allowed)" - self.log.info(msg) - raise IOError(errno.EPERM, msg) - datatypes = self.dbGrp["{datatypes}"] - if not obj_uuid: - obj_uuid = str(uuid.uuid1()) - dt = self.createTypeFromItem(datatype) - - datatypes[obj_uuid] = dt - - if obj_uuid not in datatypes: - msg = "Unexpected failure to create committed datatype" - self.log.error(msg) - raise IOError(errno.EIO, msg) - newType = datatypes[obj_uuid] # this will be a h5py Datatype class - # store reverse map as an attribute - addr = h5py.h5o.get_info(newType.id).addr - addrGrp = self.dbGrp["{addr}"] - addrGrp.attrs[str(addr)] = obj_uuid - # set timestamp - now = time.time() - self.setCreateTime(obj_uuid, timestamp=now) - self.setModifiedTime(obj_uuid, timestamp=now) - item = {"id": obj_uuid} - item["attributeCount"] = len(newType.attrs) - # item['type'] = hdf5dtype.getTypeItem(datatype.dtype) - if self.update_timestamps: - item["ctime"] = self.getCreateTime(obj_uuid) - item["mtime"] = self.getModifiedTime(obj_uuid) - return item - - def getCommittedTypeObjByUuid(self, obj_uuid): - """ - getCommittedTypeObjByUuid - get obj from {datatypes} collection - Returns type obj - """ - self.log.info("getCommittedTypeObjByUuid(" + obj_uuid + ")") - self.initFile() - datatype = None - datatypesGrp = self.dbGrp["{datatypes}"] - if obj_uuid in datatypesGrp.attrs: - typeRef = datatypesGrp.attrs[obj_uuid] - # typeRef could be a reference or (for read-only) a path - datatype = self.f[typeRef] - elif obj_uuid in datatypesGrp: - datatype = datatypesGrp[obj_uuid] # non-linked type + type_json = ctype_json["type"].copy() + type_json["id"] = ctype_id + dtype = createDataType(type_json) else: - msg = "Committed datatype: " + obj_uuid + " not found" - self.log.info(msg) + dtype = createDataType(type_item) - return datatype + return dtype - def getCommittedTypeItemByUuid(self, obj_uuid): - """ - getCommittedTypeItemByUuid - get json from {datatypes} collection - Returns type obj - """ - self.log.info("getCommittedTypeItemByUuid(" + obj_uuid + ")") - self.initFile() - datatype = self.getCommittedTypeObjByUuid(obj_uuid) - - if datatype is None: - if self.getModifiedTime(obj_uuid, useRoot=False): - msg = "Datatype with uuid: " + obj_uuid + " has been previously deleted" - self.log.info(msg) - raise IOError(errno.ENOENT, msg) - else: - msg = "Datatype with uuid: " + obj_uuid + " was not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - item = {"id": obj_uuid} - alias = [] - if datatype.name and not datatype.name.startswith("/__db__"): - alias.append(datatype.name) # just use the default h5py path for now - item["alias"] = alias - item["attributeCount"] = len(datatype.attrs) - item["type"] = getTypeItem(datatype.dtype) - if self.update_timestamps: - item["ctime"] = self.getCreateTime(obj_uuid) - item["mtime"] = self.getModifiedTime(obj_uuid) - - return item - - def getAttributeItemByObj(self, obj, name, includeData=True): + def getAttribute(self, obj_id, name, includeData=True): """ - Get attribute given an object and name + Get attribute given an object id and name returns: JSON object """ - if name not in obj.attrs: - msg = "Attribute: [" + name + "] not found in object: " + obj.name - self.log.info(msg) - return None - # get the attribute! - attrObj = h5py.h5a.open(obj.id, np.bytes_(name)) - attr = None - - item = {"name": name} - - # check if the dataset is using a committed type - typeid = attrObj.get_type() - typeItem = None - if h5py.h5t.TypeID.committed(typeid): - type_uuid = None - addr = h5py.h5o.get_info(typeid).addr - type_uuid = self.getUUIDByAddress(addr) - committedType = self.getCommittedTypeItemByUuid(type_uuid) - typeItem = committedType["type"] - typeItem["uuid"] = type_uuid - else: - typeItem = getTypeItem(attrObj.dtype) - item["type"] = typeItem - # todo - don't include data for OPAQUE until JSON serialization - # issues are addressed + obj_json = self.getObjectById(obj_id) + attrs = obj_json["attributes"] - if isinstance(typeItem, dict) and typeItem["class"] in ("H5T_OPAQUE"): - includeData = False - - shape_json = self.getShapeItemByAttrObj(attrObj) - item["shape"] = shape_json - if shape_json["class"] == "H5S_NULL": - includeData = False - if includeData: - try: - attr = obj.attrs[name] # returns a numpy array - except TypeError: - self.log.warning("type error reading attribute") - - if includeData and attr is not None: - if shape_json["class"] == "H5S_SCALAR": - data = self.getDataValue(typeItem, attr) - else: - dims = shape_json["dims"] - rank = len(dims) - # convert numpy object to python list - # values = self.toList(typeItem, attr) - data = self.toList(rank, typeItem, attr) - # data = self.bytesToString(data) - item["value"] = data - # timestamps will be added by getAttributeItem() - return item - - def getAttributeItems(self, col_type, obj_uuid, marker=None, limit=0): - self.log.info("db.getAttributeItems(" + obj_uuid + ")") - if marker: - self.log.info("...marker: " + marker) - if limit: - self.log.info("...limit: " + str(limit)) - - self.initFile() - obj = self.getObjectByUuid(col_type, obj_uuid) - if obj is None: - msg = "Object: " + obj_uuid + " could not be loaded" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - items = [] - gotMarker = True - if marker is not None: - gotMarker = False - count = 0 - for name in obj.attrs: - if not gotMarker: - if name == marker: - gotMarker = True - continue # start filling in result on next pass - else: - continue # keep going! - item = self.getAttributeItemByObj(obj, name, False) - # mix-in timestamps - if self.update_timestamps: - item["ctime"] = self.getCreateTime( - obj_uuid, objType="attribute", name=name - ) - item["mtime"] = self.getModifiedTime( - obj_uuid, objType="attribute", name=name - ) - - items.append(item) - count += 1 - if limit > 0 and count == limit: - break # return what we got - return items - - def getAttributeItem(self, col_type, obj_uuid, name): - self.log.info( - "getAttributeItemByUuid(" + col_type + ", " + obj_uuid + ", " + name + ")" - ) - self.initFile() - obj = self.getObjectByUuid(col_type, obj_uuid) - if obj is None: - msg = "Parent object: " + obj_uuid + " of attribute not found" + if name not in attrs: + msg = f"Attribute: [{name}] not found in object: {obj_id}" self.log.info(msg) - raise IOError(errno.ENXIO, msg) return None - item = self.getAttributeItemByObj(obj, name) - if item is None: - if self.getModifiedTime( - obj_uuid, objType="attribute", name=name, useRoot=False - ): - # attribute has been removed - msg = ( - "Attribute: [" - + name - + "] of object: " - + obj_uuid - + " has been previously deleted" - ) - self.log.info(msg) - raise IOError(errno.ENOENT, msg) - msg = "Attribute: [" + name + "] of object: " + obj_uuid + " not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - # mix-in timestamps - if self.update_timestamps: - item["ctime"] = self.getCreateTime(obj_uuid, objType="attribute", name=name) - item["mtime"] = self.getModifiedTime( - obj_uuid, objType="attribute", name=name - ) - - return item - - def isDimensionList(self, attr_name, attr_type): - """ - isDimensionList - return True if this attribute json looks like a dimension list - """ - if attr_name != "DIMENSION_LIST": - return False - if type(attr_type) is not dict: - return False - if attr_type["class"] != "H5T_VLEN": - return False - base_type = attr_type["base"] - if base_type["class"] != "H5T_REFERENCE": - return False - return True - - def isReferenceList(self, attr_name, attr_type): - """ - isReferenceList - return True if this attribute json looks like a reference list - """ - if attr_name != "REFERENCE_LIST": - return False - if type(attr_type) is not dict: - return False - if attr_type["class"] != "H5T_COMPOUND": - return False - - return True - - def makeDimensionList(self, obj, shape, value): - """ - makeDimensionList - work-around for h5py problems saving dimension list - - types which are vlen's of references are not working directly, so use dim_scale api - Note: this is a work-around for h5py issue: - https://github.com/h5py/h5py/issues/553 - """ - dset_refs = self.listToRef(value) - for i in range(len(dset_refs)): - refs = dset_refs[i] - if type(refs) not in (list, tuple): - msg = "Invalid dimension list value" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - for j in range(len(refs)): - scale_obj = self.f[refs[j]] - if scale_obj is None: - self.log.warning( - "dimension list, missing obj reference: " + value[i] - ) - continue - if "CLASS" not in scale_obj.attrs: - self.log.warning("dimension list, no scale obj") - continue - if scale_obj.attrs["CLASS"] != b"DIMENSION_SCALE": - self.log.warning("dimension list, invalid class for scale obj") - continue + if attrs[name] is None: + msg = f"Attribute: [{name}] has been deleted" + self.log.info(None) + return None - try: - h5py.h5ds.attach_scale(obj.id, scale_obj.id, i) - except RuntimeError: - self.log.error("got runtime error attaching scale") + attr_json = attrs[name] - def writeNdArrayToAttribute(self, attrs, attr_name, npdata, shape, dt): - """ - writeNdArrayToAttribute - create an attribute given numpy array - """ - attrs.create(attr_name, npdata, shape=shape, dtype=dt) + return attr_json - def makeNullTermStringAttribute(self, obj, attr_name, strLength, value): - """ - create a scalar string attribute using nullterm padding - """ - self.log.info( - "make nullterm, length: " + str(strLength) + " value:" + str(value) - ) - value = str(value) - if strLength < len(value): - self.log.warning( - "makeNullTermStringAttribute: value string longer than length" - ) - # value = value[:strLength] # truncate to length - - if isinstance(attr_name, str): - try: - attr_name = attr_name.encode("ascii") - except UnicodeDecodeError: - raise TypeError("non-ascii attribute name not allowed") - - # create the attribute - tid = h5py.h5t.TypeID.copy(h5py.h5t.C_S1) - tid.set_size(strLength) - tid.set_strpad(h5py.h5t.STR_NULLTERM) - sid = h5py.h5s.create(h5py.h5s.SCALAR) - aid = h5py.h5a.create(obj.id, attr_name, tid, sid) - # write the value - dtype_code = "S" + str(strLength) - ndarr = np.array(value, dtype=np.dtype(dtype_code)) - aid.write(ndarr) - - def makeAttribute(self, obj, attr_name, shape, attr_type, value): + def getAttributes(self, obj_id): """ - makeAttribute - create an attribute (except for dimension list - attribute) + Get attributes given an object id and name + returns: JSON object """ - is_committed_type = False - if isinstance(attr_type, str) and len(attr_type) == UUID_LEN: - # assume attr_type is a uuid of a named datatype - is_committed_type = True - dt = self.createTypeFromItem(attr_type) + obj_json = self.getObjectById(obj_id) + attrs = obj_json["attributes"] + names = [] + for name in attrs: + if attrs[name] is not None: + names.append(name) - if shape is None: - self.log.info("shape is null - will create null space attribute") - # create null space attribute - # null space datasets/attributes not supported in h5py yet: - # See: https://github.com/h5py/h5py/issues/279 - # work around this by using low-level interface. - # first create a temp scalar dataset so we can pull out the typeid - tmpGrp = None - if "{tmp}" not in self.dbGrp: - tmpGrp = self.dbGrp.create_group("{tmp}") - else: - tmpGrp = self.dbGrp["{tmp}"] - tmpGrp.attrs.create(attr_name, 0, shape=(), dtype=dt) - b_attr_name = attr_name.encode("utf-8") - tmpAttr = h5py.h5a.open(tmpGrp.id, name=b_attr_name) - if not tmpAttr: - msg = "Unexpected error creating datatype for nullspace attribute" - self.log.error(msg) - raise IOError(errno.EIO, msg) - tid = tmpAttr.get_type() - sid = sid = h5py.h5s.create(h5py.h5s.NULL) - # now create the permanent attribute - if attr_name in obj.attrs: - self.log.info("deleting attribute: " + attr_name) - del obj.attrs[attr_name] - attr_id = h5py.h5a.create(obj.id, b_attr_name, tid, sid) - # delete the temp attribute - del tmpGrp.attrs[attr_name] - if not attr_id: - msg = "Unexpected error creating nullspace attribute" - self.log.error(msg) - raise IOError(errno.EIO, msg) - else: - if type(value) is tuple: - value = list(value) - if type(shape) is list: - shape = tuple(shape) - if not is_committed_type: - # apparently committed types can not be used as reference types - # todo - verify why that is - - rank = len(shape) - # convert python list to numpy object - strPad = None - strLength = 0 - if ( - isinstance(attr_type, dict) - and attr_type["class"] == "H5T_STRING" - and "strPad" in attr_type - ): - strPad = attr_type["strPad"] - strLength = attr_type["length"] - - if ( - rank == 0 - and isinstance(strLength, int) - and strPad == "H5T_STR_NULLTERM" - ): - self.makeNullTermStringAttribute(obj, attr_name, strLength, value) - else: - typeItem = getTypeItem(dt) - value = self.toRef(rank, typeItem, value) - - # create numpy array - npdata = np.zeros(shape, dtype=dt) - - if rank == 0: - npdata[()] = self.toNumPyValue(attr_type, value, npdata[()]) - else: - self.toNumPyArray(rank, attr_type, value, npdata) - - self.writeNdArrayToAttribute( - obj.attrs, attr_name, npdata, shape, dt - ) + return names - """ - createAttribute - create an attribute - """ - - def createAttribute(self, col_name, obj_uuid, attr_name, shape, attr_type, value): - self.log.info("createAttribute: [" + attr_name + "]") - - self.initFile() - if self.readonly: - msg = "Unable to create attribute (updates are not allowed)" - self.log.info(msg) - raise IOError(errno.EPERM, msg) - obj = self.getObjectByUuid(col_name, obj_uuid) - if not obj: - msg = "Object with uuid: " + obj_uuid + " not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - if self.isDimensionList(attr_name, attr_type): - self.makeDimensionList(obj, shape, value) - elif self.isReferenceList(attr_name, attr_type): - pass # Skip since reference list will be created by attach scale + def getAttributeValue(self, obj_id, name): + """ Return NDArray of the given attribute value """ + attr_json = self.getAttribute(obj_id, name) + shape_json = attr_json["shape"] + if shape_json["class"] == "H5S_NULL": + # no value for empty shape attributes + return None + elif shape_json["class"] == "H5S_SCALAR": + dims = () else: - self.makeAttribute(obj, attr_name, shape, attr_type, value) - - now = time.time() - self.setCreateTime(obj_uuid, objType="attribute", name=attr_name, timestamp=now) - self.setModifiedTime( - obj_uuid, objType="attribute", name=attr_name, timestamp=now - ) - self.setModifiedTime(obj_uuid, timestamp=now) # owner entity is modified - - def deleteAttribute(self, col_name, obj_uuid, attr_name): - self.initFile() - if self.readonly: - msg = "Unable to delete attribute (updates are not allowed)" - self.log.info(msg) - raise IOError(errno.EPERM, msg) - obj = self.getObjectByUuid(col_name, obj_uuid) - - if attr_name not in obj.attrs: - msg = ( - "Attribute with name: [" - + attr_name - + "] of object: " - + obj_uuid - + " not found" - ) - self.log.info(msg) - raise IOError(errno.ENXIO, msg) + dims = shape_json["dims"] + dtype = self.getDtype(attr_json) - del obj.attrs[attr_name] - now = time.time() - self.setModifiedTime( - obj_uuid, objType="attribute", name=attr_name, timestamp=now - ) + value = attr_json["value"] + arr = jsonToArray(dims, dtype, value) - return True - - """ - Return a json-serializable representation of the numpy value - """ + return arr - def getDataValue(self, typeItem, value, dimension=0, dims=None): - if dimension > 0: - if type(dims) not in (list, tuple): - msg = "unexpected type for type array dimensions" - self.log.error(msg) - raise IOError(errno.EIO, msg) - out = [] - rank = len(dims) - if dimension > rank: - msg = "unexpected dimension for type array" - self.log.error(msg) - raise IOError(errno.EIO, msg) - nElements = dims[rank - dimension] - for i in range(nElements): - item_value = self.getDataValue( - typeItem, value[i], dimension=(dimension - 1), dims=dims - ) - out.append(item_value) - return out # done for array case - - out = None - typeClass = typeItem["class"] - if isinstance(value, (np.ndarray, np.generic)): - value = value.tolist() # convert numpy object to list - if typeClass == "H5T_COMPOUND": - if type(value) not in (list, tuple): - msg = "Unexpected type for compound value" - self.log.error(msg) - raise IOError(errno.EIO, msg) - - fields = typeItem["fields"] - if len(fields) != len(value): - msg = "Number of elements in compound type does not match type" - self.log.error(msg) - raise IOError(errno.EIO, msg) - nFields = len(fields) - out = [] - for i in range(nFields): - field = fields[i] - item_value = self.getDataValue(field["type"], value[i]) - out.append(item_value) - elif typeClass == "H5T_VLEN": - if type(value) not in (list, tuple): - msg = "Unexpected type for vlen value" - self.log.error(msg) - raise IOError(errno.EIO, msg) - - baseType = typeItem["base"] - out = [] - nElements = len(value) - for i in range(nElements): - item_value = self.getDataValue(baseType, value[i]) - out.append(item_value) - elif typeClass == "H5T_REFERENCE": - out = self.refToList(value) - elif typeClass == "H5T_OPAQUE": - out = "???" # todo - elif typeClass == "H5T_ARRAY": - type_dims = typeItem["dims"] - if type(type_dims) not in (list, tuple): - msg = "unexpected type for type array dimensions" - self.log.error(msg) - raise IOError(errno.EIO, msg) - rank = len(type_dims) - baseType = typeItem["base"] - out = self.getDataValue(baseType, value, dimension=rank, dims=type_dims) - - elif typeClass in ("H5T_INTEGER", "H5T_FLOAT", "H5T_ENUM"): - out = value # just copy value - elif typeClass == "H5T_STRING": - if "charSet" in typeItem: - charSet = typeItem["charSet"] - else: - charSet = "H5T_CSET_ASCII" - if charSet == "H5T_CSET_ASCII" and isinstance(value, bytes): - out = value.decode("utf-8") - else: - out = value - else: - msg = "Unexpected type class: " + typeClass - self.log.info(msg) - raise IOError(errno.ENINVAL, msg) - return out - - def getRefValue(self, typeItem: dict, value: list): + def createAttribute(self, obj_id, name, value, shape=None, dtype=None): """ - Return a numpy value based on json representation + create an attribute - will override any existing attributes """ - out = None - typeClass = typeItem["class"] - if typeClass == "H5T_COMPOUND": - if not isinstance(value, (list, tuple)): - msg = f"Unexpected type for compound value: {type(value)}" - self.log.error(msg) - raise IOError(errno.EIO, msg) - - fields = typeItem["fields"] - if len(fields) != len(value): - msg = "Number of elements in compound type does not match type" - self.log.error(msg) - raise IOError(errno.EIO, msg) - nFields = len(fields) - out = [] - for i in range(nFields): - field = fields[i] - item_value = self.getRefValue(field["type"], value[i]) - out.append(item_value) - elif typeClass == "H5T_VLEN": - if type(value) not in (list, tuple): - msg = "Unexpected type for vlen value" - self.log.error(msg) - raise IOError(errno.EIO, msg) - - baseType = typeItem["base"] - out = [] - nElements = len(value) - for i in range(nElements): - item_value = self.getRefValue(baseType, value[i]) - out.append(item_value) - elif typeClass == "H5T_REFERENCE": - out = self.listToRef(value) - elif typeClass == "H5T_OPAQUE": - out = "???" # todo - elif typeClass == "H5T_ARRAY": - out = self.toRef(len(typeItem["dims"]), typeItem["base"], value) - elif typeClass in ("H5T_INTEGER", "H5T_FLOAT", "H5T_ENUM"): - out = value # just copy value - elif typeClass == "H5T_STRING": - if typeItem["charSet"] == "H5T_CSET_UTF8": - # out = value.encode('utf-8') - out = value - else: - out = value.encode() - else: - msg = "Unexpected type class: " + typeClass - self.log.info(msg) - raise IOError(errno.ENINVAL, msg) - if isinstance(out, list): - out = tuple(out) # convert to tuple - return out + # TBD: if dtype is a committed ref type, fetch it first + # TBD: also, check special case for complex types - """ - Return a numpy value based on json representation - """ + if isinstance(dtype, str) and dtype.startswith("datatypes/"): + ctype_id = dtype[len("datatypes/"):] + if getCollectionForId(ctype_id) != "datatypes": + raise TypeError(f"unexpected dtype value for createAttribute: {dtype}") + if ctype_id not in self.db: + raise KeyError(f"ctype: {ctype_id} not found") + ctype_json = self.getObjectById(ctype_id) + type_json = ctype_json["type"].copy() + type_json["id"] = ctype_id + dtype = createDataType(type_json) - def toNumPyValue(self, typeItem, src, des): - typeClass = "H5T_INTEGER" # default to int type - if type(typeItem) is dict: - typeClass = typeItem["class"] - if typeClass == "H5T_COMPOUND": - fields = typeItem["fields"] - if len(fields) != len(src): - msg = "Number of elements in compound type does not match type" - self.log.error(msg) - raise IOError(errno.EIO, msg) - nFields = len(fields) - - for i in range(nFields): - field = fields[i] - field_name = field["name"] - des[field_name] = src[i] - - elif typeClass == "H5T_VLEN": - if type(src) not in (list, tuple): - msg = "Unexpected type for vlen value" - self.log.error(msg) - raise IOError(errno.EIO, msg) - - baseType = typeItem["base"] - - dt = self.createTypeFromItem(baseType) - des = np.array(src, dtype=dt) - - elif typeClass == "H5T_REFERENCE": - des = src # self.listToRef(src) - - elif typeClass == "H5T_OPAQUE": - des = "???" # todo - elif typeClass == "H5T_ARRAY": - des = src - elif typeClass in ("H5T_INTEGER", "H5T_FLOAT", "H5T_ENUM"): - des = src # just copy value - elif typeClass == "H5T_STRING": - if typeItem["charSet"] == "H5T_CSET_UTF8": - des = src # src.encode('utf-8') + # First, make sure we have a NumPy array + if isinstance(value, Reference) and dtype is None: + dtype = special_dtype(ref=Reference) + if shape == "H5S_NULL": + if value: + raise ValueError("Value can't be set for Null space attributes") + if dtype is None: + raise ValueError("Dtype must be set for Null space attributes") else: - if type(src) is str: - try: - src.encode("ascii") - except UnicodeDecodeError: - raise TypeError( - "non-ascii value not allowed with H5T_CSET_ASCII" - ) - des = src - + dtype = np.dtype(dtype) else: - msg = "Unexpected type class: " + typeClass - self.log.info(msg) - raise IOError(errno.ENINVAL, msg) - return des - - """ - copy src data to numpy array - """ - - def toNumPyArray(self, rank, typeItem, src, des): - if rank == 0: - msg = "unexpected rank value" - self.log.error(msg) - raise IOError(errno.EIO, msg) # shouldn't be called with rank 0 - - for i in range(len(des)): - des_sec = des[i] # numpy slab - - src_sec = src[i] - - if rank > 1: - self.toNumPyArray(rank - 1, typeItem, src_sec, des_sec) + value = np.asarray(value, dtype=dtype, order='C') + if dtype is None: + dtype = value.dtype else: - rv = self.toNumPyValue(typeItem, src_sec, des_sec) - # if the numpy object is writeable, des_sec will be - # already updated. Otherwise, update the des by assignment - if not hasattr(des_sec, "flags") or not des_sec.flags["WRITEABLE"]: - des[i] = rv - - def toRef(self, rank, typeItem, data): - """ - Convert json list to h5py compatible values - """ - out = None - - if isinstance(typeItem, str): - # commited type - get json representation - committed_type_item = self.getCommittedTypeItemByUuid(typeItem) - typeItem = committed_type_item["type"] + dtype = np.dtype(dtype) # In case a string, e.g. 'i8' is passed - typeClass = typeItem["class"] - if typeClass in ("H5T_INTEGER", "H5T_FLOAT"): - out = data # just use as is + # Where a top-level array type is requested, we have to do some + # fiddling around to present the data as a smaller array of + # sub-arrays. + if value is not None: + if dtype.subdtype is not None: + subdtype, subshape = dtype.subdtype - elif rank == 0: - # scalar value - out = self.getRefValue(typeItem, data) - else: - out = [] - for item in data: - if rank > 1: - out_item = self.toRef(rank - 1, typeItem, item) - out.append(out_item) - else: - out_item = self.getRefValue(typeItem, item) - out.append(out_item) + # Make sure the subshape matches the last N axes' sizes. + if shape[-len(subshape):] != subshape: + raise ValueError(f"Array dtype shape {subshape} is incompatible with data shape {shape}") - return out + # New "advertised" shape and dtype + shape = shape[0:len(shape) - len(subshape)] + dtype = subdtype - """ - Convert list to json serializable values. - """ + # Not an array type; make sure to check the number of elements + # is compatible, and reshape if needed. + else: + if isinstance(shape, tuple): + if np.prod(shape) != np.prod(value.shape): + raise ValueError("Shape of new attribute conflicts with shape of data") - def toList(self, rank, typeItem, data): - out = None - typeClass = typeItem["class"] - if typeClass in ("H5T_INTEGER", "H5T_FLOAT"): - out = data.tolist() # just use as is + if shape != value.shape: + value = value.reshape(shape) - elif rank == 0: - # scalar value - out = self.getDataValue(typeItem, data) - else: - out = [] - for item in data: - if rank > 1: - out_item = self.toList(rank - 1, typeItem, item) - out.append(out_item) - else: - out_item = self.getDataValue(typeItem, item) - out.append(out_item) + # We need this to handle special string types. + value = np.asarray(value, dtype=dtype) - return out + value_json = bytesArrayToList(value) - """ - Create ascii representation of vlen data object - """ - - def vlenToList(self, data): - # todo - verify that data is a numpy.ndarray - out = None - if len(data.shape) == 0: - out = [] else: - try: - if data.dtype.kind != "O": - out = data.tolist() - else: - out = [] - for item in data: - out.append(self.vlenToList(item)) # recursive call - except AttributeError: - # looks like this is not a numpy ndarray, just return the value - out = data - return out - - """ - Create ascii representation of ref data object - """ + value_json = None - def refToList(self, data): - # todo - verify that data is a numpy.ndarray - out = None - if type(data) is h5py.h5r.Reference: - if bool(data): - grpref = self.f[data] - addr = h5py.h5o.get_info(grpref.id).addr - uuid = self.getUUIDByAddress(addr) - if self.getGroupObjByUuid(uuid): - out = "groups/" + uuid - elif self.getDatasetObjByUuid(uuid): - out = "datasets/" + uuid - elif self.getCommittedTypeObjByUuid(uuid): - out = "datatypes/" + uuid - else: - self.log.warning("uuid in region ref not found: [" + uuid + "]") - return None - else: - out = "null" - elif type(data) is h5py.h5r.RegionReference: - out = self.getRegionReference(data) + if shape is None: + shape = value.shape + if shape == "H5S_NULL": + shape_json = {"class": "H5S_NULL"} + elif len(shape) == 0: + shape_json = {"class": "H5S_SCALAR"} else: - out = [] - for item in data: - out.append(self.refToList(item)) # recursive call - return out + shape_json = {"class": "H5S_SIMPLE"} + shape_json["dims"] = list(shape) - """ - Convert ascii representation of data references to data ref - """ - - def listToRef(self, data): - out = None - if not data: - # null reference - out = self.getNullReference() - elif isinstance(data, (bytes, str)): - obj_ref = None - # object reference should be in the form: / - for prefix in ("datasets", "groups", "datatypes"): - if data.startswith(prefix): - uuid_ref = data[len(prefix) :] - if len(uuid_ref) == (UUID_LEN + 1) and uuid_ref.startswith("/"): - obj = self.getObjectByUuid(prefix, uuid_ref[1:]) - if obj: - obj_ref = obj.ref - else: - msg = ( - "Invalid object reference value: [" - + uuid_ref - + "] not found" - ) - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - break - if not obj_ref: - msg = "Invalid object reference value: [" + data + "]" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - else: - out = obj_ref - - elif isinstance(data, (list, tuple)): - out = [] - for item in data: - out.append(self.listToRef(item)) # recursive call - elif isinstance(data, dict): - # assume region ref - out = self.createRegionReference(data) - else: - msg = "Invalid object reference value type: [" + str(type(data)) + "]" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - return out + obj_json = self.getObjectById(obj_id) + attrs_json = obj_json["attributes"] + type_json = getTypeItem(dtype) + # finally put it all together... + attr_json = {"shape": shape_json, "type": type_json, "value": value_json} + attr_json["created"] = time.time() - def bytesArrayToList(self, data): - """ - Convert list that may contain bytes type elements to list of string elements - """ - if isinstance(data, (bytes, str)): - is_list = False - elif isinstance(data, (np.ndarray, np.generic)): - if len(data.shape) == 0: - is_list = False - data = data.tolist() # tolist will return a scalar in this case - if isinstance(data, (list, tuple)): - is_list = True - else: - is_list = False - else: - is_list = True - elif isinstance(data, (list, tuple)): - is_list = True - else: - is_list = False - - if is_list: - out = [] - for item in data: - out.append(self.bytesArrayToList(item)) # recursive call - elif isinstance(data, bytes): - out = data.decode("utf-8") - else: - out = data + # slot into the obj_json["attrs"] + attrs_json[name] = attr_json - return out + # mark object as dirty + self.make_dirty(obj_id) - def getRegionReference(self, regionRef): - """ - Get item description of region reference value - """ - selectionEnums = { - h5py.h5s.SEL_NONE: "H5S_SEL_NONE", - h5py.h5s.SEL_ALL: "H5S_SEL_ALL", - h5py.h5s.SEL_POINTS: "H5S_SEL_POINTS", - h5py.h5s.SEL_HYPERSLABS: "H5S_SEL_HYPERSLABS", - } - - item = {} - objid = h5py.h5r.dereference(regionRef, self.f.file.file.id) - if objid: - item["id"] = self.getUUIDByAddress(h5py.h5o.get_info(objid).addr) - else: - self.log.info("region reference unable to find item with objid: " + objid) - return item - - sel = h5py.h5r.get_region(regionRef, objid) - select_type = sel.get_select_type() - if select_type not in selectionEnums: - msg = "Unexpected selection type: " + regionRef.typecode - self.log.error(msg) - raise IOError(errno.EIO, msg) - item["select_type"] = selectionEnums[select_type] - pointlist = None - if select_type == h5py.h5s.SEL_POINTS: - # retrieve a numpy array of selection points - points = sel.get_select_elem_pointlist() - pointlist = points.tolist() - elif select_type == h5py.h5s.SEL_HYPERSLABS: - points = sel.get_select_hyper_blocklist() - if points is not None: - pointlist = points[...].tolist() - # bump up the second coordinate by one to match api spec - for point in pointlist: - coord2 = point[1] - for i in range(len(coord2)): - coord2[i] = coord2[i] + 1 - - item["selection"] = pointlist - - return item - - def createRegionReference(self, item): - """ - Create region reference from item description of region reference value - """ - selectionEnums = { - "H5S_SEL_NONE": h5py.h5s.SEL_NONE, - "H5S_SEL_ALL": h5py.h5s.SEL_ALL, - "H5S_SEL_POINTS": h5py.h5s.SEL_POINTS, - "H5S_SEL_HYPERSLABS": h5py.h5s.SEL_HYPERSLABS, - } - region_ref = None - - if "select_type" not in item: - msg = "select_type not provided for region selection" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - select_type = item["select_type"] - if select_type not in selectionEnums.keys(): - msg = "selection type: [" + select_type + "] is not valid" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - dset = None - if select_type == "H5S_SEL_NONE": - if "id" not in item: - # select none on null dataset, return null ref - out = self.getNullReference() - return out - else: # select_type != 'H5S_SEL_NONE' - if "id" not in item: - msg = "id not provided for region selection" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - # Otherwise need to provide uuid of dataset - uuid_ref = item["id"] - if len(uuid_ref) != UUID_LEN: - msg = "uuid value: [" + uuid_ref + "] for region reference is not valid" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) + def deleteAttribute(self, obj_id, name): + """ delete the given attribute """ + obj_json = self.getObjectById(obj_id) + attrs_json = obj_json["attributes"] + if name not in attrs_json: + raise KeyError(f"attribute [{name}] not found in {obj_id}") + attrs_json[name] = None # mark key for deletion - obj = self.getObjectByUuid("datasets", uuid_ref) - if obj: - dset = obj - else: - msg = "Invalid region refence value: [" + uuid_ref + "] not found" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - if select_type in ("H5S_SEL_POINTS", "H5S_SEL_HYPERSLABS"): - if "selection" not in item: - msg = "selection key not provided for region selection" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - rank = len(dset.shape) - space_id = h5py.h5d.DatasetID.get_space(dset.id) - h5py.h5s.SpaceID.select_none(space_id) - - if select_type == "H4S_SEL_NONE": - pass # did select_none above - elif select_type == "H5S_SEL_ALL": - h5py.h5s.SpaceID.select_all(space_id) - elif select_type == "H5S_SEL_POINTS": - selection = item["selection"] - for point in selection: - if len(point) != rank: - msg = "point selection number of elements must mach rank of referenced dataset" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - h5py.h5s.SpaceID.select_elements(space_id, selection) - elif select_type == "H5S_SEL_HYPERSLABS": - selection = item["selection"] - - for slab in selection: - # each item should be a two element array defining the hyperslab boundary - if len(slab) != 2: - msg = "selection value not valid (not a 2 element array)" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - start = slab[0] - if isinstance(start, list): - start = tuple(start) - if type(start) is not tuple or len(start) != rank: - msg = "selection value not valid, start element should have number " - msg += "elements equal to rank of referenced dataset" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - stop = slab[1] - if isinstance(stop, list): - stop = tuple(stop) - if type(stop) is not tuple or len(stop) != rank: - msg = "selection value not valid, count element should have number " - msg += "elements equal to rank of referenced dataset" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - count = [] - for i in range(rank): - if start[i] < 0: - msg = "start value for hyperslab selection must be non-negative" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - if stop[i] <= start[i]: - msg = "stop value must be greater than start value for hyperslab selection" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - count.append(stop[i] - start[i]) - count = tuple(count) - - h5py.h5s.SpaceID.select_hyperslab( - space_id, start, count, op=h5py.h5s.SELECT_OR - ) - - # now that we've selected the desired region in the space, return a region reference - dset_name = dset.name.encode("utf-8") - region_ref = h5py.h5r.create( - self.f.id, dset_name, h5py.h5r.DATASET_REGION, space_id - ) - - return region_ref - - def toTuple(self, rank, data): - """ - Convert a list to a tuple, recursively. - Example. [[1,2],[3,4]] -> ((1,2),(3,4)) - """ - if isinstance(data, (list, tuple)): - if rank > 0: - return list(self.toTuple(rank - 1, x) for x in data) - else: - return tuple(self.toTuple(rank - 1, x) for x in data) - else: - return data + self.make_dirty(obj_id) - def getDatasetValuesByUuid(self, obj_uuid, slices=Ellipsis, format="json"): + def getDatasetValues(self, dset_id, sel): """ - Get values from dataset identified by obj_uuid. + Get values from dataset identified by obj_id. If a slices list or tuple is provided, it should have the same number of elements as the rank of the dataset. """ - dset = self.getDatasetObjByUuid(obj_uuid) - if format not in ("json", "binary"): - msg = "only json and binary formats are supported" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - if dset is None: - msg = "Dataset: " + obj_uuid + " not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - values = None - dt = dset.dtype - typeItem = getTypeItem(dt) - itemSize = getItemSize(typeItem) - if itemSize == "H5T_VARIABLE" and format == "binary": - msg = "Only JSON is supported for for this data type" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - if dset.shape is None: - # null space dataset (with h5py 2.6.0) - return None - - rank = len(dset.shape) - - if rank == 0: - # check for null dataspace - try: - val = dset[...] - except IOError: - # assume null dataspace, return none - return None - if val is None: - self.log.warning("no value returned from scalar dataset") - - if not isinstance(slices, (list, tuple)) and slices is not Ellipsis: - msg = "Unexpected error: getDatasetValuesByUuid: bad type for dim parameter" - self.log.error(msg) - raise IOError(errno.EIO, msg) - - if isinstance(slices, (list, tuple)) and len(slices) != rank: - msg = "Unexpected error: getDatasetValuesByUuid: number of dims in selection not same as rank" - self.log.error(msg) - raise IOError(errno.EIO, msg) - - if dt.kind == "O": - if format != "json": - msg = "Only JSON is supported for for this data type" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - # numpy object type - could be a vlen string or generic vlen - h5t_check = h5py.h5t.check_dtype(vlen=dt) - if h5t_check is str or h5t_check is bytes: - values = self.bytesArrayToList(dset[slices]) - elif h5t_check is not None: - # other vlen data - values = self.vlenToList(dset[slices]) - else: - # check for reference type - h5t_check = h5py.h5t.check_dtype(ref=dt) - if h5t_check is not None: - # reference type - values = self.refToList(dset[slices]) - else: - msg = "Unexpected error, object type unknown" - self.log.error(msg) - raise IOError(errno.EIO, msg) - elif dt.kind == "V" and len(dt) <= 1 and len(dt.shape) == 0 and not dt.names: - # opaque type - skip for now - self.log.warning("unable to get opaque type values") - values = "????" - elif dt.kind == "S" and format == "json": - values = self.bytesArrayToList(dset[slices]) - elif len(dt) > 1 or dt.names: - # compound type - if format == "json": - values = self.bytesArrayToList(dset[slices]) - else: - values = dset[slices].tobytes() - else: - values = dset[slices] - - # just use tolist to dump - if format == "json": - values = values.tolist() - else: - # values = base64.b64encode(dset[slices].tobytes()) - values = values.tobytes() + self.log.info(f"getDatasetValues dset_id: {dset_id}, sel: {sel}") + dset_json = self.getObjectById(dset_id) + shape_json = dset_json["shape"] + if not isinstance(sel, selections.Selection): + raise TypeError("Expected Selection class") - return values - - """ - doDatasetQueryByUuid: return rows based on query string - Return rows from a dataset that matches query string. - - Note: Only supported for compound_type/one-dimensional datasets - """ - - def doDatasetQueryByUuid( - self, obj_uuid, query, start=0, stop=-1, step=1, limit=None - ): - self.log.info("doQueryByUuid - uuid: " + obj_uuid + " query:" + query) - self.log.info( - "start: " - + str(start) - + " stop: " - + str(stop) - + " step: " - + str(step) - + " limit: " - + str(limit) - ) - dset = self.getDatasetObjByUuid(obj_uuid) - if dset is None: - msg = "Dataset: " + obj_uuid + " not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - values = [] - dt = dset.dtype - typeItem = getTypeItem(dt) - # itemSize = getItemSize(typeItem) - if typeItem["class"] != "H5T_COMPOUND": - msg = "Only compound type datasets can be used as query target" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - if dset.shape is None: - # null space dataset (with h5py 2.6.0) + if shape_json["class"] == "H5S_NULL": return None - rank = len(dset.shape) - if rank != 1: - msg = "One one-dimensional datasets can be used as query target" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - values = [] - indexes = [] - count = 0 - - num_elements = dset.shape[0] - if stop == -1: - stop = num_elements - elif stop > num_elements: - stop = num_elements - block_size = self._getBlockSize(dset) - self.log.info("block_size: " + str(block_size)) - - field_names = list(dset.dtype.fields.keys()) - eval_str = self._getEvalStr(query, field_names) - - while start < stop: - if limit and (count == limit): - break # no more rows for this batch - end = start + block_size - if end > stop: - end = stop - rows = dset[start:end] # read from dataset - where_result = np.where(eval(eval_str)) - index = where_result[0].tolist() - if len(index) > 0: - for i in index: - row = rows[i] - item = self.bytesArrayToList(row) - values.append(item) - indexes.append(start + i) - count += 1 - if limit and (count == limit): - break # no more rows for this batch - - start = end # go to next block - - # values = self.getDataValue(item_type, values, dimension=1, dims=(len(values),)) - - self.log.info("got " + str(count) + " query matches") - return (indexes, values) - - """ - _getBlockSize: Get number of rows to read from disk - - heurestic to get reasonable sized chunk of data to fetch. - make multiple of chunk_size if possible - """ - - def _getBlockSize(self, dset): - target_block_size = 256 * 1000 - if dset.chunks: - chunk_size = dset.chunks[0] - if chunk_size < target_block_size: - block_size = (target_block_size // chunk_size) * chunk_size - else: - block_size = target_block_size - else: - block_size = target_block_size - return block_size - - """ - _getEvalStr: Get eval string for given query - - Gets Eval string to use with numpy where method. - """ - - def _getEvalStr(self, query, field_names): - i = 0 - eval_str = "" - var_name = None - end_quote_char = None - var_count = 0 - paren_count = 0 - black_list = ("import",) # field names that are not allowed - self.log.info("getEvalStr(" + query + ")") - for item in black_list: - if item in field_names: - msg = "invalid field name" - self.log.info("EINVAL: " + msg) - raise IOError(errno.EINVAL, msg) - while i < len(query): - ch = query[i] - if (i + 1) < len(query): - ch_next = query[i + 1] - else: - ch_next = None - if var_name and not ch.isalnum(): - # end of variable - if var_name not in field_names: - # invalid - msg = "unknown field name" - self.log.info("EINVAL: " + msg) - raise IOError(errno.EINVAL, msg) - eval_str += "rows['" + var_name + "']" - var_name = None - var_count += 1 - - if end_quote_char: - if ch == end_quote_char: - # end of literal - end_quote_char = None - eval_str += ch - elif ch in ("'", '"'): - end_quote_char = ch - eval_str += ch - elif ch.isalpha(): - if ch == "b" and ch_next in ("'", '"'): - eval_str += "b" # start of a byte string literal - elif var_name is None: - var_name = ch # start of a variable - else: - var_name += ch - elif ch == "(" and end_quote_char is None: - paren_count += 1 - eval_str += ch - elif ch == ")" and end_quote_char is None: - paren_count -= 1 - if paren_count < 0: - msg = "Mismatched paren" - self.log.info("EINVAL: " + msg) - raise IOError(errno.EINVAL, msg) - eval_str += ch - else: - # just add to eval_str - eval_str += ch - i = i + 1 - if end_quote_char: - msg = "no matching quote character" - self.log.info("EINVAL: " + msg) - raise IOError(errno.EINVAL, msg) - if var_count == 0: - msg = "No field value" - self.log.info("EINVAL: " + msg) - raise IOError(errno.EINVAL, msg) - if paren_count != 0: - msg = "Mismatched paren" - self.log.info("EINVAL: " + msg) - raise IOError(errno.EINVAL, msg) - - return eval_str - - """ - Get values from dataset identified by obj_uuid using the given - point selection. - """ - - def getDatasetPointSelectionByUuid(self, obj_uuid, points): - dset = self.getDatasetObjByUuid(obj_uuid) - if dset is None: - msg = "Dataset: " + obj_uuid + " not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - rank = len(dset.shape) - values = np.zeros(len(points), dtype=dset.dtype) - try: - i = 0 - for point in points: - if rank == 1: - values[i] = dset[[point]] - else: - values[i] = dset[tuple(point)] - i += 1 - except ValueError: - # out of range error - msg = "getDatasetPointSelection, out of range error" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - return values.tolist() - - """ - setDatasetValuesByUuid - update the given dataset values with supplied data - and optionally a hyperslab selection (slices) - """ - - def setDatasetValuesByUuid(self, obj_uuid, data, slices=None, format="json"): - dset = self.getDatasetObjByUuid(obj_uuid) - - if format not in ("json", "binary"): - msg = "only json and binary formats are supported" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - if format == "binary" and type(data) is not bytes: - msg = "data must be of type bytes for binary writing" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) + if shape_json["class"] == "H5S_SCALAR": + if sel.select_type != selections.H5S_SELECT_ALL: + # TBD: support other selection types + raise ValueError("Only SELECT_ALL selections are supported for scalar datasets") + if sel.shape != (): + raise ValueError("Selection shape does not match dataset shape") + rank = 0 + else: + dims = tuple(shape_json["dims"]) + if sel.shape != dims: + raise ValueError("Selection shape does not match dataset shape") + rank = len(dims) - if dset is None: - msg = "Dataset: " + obj_uuid + " not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - dt = dset.dtype - typeItem = getTypeItem(dt) - itemSize = getItemSize(typeItem) - rank = len(dset.shape) - arraySize = 1 - for extent in dset.shape: - arraySize *= arraySize - - if itemSize == "H5T_VARIABLE" and format == "binary": - msg = "Only JSON is supported for for this data type" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - if slices is None: - slices = [] - # create selection that covers entire dataset - for dim in range(rank): - s = slice(0, dset.shape[dim], 1) - slices.append(s) - slices = tuple(slices) - - if not isinstance(slices, tuple): - msg = "setDatasetValuesByUuid: bad type for dim parameter" - self.log.error(msg) - raise IOError(errno.EIO, msg) - - if len(slices) != rank: - msg = "number of dims in selection not same as rank" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - npoints = 1 - np_shape = [] - for i in range(rank): - s = slices[i] - - if s.start < 0 or s.step <= 0 or s.stop < s.start: - msg = "invalid slice specification" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - if s.stop > dset.shape[i]: - msg = "invalid slice specification" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - np_shape.append(s.stop - s.start) - - count = (s.stop - s.start) // s.step - if count <= 0: - msg = "invalid slice specification" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - npoints *= count - - np_shape = tuple(np_shape) # for comparison with ndarray shape - - self.log.info("selection shape:" + str(np_shape)) - - # need some special conversion for compound types -- - # each element must be a tuple, but the JSON decoder - # gives us a list instead. - if format != "binary" and dset.dtype.names and isinstance(data, (list, tuple)): - data = self.toTuple(rank, data) - # for i in range(len(data)): - # converted_data.append(self.toTuple(data[i])) - # data = converted_data - else: - h5t_check = h5py.check_dtype(ref=dset.dtype) - if h5t_check in (h5py.Reference, h5py.RegionReference): - # convert data to data refs - if format == "binary": - msg = "Only JSON is supported for for this data type" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - data = self.listToRef(data) - - if format == "binary": - if npoints * itemSize != len(data): - msg = ( - "Expected: " - + str(npoints * itemSize) - + " bytes, but got: " - + str(len(data)) - ) - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - if dset.dtype.shape == (): - arr = np.fromstring(data, dtype=dset.dtype) - arr = arr.reshape(np_shape) # conform to selection shape - else: - # tricy array type! - arr = np.empty(np_shape, dtype=dset.dtype) - base_arr = np.fromstring(data, dtype=dset.dtype.base) - base_shape = list(np_shape) - base_shape.extend(dset.dtype.shape) # add on the type dimensions - base_arr = base_arr.reshape(base_shape) - arr[...] = base_arr - else: - # data is json - if npoints == 1 and len(dset.dtype) > 1: - # convert to tuple for compound singleton writes - data = [ - tuple(data), - ] - - arr = np.array(data, dtype=dset.dtype) - # raise an exception of the array shape doesn't match the selection shape - # allow if the array is a scalar and the selection shape is one element, - # numpy is ok with this - np_index = 0 - for dim in range(len(arr.shape)): - data_extent = arr.shape[dim] - selection_extent = 1 - if np_index < len(np_shape): - selection_extent = np_shape[np_index] - if selection_extent == data_extent: - np_index += 1 - continue # good - if data_extent == 1: - continue # skip singleton selection - if selection_extent == 1: - np_index += 1 - continue # skip singleton selection - - # selection/data mismatch! - msg = "data shape doesn't match selection shape" - msg += "--data shape: " + str(arr.shape) - msg += "--selection shape: " + str(np_shape) - - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - # write temp numpy array to dataset - if rank == 1: - s = slices[0] - try: - dset[s] = arr - except TypeError as te: - self.log.info("h5py setitem exception: " + str(te)) - raise IOError(errno.EINVAL, str(te)) + dtype = self.getDtype(dset_json) + if self.reader: + arr = self.reader.getDatasetValues(dset_id, sel, dtype=dtype) else: - try: - dset[slices] = arr - except TypeError as te: - self.log.info("h5py setitem exception: " + str(te)) - raise IOError(errno.EINVAL, str(te)) + # TBD: Initialize with fill value if non-zero + arr = np.zeros(sel.shape, dtype=dtype) - # update modified time - self.setModifiedTime(obj_uuid) - return True - - """ - setDatasetValuesByPointSelection - Update the dataset values using the given - data and point selection - """ - - def setDatasetValuesByPointSelection(self, obj_uuid, data, points, format="json"): - dset = self.getDatasetObjByUuid(obj_uuid) - - if format not in ("json", "binary"): - msg = "only json and binary formats are supported" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - if format == "binary" and type(data) is not bytes: - msg = "data must be of type bytes for binary writing" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - if dset is None: - msg = "Dataset: " + obj_uuid + " not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) + if "updates" in dset_json: + # apply any non-flushed changes that intersect the current selection + updates = dset_json["updates"] + for (update_sel, update_val) in updates: + sel_inter = selections.intersect(sel, update_sel) + if sel_inter.nselect == 0: + continue + # update portion of arr, that intersects update_val + slices = [] + for dim in range(rank): + start = sel_inter.start[dim] - sel.start[dim] + stop = start + sel_inter.count[dim] + slices.append(slice(start, stop, 1)) + slices = tuple(slices) + arr[slices] = update_val + + return arr + + def setDatasetValues(self, dset_id, sel, arr): + """ + Write the given ndarray to the dataset using the selection + """ + dset_json = self.getObjectById(dset_id) + shape_json = dset_json["shape"] + if not isinstance(sel, selections.Selection): + raise TypeError("Expected Selection class") + if sel.select_type not in (selections.H5S_SELECT_HYPERSLABS, selections.H5S_SELECT_ALL): + # TBD: support other selection types + raise ValueError("Only hyperslab selections are currently supported") + if not isinstance(arr, np.ndarray): + raise TypeError("Expected ndarray for data value") + if shape_json["class"] == "H5S_NULL": + raise ValueError("writing to null space dataset not supported") + if shape_json["class"] == "H5S_SCALAR": + if sel.shape != (): + raise ValueError("Selection shape does not match dataset shape") + if len(arr.shape) > 0: + raise TypeError("Expected scalar ndarray for scalar dataset") + else: + dims = tuple(shape_json["dims"]) + if sel.shape != dims: + raise ValueError("Selection shape does not match dataset shape") + if "updates" not in dset_json or sel.select_type == selections.H5S_SELECT_ALL: + # for select all, throw out any existing updates since this will overwrite them + dset_json["updates"] = [] + updates = dset_json["updates"] + updates.append((sel, arr.copy())) + self.make_dirty(dset_id) + + def resizeDataset(self, dset_id, shape): + """ + Resize existing Dataset + """ + self.log.info(f"resizeDataset {dset_id}, {shape}") + + dset_json = self.getObjectById(dset_id) # will throw exception if not found + if resize_dataset(dset_json, shape): + self._dirty_objects.add(dset_id) + + def deleteObject(self, obj_id): + """ Delete the given object """ + self.log.info(f"deleteObject: {obj_id}") + if obj_id not in self.db: + raise KeyError(f"Object {obj_id} not found for deletion") + if obj_id == self.root_id: + raise KeyError("Root group cannot be deleted") + self.db[obj_id] = None + + if obj_id in self._new_objects: + self._new_objects.remove(obj_id) + + if obj_id in self._dirty_objects: + self._dirty_objects.remove(obj_id) + + self._deleted_objects.add(obj_id) + + def getLinks(self, grp_id): + """ Get the links for the given group """ + grp_json = self.getObjectById(grp_id) + if "links" not in grp_json: + raise KeyError(f"No links - {grp_id} not a group?") + links = grp_json["links"] + names = [] + for name in links: + if links[name] is not None: + names.append(name) + return names + + def getLink(self, grp_id, name): + """ Get the given link """ + + obj_json = self.getObjectById(grp_id) + links = obj_json["links"] + if name not in links: + self.log.info(f"Link [{name}] not found in {grp_id}") + return None + if links[name] is None: + self.log.info(f"Link {name} in {grp_id} has been deleted") + return None - dt = dset.dtype - typeItem = getTypeItem(dt) - itemSize = getItemSize(typeItem) - if itemSize == "H5T_VARIABLE" and format == "binary": - msg = "Only JSON is supported for for this data type" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - rank = len(dset.shape) - - # need some special conversion for compound types -- - # each element must be a tuple, but the JSON decoder - # gives us a list instead. - if format == "json" and len(dset.dtype) > 1 and type(data) in (list, tuple): - raise NotImplementedError("need some special conversion for compound types") - # converted_data = self.toTuple(rank, data) - # for i in range(len(data)): - # converted_data.append(self.toTuple(data[i])) - # data = converted_data - - if format == "json": - try: - i = 0 - for point in points: - if rank == 1: - dset[[point]] = data[i] - else: - dset[tuple(point)] = data[i] - i += 1 - except ValueError: - # out of range error - msg = "setDatasetValuesByPointSelection, out of range error" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) + return links[name] + + def _addLink(self, grp_id, name, link_json): + obj_json = self.getObjectById(grp_id) + links = obj_json["links"] + links[name] = link_json + self.make_dirty(grp_id) + + def createHardLink(self, grp_id, name, tgt_id): + """ Create a new hardlink """ + link_json = {"class": "H5L_TYPE_HARD", "id": tgt_id} + link_json["created"] = time.time() + self._addLink(grp_id, name, link_json) + + def createSoftLink(self, grp_id, name, h5path): + """ Create a soft link """ + link_json = {"class": "H5L_TYPE_SOFT", "h5path": h5path} + link_json["created"] = time.time() + self._addLink(grp_id, name, link_json) + + def createCustomLink(self, grp_id, name, link_json): + """ create a custom link """ + if link_json.get("class") != "H5L_TYPE_USER_DEFINED": + link_json["class"] = "H5L_TYPE_USER_DEFINED" + link_json["created"] = time.time() + self._addLink(grp_id, name, link_json) + + def createExternalLink(self, grp_id, name, h5path, filepath): + """ Create a external link link """ + link_json = {"class": "H5L_TYPE_EXTERNAL", "h5path": h5path, "file": filepath} + link_json["created"] = time.time() + self._addLink(grp_id, name, link_json) + + def deleteLink(self, grp_id, name): + """ Delete the given link """ + grp_json = self.getObjectById(grp_id) + if "links" not in grp_json: + raise KeyError(f"No links - {grp_id} not a group?") + links = grp_json["links"] + if name not in links: + raise KeyError(f"Link [{name}] not found in {grp_id}") + links[name] = None # mark for deletion + self.make_dirty(grp_id) + + def createGroup(self, cpl=None): + """ Create a new group """ + if self.closed: + raise ValueError("db is closed") + grp_id = createObjId("groups", root_id=self.root_id) + group_json = {"attributes": {}, "links": {}} + if cpl: + group_json["cpl"] = cpl + else: + group_json["cpl"] = {} + group_json["created"] = time.time() + self.db[grp_id] = group_json + self._new_objects.add(grp_id) + return grp_id + + def createCommittedType(self, datatype, cpl=None): + """ + createCommittedType - creates new named datatype + Returns item + """ + if self.closed: + raise ValueError("db is closed") + self.log.info("createCommittedType") + if cpl is None: + cpl = {} + ctype_id = createObjId(obj_type="datatypes", root_id=self.root_id) + if isinstance(datatype, np.dtype): + dt = datatype else: - # binary - arr = np.fromstring(data, dtype=dset.dtype) - dset[points] = arr # coordinate write + dt = createDataType(datatype) - # update modified time - self.setModifiedTime(obj_uuid) - return True + type_json = getTypeItem(dt) # get canonical json description of datatype - """ - createDataset - creates new dataset given shape and datatype - Returns item - """ + ctype_json = {"type": type_json, "attributes": {}, "cpl": cpl} + ctype_json["created"] = time.time() + self.db[ctype_id] = ctype_json + self._new_objects.add(ctype_id) + return ctype_id def createDataset( - self, datatype, datashape, max_shape=None, creation_props=None, obj_uuid=None + self, + shape=None, + maxdims=None, + dtype=None, + cpl=None, ): - self.initFile() - if self.readonly: - msg = "Unable to create dataset (Updates are not allowed)" - self.log.info(msg) - raise IOError(errno.EPERM, msg) - datasets = self.dbGrp["{datasets}"] - if not obj_uuid: - obj_uuid = str(uuid.uuid1()) - dt = None - item = {} - fillvalue = None - - # h5py.createdataset fields - kwargs = {} # key word arguments for h5py dataset creation - - if creation_props is None: - creation_props = {} # create empty list for convience - - if creation_props: - if "fillValue" in creation_props: - fillvalue = creation_props["fillValue"] - if "trackTimes" in creation_props: - kwargs["track_times"] = creation_props["trackTimes"] - if "layout" in creation_props: - layout = creation_props["layout"] - if "dims" in layout: - kwargs["chunks"] = tuple(layout["dims"]) - if "filters" in creation_props: - filter_props = creation_props["filters"] - for filter_prop in filter_props: - if "id" not in filter_prop: - msg = "filter id not provided" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - filter_id = filter_prop["id"] - if filter_id not in _HDF_FILTERS: - self.log.info( - "unknown filter id: " + str(filter_id) + " ignoring" - ) - continue - - hdf_filter = _HDF_FILTERS[filter_id] - - self.log.info("got filter: " + str(filter_id)) - if "alias" not in hdf_filter: - self.log.info( - "unsupported filter id: " + str(filter_id) + " ignoring" - ) - continue - - filter_alias = hdf_filter["alias"] - if not h5py.h5z.filter_avail(filter_id): - self.log.info( - "compression filter not available, filter: " - + filter_alias - + " will be ignored" - ) - continue - if filter_alias in _H5PY_COMPRESSION_FILTERS: - if kwargs.get("compression"): - self.log.info( - "compression filter already set, filter: " - + filter_alias - + " will be ignored" - ) - continue - - kwargs["compression"] = filter_alias - self.log.info( - "setting compression filter to: " + kwargs["compression"] - ) - if filter_alias == "gzip": - # check for an optional compression value - if "level" in filter_prop: - kwargs["compression_opts"] = filter_prop["level"] - elif filter_alias == "szip": - bitsPerPixel = None - coding = "nn" - - if "bitsPerPixel" in filter_prop: - bitsPerPixel = filter_prop["bitsPerPixel"] - if "coding" in filter_prop: - if filter_prop["coding"] == "H5_SZIP_EC_OPTION_MASK": - coding = "ec" - elif filter_prop["coding"] == "H5_SZIP_NN_OPTION_MASK": - coding = "nn" - else: - msg = "invalid szip option: 'coding'" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - # note: pixelsPerBlock, and pixelsPerScanline not supported by h5py, - # so these options will be ignored - if "pixelsPerBlock" in filter_props: - self.log.info("ignoring szip option: 'pixelsPerBlock'") - if "pixelsPerScanline" in filter_props: - self.log.info( - "ignoring szip option: 'pixelsPerScanline'" - ) - if bitsPerPixel: - kwargs["compression_opts"] = (coding, bitsPerPixel) - else: - if filter_alias == "shuffle": - kwargs["shuffle"] = True - elif filter_alias == "fletcher32": - kwargs["fletcher32"] = True - elif filter_alias == "scaleoffset": - if "scaleOffset" not in filter_prop: - msg = "No scale_offset provided for scale offset filter" - self.log(msg) - raise IOError(errno.EINVAL, msg) - kwargs["scaleoffset"] = filter_prop["scaleOffset"] - else: - self.log.info( - "Unexpected filter name: " - + filter_alias - + " , ignoring" - ) - - dt_ref = self.createTypeFromItem(datatype) - if dt_ref is None: - msg = "Unexpected error, no type returned" - self.log.error(msg) - raise IOError(errno.EIO, msg) - - dt = dt_ref - if hasattr(dt_ref, "dtype"): - # dt_ref is actualy a handle to a committed type - # get the dtype prop, but use dt_ref for the actual dataset creation - dt = dt_ref.dtype - - if fillvalue and len(dt) > 1 and type(fillvalue) in (list, tuple): - # for compound types, need to convert from list to dataset compatible element - - if len(dt) != len(fillvalue): - msg = "fillvalue has incorrect number of elements" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - ndscalar = np.zeros((), dtype=dt) - for i in range(len(fillvalue)): - field = dt.names[i] - ndscalar[field] = self.toTuple(0, fillvalue[i]) - fillvalue = ndscalar - - if fillvalue: - kwargs["fillvalue"] = fillvalue - - dataset_id = None - if datashape is None: - # create null space dataset - # null space datasets not supported in h5py yet: - # See: https://github.com/h5py/h5py/issues/279 - # work around this by using low-level interface. - # first create a temp scalar dataset so we can pull out the typeid - tmpGrp = None - if "{tmp}" not in self.dbGrp: - tmpGrp = self.dbGrp.create_group("{tmp}") - else: - tmpGrp = self.dbGrp["{tmp}"] - tmpDataset = tmpGrp.create_dataset(obj_uuid, shape=(1,), dtype=dt_ref) - tid = tmpDataset.id.get_type() - sid = sid = h5py.h5s.create(h5py.h5s.NULL) - # now create the permanent dataset - gid = datasets.id - b_obj_uuid = obj_uuid.encode("utf-8") - dataset_id = h5py.h5d.create(gid, b_obj_uuid, tid, sid) - # delete the temp dataset - del tmpGrp[obj_uuid] - else: - # create the dataset - try: - newDataset = datasets.create_dataset( - obj_uuid, - shape=datashape, - maxshape=max_shape, - dtype=dt_ref, - **kwargs, - ) - except ValueError as ve: - msg = "Unable to create dataset" - try: - msg += ": " + ve.message - except AttributeError: - pass # no message - self.log.info(msg) - raise IOError(errno.EINVAL, msg) # assume this is due to invalid params - - if newDataset: - dataset_id = newDataset.id - - if dataset_id is None: - msg = "Unexpected failure to create dataset" - self.log.error(msg) - raise IOError(errno.EIO, msg) - # store reverse map as an attribute - addr = h5py.h5o.get_info(dataset_id).addr - addrGrp = self.dbGrp["{addr}"] - addrGrp.attrs[str(addr)] = obj_uuid - - # save creation props if any - if creation_props: - self.setDatasetCreationProps(obj_uuid, creation_props) - - # set timestamp - now = time.time() - self.setCreateTime(obj_uuid, timestamp=now) - self.setModifiedTime(obj_uuid, timestamp=now) - - item["id"] = obj_uuid - if self.update_timestamps: - item["ctime"] = self.getCreateTime(obj_uuid) - item["mtime"] = self.getModifiedTime(obj_uuid) - item["attributeCount"] = 0 - return item - - """ - Resize existing Dataset - """ - - def resizeDataset(self, obj_uuid, shape): - self.log.info("resizeDataset(") # + obj_uuid + "): ") # + str(shape)) - self.initFile() - if self.readonly: - msg = "Unable to resize dataset (Updates are not allowed)" - self.log.info(msg) - raise IOError(errno.EACESS, msg) - dset = self.getDatasetObjByUuid(obj_uuid) # will throw exception if not found - if len(shape) != len(dset.shape): - msg = "Unable to resize dataset, shape has wrong number of dimensions" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - for i in range(len(shape)): - if shape[i] < dset.shape[i]: - msg = "Unable to resize dataset, cannot make extent smaller" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - if dset.maxshape[i] is not None and shape[i] > dset.maxshape[i]: - msg = "Unable to resize dataset, max extent exceeded" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - dset.resize(shape) # resize - - # update modified time - self.setModifiedTime(obj_uuid) - - """ - Check if link points to given target (as a HardLink) - """ - - def isObjectHardLinked(self, parentGroup, targetGroup, linkName): - try: - linkObj = parentGroup.get(linkName, None, False, True) - linkClass = linkObj.__class__.__name__ - except TypeError: - # UDLink? Ignore for now - return False - if linkClass == "SoftLink": - return False - elif linkClass == "ExternalLink": - return False - elif linkClass == "HardLink": - if parentGroup[linkName] == targetGroup: - return True - else: - self.log.warning("unexpected linkclass: " + linkClass) - return False - - """ - Delete Dataset, Group or Datatype by UUID - """ - - def deleteObjectByUuid(self, objtype, obj_uuid): - if objtype not in ("group", "dataset", "datatype"): - msg = "unexpected objtype: " + objtype - self.log.error(msg) - raise IOError(errno.EIO, msg) - self.initFile() - self.log.info("delete uuid: " + obj_uuid) - if self.readonly: - msg = "Unable to delete object (Updates are not allowed)" - self.log.info(msg) - raise IOError(errno.EPERM, msg) - - if obj_uuid == self.dbGrp.attrs["rootUUID"] and objtype == "group": - # can't delete root group - msg = "Unable to delete group (root group may not be deleted)" - self.log.info(msg) - raise IOError(errno.EPERM, msg) - - dbCol = None - tgt = None - if objtype == "dataset": - tgt = self.getDatasetObjByUuid(obj_uuid) - dbCol = self.dbGrp["{datasets}"] - elif objtype == "group": - tgt = self.getGroupObjByUuid(obj_uuid) - dbCol = self.dbGrp["{groups}"] - else: # datatype - tgt = self.getCommittedTypeObjByUuid(obj_uuid) - dbCol = self.dbGrp["{datatypes}"] - - if tgt is None: - msg = "Unable to delete " + objtype + ", uuid: " + obj_uuid + " not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - # unlink from root (if present) - self.unlinkObject(self.f["/"], tgt) - - groups = self.dbGrp["{groups}"] - # iterate through each group in the file and unlink tgt if it is linked - # by the group. - # We'll store a list of links to be removed as we go, and then actually - # remove the links after the iteration is done (otherwise we can run into issues - # where the key has become invalid) - linkList = [] # this is our list - for uuidName in groups.attrs: - grpRef = groups.attrs[uuidName] - # de-reference handle - grp = self.f[grpRef] - for linkName in grp: - if self.isObjectHardLinked(grp, tgt, linkName): - linkList.append({"group": grp, "link": linkName}) - for item in linkList: - self.unlinkObjectItem(item["group"], tgt, item["link"]) - - addr = h5py.h5o.get_info(tgt.id).addr - addrGrp = self.dbGrp["{addr}"] - del addrGrp.attrs[str(addr)] # remove reverse map - dbRemoved = False - - # finally, remove the dataset from db - if obj_uuid in dbCol: - # should be here (now it is anonymous) - del dbCol[obj_uuid] - dbRemoved = True - - if not dbRemoved: - self.log.warning("did not find: " + obj_uuid + " in anonymous collection") - - if obj_uuid in dbCol.attrs: - self.log.info( - "removing: " + obj_uuid + " from non-anonymous collection" - ) - del dbCol.attrs[obj_uuid] - dbRemoved = True - - if not dbRemoved: - msg = "Unexpected Error, did not find reference to: " + obj_uuid - self.log.error(msg) - raise IOError(errno.EIO, msg) - - # note when the object was deleted - self.setModifiedTime(obj_uuid) - - return True - - def getGroupItemByUuid(self, obj_uuid): - self.initFile() - grp = self.getGroupObjByUuid(obj_uuid) - if grp is None: - if self.getModifiedTime(obj_uuid, useRoot=False): - msg = "Group with uuid: " + obj_uuid + " has been previously deleted" - self.log.info(msg) - raise IOError(errno.ENOENT, msg) - else: - msg = "Group with uuid: " + obj_uuid + " was not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - linkCount = len(grp) - if "__db__" in grp: - linkCount -= 1 # don't include the db group - - item = {"id": obj_uuid} - alias = [] - if grp.name and not grp.name.startswith("/__db__"): - alias.append(grp.name) # just use the default h5py path for now - item["alias"] = alias - item["attributeCount"] = len(grp.attrs) - item["linkCount"] = linkCount - if self.update_timestamps: - item["ctime"] = self.getCreateTime(obj_uuid) - item["mtime"] = self.getModifiedTime(obj_uuid) - - return item - - """ - getLinkItemByObj - return info about a link - parent: reference to group - linkName: name of link - return: item dictionary with link attributes, or None if not found - """ - - def getLinkItemByObj(self, parent, link_name): - if link_name not in parent: - return None - - if link_name == "__db__": - return None # don't provide link to db group - # "http://somefile/#h5path(somepath)") - item = {"title": link_name} - # get the link object, one of HardLink, SoftLink, or ExternalLink - try: - linkObj = parent.get(link_name, None, False, True) - linkClass = linkObj.__class__.__name__ - except TypeError: - # UDLink? set class as 'user' - linkClass = "UDLink" # user defined links - item["class"] = "H5L_TYPE_USER_DEFINED" - if linkClass == "SoftLink": - item["class"] = "H5L_TYPE_SOFT" - item["h5path"] = linkObj.path - item["href"] = "#h5path(" + linkObj.path + ")" - elif linkClass == "ExternalLink": - item["class"] = "H5L_TYPE_EXTERNAL" - item["h5path"] = linkObj.path - item["file"] = linkObj.filename - item["href"] = "#h5path(" + linkObj.path + ")" - elif linkClass == "HardLink": - # Hardlink doesn't have any properties itself, just get the linked - # object - obj = parent[link_name] - addr = h5py.h5o.get_info(obj.id).addr - item["class"] = "H5L_TYPE_HARD" - item["id"] = self.getUUIDByAddress(addr) - class_name = obj.__class__.__name__ - if class_name == "Dataset": - item["href"] = "datasets/" + item["id"] - item["collection"] = "datasets" - elif class_name == "Group": - item["href"] = "groups/" + item["id"] - item["collection"] = "groups" - elif class_name == "Datatype": - item["href"] = "datatypes/" + item["id"] - item["collection"] = "datatypes" - else: - self.log.warning("unexpected object type: " + item["type"]) - - return item - - def getLinkItemByUuid(self, grpUuid, link_name): - self.log.info("db.getLinkItemByUuid(" + grpUuid + ", [" + link_name + "])") - if not link_name: - msg = "link_name not specified" - self.log.info(msg) - raise IOError(errno.EINVAL, msg) - - self.initFile() - parent = self.getGroupObjByUuid(grpUuid) - if parent is None: - msg = "Parent group: " + grpUuid + " of link not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - item = self.getLinkItemByObj(parent, link_name) - # add timestamps - if item: - if self.update_timestamps: - item["ctime"] = self.getCreateTime( - grpUuid, objType="link", name=link_name - ) - item["mtime"] = self.getModifiedTime( - grpUuid, objType="link", name=link_name - ) - else: - self.log.info("link not found") - mtime = self.getModifiedTime( - grpUuid, objType="link", name=link_name, useRoot=False - ) - if mtime: - msg = ( - "Link [" - + link_name - + "] of: " - + grpUuid - + " has been previously deleted" - ) - self.log.info(msg) - raise IOError(errno.ENOENT, msg) - else: - msg = "Link [" + link_name + "] of: " + grpUuid + " not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - return item - - def getLinkItems(self, grpUuid, marker=None, limit=0): - self.log.info("db.getLinkItems(" + grpUuid + ")") - if marker: - self.log.info("...marker: " + marker) - if limit: - self.log.info("...limit: " + str(limit)) - - self.initFile() - parent = self.getGroupObjByUuid(grpUuid) - if parent is None: - msg = "Parent group: " + grpUuid + " not found, no links returned" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - items = [] - gotMarker = True - if marker is not None: - gotMarker = False - count = 0 - for link_name in parent: - if link_name == "__db__": + """ + createDataset - creates new dataset given shape and datatype + Returns obj_id + """ + if self.closed: + raise ValueError("db is closed") + type_json = getTypeItem(dtype) + if shape == "H5S_NULL": + shape_json = {"class": "H5S_NULL"} + elif shape == (): + shape_json = {"class": "H5S_SCALAR"} + else: + shape_json = {"class": "H5S_SIMPLE"} + shape_json["dims"] = list(shape) + + if maxdims: + if shape_json["class"] != "H5S_SIMPLE": + raise ValueError("only simple shapes can be resizable") + if len(shape) != len(maxdims): + raise ValueError("maxdims length not equal to shape rank") + shape_json["maxdims"] = ["H5S_UNLIMITED" if x is None else x for x in maxdims] + + dset_json = {"shape": shape_json, "type": type_json, "attributes": {}} + if cpl: + dset_json["cpl"] = cpl + else: + dset_json["cpl"] = {} + + dset_id = createObjId("datasets", root_id=self.root_id) + self.db[dset_id] = dset_json + self._new_objects.add(dset_id) + return dset_id + + def getCollection(self, col_type=None): + obj_ids = [] + for obj_id in self.db: + if self.db[obj_id] is None: + # skip deleted objects continue - if not gotMarker: - if link_name == marker: - gotMarker = True - continue # start filling in result on next pass - else: - continue # keep going! - item = self.getLinkItemByObj(parent, link_name) - items.append(item) - - count += 1 - if limit > 0 and count == limit: - break # return what we got - return items - - def unlinkItem(self, grpUuid, link_name): - if self.readonly: - msg = "Unable to unlink item (Updates are not allowed)" - self.log.info(msg) - raise IOError(errno.EPERM, msg) - grp = self.getGroupObjByUuid(grpUuid) - if grp is None: - msg = "Parent group: " + grpUuid + " not found, cannot remove link" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - if link_name not in grp: - msg = ( - "Link: [" - + link_name - + "] of group: " - + grpUuid - + " not found, cannot remove link" - ) - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - if link_name == "__db__": - # don't allow db group to be unlinked! - msg = "Unlinking of __db__ group not allowed" - raise IOError(errno.EPERM, msg) - - obj = None - try: - linkObj = grp.get(link_name, None, False, True) - linkClass = linkObj.__class__.__name__ - if linkClass == "HardLink": - # we can safely reference the object - obj = grp[link_name] - except TypeError: - # UDLink? Return false to indicate that we can not delete this - msg = "Unable to unlink user defined link" - self.log.info(msg) - raise IOError(errno.EPERM, msg) + if not col_type or getCollectionForId(obj_id) == col_type: + obj_ids.append(obj_id) + return obj_ids - linkDeleted = False - if obj is not None: - linkDeleted = self.unlinkObjectItem(grp, obj, link_name) - else: - # SoftLink or External Link - we can just remove the key - del grp[link_name] - linkDeleted = True - - if linkDeleted: - # update timestamp - self.setModifiedTime(grpUuid, objType="link", name=link_name) - - return linkDeleted - - def getCollection(self, col_type, marker=None, limit=None): - self.log.info("db.getCollection(" + col_type + ")") - # col_type should be either "datasets", "groups", or "datatypes" - if col_type not in ("datasets", "groups", "datatypes"): - msg = "Unexpected col_type: [" + col_type + "]" - self.log.error(msg) - raise IOError(errno.EIO, msg) - self.initFile() - col = None # Group, Dataset, or Datatype - if col_type == "datasets": - col = self.dbGrp["{datasets}"] - elif col_type == "groups": - col = self.dbGrp["{groups}"] - else: # col_type == "datatypes" - col = self.dbGrp["{datatypes}"] - - uuids = [] + def __len__(self): + # return the number of objects count = 0 - # gather the non-anonymous ids first - for obj_uuid in col.attrs: - if marker: - if obj_uuid == marker: - marker = None # clear and pick up next item - continue - uuids.append(obj_uuid) - count += 1 - if limit is not None and limit > 0 and count == limit: - break - - if limit == 0 or (limit is not None and count < limit): - # grab any anonymous obj ids next - for obj_uuid in col: - if marker: - if obj_uuid == marker: - marker = None # clear and pick up next item - continue - uuids.append(obj_uuid) + for obj_id in self.db: + # skip deleted objects + if self.db[obj_id] is not None: count += 1 - if limit is not None and limit > 0 and count == limit: - break - - return uuids - - """ - Get the DB Collection names - """ - - def getDBCollections(self): - return ("{groups}", "{datasets}", "{datatypes}") - - """ - Return the db collection the uuid belongs to - """ - - def getDBCollection(self, obj_uuid): - dbCollections = self.getDBCollections() - for dbCollectionName in dbCollections: - col = self.dbGrp[dbCollectionName] - if obj_uuid in col or obj_uuid in col.attrs: - return col - return None - - def unlinkObjectItem(self, parentGrp, tgtObj, link_name): - if self.readonly: - msg = "Unexpected attempt to unlink object" - self.log.error(msg) - raise IOError(errno.EIO, msg) - if link_name not in parentGrp: - msg = "Unexpected: did not find link_name: [" + link_name + "]" - self.log.error(msg) - raise IOError(errno.EIO, msg) - try: - linkObj = parentGrp.get(link_name, None, False, True) - except TypeError: - # user defined link? - msg = "Unable to remove link (user-defined link?)" - self.log.error(msg) - raise IOError(errno.EIO, msg) - linkClass = linkObj.__class__.__name__ - # only deal with HardLinks - linkDeleted = False - if linkClass == "HardLink": - obj = parentGrp[link_name] - if tgtObj is None or obj == tgtObj: - numlinks = self.getNumLinksToObject(obj) - if numlinks == 1: - # last link to this object - convert to anonymous object by - # creating link under {datasets} or {groups} or {datatypes} - # also remove the attribute UUID key - addr = h5py.h5o.get_info(obj.id).addr - obj_uuid = self.getUUIDByAddress(addr) - self.log.info("converting: " + obj_uuid + " to anonymous obj") - dbCol = self.getDBCollection(obj_uuid) - del dbCol.attrs[obj_uuid] # remove the object ref - dbCol[obj_uuid] = obj # add a hardlink - self.log.info( - "deleting link: [" + link_name + "] from: " + parentGrp.name - ) - del parentGrp[link_name] - linkDeleted = True - else: - self.log.info("unlinkObjectItem: link is not a hardlink, ignoring") - return linkDeleted - - def unlinkObject(self, parentGrp, tgtObj): - for name in parentGrp: - self.unlinkObjectItem(parentGrp, tgtObj, name) - return True - - def linkObject(self, parentUUID, childUUID, link_name): - self.initFile() - if self.readonly: - msg = "Unable to create link (Updates are not allowed)" - self.log.info(msg) - raise IOError(errno.EPERM, msg) - - parentObj = self.getGroupObjByUuid(parentUUID) - if parentObj is None: - msg = "Unable to create link, parent UUID: " + parentUUID + " not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - - childObj = self.getDatasetObjByUuid(childUUID) - if childObj is None: - # maybe it's a group... - childObj = self.getGroupObjByUuid(childUUID) - if childObj is None: - # or maybe it's a committed datatype... - childObj = self.getCommittedTypeObjByUuid(childUUID) - if childObj is None: - msg = "Unable to link item, child UUID: " + childUUID + " not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - if link_name in parentObj: - # link already exists - self.log.info("linkname already exists, deleting") - self.unlinkObjectItem(parentObj, None, link_name) - parentObj[link_name] = childObj - - # convert this from an anonymous object to ref if needed - dbCol = self.getDBCollection(childUUID) - if childUUID in dbCol: - # convert to a ref - del dbCol[childUUID] # remove hardlink - dbCol.attrs[childUUID] = childObj.ref # create a ref - - # set link timestamps - now = time.time() - self.setCreateTime(parentUUID, objType="link", name=link_name, timestamp=now) - self.setModifiedTime(parentUUID, objType="link", name=link_name, timestamp=now) - return True - - def createSoftLink(self, parentUUID, linkPath, link_name): - self.initFile() - if self.readonly: - msg = "Unable to create link (Updates are not allowed)" - self.log.info(msg) - raise IOError(errno.EPERM, msg) - parentObj = self.getGroupObjByUuid(parentUUID) - if parentObj is None: - msg = "Unable to create link, parent UUID: " + parentUUID + " not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - if link_name in parentObj: - # link already exists - self.log.info("linkname already exists, deleting") - del parentObj[link_name] # delete old link - parentObj[link_name] = h5py.SoftLink(linkPath) - - now = time.time() - self.setCreateTime(parentUUID, objType="link", name=link_name, timestamp=now) - self.setModifiedTime(parentUUID, objType="link", name=link_name, timestamp=now) - - return True - - def createExternalLink(self, parentUUID, extPath, linkPath, link_name): - self.initFile() - if self.readonly: - msg = "Unable to create link (Updates are not allowed)" - self.log.info(msg) - raise IOError(errno.EPERM, msg) - parentObj = self.getGroupObjByUuid(parentUUID) - if parentObj is None: - msg = "Unable to create link, parent UUID: " + parentUUID + " not found" - self.log.info(msg) - raise IOError(errno.ENXIO, msg) - if link_name in parentObj: - # link already exists - self.log.info("linkname already exists, deleting") - del parentObj[link_name] # delete old link - parentObj[link_name] = h5py.ExternalLink(extPath, linkPath) - - now = time.time() - self.setCreateTime(parentUUID, objType="link", name=link_name, timestamp=now) - self.setModifiedTime(parentUUID, objType="link", name=link_name, timestamp=now) - - return True - - def createGroup(self, obj_uuid=None): - self.initFile() - if self.readonly: - msg = "Unable to create group (Updates are not allowed)" - self.log.info(msg) - raise IOError(errno.EPERM, msg) - groups = self.dbGrp["{groups}"] - if not obj_uuid: - obj_uuid = str(uuid.uuid1()) - newGroup = groups.create_group(obj_uuid) - # store reverse map as an attribute - addr = h5py.h5o.get_info(newGroup.id).addr - addrGrp = self.dbGrp["{addr}"] - addrGrp.attrs[str(addr)] = obj_uuid - - # set timestamps - now = time.time() - self.setCreateTime(obj_uuid, timestamp=now) - self.setModifiedTime(obj_uuid, timestamp=now) - - return obj_uuid - - def getNumberOfGroups(self): - self.initFile() - count = 0 - groups = self.dbGrp["{groups}"] - count += len(groups) # anonymous groups - count += len(groups.attrs) # linked groups - count += 1 # add of for root group - return count - def getNumberOfDatasets(self): - self.initFile() - count = 0 - datasets = self.dbGrp["{datasets}"] - count += len(datasets) # anonymous datasets - count += len(datasets.attrs) # linked datasets - return count + def __iter__(self): + """ Iterate over object ids """ - def getNumberOfDatatypes(self): - self.initFile() - count = 0 - datatypes = self.dbGrp["{datatypes}"] - count += len(datatypes) # anonymous datatypes - count += len(datatypes.attrs) # linked datatypes - return count + for obj_id in self.db: + if self.db[obj_id] is None: + # skip deleted objects + continue + yield obj_id + + def __contains__(self, obj_id): + """ Test if a obj id exists """ + return obj_id in self.db and self.db[obj_id] is not None diff --git a/src/h5json/hdf5dtype.py b/src/h5json/hdf5dtype.py old mode 100755 new mode 100644 index 9f867f2..bbef116 --- a/src/h5json/hdf5dtype.py +++ b/src/h5json/hdf5dtype.py @@ -2,37 +2,257 @@ # Copyright by The HDF Group. # # All rights reserved. # # # -# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # -# Utilities. The full HDF5 REST Server copyright notice, including # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # # terms governing use, modification, and redistribution, is contained in # # the file COPYING, which can be found at the root of the source code # # distribution tree. If you do not have access to this file, you may # # request a copy from help@hdfgroup.org. # ############################################################################## -""" -This class is used to map between HDF5 type representations and numpy types - -""" +import weakref import numpy as np -from h5py.h5t import special_dtype -from h5py.h5t import check_dtype -from h5py.h5r import Reference -from h5py.h5r import RegionReference + + +numpy_integer_types = (np.int8, np.uint8, np.int16, np.int16, np.int32, np.uint32, np.int64, np.uint64) +numpy_float_types = (np.float16, np.float32, np.float64) + + +class Reference: + """ + Represents an HDF5 object reference + """ + + @property + def id(self): + """Low-level identifier appropriate for this object""" + return self._id + + @property + def objref(self): + """Weak reference to object""" + return self._objref # return weak ref to ref'd object + + def __init__(self, bind): + """Create a new reference by binding to + a group/dataset/committed type + """ + self._id = bind._id + self._objref = weakref.ref(bind) + + def __repr__(self): + # TBD: this is not consistent with hsds or h5py... + if not isinstance(self._id.id, str): + raise TypeError("Expected string id") + item = None + + collection_type = self._id.collection_type + item = f"{collection_type}/{self._id.id}" + return item + + def tolist(self): + if type(self._id.id) is not str: + raise TypeError("Expected string id") + if self._id.objtype_code == "d": + return [ + ("datasets/" + self._id.id), + ] + elif self._id.objtype_code == "g": + return [ + ("groups/" + self._id.id), + ] + elif self._id.objtype_code == "t": + return [ + ("datatypes/" + self._id.id), + ] + else: + raise TypeError("Unexpected id type") + + +class RegionReference: + """ + Represents an HDF5 region reference + """ + + @property + def id(self): + """Low-level identifier appropriate for this object""" + return self._id + + @property + def objref(self): + """Weak reference to object""" + return self._objref # return weak ref to ref'd object + + def __init__(self, bind): + """Create a new reference by binding to + a group/dataset/committed type + """ + self._id = bind._id + self._objref = weakref.ref(bind) + + def __repr__(self): + return "" + + +def special_dtype(**kwds): + """Create a new h5py "special" type. Only one keyword may be given. + + Legal keywords are: + + vlen = basetype + Base type for HDF5 variable-length datatype. This can be Python + str type or instance of np.dtype. + Example: special_dtype( vlen=str ) + + enum = (basetype, values_dict) + Create a NumPy representation of an HDF5 enumerated type. Provide + a 2-tuple containing an (integer) base dtype and a dict mapping + string names to integer values. + + ref = Reference | RegionReference + Create a NumPy representation of an HDF5 object or region reference + type.""" + + if len(kwds) != 1: + raise TypeError("Exactly one keyword may be provided") + + name, val = kwds.popitem() + + if name == "vlen": + + return np.dtype("O", metadata={"vlen": val}) + + if name == "enum": + + try: + dt, enum_vals = val + except TypeError: + msg = "Enums must be created from a 2-tuple " + msg += "(basetype, values_dict)" + raise TypeError(msg) + + dt = np.dtype(dt) + if dt.kind not in "iu": + raise TypeError("Only integer types can be used as enums") + + return np.dtype(dt, metadata={"enum": enum_vals}) + + if name == "ref": + dt = None + if val is Reference: + dt = np.dtype("S48", metadata={"ref": Reference}) + elif val is RegionReference: + dt = np.dtype("S48", metadata={"ref": RegionReference}) + else: + raise ValueError("Ref class must be Reference or RegionReference") + + return dt + + raise TypeError(f'Unknown special type "{name}"') + + +def find_item_type(data): + """Find the item type of a simple object or collection of objects. + + E.g. [[['a']]] -> str + + The focus is on collections where all items have the same type; we'll return + None if that's not the case. + + The aim is to treat numpy arrays of Python objects like normal Python + collections, while treating arrays with specific dtypes differently. + We're also only interested in array-like collections - lists and tuples, + possibly nested - not things like sets or dicts. + """ + if isinstance(data, np.ndarray): + if ( + data.dtype.kind == 'O' and not check_dtype(vlen=data.dtype) + ): + item_types = {type(e) for e in data.flat} + else: + return None + elif isinstance(data, (list, tuple)): + item_types = {find_item_type(e) for e in data} + else: + return type(data) + + if len(item_types) != 1: + return None + return item_types.pop() + + +def guess_dtype(data): + """ Attempt to guess an appropriate dtype for the object, returning None + if nothing is appropriate (or if it should be left up the the array + constructor to figure out) + """ + + # todo - handle RegionReference, Reference + item_type = find_item_type(data) + if item_type is bytes: + return special_dtype(vlen=bytes) + if item_type is str: + return special_dtype(vlen=str) + + return None + + +def is_float16_dtype(dt): + if dt is None: + return False + + dt = np.dtype(dt) # normalize strings -> np.dtype objects + return dt.kind == 'f' and dt.itemsize == 2 + + +def check_dtype(**kwds): + """Check a dtype for h5py special type "hint" information. Only one + keyword may be given. + + vlen = dtype + If the dtype represents an HDF5 vlen, returns the Python base class. + Currently only builting string vlens (str) are supported. Returns + None if the dtype does not represent an HDF5 vlen. + + enum = dtype + If the dtype represents an HDF5 enumerated type, returns the dictionary + mapping string names to integer values. Returns None if the dtype does + not represent an HDF5 enumerated type. + + ref = dtype + If the dtype represents an HDF5 reference type, returns the reference + class (either Reference or RegionReference). Returns None if the dtype + does not represent an HDF5 reference type. + """ + + if len(kwds) != 1: + raise TypeError("Exactly one keyword may be provided") + + name, dt = kwds.popitem() + + if name not in ("vlen", "enum", "ref"): + raise TypeError('Unknown special type "%s"' % name) + + try: + return dt.metadata[name] + except TypeError: + return None + except KeyError: + return None def getTypeResponse(typeItem): """ Convert the given type item to a predefined type string for - predefined integer and floating point types ("H5T_STD_I64LE", et. al). - For compound types, recursively iterate through the typeItem and do same - conversion for fields of the compound type. - """ + predefined integer and floating point types ("H5T_STD_I64LE", et. al). + For compound types, recursively iterate through the typeItem and do + same conversion for fields of the compound type.""" response = None if "uuid" in typeItem: # committed type, just return uuid response = "datatypes/" + typeItem["uuid"] - elif typeItem["class"] == "H5T_INTEGER" or typeItem["class"] == "H5T_FLOAT": + elif typeItem["class"] in ("H5T_INTEGER", "H5T_FLOAT"): # just return the class and base for pre-defined types response = {} response["class"] = typeItem["class"] @@ -52,7 +272,7 @@ def getTypeResponse(typeItem): for field in typeItem["fields"]: fieldItem = {} fieldItem["name"] = field["name"] - fieldItem["type"] = getTypeResponse(field["type"]) # recursive call + fieldItem["type"] = getTypeResponse(field["type"]) # recurse call fieldList.append(fieldItem) response["fields"] = fieldList else: @@ -68,112 +288,12 @@ def getTypeResponse(typeItem): return response -def getItemSize(typeItem): - """ - Get size of an item in bytes. - For variable length types (e.g. variable length strings), - return the string "H5T_VARIABLE" +def getTypeItem(dt, metadata=None): """ - # handle the case where we are passed a primitive type first - if isinstance(typeItem, bytes): - typeItem = typeItem.decode("ascii") - if isinstance(typeItem, str): - for type_prefix in ("H5T_STD_I", "H5T_STD_U", "H5T_IEEE_F"): - if typeItem.startswith(type_prefix): - num_bits = typeItem[len(type_prefix) :] - if num_bits[-2:] in ("LE", "BE"): - num_bits = num_bits[:-2] - try: - return int(num_bits) // 8 - except ValueError: - raise TypeError("Invalid Type") - # none of the expect primative types mathched - raise TypeError("Invalid Type") - if not isinstance(typeItem, dict): - raise TypeError("invalid type") - - item_size = 0 - if "class" not in typeItem: - raise KeyError("'class' not provided") - typeClass = typeItem["class"] - - if typeClass == "H5T_INTEGER": - if "base" not in typeItem: - raise KeyError("'base' not provided") - item_size = getItemSize(typeItem["base"]) - - elif typeClass == "H5T_FLOAT": - if "base" not in typeItem: - raise KeyError("'base' not provided") - item_size = getItemSize(typeItem["base"]) - - elif typeClass == "H5T_STRING": - if "length" not in typeItem: - raise KeyError("'length' not provided") - item_size = typeItem["length"] - - elif typeClass == "H5T_VLEN": - item_size = "H5T_VARIABLE" - elif typeClass == "H5T_OPAQUE": - if "size" not in typeItem: - raise KeyError("'size' not provided") - item_size = int(typeItem["size"]) - - elif typeClass == "H5T_ARRAY": - if "dims" not in typeItem: - raise KeyError("'dims' must be provided for array types") - if "base" not in typeItem: - raise KeyError("'base' not provided") - item_size = getItemSize(typeItem["base"]) - - elif typeClass == "H5T_ENUM": - if "base" not in typeItem: - raise KeyError("'base' must be provided for enum types") - item_size = getItemSize(typeItem["base"]) - - elif typeClass == "H5T_REFERENCE": - item_size = "H5T_VARIABLE" - elif typeClass == "H5T_COMPOUND": - if "fields" not in typeItem: - raise KeyError("'fields' not provided for compound type") - fields = typeItem["fields"] - if type(fields) is not list: - raise TypeError("Type Error: expected list type for 'fields'") - if not fields: - raise KeyError("no 'field' elements provided") - # add up the size of each sub-field - for field in fields: - if not isinstance(field, dict): - raise TypeError("Expected dictionary type for field") - if "type" not in field: - raise KeyError("'type' missing from field") - subtype_size = getItemSize(field["type"]) # recursive call - if subtype_size == "H5T_VARIABLE": - item_size = "H5T_VARIABLE" - break # don't need to look at the rest - - item_size += subtype_size - else: - raise TypeError("Invalid type class") - - # calculate array type - if "dims" in typeItem and type(item_size) is int: - dims = typeItem["dims"] - for dim in dims: - item_size *= dim - - return item_size - - -""" Return type info. For primitive types, return string with typename For compound types return array of dictionary items -""" - - -def getTypeItem(dt): - + """ predefined_int_types = { "int8": "H5T_STD_I8", "uint8": "H5T_STD_U8", @@ -184,10 +304,19 @@ def getTypeItem(dt): "int64": "H5T_STD_I64", "uint64": "H5T_STD_U64", } - predefined_float_types = {"float32": "H5T_IEEE_F32", "float64": "H5T_IEEE_F64"} + predefined_float_types = { + "float16": "H5T_IEEE_F16", + "float32": "H5T_IEEE_F32", + "float64": "H5T_IEEE_F64", + } + + dt = np.dtype(dt) # convert 'int32', np.int32, etc. to a dtype + + if not metadata and dt.metadata: + metadata = dt.metadata type_info = {} - if len(dt) > 1 or dt.names: + if len(dt): # compound type names = dt.names type_info["class"] = "H5T_COMPOUND" @@ -204,15 +333,22 @@ def getTypeItem(dt): # array type type_info["dims"] = dt.shape type_info["class"] = "H5T_ARRAY" - type_info["base"] = getTypeItem(dt.base) + type_info["base"] = getTypeItem(dt.base, metadata=metadata) elif dt.kind == "O": # vlen string or data # # check for h5py variable length extension - vlen_check = check_dtype(vlen=dt.base) - if vlen_check is not None and not isinstance(vlen_check, np.dtype): - vlen_check = np.dtype(vlen_check) - ref_check = check_dtype(ref=dt.base) + vlen_check = None + if metadata and "vlen" in metadata: + vlen_check = metadata["vlen"] + if vlen_check is not None and not isinstance(vlen_check, np.dtype): + vlen_check = np.dtype(vlen_check) + + if metadata and "ref" in metadata: + ref_check = metadata["ref"] + else: + ref_check = check_dtype(ref=dt.base) + if vlen_check == bytes: type_info["class"] = "H5T_STRING" type_info["length"] = "H5T_VARIABLE" @@ -229,15 +365,15 @@ def getTypeItem(dt): type_info["size"] = "H5T_VARIABLE" type_info["base"] = getTypeItem(vlen_check) elif vlen_check is not None: - # unknown vlen type + # unknown vlen type raise TypeError("Unknown h5py vlen type: " + str(vlen_check)) elif ref_check is not None: # a reference type type_info["class"] = "H5T_REFERENCE" - if ref_check is Reference: + if ref_check.__name__ == "Reference": type_info["base"] = "H5T_STD_REF_OBJ" # objref - elif ref_check is RegionReference: + elif ref_check.__name__ == "RegionReference": type_info["base"] = "H5T_STD_REF_DSETREG" # region ref else: raise TypeError("unexpected reference type") @@ -249,14 +385,40 @@ def getTypeItem(dt): type_info["size"] = dt.itemsize type_info["tag"] = "" # todo - determine tag elif dt.base.kind == "S": - # Fixed length string type - type_info["class"] = "H5T_STRING" - type_info["charSet"] = "H5T_CSET_ASCII" + # check for object reference + ref_check = check_dtype(ref=dt.base) + if ref_check is not None: + # a reference type + type_info["class"] = "H5T_REFERENCE" + + if ref_check is Reference: + type_info["base"] = "H5T_STD_REF_OBJ" # objref + elif ref_check is RegionReference: + type_info["base"] = "H5T_STD_REF_DSETREG" # region ref + else: + raise TypeError("unexpected reference type") + else: + # Fixed length string type + type_info["class"] = "H5T_STRING" type_info["length"] = dt.itemsize + type_info["charSet"] = "H5T_CSET_ASCII" type_info["strPad"] = "H5T_STR_NULLPAD" elif dt.base.kind == "U": # Fixed length unicode type - raise TypeError("Fixed length unicode type is not supported") + ref_check = check_dtype(ref=dt.base) + if ref_check is not None: + raise TypeError("unexpected reference type") + + # Fixed length string type with unicode support + type_info["class"] = "H5T_STRING" + + # this can be problematic if the encoding of the string is not valid, + # or reqires too many bytes. Use variable length strings to handle all + # UTF8 strings correctly + type_info["charSet"] = "H5T_CSET_UTF8" + # convert from UTF32 length to a fixed length + type_info["length"] = dt.itemsize + type_info["strPad"] = "H5T_STR_NULLPAD" elif dt.kind == "b": # boolean type - h5py stores as enum @@ -265,13 +427,14 @@ def getTypeItem(dt): if dt.base.byteorder == ">": byteorder = "BE" # this mapping is an h5py convention for boolean support - members = [{"name": "FALSE", "value": 0}, {"name": "TRUE", "value": 1}] + bool_false = {"name": "FALSE", "value": 0} + bool_true = {"name": "TRUE", "value": 1} + members = [bool_false, bool_true] type_info["class"] = "H5T_ENUM" type_info["members"] = members base_info = {"class": "H5T_INTEGER"} base_info["base"] = "H5T_STD_I8" + byteorder type_info["base"] = base_info - elif dt.kind == "f": # floating point type type_info["class"] = "H5T_FLOAT" @@ -280,7 +443,8 @@ def getTypeItem(dt): byteorder = "BE" if dt.name in predefined_float_types: # maps to one of the HDF5 predefined types - type_info["base"] = predefined_float_types[dt.base.name] + byteorder + float_type = predefined_float_types[dt.base.name] + type_info["base"] = float_type + byteorder else: raise TypeError("Unexpected floating point type: " + dt.name) elif dt.kind == "i" or dt.kind == "u": @@ -291,14 +455,18 @@ def getTypeItem(dt): if dt.base.byteorder == ">": byteorder = "BE" - # numpy integer type - but check to see if this is the h5py + # numpy integer type - but check to see if this is the hypy # enum extension - mapping = check_dtype(enum=dt) - - if mapping: + if metadata and "enum" in metadata: # yes, this is an enum! + mapping = metadata["enum"] type_info["class"] = "H5T_ENUM" - type_info["members"] = [{"name": n, "value": v} for n, v in mapping.items()] + members = [] + for name in mapping: + value = mapping[name] + item = {"name": name, "value": value} + members.append(item) + type_info["members"] = members if dt.name not in predefined_int_types: raise TypeError("Unexpected integer type: " + dt.name) # maps to one of the HDF5 predefined types @@ -316,11 +484,174 @@ def getTypeItem(dt): else: # unexpected kind - raise TypeError("unexpected dtype kind: " + dt.kind) + raise TypeError(f"unexpected dtype kind: {dt.kind}") return type_info +def isVlen(dt): + """ + Return True if the type contains variable length elements + """ + is_vlen = False + if len(dt): + names = dt.names + for name in names: + if isVlen(dt[name]): + is_vlen = True + break + else: + if dt.base.metadata and "vlen" in dt.base.metadata: + is_vlen = True + return is_vlen + + +def isOpaqueDtype(dt): + """ + Return True if this is an opaque dtype + """ + if dt.kind == "V" and len(dt) == 0 and len(dt.shape) == 0 and not dt.names: + return True + if dt.metadata and dt.metadata.get('h5py_opaque'): + return True + return False + + +def getItemSize(typeItem): + """ + Get size of an item in bytes. + For variable length types (e.g. variable length strings), + return the string "H5T_VARIABLE" + """ + # handle the case where we are passed a primitive type first + if isinstance(typeItem, str) or isinstance(typeItem, bytes): + for type_prefix in ("H5T_STD_I", "H5T_STD_U", "H5T_IEEE_F"): + if typeItem.startswith(type_prefix): + nlen = len(type_prefix) + num_bits = typeItem[nlen:] + if num_bits[-2:] in ("LE", "BE"): + num_bits = num_bits[:-2] + try: + return int(num_bits) // 8 + except ValueError: + raise TypeError("Invalid Type") + # none of the expect primative types mathched + raise TypeError("Invalid Type") + if not isinstance(typeItem, dict): + raise TypeError("invalid type") + + item_size = 0 + if "class" not in typeItem: + raise KeyError("'class' not provided") + typeClass = typeItem["class"] + + if typeClass == "H5T_INTEGER": + if "base" not in typeItem: + raise KeyError("'base' not provided") + item_size = getItemSize(typeItem["base"]) + + elif typeClass == "H5T_FLOAT": + if "base" not in typeItem: + raise KeyError("'base' not provided") + item_size = getItemSize(typeItem["base"]) + + elif typeClass == "H5T_STRING": + if "length" not in typeItem: + raise KeyError("'length' not provided") + item_size = typeItem["length"] + + elif typeClass == "H5T_VLEN": + item_size = "H5T_VARIABLE" + elif typeClass == "H5T_OPAQUE": + if "size" not in typeItem: + raise KeyError("'size' not provided") + item_size = int(typeItem["size"]) + + elif typeClass == "H5T_ARRAY": + if "dims" not in typeItem: + raise KeyError("'dims' must be provided for array types") + if "base" not in typeItem: + raise KeyError("'base' not provided") + item_size = getItemSize(typeItem["base"]) + + elif typeClass == "H5T_ENUM": + if "base" not in typeItem: + raise KeyError("'base' must be provided for enum types") + item_size = getItemSize(typeItem["base"]) + + elif typeClass == "H5T_REFERENCE": + if "length" in typeItem: + item_size = typeItem["length"] + elif "base" in typeItem and typeItem["base"] == "H5T_STD_REF_OBJ": + # obj ref values are in the form: "groups/" or + # "datasets/" or "datatypes/" + item_size = 48 + else: + item_size = 80 # tb: just take a guess at this for now + elif typeClass == "H5T_COMPOUND": + if "fields" not in typeItem: + raise KeyError("'fields' not provided for compound type") + fields = typeItem["fields"] + if not isinstance(fields, list): + raise TypeError("Type Error: expected list type for 'fields'") + if not fields: + raise KeyError("no 'field' elements provided") + # add up the size of each sub-field + for field in fields: + if not isinstance(field, dict): + raise TypeError("Expected dictionary type for field") + if "type" not in field: + raise KeyError("'type' missing from field") + subtype_size = getItemSize(field["type"]) # recursive call + if subtype_size == "H5T_VARIABLE": + item_size = "H5T_VARIABLE" + break # don't need to look at the rest + + item_size += subtype_size + else: + raise TypeError("Invalid type class") + + # calculate array type + if "dims" in typeItem and isinstance(item_size, int): + dims = typeItem["dims"] + for dim in dims: + item_size *= dim + + return item_size + + +def getDtypeItemSize(dtype): + """ Return size of dtype in bytes + For variable length types (e.g. variable length strings), + return the string "H5T_VARIABLE + """ + item_size = 0 + if len(dtype): + # compound dtype + for i in range(len(dtype)): + sub_dt = dtype[i] + sub_dt_size = getDtypeItemSize(sub_dt) + if sub_dt_size == "H5T_VARIABLE": + item_size = "H5T_VARIABLE" # return variable if any component is variable + break + item_size += sub_dt_size + else: + # primitive type + if dtype.shape: + base_size = getDtypeItemSize(dtype.base) + if base_size == "H5T_VARIABLE": + item_size = "H5T_VARIABLE" + else: + nelements = np.prod(dtype.shape) + item_size = base_size * nelements + else: + if dtype.metadata and "vlen" in dtype.metadata: + item_size = "H5T_VARIABLE" + else: + item_size = dtype.itemsize + return item_size + + def getNumpyTypename(hdf5TypeName, typeClass=None): predefined_int_types = { "H5T_STD_I8": "i1", @@ -332,7 +663,11 @@ def getNumpyTypename(hdf5TypeName, typeClass=None): "H5T_STD_I64": "i8", "H5T_STD_U64": "u8", } - predefined_float_types = {"H5T_IEEE_F32": "f4", "H5T_IEEE_F64": "f8"} + predefined_float_types = { + "H5T_IEEE_F16": "f2", + "H5T_IEEE_F32": "f4", + "H5T_IEEE_F64": "f8", + } if len(hdf5TypeName) < 3: raise Exception("Type Error: invalid typename: ") @@ -356,7 +691,6 @@ def getNumpyTypename(hdf5TypeName, typeClass=None): def createBaseDataType(typeItem): - dtRet = None if isinstance(typeItem, str): # should be one of the predefined types @@ -371,20 +705,32 @@ def createBaseDataType(typeItem): raise KeyError("'class' not provided") typeClass = typeItem["class"] + dims = "" + if "dims" in typeItem: + if typeClass != "H5T_ARRAY": + raise TypeError("'dims' only supported for integer types") + + dims = None + if isinstance(typeItem["dims"], int): + dims = typeItem["dims"] # make into a tuple + elif not isinstance(typeItem["dims"], list) and not isinstance( + typeItem["dims"], tuple + ): + raise TypeError("expected list or integer for dims") + else: + dims = typeItem["dims"] + dims = str(tuple(dims)) + if typeClass == "H5T_INTEGER": if "base" not in typeItem: raise KeyError("'base' not provided") - if "dims" in typeItem: - raise TypeError("'dims' not supported for integer types") baseType = getNumpyTypename(typeItem["base"], typeClass="H5T_INTEGER") - dtRet = np.dtype(baseType) + dtRet = np.dtype(dims + baseType) elif typeClass == "H5T_FLOAT": if "base" not in typeItem: raise KeyError("'base' not provided") - if "dims" in typeItem: - raise TypeError("'dims' not supported for floating point types") baseType = getNumpyTypename(typeItem["base"], typeClass="H5T_FLOAT") - dtRet = np.dtype(baseType) + dtRet = np.dtype(dims + baseType) elif typeClass == "H5T_STRING": if "length" not in typeItem: raise KeyError("'length' not provided") @@ -392,8 +738,9 @@ def createBaseDataType(typeItem): raise KeyError("'charSet' not provided") if typeItem["length"] == "H5T_VARIABLE": - if "dims" in typeItem: - raise TypeError("'dims' not supported for variable types") + if dims: + msg = "ArrayType is not supported for variable len types" + raise TypeError(msg) if typeItem["charSet"] == "H5T_CSET_ASCII": dtRet = special_dtype(vlen=bytes) elif typeItem["charSet"] == "H5T_CSET_UTF8": @@ -408,20 +755,25 @@ def createBaseDataType(typeItem): if typeItem["charSet"] == "H5T_CSET_ASCII": type_code = "S" elif typeItem["charSet"] == "H5T_CSET_UTF8": - raise TypeError("fixed-width unicode strings are not supported") + # use the same type_code as ascii strings + # (otherwise, numpy will reserve bytes for UTF32 representation) + type_code = "S" else: raise TypeError("unexpected 'charSet' value") - dtRet = np.dtype(type_code + str(nStrSize)) # fixed size string + # a fixed size string + dtRet = np.dtype(dims + type_code + str(nStrSize)) elif typeClass == "H5T_VLEN": - if "dims" in typeItem: - raise TypeError("'dims' not supported for vlen types") + if dims: + msg = "ArrayType is not supported for variable len types" + raise TypeError(msg) if "base" not in typeItem: raise KeyError("'base' not provided") baseType = createBaseDataType(typeItem["base"]) dtRet = special_dtype(vlen=np.dtype(baseType)) elif typeClass == "H5T_OPAQUE": - if "dims" in typeItem: - raise TypeError("'dims' not supported for opaque types") + if dims: + msg = "Opaque Type is not supported for variable len types" + raise TypeError(msg) if "size" not in typeItem: raise KeyError("'size' not provided") nSize = int(typeItem["size"]) @@ -429,26 +781,19 @@ def createBaseDataType(typeItem): raise TypeError("'size' must be non-negative") dtRet = np.dtype("V" + str(nSize)) elif typeClass == "H5T_ARRAY": - if "dims" not in typeItem: + if not dims: raise KeyError("'dims' must be provided for array types") if "base" not in typeItem: raise KeyError("'base' not provided") arrayBaseType = typeItem["base"] - if type(arrayBaseType) is dict: + if isinstance(arrayBaseType, dict): if "class" not in arrayBaseType: raise KeyError("'class' not provided for array base type") - if arrayBaseType["class"] not in ( - "H5T_INTEGER", - "H5T_FLOAT", - "H5T_STRING", - "H5T_COMPOUND", - ): - raise TypeError( - f"{arrayBaseType['class']}: H5T_ARRAY base type not supported." - ) - - dt_base = createDataType(arrayBaseType) - + type_classes = ("H5T_INTEGER", "H5T_FLOAT", "H5T_STRING", "H5T_COMPOUND", "H5T_ARRAY") + if arrayBaseType["class"] not in type_classes: + msg = "Array Type base type must be integer, float, string, compound or array" + raise TypeError(msg) + baseType = createDataType(arrayBaseType) if isinstance(typeItem["dims"], int): dims = typeItem["dims"] # make into a tuple elif type(typeItem["dims"]) not in (list, tuple): @@ -457,11 +802,17 @@ def createBaseDataType(typeItem): dims = typeItem["dims"] # create an array type of the base type - dtRet = np.dtype((dt_base, dims)) - + dtRet = np.dtype((baseType, dims)) + """ + metadata = None + if baseType.metadata: + metadata = dict(baseType.metadata) + dtRet = np.dtype(dims + baseType.str, metadata=metadata) + else: + dtRet = np.dtype(dims + baseType.str) + return dtRet # return predefined type + """ elif typeClass == "H5T_REFERENCE": - if "dims" in typeItem: - raise TypeError("'dims' not supported for reference types") if "base" not in typeItem: raise KeyError("'base' not provided") if typeItem["base"] == "H5T_STD_REF_OBJ": @@ -470,6 +821,7 @@ def createBaseDataType(typeItem): dtRet = special_dtype(ref=RegionReference) else: raise TypeError("Invalid base type for reference type") + elif typeClass == "H5T_ENUM": if "base" not in typeItem: raise KeyError("Expected 'base' to be provided for enum type") @@ -477,21 +829,36 @@ def createBaseDataType(typeItem): if "class" not in base_json: raise KeyError("Expected class field in base type") if base_json["class"] != "H5T_INTEGER": - raise TypeError("Only integer base types can be used with enum type") - if "members" not in typeItem: - raise KeyError("'members' not provided for enum type") - members = typeItem["members"] - if len(members) == 0: - raise KeyError("empty enum members") + msg = "Only integer base types can be used with enum type" + raise TypeError(msg) + if "mapping" in typeItem: + mapping = typeItem["mapping"] + elif "members" in typeItem: + mapping = typeItem["members"] # backward-compatibility for hdf5-json + else: + raise KeyError("'mapping' not provided for enum type") + + if len(mapping) == 0: + raise KeyError("empty enum map") dt = createBaseDataType(base_json) - values_dict = dict((m["name"], m["value"]) for m in members) - if ( - dt.kind == "i" - and dt.name == "int8" - and len(members) == 2 - and "TRUE" in values_dict - and "FALSE" in values_dict + if isinstance(mapping, list): + # convert to a dictionary + values_dict = dict((m["name"], m["value"]) for m in mapping) + elif isinstance(mapping, dict): + # just use as is + values_dict = mapping + else: + raise TypeError("Expected dict or list mapping for enum type") + + if all( + ( + dt.kind == "i", + dt.name == "int8", + len(mapping) == 2, + "TRUE" in values_dict, + "FALSE" in values_dict, + ) ): # convert to numpy boolean type dtRet = np.dtype("bool") @@ -505,14 +872,12 @@ def createBaseDataType(typeItem): return dtRet -""" -Create a numpy datatype given a json type -""" - - def createDataType(typeItem): + """ + Create a numpy datatype given a json type + """ dtRet = None - if isinstance(typeItem, (str, bytes)): + if type(typeItem) in (str, bytes): # should be one of the predefined types dtName = getNumpyTypename(typeItem) dtRet = np.dtype(dtName) @@ -543,20 +908,90 @@ def createDataType(typeItem): if "type" not in field: raise KeyError("'type' missing from field") field_name = field["name"] - if isinstance(field_name, str): - # verify the field name is ascii - try: - field_name.encode("ascii") - except UnicodeDecodeError: - raise TypeError("non-ascii field name not allowed") + if not isinstance(field_name, str): + raise TypeError("field names must be strings") + # verify the field name is ascii + try: + field_name.encode("ascii") + except UnicodeEncodeError: + raise TypeError("non-ascii field name not allowed") dt = createDataType(field["type"]) # recursive call if dt is None: raise Exception("unexpected error") - subtypes.append((field_name, dt)) # append tuple + subtypes.append((field["name"], dt)) # append tuple dtRet = np.dtype(subtypes) - else: dtRet = createBaseDataType(typeItem) # create non-compound dt return dtRet + + +def validateTypeItem(typeItem): + """ + Validate a json type - call createDataType and if no exception, + it's valid + """ + createDataType(typeItem) + # throws KeyError, TypeError, or ValueError + + +def getBaseTypeJson(type_name): + """ + Return JSON representation of a predefined type string + """ + predefined_int_types = ( + "H5T_STD_I8", + "H5T_STD_U8", + "H5T_STD_I16", + "H5T_STD_U16", + "H5T_STD_I32", + "H5T_STD_U32", + "H5T_STD_I64", + "H5T_STD_U64", + ) + predefined_float_types = ("H5T_IEEE_F16", "H5T_IEEE_F32", "H5T_IEEE_F64") + type_json = {} + # predefined typenames start with 'H5T' and end with "LE" or "BE" + if all( + ( + type_name.startswith("H5T_"), + type_name[-1] == "E", + type_name[-2] in ("L", "B"), + ) + ): + # trime of the "BE/"LE" + type_prefix = type_name[:-2] + if type_prefix in predefined_int_types: + type_json["class"] = "H5T_INTEGER" + type_json["base"] = type_name + elif type_prefix in predefined_float_types: + type_json["class"] = "H5T_FLOAT" + type_json["base"] = type_name + else: + raise TypeError("Invalid type name") + else: + raise TypeError("Invalid type name") + return type_json + + +def getSubType(dt_parent, fields): + """ Return a dtype that is a compound type composed of + the fields given in the field_names list + """ + if len(dt_parent) == 0: + raise TypeError("getSubType - parent must be compound type") + if not fields: + raise TypeError("null field specification") + if isinstance(fields, str): + fields = [fields,] # convert to a list + + field_names = set(dt_parent.names) + dt_items = [] + for field in fields: + if field not in field_names: + raise TypeError(f"field: {field} is not defined in parent type") + dt_items.append((field, dt_parent[field])) + dt = np.dtype(dt_items) + + return dt diff --git a/src/h5json/hsdsstore/hsds_reader.py b/src/h5json/hsdsstore/hsds_reader.py new file mode 100644 index 0000000..55a8c02 --- /dev/null +++ b/src/h5json/hsdsstore/hsds_reader.py @@ -0,0 +1,312 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import logging + +from ..objid import getCollectionForId, getUuidFromId + +from ..hdf5dtype import createDataType +from ..array_util import jsonToArray, bytesToArray +from .. import selections +from ..h5reader import H5Reader +from .httpconn import HttpConn + + +class HSDSReader(H5Reader): + """ + This class can be used by HDF5DB to read content from an hdf5-json file + """ + + def __init__( + self, + domain_path, + app_logger=None, + endpoint=None, + username=None, + password=None, + bucket=None, + api_key=None, + use_session=True, + expire_time=0, + max_objects=0, + max_age=0, + retries=3, + timeout=30.0, + ): + if app_logger: + self.log = app_logger + else: + self.log = logging.getLogger() + + self.log.debug("HSDSReader init(") + + kwargs = {} + self.log.debug(f" domain_path: {domain_path}") + if endpoint: + self.log.debug(f" endpoint: {endpoint}") + kwargs["endpoint"] = endpoint + if username: + self.log.debug(f" username: {username}") + kwargs["username"] = username + if password: + self.log.debug(f" password: {'*' * len(password)}") + kwargs["password"] = password + if bucket: + self.log.debug(f" bucket: {bucket}") + kwargs["bucket"] = bucket + if api_key: + self.log.debug(f" apI_key: {'*' * len(api_key)}") + kwargs["api_key"] = api_key + if use_session: + self.log.debug(f" use_session: {use_session}") + kwargs["user_session"] = use_session + + if expire_time: + self.log.debug(f" expire_time: {expire_time}") + kwargs["expire_time"] = expire_time + if max_objects: + self.log.debug(f" max_objects: {max_objects}") + kwargs["max_objects"] = max_objects + if max_age: + self.log.debug(f" max_age: {max_age}") + kwargs["max_age"] = max_age + if retries: + self.log.debug(f" retries: {retries}") + kwargs["retries"] = retries + if timeout: + self.log.debug(f" timeout: {timeout}") + kwargs["timeout"] = timeout + # save these for when we create the connection + self._http_kwargs = kwargs + self._http_conn = None + + super().__init__(domain_path, app_logger=app_logger) + + def open(self): + if self._http_conn: + return # open already called + + kwargs = self._http_kwargs + http_conn = HttpConn(self.filepath, **kwargs) + + hsds_info = http_conn.serverInfo() + self.log.debug(f"got hsds info: {hsds_info}") + + # try to do a GET from the domain + req = "/" + params = {} + """ + if max_objects is None or max_objects > 0: + # get object meta objects + # TBD: have hsds support a max limit of objects to return + params["getobjs"] = 1 + params["include_attrs"] = 1 + params["include_links"] = 1 + """ + + rsp = http_conn.GET(req, params=params) + + if rsp.status_code != 200: + # file must exist + http_conn.close() + raise IOError(rsp.status_code, rsp.reason) + + domain_json = rsp.json() + self.log.debug(f"got domain_json: {domain_json}") + + if "root" not in domain_json: + http_conn.close() + raise IOError(404, "Location is a folder, not a file") + + root_id = domain_json["root"] + self._root_id = root_id + + """ + if "domain_objs" in root_json: + domain_objs = root_json["domain_objs"] + objdb.load(domain_objs) + """ + if "limits" in domain_json: + self._limits = domain_json["limits"] + else: + self._limits = None + if "version" in domain_json: + self._version = domain_json["version"] + else: + self._version = None + + self._http_conn = http_conn + self._domain_json = domain_json + + return self._root_id + + @property + def http_conn(self): + return self._http_conn + + def close(self): + if self._http_conn: + self._http_conn.close() + + def isClosed(self): + if self._http_conn: + return False + else: + return True + + def get_root_id(self): + """ Return root id """ + return self._root_id + + def getObjectById(self, obj_id, include_attrs=True, include_links=True, include_values=False): + """ return object with given id """ + + collection = getCollectionForId(obj_id) + + req = f"/{collection}/{obj_id}" + self.log.debug("sending req: {req}") + + params = {} + if include_attrs: + params["include_attrs"] = 1 + if include_links: + params["include_links"] = 1 + + rsp = self.http_conn.GET(req, params=params) + + if rsp.status_code != 200: + raise IOError(rsp.status_code, rsp.reason) + + obj_json = rsp.json() + # remove any unneeded keys + redundant_keys = ("hrefs", "root", "domain", "bucket", "linkCount", "attributeCount") + for key in redundant_keys: + if key in obj_json: + del obj_json[key] + + self.log.debug(f"got json for id: {obj_id}: {obj_json}") + return obj_json + + def getAttribute(self, obj_id, name, includeData=True): + """ + Get attribute given an object id and name + returns: JSON object + """ + self.log.debug(f"getAttribute({obj_id}), [{name}], include_data={includeData})") + collection = getCollectionForId(obj_id) + req = f"/{collection}/{obj_id}/attributes/{name}" + + params = {} + params["IncludeData"] = 1 if includeData else 0 + + rsp = self.http_conn.GET(req, params=params) + + if rsp.status_code in (404, 410): + self.log.warning(f"attribute {name} not found") + return None + + if rsp.status_code != 200: + self.log.error(f"GET {req} failed with status_code: {rsp.status_code}") + raise IOError(rsp.status_code, rsp.reason) + attr_json = rsp.json() + + if "hrefs" in attr_json: + del attr_json["hrefs"] + + return attr_json + + def getDtype(self, obj_json): + """ Return the dtype for the type given by obj_json """ + if "type" not in obj_json: + raise KeyError("no type item found") + type_item = obj_json["type"] + if isinstance(type_item, str) and type_item.startswith("datatypes/"): + # this is a reference to a committed type + ctype_id = "t-" + getUuidFromId(type_item) + ctype_json = self.getObjectById(ctype_id) + if "type" not in ctype_json: + raise KeyError(f"Unexpected datatype: {ctype_json}") + # Use the ctype's item json + type_item = ctype_json["type"] + dtype = createDataType(type_item) + return dtype + + def getDatasetValues(self, dset_id, sel=None, dtype=None): + """ + Get values from dataset identified by obj_id. + If a slices list or tuple is provided, it should have the same + number of elements as the rank of the dataset. + """ + + self.log.debug(f"getDatasetValues({dset_id}), sel={sel}") + collection = getCollectionForId(dset_id) + if collection != "datasets": + msg = f"unexpected id: {dset_id} for getDatasetValues" + self.log.warning(msg) + return ValueError(msg) + + if sel is None or sel.select_type == selections.H5S_SELECT_ALL: + query_param = None # just return the entire array + elif isinstance(sel, (selections.SimpleSelection, selections.FancySelection)): + query_param = sel.getQueryParam() + else: + raise NotImplementedError(f"selection type: {type(sel)} not supported") + + mtype = dtype # TBD - support read time dtype + mshape = sel.mshape + + req = f"/{collection}/{dset_id}/value" + params = {} + + if query_param: + params["select"] = query_param + + if mtype.names != dtype.names: + params["fields"] = ":".join(mtype.names) + + MAX_SELECT_QUERY_LEN = 100 + if len(query_param) > MAX_SELECT_QUERY_LEN: + # use a post method to avoid possible long query strings + try: + rsp = self.http_conn.POST(req, body=params, format="binary") + except IOError as ioe: + self.log.info(f"got IOError: {ioe.errno}") + raise IOError(f"Error retrieving data: {ioe.errno}") + else: + # make a http GET + try: + rsp = self.http_conn.GET(req, params=params, format="binary") + except IOError as ioe: + self.log.info(f"got IOError: {ioe.errno}") + raise IOError(ioe.errno, "Error retrieving data") + + if rsp.status_code != 200: + self.log.info(f"got http error: {rsp.status_code}") + raise IOError(rsp.status_code, "Error retrieving data") + + if rsp.is_binary: + # got binary response + self.log.info(f"binary response, {len(rsp.text)} bytes") + arr = bytesToArray(rsp.text, mtype, mshape) + else: + # got JSON response + # need some special conversion for compound types -- + # each element must be a tuple, but the JSON decoder + # gives us a list instead. + self.log.info("json response") + + data = rsp.json()["value"] + # self.log.debug(data) + + arr = jsonToArray(mshape, mtype, data) + self.log.debug(f"jsonToArray returned: {arr}") + + return arr diff --git a/src/h5json/hsdsstore/hsds_writer.py b/src/h5json/hsdsstore/hsds_writer.py new file mode 100644 index 0000000..c4a7c39 --- /dev/null +++ b/src/h5json/hsdsstore/hsds_writer.py @@ -0,0 +1,322 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import logging +import time + +from ..objid import getCollectionForId, getUuidFromId + +from ..hdf5dtype import createDataType +from ..array_util import jsonToArray, bytesToArray +from .. import selections +from ..h5writer import H5Writer +from .httpconn import HttpConn + + +class HSDSWriter(H5Writer): + """ + This class can be used by HDF5DB to read content from an hdf5-json file + """ + + def __init__( + self, + domain_path, + append=False, + no_data=False, + app_logger=None, + endpoint=None, + username=None, + password=None, + bucket=None, + api_key=None, + use_session=True, + expire_time=0, + max_objects=0, + max_age=0, + retries=3, + timeout=30.0, + track_order=False, + owner=None, + linked_domain=None + + ): + if app_logger: + self.log = app_logger + else: + self.log = logging.getLogger() + + if append: + self._init = False + else: + self._init = True + + if no_data: + self._no_data = True + else: + self._no_data = False + + self.log.debug("HSDSWriter init") + + kwargs = {} + self.log.debug(f" domain_path: {domain_path}") + self.log.debug(f" append: {append}") + if endpoint: + self.log.debug(f" endpoint: {endpoint}") + kwargs["endpoint"] = endpoint + if username: + self.log.debug(f" username: {username}") + kwargs["username"] = username + if password: + self.log.debug(f" password: {'*' * len(password)}") + kwargs["password"] = password + if bucket: + self.log.debug(f" bucket: {bucket}") + kwargs["bucket"] = bucket + if api_key: + self.log.debug(f" apI_key: {'*' * len(api_key)}") + kwargs["api_key"] = api_key + if use_session: + self.log.debug(f" use_session: {use_session}") + kwargs["user_session"] = use_session + if expire_time: + self.log.debug(f" expire_time: {expire_time}") + kwargs["expire_time"] = expire_time + if max_objects: + self.log.debug(f" max_objects: {max_objects}") + kwargs["max_objects"] = max_objects + if max_age: + self.log.debug(f" max_age: {max_age}") + kwargs["max_age"] = max_age + if retries: + self.log.debug(f" retries: {retries}") + kwargs["retries"] = retries + if timeout: + self.log.debug(f" timeout: {timeout}") + kwargs["timeout"] = timeout + self._http_kwargs = kwargs # save for when we create the connection + + super().__init__(domain_path, app_logger=app_logger) + + self._http_conn = None + self._root_id = None + self._append = append + self._owner = owner + self._track_order = track_order + self._linked_domain = linked_domain + self._domain_json = None + self._last_flush_time = 0 + + def open(self): + """ setup domain for writing """ + + if self._http_conn: + http_conn = self._http_conn + else: + kwargs = self._http_kwargs + http_conn = HttpConn(self.filepath, **kwargs) + if self._append: + http_conn._mode = "a" + self._http_conn = http_conn + hsds_info = http_conn.serverInfo() + self.log.debug(f"got hsds info: {hsds_info}") + + if not self._domain_json: + # haven't fetched the domain json yet, do it now + + # try to do a GET from the domain + req = "/" + params = {} + """ + if max_objects is None or max_objects > 0: + # get object meta objects + # TBD: have hsds support a max limit of objects to return + params["getobjs"] = 1 + params["include_attrs"] = 1 + params["include_links"] = 1 + """ + + domain_json = None + rsp = http_conn.GET(req, params=params) + + if rsp.status_code not in (200, 404, 410): + msg = f"Got status code: {rsp.status_code} on initial domain get" + self.log.warning(msg) + raise IOError(msg) + + if rsp.status_code == 200: + if self._append: + # domain exists already + domain_json = rsp.json() + if "root" not in domain_json: + # this a folder not a domain + self.log.warning(f"folder: {self.filepath} has no root property") + http_conn.close() + raise IOError(404, "Location is a folder, not a file") + else: + # not append - delete existing domain + self.log.info(f"sending delete request for {self.filepath}") + delete_rsp = http_conn.DELETE(req, params=params) + if delete_rsp.status_code not in (200, 410): + # failed to delete + http_conn.close() + raise IOError(rsp.status_code, rsp.reason) + + if not domain_json: + # domain doesn't exist, create it + body = {} + if self.db.root_id: + # initialize domain using the db's root_id + body["root_id"] = self.db.root_id + if self._owner: + body["owner"] = self._owner + if self._linked_domain: + body["linked_domain"] = self._linked_domain + if self._track_order: + create_props = {"CreateOrder": 1} + group_body = {"creationProperties": create_props} + body["group"] = group_body + rsp = http_conn.PUT(req, params=params, body=body) + if rsp.status_code != 201: + http_conn.close() + raise IOError(rsp.status_code, rsp.reason) + domain_json = rsp.json() + self.log.info(f"got rsp on PUT domain: {domain_json}") + if "root" not in domain_json: + http_conn.close() + raise IOError(404, "Unexpected error") + + self.log.debug(f"got domain_json: {domain_json}") + + if "root" not in domain_json: + http_conn.close() + raise IOError(404, "Location is a folder, not a file") + + root_id = domain_json["root"] + + self._root_id = root_id + + if "limits" in domain_json: + self._limits = domain_json["limits"] + else: + self._limits = None + if "version" in domain_json: + self._version = domain_json["version"] + else: + self._version = None + + self._domain_json = domain_json + + return self._root_id + + @property + def http_conn(self): + return self._http_conn + + def createObjects(self, obj_ids): + MAX_OBJECTS_PER_REQUEST = 1 + collections = ("groups", "datasets", "datatypes") + col_items = {} + for collection in collections: + col_items[collection] = [] + + for obj_id in obj_ids: + if obj_id == self._root_id: + continue # this was created when the domain was + collection = getCollectionForId(obj_id) + obj_json = self.db.getObjectById(obj_id) + item = {"id": obj_id} + for key in ("links", "attributes"): + if key in obj_json: + item[key] = obj_json[key] + items = col_items[collection] + items.append(item) + if len(items) == MAX_OBJECTS_PER_REQUEST: + print("items:", items) + post_rsp = self.http_conn.POST("/" + collection, items) + print("post_rsp.status_code:", post_rsp.status_code) + if post_rsp.is_json: + print("post_rsp.json:", post_rsp.json()) + items.clear() + + # handle any remainder items + for collection in collections: + items = col_items[collection] + if items: + self.http_conn.POST("/" + collection, items) + + def updateLinks(self, grp_ids): + """ update any modified links of the given objects """ + + print("updateLinks:", grp_ids) + body = {} # body will hold a map of grp ids to link lists + + for grp_id in grp_ids: + if getCollectionForId(grp_id) != "groups": + continue # ignore datasets and datatypes + grp_json = self.db.getObjectById(grp_id) + grp_links = grp_json["links"] + print(f"grp_id {grp_id} links: {grp_links}") + for link_json in grp_links: + if "created" not in link_json: + self.log.error(f"hsds_writer> expected created timestamp in link: {link_json}") + created = link_json["created"] + if created > self._last_flush_time: + # new link, add to our list + if grp_id not in body: + body[grp_id] = {} + + if body: + print("updateLinks, body:", body) + + def flush(self): + """ Write dirty items """ + + if not self.db: + # no db set yet + return False + self.log.info("hsds_writer.flush()") + self.log.debug(f" new object count: {len(self.db.new_objects)}") + self.log.debug(f" dirty object count: {len(self.db.dirty_objects)}") + self.log.debug(f" deleted object count: {len(self.db.deleted_objects)}") + + if self._init: + # initialize all existing objects + self.log.debug(f"flush -- init is true, self.db: {self.db.db}") + for obj_id in self.db: + self.log.debug(f"init: {obj_id}") + self.createObjects(self.db.db.keys()) + self._init = False + elif self.db.new_objects: + for obj_id in self.db.new_objects: + self.log.debug(f"new obj id: {obj_id}") + self.createObjects(self.db.new_objects) + + for obj_id in self.db.dirty_objects: + self.log.debug(f"dirty object id: {obj_id}") + self.updateLinks(self.db.dirty_objects) + + for obj_id in self.db.deleted_objects: + self.log.debug(f"deleted object: {obj_id}") + + self._last_flush_time = time.time() + return True # all objects written successfully + + def close(self): + # over-ride of H5Writer method + self.flush() + + def isClosed(self): + """ return closed status """ + return False if self._http_conn else True + + def get_root_id(self): + """ Return root id """ + return self._root_id diff --git a/src/h5json/hsdsstore/httpconn.py b/src/h5json/hsdsstore/httpconn.py new file mode 100644 index 0000000..14b3d54 --- /dev/null +++ b/src/h5json/hsdsstore/httpconn.py @@ -0,0 +1,808 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## + +from __future__ import absolute_import + +import os +import sys +import time +import base64 + +import requests +import requests_unixsocket +from requests import ConnectionError +from requests.adapters import HTTPAdapter, Retry +import json +import logging + +from .. import openid +from .. import config + + +def eprint(*args, **kwargs): + print(*args, file=sys.stderr, **kwargs) + + +DEFAULT_TIMEOUT = ( + 10, + 1000, +) # #20 # 180 # seconds - allow time for hsds service to bounce + +""" +def verifyCert(self): + # default to validate CERT for https requests, unless + # the H5PYD_VERIFY_CERT environment variable is set and True + # + # TBD: set default to True once the signing authority of data.hdfgroup.org is + # recognized + if "H5PYD_VERIFY_CERT" in os.environ: + verify_cert = os.environ["H5PYD_VERIFY_CERT"].upper() + if verify_cert.startswith('F'): + return False + return True +""" + + +def getAzureApiKey(): + """construct API key for Active Directory if configured""" + # TBD: GoogleID? + + api_key = None + + # if Azure AD ids are set, pass them to HttpConn via api_key dict + cfg = config.get_config() # pulls in state from a .hscfg file (if found). + + ad_app_id = None # Azure AD HSDS Server id + if "HS_AD_APP_ID" in os.environ: + ad_app_id = os.environ["HS_AD_APP_ID"] + elif "hs_ad_app_id" in cfg: + ad_app_id = cfg["hs_ad_app_id"] + ad_tenant_id = None # Azure AD tenant id + if "HS_AD_TENANT_ID" in os.environ: + ad_tenant_id = os.environ["HS_AD_TENANT_ID"] + elif "hs_ad_tenant_id" in cfg: + ad_tenant_id = cfg["hs_ad_tenant_id"] + + ad_resource_id = None # Azure AD resource id + if "HS_AD_RESOURCE_ID" in os.environ: + ad_resource_id = os.environ["HS_AD_RESOURCE_ID"] + elif "hs_ad_resource_id" in cfg: + ad_resource_id = cfg["hs_ad_resource_id"] + + ad_client_secret = None # Azure client secret + if "HS_AD_CLIENT_SECRET" in os.environ: + ad_client_secret = os.environ["HS_AD_CLIENT_SECRET"] + elif "hs_ad_client_secret" in cfg: + ad_client_secret = cfg["hs_ad_client_secret"] + + if ad_app_id and ad_tenant_id and ad_resource_id: + # contruct dict to pass to HttpConn + api_key = { + "AD_APP_ID": ad_app_id, + "AD_TENANT_ID": ad_tenant_id, + "AD_RESOURCE_ID": ad_resource_id, + "openid_provider": "azure", + } + # optional config + if ad_client_secret: + api_key["AD_CLIENT_SECRET"] = ad_client_secret + return api_key # None if AAD not configured + + +def getKeycloakApiKey(): + # check for keycloak next + cfg = config.get_config() # pulls in state from a .hscfg file (if found). + api_key = None + # check to see if we are configured for keycloak authentication + if "HS_KEYCLOAK_URI" in os.environ: + keycloak_uri = os.environ["HS_KEYCLOAK_URI"] + elif "hs_keycloak_uri" in cfg: + keycloak_uri = cfg["hs_keycloak_uri"] + else: + keycloak_uri = None + if "HS_KEYCLOAK_CLIENT_ID" in os.environ: + keycloak_client_id = os.environ["HS_KEYCLOAK_CLIENT_ID"] + elif "hs_keycloak_client_id" in cfg: + keycloak_client_id = cfg["hs_keycloak_client_id"] + else: + keycloak_client_id = None + if "HS_KEYCLOAK_REALM" in os.environ: + keycloak_realm = cfg["HS_KEYCLOAK_REALM"] + elif "hs_keycloak_realm" in cfg: + keycloak_realm = cfg["hs_keycloak_realm"] + else: + keycloak_realm = None + + if keycloak_uri and keycloak_client_id and keycloak_uri: + api_key = { + "keycloak_uri": keycloak_uri, + "keycloak_client_id": keycloak_client_id, + "keycloak_realm": keycloak_realm, + "openid_provider": "keycloak", + } + return api_key + + +class HttpResponse: + """ wrapper for http request responses """ + def __init__(self, rsp, logger=None): + self._rsp = rsp + self._logger = logger + if logger is None: + self.log = logging + else: + self.log = logging.getLogger(logger) + self._text = None + + @property + def status_code(self): + """ return response status code """ + return self._rsp.status_code + + @property + def reason(self): + """ return response reason """ + return self._rsp.reason + + @property + def content_type(self): + """ return content type """ + rsp = self._rsp + if 'Content-Type' in rsp.headers: + content_type = rsp.headers['Content-Type'] + else: + content_type = "" + return content_type + + @property + def content_length(self): + """ Return length of response if available """ + if 'Content-Length' in self._rsp.headers: + content_length = self._rsp.headers['Content-Length'] + else: + content_length = None + return content_length + + @property + def is_binary(self): + """ return True if the response indicates binary data """ + + if self.content_type == "application/octet-stream": + return True + else: + return False + + @property + def is_json(self): + """ return true if response indicates json """ + + if self.content_type.startswith("application/json"): + return True + else: + return False + + @property + def text(self): + """ getresponse content as bytes """ + + if not self._text: + rsp = self._rsp + if not self.is_binary: + # hex encoded response? + # this is returned by API Gateway for lambda responses + self._text = bytes.fromhex(rsp.text) + else: + if self.content_length: + self.log.debug(f"got binary response, {self.content_length} bytes") + else: + self.log.debug("got binary response, content_length unknown") + + HTTP_CHUNK_SIZE = 4096 + http_chunks = [] + downloaded_bytes = 0 + for http_chunk in rsp.iter_content(chunk_size=HTTP_CHUNK_SIZE): + if http_chunk: # filter out keep alive chunks + self.log.debug(f"got http_chunk - {len(http_chunk)} bytes") + downloaded_bytes += len(http_chunk) + http_chunks.append(http_chunk) + if len(http_chunks) == 0: + raise IOError("no data returned") + if len(http_chunks) == 1: + # can return first and only chunk as response + self._text = http_chunks[0] + else: + msg = f"retrieved {len(http_chunks)} http_chunks " + msg += f" {downloaded_bytes} total bytes" + self.log.info(msg) + self._text = bytearray(downloaded_bytes) + index = 0 + for http_chunk in http_chunks: + self._text[index:(index + len(http_chunk))] = http_chunk + index += len(http_chunk) + + return self._text + + def json(self): + """ Return json from response""" + + rsp = self._rsp + + if not self.is_json: + raise IOError("response is not json") + + rsp_json = json.loads(rsp.text) + self.log.debug(f"rsp_json - {len(rsp.text)} bytes") + return rsp_json + + +class HttpConn: + """ + Some utility methods based on equivalents in base class. + """ + + def __init__( + self, + domain_name, + endpoint=None, + username=None, + password=None, + bucket=None, + api_key=None, + mode="a", + use_session=True, + expire_time=1.0, + max_objects=None, + max_age=1.0, + logger=None, + retries=3, + timeout=DEFAULT_TIMEOUT, + **kwds, + ): + self._domain = domain_name + self._mode = mode + self._domain_json = None + self._use_session = use_session + self._retries = retries + self._timeout = timeout + self._api_key = api_key + self._s = None # Sessions + self._server_info = None + self._external_refs = [] + + self._logger = logger + if logger is None: + self.log = logging + else: + self.log = logging.getLogger(logger) + msg = f"HttpConn.init(domain: {domain_name} use_session: {use_session} " + msg += f"expire_time: {expire_time:6.2f} sec retries: {retries}" + self.log.debug(msg) + + if self._timeout != DEFAULT_TIMEOUT: + self.log.info(f"HttpConn.init - timeout = {self._timeout}") + if not endpoint: + if "HS_ENDPOINT" in os.environ: + endpoint = os.environ["HS_ENDPOINT"] + + if not endpoint: + msg = "no endpoint set" + raise ValueError(msg) + + self._endpoint = endpoint + + if not username: + if "HS_USERNAME" in os.environ: + username = os.environ["HS_USERNAME"] + if isinstance(username, str) and (not username or username.upper() == "NONE"): + username = None + self._username = username + + if not password: + if "HS_PASSWORD" in os.environ: + password = os.environ["HS_PASSWORD"] + if isinstance(password, str) and (not password or password.upper() == "NONE"): + password = None + self._password = password + + if not bucket: + if "HS_BUCKET" in os.environ: + bucket = os.environ["HS_BUCKET"] + if isinstance(bucket, str) and (not bucket or bucket.upper() == "NONE"): + bucket = None + self._bucket = bucket + + if api_key is None and "HS_API_KEY" in os.environ: + api_key = os.environ["HS_API_KEY"] + if isinstance(api_key, str) and (not api_key or api_key.upper() == "NONE"): + api_key = None + if not api_key: + api_key = getAzureApiKey() + if not api_key: + api_key = getKeycloakApiKey() + + # Convert api_key to OpenIDHandler + if isinstance(api_key, dict): + # Maintain Azure-defualt backwards compatibility, but allow + # both environment variable and kwarg override. + provider = api_key.get("openid_provider", "azure") + if provider == "azure": + self.log.debug("creating OpenIDHandler for Azure") + self._api_key = openid.AzureOpenID(endpoint, api_key) + elif provider == "google": + self.log.debug("creating OpenIDHandler for Google") + + config = api_key.get("client_secret", None) + scopes = api_key.get("scopes", None) + self._api_key = openid.GoogleOpenID( + endpoint, config=config, scopes=scopes + ) + elif provider == "keycloak": + self.log.debug("creating OpenIDHandler for Keycloak") + + # for Keycloak, pass in username and password + self._api_key = openid.KeycloakOpenID( + endpoint, config=api_key, username=username, password=password + ) + else: + self.log.error(f"Unknown openid provider: {provider}") + + def __del__(self): + if self._s: + self.log.debug("close session") + self._s.close() + self._s = None + + def getHeaders(self, username=None, password=None, headers=None): + + if headers is None: + headers = {} + + # This should be the default - but explicitly set anyway + if "Accept-Encoding" not in headers: + headers['Accept-Encoding'] = "deflate, gzip" + + elif "Authorization" in headers: + return headers # already have auth key + if username is None: + username = self._username + if password is None: + password = self._password + + if self._api_key: + self.log.debug("using api key") + # use OpenId handler to get a bearer token + token = "" + + # Get a token, possibly refreshing if needed. + if isinstance(self._api_key, openid.OpenIDHandler): + token = self._api_key.token + + # Token was provided as a string. + elif isinstance(self._api_key, str): + token = self._api_key + + if token: + auth_string = b"Bearer " + token.encode("ascii") + headers["Authorization"] = auth_string + elif username is not None and password is not None: + self.log.debug(f"use basic auth with username: {username}") + auth_string = username + ":" + password + auth_string = auth_string.encode("utf-8") + auth_string = base64.b64encode(auth_string) + auth_string = b"Basic " + auth_string + headers["Authorization"] = auth_string + else: + self.log.debug("no auth header") + # no auth header + pass + + return headers + + def serverInfo(self): + if self._server_info: + return self._server_info + + if self._endpoint is None: + raise IOError("object not initialized") + + # make an about request + rsp = self.GET("/about") + if rsp.status_code != 200: + raise IOError(rsp.status_code, rsp.reason) + server_info = rsp.json() + if server_info: + self._server_info = server_info + return server_info + + def server_version(self): + server_info = self.serverInfo() + if "hsds_version" in server_info: + server_version = server_info["hsds_version"] + else: + # no standard way to get version for other implements... + server_version = None + return server_version + + def verifyCert(self): + # default to validate CERT for https requests, unless + # the H5PYD_VERIFY_CERT environment variable is set and True + # + # TBD: set default to True once the signing authority of data.hdfgroup.org is + # recognized + if "H5PYD_VERIFY_CERT" in os.environ: + verify_cert = os.environ["H5PYD_VERIFY_CERT"].upper() + if verify_cert.startswith("F"): + return False + return True + + def GET(self, req, format="json", params=None, headers=None): + if self._endpoint is None: + raise IOError("object not initialized") + # check that domain is defined (except for some specific requests) + if req not in ("/domains", "/about", "/info", "/") and self._domain is None: + raise IOError(f"no domain defined: req: {req}") + + rsp = None + + headers = self.getHeaders(headers=headers) + + if params is None: + params = {} + if "domain" not in params: + params["domain"] = self._domain + if "bucket" not in params and self._bucket: + params["bucket"] = self._bucket + if self._api_key and not isinstance(self._api_key, dict): + params["api_key"] = self._api_key + domain = params["domain"] + self.log.debug(f"GET: {req} [{domain}] bucket: {self._bucket}") + + if format == "binary": + headers["accept"] = "application/octet-stream" + + self.log.info(f"GET: {self._endpoint + req} [{params['domain']}] timeout: {self._timeout}") + + for k in params: + if k != "domain": + v = params[k] + self.log.debug(f"GET params {k}:{v}") + + try: + s = self.session + stream = True # tbd - config for no streaming? + ts = time.time() + rsp = s.get( + self._endpoint + req, + params=params, + headers=headers, + stream=stream, + timeout=self._timeout, + verify=self.verifyCert(), + ) + elapsed = time.time() - ts + self.log.info(f"status: GET {rsp.status_code}, elapsed: {elapsed:.4f}") + except ConnectionError as ce: + self.log.error(f"connection error: {ce}") + raise IOError("Connection Error") + except Exception as e: + self.log.error(f"got {type(e)} exception: {e}") + raise IOError("Unexpected exception") + + if rsp.status_code != 200: + self.log.warning(f"GET {req} returned status: {rsp.status_code}") + + return HttpResponse(rsp) + + def PUT(self, req, body=None, format="json", params=None, headers=None): + if self._endpoint is None: + raise IOError("object not initialized") + if self._domain is None: + raise IOError("no domain defined") + + if params: + self.log.info(f"PUT params: {params}") + else: + params = {} + + if "domain" not in params: + params["domain"] = self._domain + if "bucket" not in params and self._bucket: + params["bucket"] = self._bucket + if self._api_key: + params["api_key"] = self._api_key + + # verify the file was open for modification + if self._mode == "r": + raise IOError("Unable to create group (No write intent on file)") + + # try to do a PUT to the domain + + headers = self.getHeaders(headers=headers) + + if format == "binary": + headers["Content-Type"] = "application/octet-stream" + # binary write + data = body + else: + headers["Content-Type"] = "application/json" + data = json.dumps(body) + + self.log.info(f"PUT: {req} format: {format} [{len(data)} bytes]") + + try: + s = self.session + ts = time.time() + rsp = s.put( + self._endpoint + req, + data=data, + headers=headers, + params=params, + verify=self.verifyCert(), + ) + elapsed = time.time() - ts + self.log.info(f"status: PUT {rsp.status_code}, elapsed: {elapsed:.4f}") + except ConnectionError as ce: + self.log.error(f"connection error: {ce}") + raise IOError("Connection Error") + + if rsp.status_code == 201 and req == "/": + self.log.info("clearing domain_json cache") + self._domain_json = None + if rsp.status_code not in (200, 201): + self.log.warning(f"got status code: {rsp.status_code} for PUT {req}") + self.log.info(f"PUT returning: {rsp}") + + return HttpResponse(rsp) + + def POST(self, req, body=None, format="json", params=None, headers=None): + if self._endpoint is None: + raise IOError("object not initialized") + if self._domain is None: + raise IOError("no domain defined") + + if params is None: + params = {} + if "domain" not in params: + params["domain"] = self._domain + if "bucket" not in params and self._bucket: + params["bucket"] = self._bucket + if self._api_key: + params["api_key"] = self._api_key + + # verify we have write intent (unless this is a dataset point selection) + if req.startswith("/datasets/") and req.endswith("/value"): + point_sel = True + else: + point_sel = False + if self._mode == "r" and not point_sel: + raise IOError("Unable perform request (No write intent on file)") + + # try to do a POST to the domain + + headers = self.getHeaders(headers=headers) + + if isinstance(body, bytes): + headers["Content-Type"] = "application/octet-stream" + data = body + else: + # assume json + try: + data = json.dumps(body) + except TypeError: + msg = f"Unable to convert {body} to json" + self.log.error(msg) + raise IOError("JSON encoding error") + if format == "binary": + # receive data as binary + headers["accept"] = "application/octet-stream" + + self.log.info("POST: " + req) + + try: + s = self.session + ts = time.time() + rsp = s.post( + self._endpoint + req, + data=data, + headers=headers, + params=params, + verify=self.verifyCert(), + ) + elapsed = time.time() - ts + self.log.info(f"status: POST {rsp.status_code}, elapsed: {elapsed:.4f}") + except ConnectionError as ce: + self.log.warning(f"connection error: {ce}") + raise IOError(str(ce)) + + if rsp.status_code not in (200, 201): + self.log.error(f"got status_code: {rsp.status_code} for DELETE: {req}") + + return HttpResponse(rsp) + + def DELETE(self, req, params=None, headers=None): + if self._endpoint is None: + raise IOError("object not initialized") + + if req not in ("/domains", "/") and self._domain is None: + raise IOError("no domain defined") + if params is None: + params = {} + if "domain" not in params: + params["domain"] = self._domain + if "bucket" not in params and self._bucket: + params["bucket"] = self._bucket + if self._api_key: + params["api_key"] = self._api_key + + # verify we have write intent + if self._mode == "r": + raise IOError("Unable perform request (No write intent on file)") + + # try to do a DELETE of the resource + headers = self.getHeaders(headers=headers) + + self.log.info("DEL: " + req) + try: + s = self.session + ts = time.time() + rsp = s.delete( + self._endpoint + req, + headers=headers, + params=params, + verify=self.verifyCert(), + ) + self.log.info(f"status: {rsp.status_code}") + elapsed = time.time() - ts + self.log.info(f"status: DELETE {rsp.status_code}, elapsed: {elapsed:.4f}") + except ConnectionError as ce: + self.log.error(f"connection error: {ce}") + raise IOError("Connection Error") + + if rsp.status_code == 200 and req == "/": + self.log.info("clearing domain_json cache") + self._domain_json = None + + if rsp.status_code != 200: + self.log.warning(f"got status_code: {rsp.status_code} for DELETE {req}") + + return HttpResponse(rsp) + + @property + def session(self): + # create a session object to re-use http connection when possible + s = requests + retries = self._retries + backoff_factor = 1 + status_forcelist = (500, 502, 503, 504) + + if self._use_session: + if self._s is None: + if self._endpoint.startswith("http+unix://"): + self.log.debug(f"create unixsocket session: {self._endpoint}") + s = requests_unixsocket.Session() + else: + # regular request session + s = requests.Session() + + retry = Retry( + total=retries, + read=retries, + connect=retries, + backoff_factor=backoff_factor, + status_forcelist=status_forcelist, + ) + + s.mount( + "http://", + HTTPAdapter(max_retries=retry, pool_connections=16, pool_maxsize=16), + ) + s.mount( + "https://", + HTTPAdapter(max_retries=retry, pool_connections=16, pool_maxsize=16), + ) + self._s = s + else: + s = self._s + return s + + def add_external_ref(self, fid): + # this is used by the group class to keep references to external links open + if fid.__class__.__name__ != "FileID": + raise TypeError("add_external_ref, expected FileID type") + self._external_refs.append(fid) + + def close(self): + if self._s: + self._s.close() + self._s = None + + @property + def domain(self): + return self._domain + + @property + def username(self): + return self._username + + @property + def endpoint(self): + return self._endpoint + + @property + def password(self): + return self._password + + @property + def mode(self): + return self._mode + + @property + def domain_json(self): + if self._domain_json is None: + rsp = self.GET("/") + if rsp.status_code != 200: + raise IOError(rsp.reason) + # assume JSON + self._domain_json = rsp.json() + return self._domain_json + + @property + def root_uuid(self): + domain_json = self.domain_json + if "root" not in domain_json: + raise IOError("Unexpected response") + root_uuid = domain_json["root"] + return root_uuid + + @property + def compressors(self): + compressors = [] + if "compressors" in self.domain_json: + compressors = self.domain_json["compressors"] + if not compressors: + compressors = [ + "gzip", + ] + return compressors + + @property + def modified(self): + """Last modified time of the domain as a datetime object.""" + domain_json = self.domain_json + if "lastModified" not in domain_json: + raise IOError("Unexpected response") + last_modified = domain_json["lastModified"] + return last_modified + + @property + def created(self): + """Creation time of the domain""" + domain_json = self.domain_json + if "created" not in domain_json: + raise IOError("Unexpected response") + created = domain_json["created"] + return created + + @property + def owner(self): + """username of creator of domain""" + domain_json = self.domain_json + username = None + if "owner" in domain_json: + # currently this is only available for HSDS + username = domain_json["owner"] + return username + + @property + def logging(self): + """return name of logging handler""" + return self.log diff --git a/src/h5json/jsonstore/__init__.py b/src/h5json/jsonstore/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/h5json/jsonstore/h5json_reader.py b/src/h5json/jsonstore/h5json_reader.py new file mode 100644 index 0000000..40f8e5e --- /dev/null +++ b/src/h5json/jsonstore/h5json_reader.py @@ -0,0 +1,217 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import json +import logging + +from ..objid import getCollectionForId, getUuidFromId + +from ..hdf5dtype import createDataType +from ..array_util import jsonToArray +from .. import selections +from ..h5reader import H5Reader + + +class H5JsonReader(H5Reader): + """ + This class can be used by HDF5DB to read content from an hdf5-json file + """ + + def __init__( + self, + filepath, + app_logger=None + ): + if app_logger: + self.log = app_logger + else: + self.log = logging.getLogger() + + super().__init__(filepath, app_logger=app_logger) + self._root_id = None + self._h5json = None + + def open(self): + if self._h5json: + return # already read JSON file + + with open(self.filepath) as f: + text = f.read() + + # parse the json file + h5json = json.loads(text) + + self._h5json = h5json + + if "root" not in h5json: + raise Exception("no root key in input file") + + self._root_id = "g-" + h5json["root"] + if self.db.root_id and self.db.root_id != self._root_id: + self.log.warning("h5json root id doesn't match db root id") + raise IOError("root id mismatch") + + return self._root_id + + def close(self): + pass + + def isClosed(self): + return False if self._h5json else False + + def get_root_id(self): + """ Return root id """ + return self._root_id + + def getObjectById(self, obj_id, include_attrs=True, include_links=True, include_values=False): + """ return object with given id """ + collection = getCollectionForId(obj_id) + if collection not in self._h5json: + self.log.warning(f"getObjectById - collection: {collection} not found") + return None + json_objs = self._h5json[collection] + obj_uuid = getUuidFromId(obj_id) + if obj_uuid not in json_objs: + self.log.warning(f"getObjectById - {obj_id} not found") + return None + json_obj = json_objs[obj_uuid] + + resp = {} + # selectively copy from the db dict + for k in json_obj: + for k in ("shape", "type", "cpl", "dcpl"): + if k in json_obj: + resp[k] = json_obj[k] + if include_attrs and "attributes" in json_obj: + attrs = {} + attr_list = json_obj["attributes"] + for item in attr_list: + if "name" not in item: + self.log.warning(f"expected to find name key for {obj_id} attributes") + continue + name = item["name"] + attr = {} + if "type" not in item: + raise KeyError(f"expected to find type key for attribute {name} of {obj_id}") + attr["type"] = item["type"] + if "shape" not in item: + raise KeyError(f"expected to find shape key for attribute {name} of {obj_id}") + attr["shape"] = item["shape"] + if "value" in item: + attr["value"] = item["value"] + attrs[name] = attr + resp["attributes"] = attrs + + if include_links and "links" in json_obj: + links = {} + link_list = json_obj["links"] + for item in link_list: + if "title" not in item: + self.log.warning(f"expected to find title key for {obj_id} links") + continue + title = item["title"] + link = {} + for k in ("class", "file", "h5path"): + if k in item: + link[k] = item[k] + if "collection" in item: + collection = item["collection"] + if "id" not in item: + self.log.warning(f"expected to find id key for {obj_id} link item") + continue + obj_uuid = item["id"] + if collection == "groups": + obj_id = "g-" + obj_uuid + elif collection == "datasets": + obj_id = "d-" + obj_uuid + elif collection == "datatypes": + obj_id = "t-" + obj_uuid + else: + self.log.warning(f"unexpected collection type: {collection}") + continue + item["id"] = obj_id + links[title] = item + resp["links"] = links + + if include_values and collection == "datasets" and "value" in json_obj: + resp["value"] = json_obj["value"] + + return resp + + def getAttribute(self, obj_id, name, includeData=True): + """ + Get attribute given an object id and name + returns: JSON object + """ + self.log.debug(f"getAttribute({obj_id}), [{name}], include_data={includeData})") + json_obj = self.getObjectById(obj_id) + if json_obj is None: + return None + if "attributes" not in json_obj: + self.log.warning(f"obj: {obj_id} has no attributes collection") + return None + attributes = json_obj["attributes"] + if name not in attributes: + self.log.info(f"attr: [{name}] of {obj_id} not found") + return None + return attributes[name] + + def getDtype(self, obj_json): + """ Return the dtype for the type given by obj_json """ + if "type" not in obj_json: + raise KeyError("no type item found") + type_item = obj_json["type"] + if isinstance(type_item, str) and type_item.startswith("datatypes/"): + # this is a reference to a committed type + ctype_id = "t-" + getUuidFromId(type_item) + ctype_json = self.getObjectById(ctype_id) + if "type" not in ctype_json: + raise KeyError(f"Unexpected datatype: {ctype_json}") + # Use the ctype's item json + type_item = ctype_json["type"] + dtype = createDataType(type_item) + return dtype + + def getDatasetValues(self, obj_id, sel=None, dtype=None): + """ + Get values from dataset identified by obj_id. + If a slices list or tuple is provided, it should have the same + number of elements as the rank of the dataset. + """ + + self.log.debug(f"getDatasetValues({obj_id}), sel={sel}") + json_obj = self.getObjectById(obj_id, include_values=True) + if json_obj is None: + self.log.warning(f"no object found with id; {obj_id}") + return None + + if "value" not in json_obj: + self.log.warning(f"value key not found for {obj_id}") + return None + json_value = json_obj["value"] + shape_json = json_obj["shape"] + if shape_json["class"] == "H5S_NULL": + self.log.warning("getDatasetValues called for null space object: {obj_id}") + return None + elif shape_json["class"] == "H5S_SCALAR": + dims = () + else: + dims = shape_json["dims"] + + arr = jsonToArray(dims, dtype, json_value) + if sel is None or sel.select_type == selections.H5S_SELECT_ALL: + pass # just return the entire array + elif isinstance(sel, selections.SimpleSelection): + arr = arr[sel.slices] + else: + raise NotImplementedError("selection type not supported") + + return arr diff --git a/src/h5json/jsonstore/h5json_writer.py b/src/h5json/jsonstore/h5json_writer.py new file mode 100644 index 0000000..92d3499 --- /dev/null +++ b/src/h5json/jsonstore/h5json_writer.py @@ -0,0 +1,293 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## + +import json + +from ..h5writer import H5Writer +from ..objid import getUuidFromId, getCollectionForId, createObjId +from ..array_util import bytesArrayToList +from .. import selections + + +class H5JsonWriter(H5Writer): + """ + This abstract class defines properties and methods that the Hdf5db class uses for writing to an HDF5 + compatible storage medium. + """ + + def __init__( + self, + filepath, + append=False, + no_data=False, + app_logger=None + ): + super().__init__(filepath, append=append, no_data=no_data, app_logger=app_logger) + if append: + raise ValueError("H5JsonWriter does not support append mode") + self.alias_db = {} + self.json = {} + self._root_id = None + + def flush(self): + """ Write dirty items """ + # json writer doesn't support incremental updates, so we'll wait + # for close to write out database + if not self._root_id: + msg = "flush called prior to open" + self.log.warning(msg) + raise IOError(msg) + + self.log.info("flush") + return False + + def open(self): + """ file open """ + # no incremental updates with h5json writer, so just fetch the root_id here + if self.db.root_id: + self._root_id = self.db.root_id + else: + self._root_id = createObjId(obj_type="groups") + return self._root_id + + def close(self): + """ close storage handle """ + self.dumpFile() + + def isClosed(self): + """ return closed status """ + return False if self._root_id else True + + def getAliasList(self, obj_id): + """ return list of alias """ + if obj_id not in self.alias_db: + self.alias_db[obj_id] = [] + return self.alias_db[obj_id] + + def updateAliasList(self): + """ update the alias list for each object """ + # clear exiting aliases + obj_ids = self.db.getCollection() + for obj_id in obj_ids: + self.alias_db[obj_id] = [] + + self._setAlias(self._root_uuid, set(), "/") + + def _setAlias(self, obj_id, id_set, h5path): + """ add the given h5path to the object's alias list + If the object is a group, recurse through each hard link """ + obj_json = self.db.getObjectById(obj_id) + alias_list = self.getAliasList(obj_id) + if h5path in alias_list: + return # nothing to do + alias_list.append(h5path) + if getCollectionForId(obj_id) != "groups": + return # done + id_set.add(obj_id) # keep track of objects we've visited to avoid loops + links = obj_json["links"] + if h5path[-1] != '/': + h5path += '/' + + for link_name in links: + link_json = links[link_name] + if link_json["class"] == "H5L_TYPE_HARD": + tgt_id = link_json["id"] + if tgt_id in id_set: + self.log.info("_setAlias - circular loop found") + else: + self._setAlias(tgt_id, id_set, f"{h5path}{link_name}") + id_set.remove(obj_id) + + def dumpAttribute(self, obj_id, attr_name): + self.log.info(f"dumpAttribute: [{attr_name}]") + item = self.db.getAttribute(obj_id, attr_name) + response = {"name": attr_name} + response["type"] = item["type"] + response["shape"] = item["shape"] + + if "value" not in item: + self.log.warning(f"no value key in attribute: {attr_name}") + else: + # dump values unless header -D was passed + response["value"] = item["value"] + return response + + def dumpAttributes(self, obj_id): + attrs = self.db.getAttributes(obj_id) + self.log.info(f"dumpAttributes: {obj_id}") + items = [] + for attr_name in attrs: + item = self.dumpAttribute(obj_id, attr_name) + items.append(item) + + return items + + def dumpLink(self, obj_id, name): + item = self.db.getLink(obj_id, name) + response = {"class": item["class"]} + if "id" in item: + tgt_id = item["id"] + response["collection"] = getCollectionForId(tgt_id) + response["id"] = getUuidFromId(tgt_id) + + for key in item: + if key in ("id", "created", "modified"): + continue + response[key] = item[key] + response["title"] = name + return response + + def dumpLinks(self, obj_id): + links = self.db.getLinks(obj_id) + items = [] + for link_name in links: + item = self.dumpLink(obj_id, link_name) + items.append(item) + return items + + def dumpGroup(self, obj_id): + item = self.db.getObjectById(obj_id) + response = {} + + alias = self.getAliasList(obj_id) + response["alias"] = alias + + if "cpl" in item: + item["creationProperties"] = item["cpl"] + attributes = self.dumpAttributes(obj_id) + if attributes: + response["attributes"] = attributes + links = self.dumpLinks(obj_id) + if links: + response["links"] = links + return response + + def dumpGroups(self): + groups = {} + item = self.dumpGroup(self._root_uuid) + root_uuid = getUuidFromId(self._root_uuid) + groups[root_uuid] = item + obj_ids = self.db.getCollection("groups") + for obj_id in obj_ids: + if obj_id == self._root_uuid: + continue + item = self.dumpGroup(obj_id) + obj_uuid = getUuidFromId(obj_id) + groups[obj_uuid] = item + + self.json["groups"] = groups + + def dumpDataset(self, obj_id): + response = {} + self.log.info("dumpDataset: " + obj_id) + item = self.db.getObjectById(obj_id) + alias = self.getAliasList(obj_id) + response["alias"] = alias + + response["type"] = item["type"] + shapeItem = item["shape"] + shape_rsp = {} + num_elements = 1 + shape_rsp["class"] = shapeItem["class"] + if shapeItem["class"] == "H5S_NULL": + dims = None + num_elements = 0 + elif shapeItem["class"] == "H5S_SCALAR": + dims = () + num_elements = 1 + else: + shape_rsp["dims"] = shapeItem["dims"] + dims = tuple(shapeItem["dims"]) + for extent in dims: + num_elements *= extent + + if "maxdims" in shapeItem: + maxdims = [] + for dim in shapeItem["maxdims"]: + if dim == 0: + maxdims.append("H5S_UNLIMITED") + else: + maxdims.append(dim) + shape_rsp["maxdims"] = maxdims + response["shape"] = shape_rsp + + if "cpl" in item: + response["creationProperties"] = item["cpl"] + + attributes = self.dumpAttributes(obj_id) + if attributes: + response["attributes"] = attributes + + if not self.no_data: + if num_elements > 0: + sel_all = selections.select(dims, ...) + arr = self.db.getDatasetValues(obj_id, sel_all) + response["value"] = bytesArrayToList(arr) # dump values unless header flag was passed + return response + + def dumpDatasets(self): + obj_ids = self.db.getCollection("datasets") + if obj_ids: + datasets = {} + for obj_id in obj_ids: + item = self.dumpDataset(obj_id) + obj_uuid = getUuidFromId(obj_id) + datasets[obj_uuid] = item + + self.json["datasets"] = datasets + + def dumpDatatype(self, obj_id): + response = {} + item = self.db.getObjectById(obj_id) + alias = self.getAliasList(obj_id) + response["alias"] = alias + response["type"] = item["type"] + if "cpl" in item: + response["creationProperties"] = item["cpl"] + attributes = self.dumpAttributes(obj_id) + if attributes: + response["attributes"] = attributes + return response + + def dumpDatatypes(self): + obj_ids = self.db.getCollection("datatypes") + if obj_ids: + datatypes = {} + for obj_id in obj_ids: + item = self.dumpDatatype(obj_id) + obj_uuid = getUuidFromId(obj_id) + datatypes[obj_uuid] = item + + self.json["datatypes"] = datatypes + + def dumpFile(self): + self._root_uuid = self.db.getObjectIdByPath("/") + + db_version_info = self.db.getVersionInfo() + + self.json["apiVersion"] = db_version_info["hdf5-json-version"] + self.json["root"] = getUuidFromId(self._root_uuid) + + self.updateAliasList() # create alias_db with obj_id to alias list dict + + self.dumpGroups() + + self.dumpDatasets() + + self.dumpDatatypes() + indent = 4 + ensure_ascii = True + if self._filepath: + with open('data.json', 'w', encoding='utf-8') as f: + json.dump(self.json, f, ensure_ascii=ensure_ascii, indent=indent) + else: + print(json.dumps(self.json, sort_keys=True, ensure_ascii=ensure_ascii, indent=indent)) diff --git a/src/h5json/jsontoh5/jsontoh5.py b/src/h5json/jsontoh5/jsontoh5.py index c12d037..28f5e00 100755 --- a/src/h5json/jsontoh5/jsontoh5.py +++ b/src/h5json/jsontoh5/jsontoh5.py @@ -9,274 +9,49 @@ # distribution tree. If you do not have access to this file, you may # # request a copy from help@hdfgroup.org. # ############################################################################## -import json -import argparse -import h5py +import sys +import os.path as op import logging -import logging.handlers from h5json import Hdf5db - - -""" -Writeh5 - return json representation of all objects within the given file - h5writer = Writeh5(db, h5json) - h5writer.writeFile() -""" - - -class Writeh5: - def __init__(self, db, json, options=None): - self.options = options - self.db = db - self.json = json - self.root_uuid = None - - # - # Create a hard, soft, or external link - # - def createLink(self, link_obj, parent_uuid): - title = link_obj["title"] - link_class = link_obj["class"] - if link_class == "H5L_TYPE_HARD": - child_uuid = link_obj["id"] - self.db.linkObject(parent_uuid, child_uuid, title) - elif link_class == "H5L_TYPE_SOFT": - h5path = link_obj["h5path"] - self.db.createSoftLink(parent_uuid, h5path, title) - elif link_class == "H5L_TYPE_EXTERNAL": - h5path = link_obj["h5path"] - link_file = link_obj["file"] - self.db.createExternalLink(parent_uuid, link_file, h5path, title) - else: - print("Unable to create link with class:", link_class) - - # - # Create HDF5 dataset object and write data values - # - def createDataset(self, uuid, body): - datatype = body["type"] - if isinstance(datatype, str) and datatype.startswith("datatypes/"): - # committed datatype, just pass in the UUID part - datatype = datatype[len("datatypes/") :] - dims = () # if no space in body, default to scalar - max_shape = None - creation_props = None - if "creationProperties" in body: - creation_props = body["creationProperties"] - if "shape" in body: - shape = body["shape"] - if shape["class"] == "H5S_SIMPLE": - dims = shape["dims"] - if isinstance(dims, int): - # convert int to array - dim1 = shape - dims = [dim1] - if "maxdims" in shape: - max_shape = shape["maxdims"] - if isinstance(max_shape, int): - # convert to array - dim1 = max_shape - max_shape = [dim1] - # convert H5S_UNLIMITED's to None's - for i in range(len(max_shape)): - if max_shape[i] == "H5S_UNLIMITED": - max_shape[i] = None - elif shape["class"] == "H5S_NULL": - dims = None - - self.db.createDataset( - datatype, - dims, - max_shape=max_shape, - creation_props=creation_props, - obj_uuid=uuid, - ) - - if "value" in body: - data = body["value"] - if data: - data = self.db.toRef(len(dims), datatype, data) - self.db.setDatasetValuesByUuid(uuid, data) - - def createAttribute(self, attr_json, col_name, uuid): - attr_name = attr_json["name"] - datatype = attr_json["type"] - if isinstance(datatype, str) and datatype.startswith("datatypes/"): - # committed datatype, just pass in the UUID part - datatype = datatype[len("datatypes/") :] - - attr_value = None - if "value" in attr_json: - attr_value = attr_json["value"] - dims = None - if "shape" in attr_json: - shape = attr_json["shape"] - if shape["class"] == "H5S_SIMPLE": - dims = shape["dims"] - if isinstance(dims, int): - # convert int to array - dim1 = shape - dims = [dim1] - elif shape["class"] == "H5S_SCALAR": - dims = () # empty tuple for scalar - self.db.createAttribute(col_name, uuid, attr_name, dims, datatype, attr_value) - - # - # create committed datatype HDF5 object - # - def createDatatype(self, uuid, body): - datatype = body["type"] - self.db.createCommittedType(datatype, obj_uuid=uuid) - - # - # Create HDF5 group object (links and attributes will be added later) - # - def createGroup(self, uuid, body): - if uuid != self.root_uuid: - self.db.createGroup(obj_uuid=uuid) - - # - # Create all the HDF5 objects defined in the JSON file - # - def createObjects(self): - # create datatypes - if "datatypes" in self.json: - datatypes = self.json["datatypes"] - for uuid in datatypes: - json_obj = datatypes[uuid] - self.createDatatype(uuid, json_obj) - # create groups - if "groups" in self.json: - groups = self.json["groups"] - for uuid in groups: - json_obj = groups[uuid] - self.createGroup(uuid, json_obj) - # create datasets - if "datasets" in self.json: - datasets = self.json["datasets"] - for uuid in datasets: - json_obj = datasets[uuid] - self.createDataset(uuid, json_obj) - - # - # Create all the attributes for HDF5 objects defined in the JSON file - # Note: this needs to be done after createObjects since an attribute - # may use a committed datatype - # - def createAttributes(self): - dimension_list_attrs = [] # track dimension list attributes - # create datatype attributes - if "datatypes" in self.json: - datatypes = self.json["datatypes"] - for uuid in datatypes: - body = datatypes[uuid] - if "attributes" in body: - attributes = body["attributes"] - for attribute in attributes: - self.createAttribute(attribute, "datatypes", uuid) - # create group attributes - if "groups" in self.json: - groups = self.json["groups"] - for uuid in groups: - body = groups[uuid] - if "attributes" in body: - attributes = body["attributes"] - for attribute in attributes: - self.createAttribute(attribute, "groups", uuid) - # create datasets - if "datasets" in self.json: - datasets = self.json["datasets"] - for uuid in datasets: - body = datasets[uuid] - if "attributes" in body: - attributes = body["attributes"] - for attribute in attributes: - if attribute["name"] == "DIMENSION_LIST": - # defer dimension list creation until after we've created all other - # attributes (otherwsie attach_scale may fail) - dimension_list_attrs.append( - {"attribute": attribute, "uuid": uuid} - ) - else: - self.createAttribute(attribute, "datasets", uuid) - - # finally, do dimension_list attributes - for item in dimension_list_attrs: - attribute = item["attribute"] - uuid = item["uuid"] - self.createAttribute(attribute, "datasets", uuid) - - # - # Link all the objects - # Note: this will "de-anonymous-ize" objects defined in the HDF5 file - # Any non-linked objects will be deleted when the __db__ group is deleted - # - def createLinks(self): - if "groups" in self.json: - groups = self.json["groups"] - for uuid in groups: - json_obj = groups[uuid] - if "links" in json_obj: - links = json_obj["links"] - for link in links: - self.createLink(link, uuid) - - def writeFile(self): - - self.root_uuid = self.json["root"] - - self.createObjects() # create datasets, groups, committed datatypes - self.createAttributes() # create attributes for objects - self.createLinks() # link it all together +from h5json.h5pystore.h5py_writer import H5pyWriter +from h5json.jsonstore.h5json_reader import H5JsonReader def main(): - parser = argparse.ArgumentParser(usage="%(prog)s [-h] ") - parser.add_argument( - "in_filename", nargs="+", help="JSon file to be converted to h5" - ) - parser.add_argument("out_filename", nargs="+", help="name of HDF5 output file") - args = parser.parse_args() + if len(sys.argv) < 3 or sys.argv[1] in ("-h", "--help"): + print(f"usage: {sys.argv[0]} [-h] [--nodata] ") + sys.exit(0) + + no_data = False + json_filename = None + hdf5_filename = None + for i in range(1, len(sys.argv)): + if sys.argv[i] == "--nodata": + no_data = True + elif not json_filename: + json_filename = sys.argv[i] + else: + hdf5_filename = sys.argv[i] # create logger - log = logging.getLogger("h5serv") - # log.setLevel(logging.WARN) - log.setLevel(logging.INFO) - # add log handler - handler = logging.FileHandler("./jsontoh5.log") - - # add handler to logger - log.addHandler(handler) - - text = open(args.in_filename[0]).read() - - # parse the json file - h5json = json.loads(text) - - if "root" not in h5json: - raise Exception("no root key in input file") - root_uuid = h5json["root"] - - filename = args.out_filename[0] - - # create the file, will raise IOError if there's a problem - Hdf5db.createHDF5File(filename) - - with Hdf5db( - filename, root_uuid=root_uuid, update_timestamps=False, app_logger=log - ) as db: - h5writer = Writeh5(db, h5json) - h5writer.writeFile() - - # open with h5py and remove the _db_ group - # Note: this will delete any anonymous (un-linked) objects - f = h5py.File(filename, "a") - if "__db__" in f: - del f["__db__"] - f.close() - - print("done!") + logfname = "jsontoh5.log" + loglevel = logging.DEBUG + logging.basicConfig(filename=logfname, format='%(levelname)s %(asctime)s %(message)s', level=loglevel) + log = logging.getLogger() + + # check that the input file exists + if not op.isfile(json_filename): + sys.exit(f"Cannot find file: {json_filename}") + + log.info(f"jsontoh5 {json_filename} to {hdf5_filename}") + + db = Hdf5db(app_logger=log) + db.reader = H5JsonReader(json_filename, app_logger=log) + db.writer = H5pyWriter(hdf5_filename, no_data=no_data, app_logger=log) + db.open() # read json data + # close should create everything the json reader read to the output file + db.close() if __name__ == "__main__": diff --git a/src/h5json/objid.py b/src/h5json/objid.py new file mode 100644 index 0000000..57b5316 --- /dev/null +++ b/src/h5json/objid.py @@ -0,0 +1,508 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HDF (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +# +# objID: +# id (uuid) related functions +# + + +import hashlib +import uuid + +S3_URI = "s3://" +FILE_URI = "file://" +AZURE_URI = "blob.core.windows.net/" # preceded with "https://" +UUID_LEN = 36 # length for uuid strings + + +def _getStorageProtocol(uri): + """ returns 's3://', 'file://', or 'https://...net/' prefix if present. + If the prefix is in the form: https://myaccount.blob.core.windows.net/mycontainer + (references Azure blob storage), return: https://myaccount.blob.core.windows.net/ + otherwise None """ + + if not uri: + protocol = None + elif uri.startswith(S3_URI): + protocol = S3_URI + elif uri.startswith(FILE_URI): + protocol = FILE_URI + elif uri.startswith("https://") and uri.find(AZURE_URI) > 0: + n = uri.find(AZURE_URI) + len(AZURE_URI) + protocol = uri[:n] + elif uri.find("://") >= 0: + raise ValueError(f"storage uri: {uri} not supported") + else: + protocol = None + return protocol + + +def _getBaseName(uri): + """ Return the part of the URI after the storage protocol (if any) """ + + protocol = _getStorageProtocol(uri) + if not protocol: + return uri + else: + return uri[len(protocol):] + + +def _getPrefixForCollection(collection): + """ Return prefix character for given collection type """ + collection = collection.lower() + + if collection in ("group", "groups"): + return 'g' + elif collection in ("dataset", "datasets"): + return 'd' + elif collection in ("datatype", "datatypes"): + return 't' + elif collection in ("chunk", "chunks"): + return 'c' + else: + raise ValueError(f"unexpected collection type: {collection}") + + +def getIdHash(id): + """Return md5 prefix based on id value""" + m = hashlib.new("md5") + m.update(id.encode("utf8")) + hexdigest = m.hexdigest() + return hexdigest[:5] + + +def isSchema2Id(id): + """return true if this is a v2 id""" + # v1 ids are in the standard UUID format: 8-4-4-4-12 + # v2 ids are in the non-standard: 8-8-4-6-6 + if not isValidUuid(id): + return False + parts = id.split("-") + if len(parts) != 6: + raise ValueError(f"Unexpected id formation for uuid: {id}") + if len(parts[2]) == 8: + return True + else: + return False + + +def getIdHexChars(id): + """get the hex chars of the given id""" + if id[0] == "c": + # don't include chunk index + index = id.index("_") + parts = id[0:index].split("-") + else: + parts = id.split("-") + if len(parts) != 6: + raise ValueError(f"Unexpected id format for uuid: {id}") + return "".join(parts[1:]) + + +def hexRot(ch): + """rotate hex character by 8""" + return format((int(ch, base=16) + 8) % 16, "x") + + +def getCollectionForId(obj_id): + """return groups/datasets/datatypes based on id""" + if not isinstance(obj_id, str): + raise ValueError("invalid object id") + + collection = None + if obj_id.startswith("g-") or obj_id.startswith("groups/"): + collection = "groups" + elif obj_id.startswith("d-") or obj_id.startswith("datasets/"): + collection = "datasets" + elif obj_id.startswith("t-") or obj_id.startswith("datatypes"): + collection = "datatypes" + else: + raise ValueError(f"{obj_id} not a collection id") + return collection + + +def isRootObjId(id): + """returns true if this is a root id (only for v2 schema)""" + if not isSchema2Id(id): + raise ValueError("isRootObjId can only be used with v2 ids") + validateUuid(id) # will throw ValueError exception if not a objid + try: + if getCollectionForId(id) != "groups": + return False # not a group + except ValueError: + return False + token = getIdHexChars(id) + # root ids will have last 16 chars rotated version of the first 16 + is_root = True + for i in range(16): + if token[i] != hexRot(token[i + 16]): + is_root = False + break + return is_root + + +def getRootObjId(id): + """returns root id for this objid if this is a root id + (only for v2 schema) + """ + if isRootObjId(id): + return id # this is the root id + token = list(getIdHexChars(id)) + # root ids will have last 16 chars rotated version of the first 16 + for i in range(16): + token[i + 16] = hexRot(token[i]) + token = "".join(token) + root_id = "g-" + token[0:8] + "-" + token[8:16] + "-" + token[16:20] + root_id += "-" + token[20:26] + "-" + token[26:32] + + return root_id + + +def createObjId(obj_type=None, root_id=None): + """ create a new objid + + if obj_type is None, return just a bare uuid. + Otherwise a hsds v2 schema obj_id will be created. + In this case obj_type should be one of "groups", + "datasets", "datatypes", "chunks". If rootid is + None, a root group obj_id will be created. Otherwise the + obj_id will be a an id that has root_id as it's root. """ + + prefix = None + if obj_type is None: + # just return a regular uuid + objid = str(uuid.uuid4()) + else: + + prefix = _getPrefixForCollection(obj_type) + # schema v2 + salt = uuid.uuid4().hex + # take a hash to randomize the uuid + token = list(hashlib.sha256(salt.encode()).hexdigest()) + + if root_id: + # replace first 16 chars of token with first 16 chars of root id + root_hex = getIdHexChars(root_id) + token[0:16] = root_hex[0:16] + else: + if obj_type != "groups": + raise ValueError("expected 'groups' obj_type for root group id") + # use only 16 chars, but make it look a 32 char id + for i in range(16): + token[16 + i] = hexRot(token[i]) + # format as a string + token = "".join(token) + objid = prefix + "-" + token[0:8] + "-" + token[8:16] + "-" + objid += token[16:20] + "-" + token[20:26] + "-" + token[26:32] + + return objid + + +def getS3Key(id): + """Return s3 key for given id. + + For schema v1: + A md5 prefix is added to the front of the returned key to better + distribute S3 objects. + For schema v2: + The id is converted to the pattern: "db/{rootid[0:16]}" for rootids and + "db/id[0:16]/{prefix}/id[16-32]" for other ids + Chunk ids have the chunk index added after the slash: + "db/id[0:16]/d/id[16:32]/x_y_z + + For domain id's: + Return a key with the .domain suffix and no preceding slash. + For non-default buckets, use the format: /s3_key + If the id has a storage specifier ("s3://", "file://", etc.) + include that along with the bucket name. e.g.: "s3://mybucket/a_folder/a_file.h5" + """ + + base_id = _getBaseName(id) # strip any s3://, etc. + if base_id.find("/") > 0: + # a domain id + domain_suffix = ".domain.json" + index = base_id.find("/") + 1 + key = base_id[index:] + if not key.endswith(domain_suffix): + if key[-1] != "/": + key += "/" + key += domain_suffix + else: + if isSchema2Id(id): + # schema v2 id + hexid = getIdHexChars(id) + prefix = id[0] # one of g, d, t, c + if prefix not in ("g", "d", "t", "c"): + raise ValueError(f"Unexpected id: {id}") + + if isRootObjId(id): + key = f"db/{hexid[0:8]}-{hexid[8:16]}" + else: + partition = "" + if prefix == "c": + # use 'g' so that chunks will show up under their dataset + s3col = "d" + n = id.find("-") + if n > 1: + # extract the partition index if present + partition = "p" + id[1:n] + else: + s3col = prefix + key = f"db/{hexid[0:8]}-{hexid[8:16]}/{s3col}/{hexid[16:20]}" + key += f"-{hexid[20:26]}-{hexid[26:32]}" + if prefix == "c": + if partition: + key += "/" + key += partition + # add the chunk coordinate + index = id.index("_") # will raise ValueError if not found + n = index + 1 + coord = id[n:] + key += "/" + key += coord + elif prefix == "g": + # add key suffix for group + key += "/.group.json" + elif prefix == "d": + # add key suffix for dataset + key += "/.dataset.json" + else: + # add key suffix for datatype + key += "/.datatype.json" + else: + # v1 id + # schema v1 id + idhash = getIdHash(id) + key = f"{idhash}-{id}" + + return key + + +def getObjId(s3key): + """Return object id given valid s3key""" + if all( + ( + len(s3key) >= 44 and s3key[0:5].isalnum(), + len(s3key) >= 44 and s3key[5] == "-", + len(s3key) >= 44 and s3key[6] in ("g", "d", "c", "t"), + ) + ): + # v1 obj keys + objid = s3key[6:] + elif s3key.endswith("/.domain.json"): + objid = "/" + s3key[: -(len("/.domain.json"))] + elif s3key.startswith("db/"): + # schema v2 object key + parts = s3key.split("/") + chunk_coord = "" # used only for chunk ids + partition = "" # likewise + token = [] + for ch in parts[1]: + if ch != "-": + token.append(ch) + + if len(parts) == 3: + # root id + # last part should be ".group.json" + if parts[2] != ".group.json": + raise ValueError(f"unexpected S3Key: {s3key}") + # add 16 more chars using rotated version of first 16 + for i in range(16): + token.append(hexRot(token[i])) + prefix = "g" + elif len(parts) == 5: + # group, dataset, or datatype or chunk + for ch in parts[3]: + if ch != "-": + token.append(ch) + + if parts[2] == "g" and parts[4] == ".group.json": + prefix = "g" # group json + elif parts[2] == "t" and parts[4] == ".datatype.json": + prefix = "t" # datatype json + elif parts[2] == "d": + if parts[4] == ".dataset.json": + prefix = "d" # dataset json + else: + # chunk object + prefix = "c" + chunk_coord = "_" + parts[4] + else: + raise ValueError(f"unexpected S3Key: {s3key}") + elif len(parts) == 6: + # chunk key with partitioning + for ch in parts[3]: + if ch != "-": + token.append(ch) + if parts[2][0] != "d": + raise ValueError(f"unexpected S3Key: {s3key}") + prefix = "c" + partition = parts[4] + if partition[0] != "p": + raise ValueError(f"unexpected S3Key: {s3key}") + partition = partition[1:] # strip off the p + chunk_coord = "_" + parts[5] + else: + raise ValueError(f"unexpected S3Key: {s3key}") + + token = "".join(token) + objid = prefix + partition + "-" + token[0:8] + "-" + token[8:16] + objid += "-" + token[16:20] + "-" + token[20:26] + "-" + objid += token[26:32] + chunk_coord + else: + msg = f"unexpected S3Key: {s3key}" + raise ValueError(msg) + return objid + + +def isS3ObjKey(s3key): + """ return True if this is a storage key """ + valid = False + try: + objid = getObjId(s3key) + if objid: + valid = True + except KeyError: + pass # ignore + except ValueError: + pass # ignore + return valid + + +def validateUuid(id, obj_class=None): + """ verify the UUID is well-formed + schema can be: + None: expecting ordinary UUID + "v1": expecting HSDS v1 format + "v2": expecting HSDS v2 format + if set obj_class can be one of "groups", "datasets", "datatypes" + """ + if not isinstance(id, str): + raise ValueError("Expected string type") + if len(id) < UUID_LEN: + raise ValueError("id is too short to be an object identifier") + if len(id) == UUID_LEN: + if obj_class: + # expected a prefix + raise ValueError(f"obj_id: {id} not valid for collection: {obj_class}") + else: + # does this have a v1 schema hash tag? + # e.g.: "a49be-g-314d61b8-9954-11e6-a733-3c15c2da029e", + if id[:5].isalnum() and id[5] == '-': + id = id[6:] # trim off the hash tag + + # for id's like "datasets/abced...", trim the collection name and add collection + # prefix to the id if not already present + if id.find('/') > 0: + parts = id.split('/') + if len(parts) > 2: + raise ValueError(f"obj_id: {id} not valid (too many slash chars)") + collection = parts[0] + if getCollectionForId(id) != collection: + raise ValueError(f"obj_id: {id} invalid collection") + id = parts[1] + if len(id) == UUID_LEN: + # prefix with the one char collection code + id = _getPrefixForCollection(collection) + '-' + id + + # validate prefix + if id[0] not in ("g", "d", "t", "c"): + raise ValueError("Unexpected prefix") + if id[0] != "c" and id[1] != "-": + # chunk ids may have a partition index following the c + raise ValueError("Unexpected prefix") + if obj_class is not None: + obj_class = obj_class.lower() + if id[0] != _getPrefixForCollection(obj_class): + raise ValueError(f"unexpected object id {id} for collection: {obj_class}") + if id[0] == "c": + # trim the type char and any partition id + n = id.find("-") + if n == -1: + raise ValueError("Invalid chunk id") + + # trim the chunk index for chunk ids + m = id.find("_") + if m == -1: + raise ValueError("Invalid chunk id") + n += 1 + id = "c-" + id[n:m] + id = id[2:] + if len(id) != UUID_LEN: + # id should be 36 now + raise ValueError("Unexpected id length") + + for ch in id: + if ch.isalnum(): + continue + if ch == "-": + continue + raise ValueError(f"Unexpected character in uuid: {ch}") + + +def isValidUuid(id, obj_class=None): + try: + validateUuid(id, obj_class) + return True + except ValueError: + return False + + +def isValidChunkId(id): + if not isValidUuid(id): + return False + if id[0] != "c": + return False + return True + + +def getClassForObjId(id): + """return domains/chunks/groups/datasets/datatypes based on id""" + if not isinstance(id, str): + raise ValueError("Expected string type") + if len(id) == 0: + raise ValueError("Empty string") + if id[0] == "/": + return "domains" + if isValidChunkId(id): + return "chunks" + else: + return getCollectionForId(id) + + +def isObjId(id): + """return true if uuid or domain""" + if not isinstance(id, str) or len(id) == 0: + return False + if id.find("/") > 0: + # domain id is any string in the form / + return True + return isValidUuid(id) + + +def getUuidFromId(id): + """strip off the type prefix ('g-' or 'd-', or 't-') + and return the uuid part """ + if id.find('/') > 0: + # remove a collection name prefix if present + parts = id.split('/') + if len(parts) > 2: + raise ValueError(f"Unexpected obj_id: {id}") + id = parts[1] + if len(id) == UUID_LEN: + # just a uuid + return id + elif len(id) == UUID_LEN + 2: + # 'g-', 'd-', or 't-' prefix + return id[2:] + else: + raise ValueError(f"Unexpected obj_id: {id}") diff --git a/src/h5json/openid.py b/src/h5json/openid.py new file mode 100644 index 0000000..af38d94 --- /dev/null +++ b/src/h5json/openid.py @@ -0,0 +1,437 @@ +import os +import sys +import json +import requests +import time +from abc import ABC, abstractmethod +from datetime import datetime + +from . import config as hsconfig + + +def eprint(*args, **kwargs): + print(*args, file=sys.stderr, **kwargs) + + +# Azure +try: + import adal +except ModuleNotFoundError: + pass # change this to the eprint below to see the import error + # eprint()"Unable to import azure auth packages") + +# Google +try: + from google_auth_oauthlib.flow import InstalledAppFlow as GoogleInstalledAppFlow + from google.auth.transport.requests import Request as GoogleRequest + from google.oauth2.credentials import Credentials as GoogleCredentials + from google.oauth2 import id_token as GoogleIDToken +except ModuleNotFoundError: + pass # change this to the eprint below to see the import error + # eprint("Unable to import google auth packages") + + +class OpenIDHandler(ABC): + + def __init__(self, endpoint, use_token_cache=True, username=None, password=None): + """Initialize the token.""" + + # Location of the token cache. + self._token_cache_file = os.path.expanduser('~/.hstokencfg') + self._endpoint = endpoint + self._username = username + self._password = password + + # The _token attribute should be a dict with at least the following keys: + # + # accessToken - The OpenID token to send. + # refreshToken - The refresh token (optional). + # expiresOn - The unix timestamp when the token expires (optional). + + if not use_token_cache or not os.path.isfile(self._token_cache_file): + self._token = None + else: + if username: + file_key = username + '@' + endpoint + else: + file_key = endpoint + with open(self._token_cache_file, 'r') as token_file: + self._token = json.load(token_file).get(file_key, None) + + @abstractmethod + def acquire(self): + """Acquire a new token from the provider.""" + pass + + @abstractmethod + def refresh(self): + """Refresh an existing token with the provider.""" + pass + + @property + def username(self): + """ Return username if known """ + return self._username + + @property + def expired(self): + """Return if the token is expired.""" + t = self._token + # add some buffer to account for clock skew + return t is not None and 'expiresOn' in t and time.time() + 10.0 >= t['expiresOn'] + + @property + def token(self): + """Return the token if valid, otherwise get a new one.""" + + if self.expired: + self.refresh() + if self._token: + self.write_token_cache() + + if self._token is None: + self.acquire() + self.write_token_cache() + + return self._token['accessToken'] + + def write_token_cache(self): + """Write the token to a file cache.""" + + cache_exists = os.path.isfile(self._token_cache_file) + + if self._username: + file_key = self._username + '@' + self._endpoint + else: + file_key = self._endpoint + + # Create a new cache file. + if not cache_exists and self._token is not None: + with open(self._token_cache_file, 'w') as token_file: + json.dump({file_key: self._token}, token_file) + + # Update an exisiting cache file. + elif cache_exists: + with open(self._token_cache_file, 'r+') as token_file: + cache = json.loads(token_file.read()) + + # Store valid tokens. + if self._token is not None: + cache[file_key] = self._token + + # Delete invalid tokens. + elif file_key in cache: + del cache[file_key] + + token_file.seek(0) + token_file.truncate(0) + json.dump(cache, token_file) + + +class AzureOpenID(OpenIDHandler): + + AUTHORITY_URI = 'https://login.microsoftonline.com' # login endpoint for AD auth + + def __init__(self, endpoint, config=None): + """Store configuration.""" + + # Configuration manager + hs_config = hsconfig.get_config() + + # Config is a dictionary. + if isinstance(config, dict): + self.config = config + + # Maybe client_secrets are in environment variables? + else: + + self.config = { + 'AD_APP_ID': hs_config.get("hs_ad_app_id", None), + 'AD_TENANT_ID': hs_config.get("hs_ad_tenant_id", None), + 'AD_RESOURCE_ID': hs_config.get("hs_ad_resource_id", None), + 'AD_CLIENT_SECRET': hs_config.get("hs_ad_client_secret", None) + } + + if 'AD_CLIENT_SECRET' in self.config and self.config['AD_CLIENT_SECRET']: + use_token_cache = False + else: + use_token_cache = True + + super().__init__(endpoint, use_token_cache=use_token_cache) + + def write_token_cache(self): + if 'AD_CLIENT_SECRET' in self.config and self.config['AD_CLIENT_SECRET']: + pass # don't use token cache for unattended authentication + else: + super().write_token_cache() + + def acquire(self): + """Acquire a new Azure token.""" + + if "adal" not in sys.modules: + msg = "adal module not found, run: pip install -e . '.[azure]'" + raise ModuleNotFoundError(msg) + + app_id = self.config["AD_APP_ID"] + resource_id = self.config["AD_RESOURCE_ID"] + tenant_id = self.config["AD_TENANT_ID"] + client_secret = self.config.get("AD_CLIENT_SECRET", None) + authority_uri = self.AUTHORITY_URI + '/' + tenant_id + + # Try to get a token using different oauth flows. + context = adal.AuthenticationContext(authority_uri, enable_pii=True, api_version=None) + + try: + if client_secret is not None: + code = context.acquire_token_with_client_credentials(resource_id, app_id, client_secret) + else: + code = context.acquire_user_code(resource_id, app_id) + + except Exception as e: + eprint(f"unable to process AD token: {e}") + self._token = None + self.write_token_cache() + raise + + if "message" in code: + eprint(code["message"]) + mgmt_token = context.acquire_token_with_device_code(resource_id, code, app_id) + + elif "accessToken" in code: + mgmt_token = code + + else: + eprint("Could not authenticate with AD") + + # Only store some fields. + self._token = { + 'accessToken': mgmt_token['accessToken'], + 'refreshToken': mgmt_token.get('refreshToken', None), + 'tenantId': mgmt_token.get('tenantId', tenant_id), + 'clientId': mgmt_token.get('_clientId', app_id), + 'resource': mgmt_token.get('resource', resource_id) + } + + # Parse time to timestamp. + if 'expiresOn' in mgmt_token: + expire_dt = datetime.strptime(mgmt_token['expiresOn'], '%Y-%m-%d %H:%M:%S.%f') + self._token['expiresOn'] = expire_dt.timestamp() + + def refresh(self): + """Try to renew an Azure token.""" + + try: + + # This will work for device code flow, but not with client + # credentials. If we have the secret, we can just request a new + # token anyways. + + authority_uri = self.AUTHORITY_URI + '/' + self._token['tenantId'] + context = adal.AuthenticationContext(authority_uri, api_version=None) + mgmt_token = context.acquire_token_with_refresh_token(self._token['refreshToken'], + self._token['clientId'], + self._token['resource'], + None) + + # New token does not have all the metadata. + self._token['accessToken'] = mgmt_token['accessToken'] + self._token['refreshToken'] = mgmt_token['refreshToken'] + + # Parse time to timestamp. + if 'expiresOn' in mgmt_token: + expire_dt = datetime.strptime(mgmt_token['expiresOn'], '%Y-%m-%d %H:%M:%S.%f') + self._token['expiresOn'] = expire_dt.timestamp() + + except Exception: + self._token = None + + +class GoogleOpenID(OpenIDHandler): + + def __init__(self, endpoint, config=None, scopes=None): + """Store configuration.""" + + if "google.oauth2" not in sys.modules: + msg = "google.oauth2 module not found, run: pip install -e . '.[google]'" + raise ModuleNotFoundError(msg) + + # Configuration manager + hs_config = hsconfig.get_config() + + if scopes is None: + scopes = hs_config.get('hs_google_scopes', 'openid').split() + self.scopes = scopes + + # Config is a client_secrets dictionary. + if isinstance(config, dict): + self.config = config + + # Config points to a client_secrets.json file. + elif isinstance(config, str) and os.path.isfile(config): + with open(config, 'r') as f: + self.config = json.loads(f.read()) + + # Maybe client_secrets are in environment variables? + else: + self.config = { + 'installed': { + 'project_id': hs_config.get('hs_google_project_id', None), + 'client_id': hs_config.get('hs_google_client_id', None), + 'client_secret': hs_config.get('hs_google_client_secret', None), + 'auth_uri': 'https://accounts.google.com/o/oauth2/auth', + 'token_uri': 'https://oauth2.googleapis.com/token', + 'auth_provider_x509_cert_url': 'https://www.googleapis.com/oauth2/v1/certs', + 'redirect_uris': ['urn:ietf:wg:oauth:2.0:oob', 'http://localhost'] + } + } + + super().__init__(endpoint) + + def _parse(self, creds): + """Parse credentials.""" + + # NOTE: In Google OpenID, if a client is set up for InstalledAppFlow + # then the client_secret is not actually treated as a secret. Acquire + # will ALWAYS prompt for user input before granting a token. + + token = { + 'accessToken': creds.id_token, + 'refreshToken': creds.refresh_token, + 'tokenUri': creds.token_uri, + 'clientId': creds.client_id, + 'clientSecret': creds.client_secret, + 'scopes': creds.scopes + } + + # The expiry field that is in creds is for the OAuth token, not the + # OpenID token. We need to validate the OpenID tokenn to get the exp. + idinfo = GoogleIDToken.verify_oauth2_token(creds.id_token, GoogleRequest()) + if 'exp' in idinfo: + token['expiresOn'] = idinfo['exp'] + + return token + + def acquire(self): + """Acquire a new Google token.""" + + flow = GoogleInstalledAppFlow.from_client_config(self.config, + scopes=self.scopes) + creds = flow.run_console() + self._token = self._parse(creds) + + def refresh(self): + """Try to renew a token.""" + + try: + + token = self._token + creds = GoogleCredentials(token=None, + refresh_token=token['refreshToken'], + scopes=token['scopes'], + token_uri=token['tokenUri'], + client_id=token['clientId'], + client_secret=token['clientSecret']) + + creds.refresh(GoogleRequest()) + self._token = self._parse(creds) + + except Exception: + self._token = None + + +class KeycloakOpenID(OpenIDHandler): + + def __init__(self, endpoint, config=None, scopes=None, username=None, password=None): + """Store configuration.""" + + # Configuration manager + hs_config = hsconfig.get_config() + + if scopes is None: + scopes = hs_config.get('hs_keycloak_scopes', 'openid').split() + self.scopes = scopes + + # Config is a client_secrets dictionary. + if isinstance(config, dict): + self.config = config + + # Config points to a client_secrets.json file. + elif isinstance(config, str) and os.path.isfile(config): + with open(config, 'r') as f: + self.config = json.loads(f.read()) + + # Maybe configs are in environment variables? + else: + self.config = { + 'keycloak_client_id': hs_config.get('hs_keycloak_client_id', None), + 'keycloak_client_secret': hs_config.get('hs_keycloak_client_secret', None), + 'keycloak_realm': hs_config.get('hs_keycloak_realm', None), + 'keycloak_uri': hs_config.get('hs_keycloak_uri', None) + } + + super().__init__(endpoint, username=username, password=password) + + def _getKeycloakUrl(self): + if not self.config['keycloak_uri']: + raise KeyError("keycloak_uri not set") + if not self.config['keycloak_realm']: + raise KeyError("Keycloak realm not set") + if not self.config['keycloak_client_id']: + raise KeyError("keycloak client_id not set") + + url = self.config['keycloak_uri'] + url += "/realms/" + url += self.config['keycloak_realm'] + url += "/protocol/openid-connect/token" + + return url + + def _parse(self, creds): + """Parse credentials.""" + + # validate json returned by keycloak + if "token_type" not in creds: + raise IOError("Unexpected Keycloak JWT, no token_type") + if creds["token_type"].lower() != "bearer": + raise IOError("Unexpected Keycloak JWT, expected Bearer token") + + token = {} + if "access_token" not in creds: + raise IOError("Unexpected Keycloak JWT, no access_token") + token["accessToken"] = creds["access_token"] + if "refesh_token" in creds: + token["refreshToken"] = creds["refresh_token"] + if "expires_in" in creds: + now = time.time() + token['expiresOn'] = now + creds["expires_in"] + + # TBD: client_secret + # TBD: scopes + # TBD: client_id + + return token + + def acquire(self): + """Acquire a new Keycloak token.""" + keycloak_url = self._getKeycloakUrl() + + headers = {"Content-Type": "application/x-www-form-urlencoded"} + body = {} + body["username"] = self._username + body["password"] = self._password + body["grant_type"] = "password" + body["client_id"] = self.config.get("keycloak_client_id") + rsp = requests.post(keycloak_url, data=body, headers=headers) + + if rsp.status_code not in (200, 201): + print(f"POST error: {rsp.status_code}") + raise IOError(f"Keycloak response: {rsp.status_code}") + + creds = rsp.json() # TBD: catch json format errors? + self._token = self._parse(creds) + + def refresh(self): + """Try to renew a token.""" + # TBD + # unclear if refresh is supported without a client secret + self._token = None diff --git a/src/h5json/selections.py b/src/h5json/selections.py new file mode 100644 index 0000000..3a94b09 --- /dev/null +++ b/src/h5json/selections.py @@ -0,0 +1,847 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## + +# We use __getitem__ side effects, which pylint doesn't like. +# pylint: disable=pointless-statement + +""" + High-level access to HDF5 dataspace selections +""" + +from __future__ import absolute_import + +import numpy as np + +H5S_SEL_POINTS = 0 +H5S_SELECT_SET = 1 +H5S_SELECT_APPEND = 2 +H5S_SELECT_PREPEND = 3 +H5S_SELECT_OR = 4 +H5S_SELECT_NONE = 5 +H5S_SELECT_ALL = 6 +H5S_SELECT_HYPERSLABS = 7 +H5S_SELECT_NOTB = 8 +H5S_SELLECT_FANCY = 9 + + +def select(obj, args): + """ High-level routine to generate a selection from arbitrary arguments + to __getitem__. The arguments should be the following: + + obj + Datatset object + + args + Either a single argument or a tuple of arguments. See below for + supported classes of argument. + + Argument classes: + + Single Selection instance + Returns the argument. + + numpy.ndarray + Must be a boolean mask. Returns a PointSelection instance. + + RegionReference + Returns a Selection instance. + + Indices, slices, ellipses only + Returns a SimpleSelection instance + + Indices, slices, ellipses, lists or boolean index arrays + Returns a FancySelection instance. + """ + if not isinstance(args, tuple): + args = (args,) + + if hasattr(obj, "shape") and obj.shape == (): + # scalar object + sel = ScalarSelection(obj.shape, args) + return sel + + # "Special" indexing objects + if len(args) == 1: + + arg = args[0] + + if isinstance(arg, Selection): + if arg.shape != obj.shape: + raise TypeError("Mismatched selection shape") + return arg + + elif isinstance(arg, np.ndarray) or isinstance(arg, list): + sel = PointSelection(obj.shape) + sel[arg] + return sel + """ + #todo - RegionReference + elif isinstance(arg, h5r.RegionReference): + sid = h5r.get_region(arg, dsid) + if shape != sid.shape: + raise TypeError("Reference shape does not match dataset shape") + + return Selection(shape, spaceid=sid) + """ + + for a in args: + use_fancy = False + if isinstance(a, np.ndarray): + use_fancy = True + elif a is []: + use_fancy = True + elif not isinstance(a, slice) and a is not Ellipsis: + try: + int(a) + except Exception: + use_fancy = True + if use_fancy and hasattr(obj, "shape"): + sel = FancySelection(obj.shape) + sel[args] + return sel + if hasattr(obj, "shape"): + sel = SimpleSelection(obj.shape) + else: + sel = SimpleSelection(obj) + sel[args] + return sel + + +def intersect(s1, s2): + """ Return the intersection of two selections """ + # TBD: this is currently only working for simple selections with stride 1 + valid_select_types = (H5S_SELECT_HYPERSLABS, H5S_SELECT_ALL) + if not isinstance(s1, Selection): + raise TypeError("Expected selection type for first arg") + if not isinstance(s2, Selection): + raise TypeError("Expected selection type for second arg") + if s1.select_type not in valid_select_types: + raise TypeError("Expected hyperslab selection for first arg") + if s2.select_type not in valid_select_types: + raise TypeError("Expected hyperslab selection for second arg") + if s1.shape != s2.shape: + raise ValueError("selections have incompatible shapes") + + slices = [] + rank = len(s1.shape) + for dim in range(rank): + start = max(s1.start[dim], s2.start[dim]) + stop = min(s1.start[dim] + s1.count[dim], s2.start[dim] + s2.count[dim]) + msg = "stepped slices not currently supported" + if s1.step[dim] > 1: + raise ValueError(msg) + if s2.step[dim] > 1: + raise ValueError("stepped slices not currently supported") + if start > stop: + stop = start + slices.append(slice(start, stop, 1)) + slices = tuple(slices) + + return select(s1.shape, slices) + + +class Selection(object): + + """ + Base class for HDF5 dataspace selections. Subclasses support the + "selection protocol", which means they have at least the following + members: + + __init__(shape) => Create a new selection on "shape"-tuple + __getitem__(args) => Perform a selection with the range specified. + What args are allowed depends on the + particular subclass in use. + + id (read-only) => h5py.h5s.SpaceID instance + shape (read-only) => The shape of the dataspace. + mshape (read-only) => The shape of the selection region. + Not guaranteed to fit within "shape", although + the total number of points is less than + product(shape). + nselect (read-only) => Number of selected points. Always equal to + product(mshape). + + broadcast(target_shape) => Return an iterable which yields dataspaces + for read, based on target_shape. + + The base class represents "unshaped" selections (1-D). + """ + + def __init__(self, shape, *args, **kwds): + """ Create a selection. """ + + shape = tuple(shape) + self._shape = shape + + self._select_type = H5S_SELECT_ALL + + @property + def select_type(self): + """ SpaceID instance """ + return self._select_type + + @property + def shape(self): + """ Shape of whole dataspace """ + return self._shape + + @property + def nselect(self): + """ Number of elements currently selected """ + + return self.getSelectNpoints() + + @property + def mshape(self): + """ Shape of selection (always 1-D for this class) """ + return (self.nselect,) + + def getSelectNpoints(self): + npoints = None + if self._select_type == H5S_SELECT_NONE: + npoints = 0 + elif self._select_type == H5S_SELECT_ALL: + dims = self._shape + npoints = 1 + for nextent in dims: + npoints *= nextent + else: + raise IOError("Unsupported select type") + return npoints + + def broadcast(self, target_shape): + """ Get an iterable for broadcasting """ + if np.product(target_shape) != self.nselect: + raise TypeError("Broadcasting is not supported for point-wise selections") + yield self._id + + def __getitem__(self, args): + raise NotImplementedError("This class does not support indexing") + + def __repr__(self): + return f"Selection(shape:{self._shape})" + + +class PointSelection(Selection): + + """ + Represents a point-wise selection. You can supply sequences of + points to the three methods append(), prepend() and set(), or a + single boolean array to __getitem__. + """ + def __init__(self, shape, *args, **kwds): + """ Create a Point selection. """ + Selection.__init__(self, shape, *args, **kwds) + self._points = [] + + @property + def points(self): + """ selection points """ + return self._points + + def getSelectNpoints(self): + npoints = None + if self._select_type == H5S_SELECT_NONE: + npoints = 0 + elif self._select_type == H5S_SELECT_ALL: + dims = self._shape + npoints = 1 + for nextent in dims: + npoints *= nextent + elif self._select_type == H5S_SEL_POINTS: + dims = self._shape + rank = len(dims) + if len(self._points) == rank and not type(self._points[0]) in (list, tuple, np.ndarray): + npoints = 1 + else: + npoints = len(self._points) + else: + raise IOError("Unsupported select type") + return npoints + + def _perform_selection(self, points, op): + """ Internal method which actually performs the selection """ + if isinstance(points, np.ndarray) or True: + points = np.asarray(points, order='C', dtype='u8') + if len(points.shape) == 1: + # points.shape = (1,points.shape[0]) + pass + + if self._select_type != H5S_SEL_POINTS: + op = H5S_SELECT_SET + self._select_type = H5S_SEL_POINTS + + if op == H5S_SELECT_SET: + self._points = points + elif op == H5S_SELECT_APPEND: + self._points.extent(points) + elif op == H5S_SELECT_PREPEND: + tmp = self._points + self._points = points + self._points.extend(tmp) + else: + raise ValueError("Unsupported operation") + + # def _perform_list_selection(points, H5S_SELECT_SET): + + def __getitem__(self, arg): + """ Perform point-wise selection from a NumPy boolean array """ + if isinstance(arg, list): + points = arg + else: + if not (isinstance(arg, np.ndarray) and arg.dtype.kind == 'b'): + raise TypeError("PointSelection __getitem__ only works with bool arrays") + if not arg.shape == self._shape: + raise TypeError("Boolean indexing array has incompatible shape") + + points = np.transpose(arg.nonzero()) + self.set(points) + return self + + def append(self, points): + """ Add the sequence of points to the end of the current selection """ + self._perform_selection(points, H5S_SELECT_APPEND) + + def prepend(self, points): + """ Add the sequence of points to the beginning of the current selection """ + self._perform_selection(points, H5S_SELECT_PREPEND) + + def set(self, points): + """ Replace the current selection with the given sequence of points""" + """ + if isinstance(points, list): + # selection with list of points + self._perform_list_selection(points, H5S_SELECT_SET) + + else: + # selection with boolean ndarray + """ + self._perform_selection(points, H5S_SELECT_SET) + + def __repr__(self): + return f"PointSelection(shape:{self._shape}, {len(self._points)} points)" + + +class SimpleSelection(Selection): + + """ A single "rectangular" (regular) selection composed of only slices + and integer arguments. Can participate in broadcasting. + """ + + @property + def mshape(self): + """ Shape of current selection """ + return self._mshape + + @property + def start(self): + return self._sel[0] + + @property + def count(self): + return self._sel[1] + + @property + def step(self): + return self._sel[2] + + def __init__(self, shape, *args, **kwds): + Selection.__init__(self, shape, *args, **kwds) + rank = len(self._shape) + self._sel = ((0,) * rank, self._shape, (1,) * rank, (False,) * rank) + self._mshape = self._shape + self._select_type = H5S_SELECT_ALL + + def __getitem__(self, args): + + if not isinstance(args, tuple): + args = (args,) + + if self._shape == (): + if len(args) > 0 and args[0] not in (Ellipsis, ()): + raise TypeError("Invalid index for scalar dataset (only ..., () allowed)") + self._select_type = H5S_SELECT_ALL + return self + + start, count, step, scalar = _handle_simple(self._shape, args) + self._sel = (start, count, step, scalar) + + # self._id.select_hyperslab(start, count, step) + self._select_type = H5S_SELECT_HYPERSLABS + + self._mshape = tuple(x for x, y in zip(count, scalar) if not y) + + return self + + def getSelectNpoints(self): + """Return number of elements in current selection + """ + npoints = None + if self._select_type == H5S_SELECT_NONE: + npoints = 0 + elif self._select_type == H5S_SELECT_ALL: + dims = self._shape + npoints = 1 + for nextent in dims: + npoints *= nextent + elif self._select_type == H5S_SELECT_HYPERSLABS: + dims = self._shape + npoints = 1 + rank = len(dims) + for i in range(rank): + npoints *= self.count[i] + else: + raise IOError("Unsupported select type") + return npoints + + def getQueryParam(self): + """ Get select param for use with HDF Rest API""" + param = '' + rank = len(self._shape) + if rank == 0: + return None + + param += "[" + for i in range(rank): + start = self.start[i] + stop = start + (self.count[i] * self.step[i]) + if stop > self._shape[i]: + stop = self._shape[i] + dim_sel = str(start) + ':' + str(stop) + if self.step[i] != 1: + dim_sel += ':' + str(self.step[i]) + if i != rank - 1: + dim_sel += ',' + param += dim_sel + param += ']' + return param + + def broadcast(self, target_shape): + """ Return an iterator over target dataspaces for broadcasting. + + Follows the standard NumPy broadcasting rules against the current + selection shape (self._mshape). + """ + if self._shape == (): + if np.product(target_shape) != 1: + raise TypeError(f"Can't broadcast {target_shape} to scalar") + self._id.select_all() + yield self._id + return + + start, count, step, scalar = self._sel + + rank = len(count) + target = list(target_shape) + + tshape = [] + for idx in range(1, rank + 1): + if len(target) == 0 or scalar[-idx]: # Skip scalar axes + tshape.append(1) + else: + t = target.pop() + if t == 1 or count[-idx] == t: + tshape.append(t) + else: + raise TypeError(f"Can't broadcast {target_shape} -> {count}") + tshape.reverse() + tshape = tuple(tshape) + + chunks = tuple(x // y for x, y in zip(count, tshape)) + nchunks = int(np.product(chunks)) + + if nchunks == 1: + yield self._id + else: + sid = self._id.copy() + sid.select_hyperslab((0,) * rank, tshape, step) + for idx in range(nchunks): + offset = tuple(x * y * z + s for x, y, z, s in zip(np.unravel_index(idx, chunks), tshape, step, start)) + sid.offset_simple(offset) + yield sid + + @property + def slices(self): + """ return tuple of slices for this selection """ + rank = len(self.shape) + slices = [] + for dim in range(rank): + start = self.start[dim] + stop = start + self.count[dim] + step = self.step[dim] + slices.append(slice(start, stop, step)) + return tuple(slices) + + def __repr__(self): + s = f"SimpleSelection(shape:{self._shape}, start: {self._sel[0]}," + s += f" count: {self._sel[1]}, step: {self._sel[2]}" + return s + + +class FancySelection(Selection): + + """ + Implements advanced NumPy-style selection operations in addition to + the standard slice-and-int behavior. + + Indexing arguments may be ints, slices, lists of indicies, or + per-axis (1D) boolean arrays. + + Broadcasting is not supported for these selections. + """ + + @property + def slices(self): + return self._slices + + @property + def mshape(self): + """ Shape of current selection """ + return self._mshape + + def __init__(self, shape, *args, **kwds): + Selection.__init__(self, shape, *args, **kwds) + self._slices = [] + + def __getitem__(self, args): + + if not isinstance(args, tuple): + args = (args,) + + args = _expand_ellipsis(args, len(self._shape)) + select_type = H5S_SELECT_HYPERSLABS # will adjust if we have a coord + + # Create list of slices and/or coordinates + slices = [] + mshape = [] + num_coordinates = None + for idx, arg in enumerate(args): + length = self._shape[idx] + if isinstance(arg, slice): + _, count, _ = _translate_slice(arg, length) # raise exception for invalid slice + if arg.start is None: + start = 0 + else: + start = arg.start + if arg.stop is None: + stop = length + else: + stop = arg.stop + if arg.step is None: + step = 1 + else: + step = arg.step + slices.append(slice(start, stop, step)) + mshape.append(count) + + elif hasattr(arg, 'dtype') and arg.dtype == np.dtype('bool'): + if len(arg.shape) != 1: + raise TypeError("Boolean indexing arrays must be 1-D") + arg = arg.nonzero()[0] + try: + slices.append(list(arg)) + except TypeError: + pass + else: + if sorted(arg) != list(arg): + raise TypeError("Indexing elements must be in increasing order") + mshape.append(len(arg)) + select_type = H5S_SELLECT_FANCY + elif isinstance(arg, list) or hasattr(arg, 'dtype'): + # coordinate selection + slices.append(arg) + for x in arg: + if x < 0 or x >= length: + raise IndexError(f"Index ({arg}) out of range (0-{length - 1})") + if num_coordinates is None: + num_coordinates = len(arg) + elif num_coordinates == len(arg): + # second set of coordinates doesn't effect mshape + continue + else: + # this shouldn't happen since HSDS would have thrown an error + raise ValueError("coordinate num element missmatch") + mshape.append(len(arg)) + select_type = H5S_SELLECT_FANCY + elif isinstance(arg, int): + if arg < 0 or arg >= length: + raise IndexError(f"Index ({arg}) out of range (0-{length - 1})") + slices.append(arg) + elif isinstance(arg, type(Ellipsis)): + slices.append(slice(0, length, 1)) + else: + raise TypeError(f"Unexpected arg type: {arg} - {type(arg)}") + self._slices = slices + self._select_type = select_type + self._mshape = tuple(mshape) + + def getSelectNpoints(self): + """Return number of elements in current selection + """ + npoints = 1 + for idx, s in enumerate(self._slices): + if isinstance(s, slice): + length = self._shape[idx] + _, count, _ = _translate_slice(s, length) + elif isinstance(s, list): + count = len(s) + else: + # scalar selection + count = 1 + npoints *= count + + return npoints + + def getQueryParam(self): + """ Get select param for use with HDF Rest API""" + query = [] + query.append('[') + rank = len(self._slices) + for dim, s in enumerate(self._slices): + if isinstance(s, slice): + if s.start is None and s.stop is None: + query.append(':') + elif s.stop is None: + query.append(f"{s.start}:") + else: + query.append(f"{s.start}:{s.stop}") + if s.step and s.step != 1: + query.append(f":{s.step}") + elif isinstance(s, list) or hasattr(s, 'dtype'): + query.append('[') + for idx, n in enumerate(s): + query.append(str(n)) + if idx + 1 < len(s): + query.append(',') + query.append(']') + else: + # scalar selection + query.append(str(s)) + if dim + 1 < rank: + query.append(',') + query.append(']') + return "".join(query) + + def broadcast(self, target_shape): + raise TypeError("Broadcasting is not supported for complex selections") + + def __repr__(self): + return f"FancySelection(shape:{self._shape}, slices: {self._slices})" + + +def _expand_ellipsis(args, rank): + """ Expand ellipsis objects and fill in missing axes. + """ + n_el = sum(1 for arg in args if arg is Ellipsis) + if n_el > 1: + raise ValueError("Only one ellipsis may be used.") + elif n_el == 0 and len(args) != rank: + args = args + (Ellipsis,) + + final_args = [] + n_args = len(args) + for arg in args: + + if arg is Ellipsis: + final_args.extend((slice(None, None, None),) * (rank - n_args + 1)) + else: + final_args.append(arg) + + if len(final_args) > rank: + raise TypeError("Argument sequence too long") + + return final_args + + +def _handle_simple(shape, args): + """ Process a "simple" selection tuple, containing only slices and + integer objects. Return is a 4-tuple with tuples for start, + count, step, and a flag which tells if the axis is a "scalar" + selection (indexed by an integer). + + If "args" is shorter than "shape", the remaining axes are fully + selected. + """ + args = _expand_ellipsis(args, len(shape)) + + start = [] + count = [] + step = [] + scalar = [] + + for arg, length in zip(args, shape): + if isinstance(arg, slice): + x, y, z = _translate_slice(arg, length) + s = False + else: + try: + x, y, z = _translate_int(int(arg), length) + s = True + except TypeError: + raise TypeError(f'Illegal index "{arg}" (must be a slice or number)') + start.append(x) + count.append(y) + step.append(z) + scalar.append(s) + + return tuple(start), tuple(count), tuple(step), tuple(scalar) + + +def _translate_int(exp, length): + """ Given an integer index, return a 3-tuple + (start, count, step) + for hyperslab selection + """ + if exp < 0: + exp = length + exp + + if not 0 <= exp < length: + raise IndexError(f"Index ({exp}) out of range (0-{length - 1})") + + return exp, 1, 1 + + +def _translate_slice(exp, length): + """ Given a slice object, return a 3-tuple + (start, count, step) + for use with the hyperslab selection routines + """ + start, stop, step = exp.indices(length) + # Now if step > 0, then start and stop are in [0, length]; + # if step < 0, they are in [-1, length - 1] (Python 2.6b2 and later; + # Python issue 3004). + + if step < 1: + raise ValueError("Step must be >= 1 (got %d)" % step) + if stop < start: + stop = start + + count = 1 + (stop - start - 1) // step + + return start, count, step + + +def guess_shape(sid): + """ Given a dataspace, try to deduce the shape of the selection. + + Returns one of: + * A tuple with the selection shape, same length as the dataspace + * A 1D selection shape for point-based and multiple-hyperslab selections + * None, for unselected scalars and for NULL dataspaces + """ + + sel_class = sid.get_simple_extent_type() # Dataspace class + sel_type = sid.get_select_type() # Flavor of selection in use + + if sel_class == 'H5S_NULL': + # NULL dataspaces don't support selections + return None + + elif sel_class == 'H5S_SCALAR': + # NumPy has no way of expressing empty 0-rank selections, so we use None + if sel_type == H5S_SELECT_NONE: + return None + if sel_type == H5S_SELECT_ALL: + return tuple() + + elif sel_class != 'H5S_SIMPLE': + raise TypeError(f"Unrecognized dataspace class {sel_class}") + + # We have a "simple" (rank >= 1) dataspace + + N = sid.get_select_npoints() + rank = len(sid.shape) + + if sel_type == H5S_SELECT_NONE: + return (0,) * rank + + elif sel_type == H5S_SELECT_ALL: + return sid.shape + + elif sel_type == H5S_SEL_POINTS: + # Like NumPy, point-based selections yield 1D arrays regardless of + # the dataspace rank + return (N,) + + elif sel_type != H5S_SELECT_HYPERSLABS: + raise TypeError(f"Unrecognized selection method {sel_type}") + + # We have a hyperslab-based selection + + if N == 0: + return (0,) * rank + + bottomcorner, topcorner = (np.array(x) for x in sid.get_select_bounds()) + + # Shape of full selection box + boxshape = topcorner - bottomcorner + np.ones((rank,)) + + def get_n_axis(sid, axis): + """ Determine the number of elements selected along a particular axis. + + To do this, we "mask off" the axis by making a hyperslab selection + which leaves only the first point along the axis. For a 2D dataset + with selection box shape (X, Y), for axis 1, this would leave a + selection of shape (X, 1). We count the number of points N_leftover + remaining in the selection and compute the axis selection length by + N_axis = N/N_leftover. + """ + + if (boxshape[axis]) == 1: + return 1 + + start = bottomcorner.copy() + start[axis] += 1 + count = boxshape.copy() + count[axis] -= 1 + + # Throw away all points along this axis + masked_sid = sid.copy() + masked_sid.select_hyperslab(tuple(start), tuple(count), op=H5S_SELECT_NOTB) + + N_leftover = masked_sid.get_select_npoints() + + return N // N_leftover + + shape = tuple(get_n_axis(sid, x) for x in range(rank)) + + if np.product(shape) != N: + # This means multiple hyperslab selections are in effect, + # so we fall back to a 1D shape + return (N,) + + return shape + + +class ScalarSelection(Selection): + + """ + Implements slicing for scalar datasets. + """ + + @property + def mshape(self): + return self._mshape + + def __init__(self, shape, *args, **kwds): + Selection.__init__(self, shape, *args, **kwds) + arg = None + if len(args) > 0: + arg = args[0] + if arg == (): + self._mshape = None + self._select_type = H5S_SELECT_ALL + elif arg == (Ellipsis,): + self._mshape = () + self._select_type = H5S_SELECT_ALL + else: + raise ValueError("Illegal slicing argument for scalar dataspace") diff --git a/src/h5json/writer/__init__.py b/src/h5json/writer/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/test/integ/h5tojson_test.py b/test/integ/h5tojson_test.py index 68b0464..5be40c8 100644 --- a/test/integ/h5tojson_test.py +++ b/test/integ/h5tojson_test.py @@ -35,7 +35,7 @@ "compound.h5", "compound_array.h5", "compound_array_attr.h5", - # "compound_array_vlen_string.h5", # crashes python w/ Linux! + "compound_array_vlen_string.h5", # crashes python w/ Linux? "compound_array_dset.h5", "compound_attr.h5", "compound_committed.h5", @@ -47,8 +47,8 @@ "enum_attr.h5", "enum_dset.h5", "fillvalue.h5", - "fixed_string_attr.h5", # temp for trying travis - "fixed_string_dset.h5", # temp for trying travis + "fixed_string_attr.h5", + "fixed_string_dset.h5", "h5ex_d_alloc.h5", "h5ex_d_checksum.h5", "h5ex_d_chunk.h5", diff --git a/test/integ/jsontoh5_test.py b/test/integ/jsontoh5_test.py index dad5648..3be3a3b 100644 --- a/test/integ/jsontoh5_test.py +++ b/test/integ/jsontoh5_test.py @@ -36,7 +36,7 @@ # "compound_array.json", # "compound_array_attr.json", # "compound_array_dset.json", - "compound_array_vlen_string.json", + # "compound_array_vlen_string.json", # regression "compound_attr.json", "compound_committed.json", "dim_scale.json", @@ -95,7 +95,7 @@ "regionref_attr.json", # "regionref_dset.json", "scalar_attr.json", - "vlen_attr.json", + # "vlen_attr.json", #regression "vlen_dset.json", "vlen_string_attr.json", "vlen_string_dset.json", diff --git a/test/unit/array_util_test.py b/test/unit/array_util_test.py new file mode 100644 index 0000000..1ede343 --- /dev/null +++ b/test/unit/array_util_test.py @@ -0,0 +1,1269 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import unittest +import json +import numpy as np + +import base64 + +from h5json.array_util import bytesArrayToList +from h5json.array_util import toTuple +from h5json.array_util import getNumElements +from h5json.array_util import jsonToArray +from h5json.array_util import arrayToBytes +from h5json.array_util import bytesToArray +from h5json.array_util import getByteArraySize +from h5json.array_util import IndexIterator +from h5json.array_util import ndarray_compare +from h5json.array_util import getNumpyValue +from h5json.array_util import getBroadcastShape +from h5json.array_util import isVlen + +from h5json.hdf5dtype import special_dtype +from h5json.hdf5dtype import check_dtype +from h5json.hdf5dtype import createDataType + + +class ArrayUtilTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(ArrayUtilTest, self).__init__(*args, **kwargs) + # main + + def testByteArrayToList(self): + data_items = ( + 42, + "foo", + b"foo", + [1, 2, 3], + (1, 2, 3), + ["A", "B", "C"], + [b"A", b"B", b"C"], + [["A", "B"], [b"a", b"b", b"c"]], + ) + for data in data_items: + json_data = bytesArrayToList(data) + # will throw TypeError if not able to convert + json.dumps(json_data) + + def testToTuple(self): + data0d = 42 # scalar + data1d1 = [1] # one dimensional, one element list + data1d = [1, 2, 3, 4, 5] # list + data2d1 = [ + [1, 2], + ] # two dimensional, one element + data2d = [[1, 0.1], [2, 0.2], [3, 0.3], [4, 0.4]] # list of two-element lists + data3d = [[[0, 0.0], [1, 0.1]], [[2, 0.2], [3, 0.3]]] # list of list of lists + out = toTuple(0, data0d) + self.assertEqual(data0d, out) + out = toTuple(1, data1d1) + self.assertEqual(data1d1, out) + out = toTuple(1, data1d) + self.assertEqual(data1d, out) + out = toTuple(2, data2d) + self.assertEqual(data2d, out) + out = toTuple(1, data2d1) + self.assertEqual([(1, 2)], out) + out = toTuple(3, data3d) + self.assertEqual(data3d, out) + out = toTuple(1, data2d) # treat input as 1d array of two-field compound types + self.assertEqual([(1, 0.1), (2, 0.2), (3, 0.3), (4, 0.4)], out) + out = toTuple(2, data3d) # treat input as 2d array of two-field compound types + self.assertEqual([[(0, 0.0), (1, 0.1)], [(2, 0.2), (3, 0.3)]], out) + out = toTuple(1, data3d) # treat input a 1d array of compound type of compound types + self.assertEqual([((0, 0.0), (1, 0.1)), ((2, 0.2), (3, 0.3))], out) + + def testToTupleStrData(self): + data = "a string!" + out = toTuple(0, data) + self.assertEqual(data, out) + + data = ["a string!"] + out = toTuple(1, data) + self.assertEqual(data, out) + + data = ["a string2"] + out = toTuple(1, data) + self.assertEqual(data, out) + + data = [["partA", "partB", "partC"],] + out = toTuple(1, data) + self.assertEqual([("partA", "partB", "partC"), ], out) + + data = [[[4, 8, 12], "four"], [[5, 10, 15], "five"]] + out = toTuple(1, data) + self.assertEqual([((4, 8, 12), 'four'), ((5, 10, 15), 'five')], out) + + def testGetNumElements(self): + shape = (4,) + nelements = getNumElements(shape) + self.assertEqual(nelements, 4) + + shape = [10,] + nelements = getNumElements(shape) + self.assertEqual(nelements, 10) + + shape = (10, 8) + nelements = getNumElements(shape) + self.assertEqual(nelements, 80) + + def testJsonToArray(self): + + # simple integer + dt = np.dtype("i4") + shape = [4, ] + data = [0, 2, 4, 6] + out = jsonToArray(shape, dt, data) + + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (4,)) + for i in range(4): + self.assertEqual(out[i], i * 2) + + shape = () # scalar + data = 42 + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, ()) + self.assertEqual(out[()], 42) + + shape = (1, ) # one element + data = 42 + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (1, )) + self.assertEqual(out[0], 42) + + shape = (10, ) # multi-1D + data = list(range(10)) + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (10, )) + self.assertEqual(out[5], 5) + + shape = (5, 4) # multi-2D + data = [] + for i in range(5): + data.append([42, ] * 4) + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (5, 4)) + self.assertEqual(out[2, 3], 42) + + shape = (5, 4) # multi-2D, reshape input data + data = [42, ] * 20 + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (5, 4)) + self.assertEqual(out[2, 3], 42) + + dt = np.dtype("S10") # fixed size string + shape = [5, ] + data = ["parting", "is", "such", "sweet", "sorrow"] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (5, )) + self.assertEqual(out[4], b'sorrow') + + shape = () # scalar + data = "a string" + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, ()) + self.assertEqual(out[()], b'a string') + + # VLEN Scalar str + dt = special_dtype(vlen=str) + data = "I'm a string!" + shape = [] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, ()) + val = out[()] + self.assertEqual(val, data) + + # VLEN one element str + dt = special_dtype(vlen=str) + data = "I'm a string!" + shape = [1,] + out = jsonToArray(shape, dt, [data,]) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (1,)) + val = out[0] + self.assertEqual(val, data) + + # VLEN multi element + shape = [5, ] + data = ["parting", "is", "such", "sweet", "sorrow"] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (5, )) + self.assertEqual(out[4], 'sorrow') + + # VLEN ascii + dt = special_dtype(vlen=bytes) + data = [b"one", b"two", b"three", b"four", b"five"] + shape = [5, ] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (5,)) + self.assertTrue("vlen" in out.dtype.metadata) + self.assertEqual(out.dtype.metadata["vlen"], bytes) + self.assertEqual(out.dtype.kind, "O") + self.assertEqual(out.shape, (5,)) + # TBD: code does not actually enforce use of bytes vs. str, + # probably not worth the effort to fix + self.assertEqual(out[2], b"three") + self.assertEqual(out[3], b"four") + + # VLEN unicode + dt = special_dtype(vlen=bytes) + data = ["one", "two", "three", "four", "five"] + shape = [5, ] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertTrue("vlen" in out.dtype.metadata) + self.assertEqual(out.dtype.metadata["vlen"], bytes) + self.assertEqual(out.dtype.kind, "O") + self.assertEqual(out[2], "three") + + # test utf8 strings + dt = np.dtype("S26") + shape = [] + data = "eight: \u516b" + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out[()], data.encode("utf8")) + + dt = special_dtype(vlen=str) + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out[()], data) + + data = ["I'm an UTF-8 null terminated string",] + shape = [1,] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out[0], data[0]) + + dt = np.dtype("S12") + data = "eight: \u516b" + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out[()], data.encode("utf8")) + + # UTF8 encode the data first + out = jsonToArray(shape, dt, data.encode('utf8')) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out[()], data.encode('utf8')) + + # one-element array + shape = [1,] + dt = np.dtype("S12") + data = "eight: \u516b" + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out[0], b'eight: \xe5\x85\xab') + + # VLEN data + shape = [] + dt = special_dtype(vlen=np.dtype("S10")) + data = ["foo", "bar"] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + + dt = special_dtype(vlen=np.dtype("int32")) + shape = [4, ] + data = [ + [1,], + [1, 2], + [1, 2, 3], + [1, 2, 3, 4], + ] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(check_dtype(vlen=out.dtype), np.dtype("int32")) + + self.assertEqual(out.shape, (4,)) + self.assertEqual(out.dtype.kind, "O") + self.assertEqual(check_dtype(vlen=out.dtype), np.dtype("int32")) + for i in range(4): + e = out[i] # .tolist() + self.assertTrue(isinstance(e, tuple)) + self.assertEqual(e, tuple(range(1, i + 2))) + + # VLEN 2D data + dt = special_dtype(vlen=np.dtype("int32")) + shape = [2, 2] + data = [ + [ + [0,], + [1, 2], + ], + [ + [1,], + [2, 3], + ], + ] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(check_dtype(vlen=out.dtype), np.dtype("int32")) + + self.assertEqual(out.shape, (2, 2)) + self.assertEqual(out.dtype.kind, "O") + self.assertEqual(check_dtype(vlen=out.dtype), np.dtype("int32")) + for i in range(2): + for j in range(2): + e = out[i, j] # .tolist() + self.assertTrue(isinstance(e, tuple)) + + # create VLEN of obj ref's + ref_type = {"class": "H5T_REFERENCE", "base": "H5T_STD_REF_OBJ"} + vlen_type = {"class": "H5T_VLEN", "base": ref_type} + dt = createDataType(vlen_type) # np datatype + + id0 = b"g-a4f455b2-c8cf-11e7-8b73-0242ac110009" + id1 = b"g-a50af844-c8cf-11e7-8b73-0242ac110009" + id2 = b"g-a5236276-c8cf-11e7-8b73-0242ac110009" + + data = [ + [id0, ], + [id0, id1], + [id0, id1, id2], + ] + shape = [3, ] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + base_type = check_dtype(vlen=out.dtype) + self.assertEqual(base_type.kind, "S") + self.assertEqual(base_type.itemsize, 48) + + self.assertEqual(out.shape, (3,)) + self.assertEqual(out.dtype.kind, "O") + self.assertEqual(check_dtype(vlen=out.dtype), np.dtype("S48")) + + e = out[0] + self.assertTrue(isinstance(e, tuple)) + self.assertEqual(e, (id0,)) + e = out[1] + self.assertTrue(isinstance(e, tuple)) + self.assertEqual(e, (id0, id1)) + e = out[2] + self.assertTrue(isinstance(e, tuple)) + self.assertEqual(e, (id0, id1, id2)) + + # compound type + dt = np.dtype([("a", "i4"), ("b", "S5")]) + shape = [2, ] + data = [[4, "four"], [5, "five"]] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + + self.assertEqual(out.shape, (2,)) + self.assertTrue(isinstance(out[0], np.void)) + e0 = out[0].tolist() + self.assertEqual(e0, (4, b"four")) + self.assertTrue(isinstance(out[1], np.void)) + e1 = out[1].tolist() + self.assertEqual(e1, (5, b"five")) + + data = [[6, "six"],] + shape = [1,] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (1,)) + self.assertTrue(isinstance(out[0], np.void)) + e1 = out[0].tolist() + self.assertEqual(e1, (6, b"six")) + + data = [7, "seven"] + shape = [] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, ()) + self.assertTrue(isinstance(out[()], np.void)) + e1 = out[()].tolist() + self.assertEqual(e1, (7, b"seven")) + + data = [8, "eight"] + shape = [1,] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (1,)) + self.assertTrue(isinstance(out[0], np.void)) + e1 = out[0].tolist() + self.assertEqual(e1, (8, b"eight")) + + dt = np.dtype([("a", "i4"), ("b", "f4")]) + shape = [1, ] + data = [42, 0.42] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (1, )) + e1 = out[0] + self.assertEqual(e1[0], 42) + + # compound with VLEN element + + dt_str = special_dtype(vlen=str) + dt = np.dtype([("a", "i4"), ("b", dt_str)]) + shape = [2, ] + data = [[4, "four"], [5, "five"]] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (2,)) + e0 = out[0].tolist() + self.assertEqual(e0, (4, "four")) + + shape = [1, ] + data = [[6, "six"],] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, (1,)) + e0 = out[0].tolist() + self.assertEqual(e0, (6, "six")) + + shape = [] + data = [7, "seven",] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + self.assertEqual(out.shape, ()) + e0 = out[()] + self.assertEqual(len(e0), 2) + self.assertEqual(e0[0], 7) + self.assertEqual(e0[1], "seven") + + # compound type with array field + dt = np.dtype([("a", ("i4", 3)), ("b", "S5")]) + shape = [2, ] + data = [[[4, 8, 12], "four"], [[5, 10, 15], "five"]] + out = jsonToArray(shape, dt, data) + self.assertTrue(isinstance(out, np.ndarray)) + + self.assertEqual(out.shape, (2,)) + self.assertTrue(isinstance(out[0], np.void)) + e0 = out[0] + self.assertEqual(len(e0), 2) + e0a = e0[0] + self.assertTrue(isinstance(e0a, np.ndarray)) + self.assertEqual(e0a[0], 4) + self.assertEqual(e0a[1], 8) + self.assertEqual(e0a[2], 12) + e0b = e0[1] + self.assertEqual(e0b, b"four") + self.assertTrue(isinstance(out[1], np.void)) + e1 = out[1] + self.assertEqual(len(e1), 2) + e1a = e1[0] + self.assertTrue(isinstance(e1a, np.ndarray)) + self.assertEqual(e1a[0], 5) + self.assertEqual(e1a[1], 10) + self.assertEqual(e1a[2], 15) + e1b = e1[1] + self.assertEqual(e1b, b"five") + + def testToBytes(self): + # Simple array + dt = np.dtype("u8") + arr = np.asarray((1, 2, 3, 4), dtype=dt) + buffer = arrayToBytes(arr) + self.assertEqual(buffer, arr.tobytes()) + + # fixed length string + dt = np.dtype("S8") + arr = np.asarray(("abcdefgh", "ABCDEFGH", "12345678"), dtype=dt) + buffer = arrayToBytes(arr) + self.assertEqual(buffer, arr.tobytes()) + + # convert back to array + arr_copy = bytesToArray(buffer, dt, (3,)) + self.assertTrue(ndarray_compare(arr, arr_copy)) + + # fixed length UTF8 string + dt = np.dtype("S10") + arr = np.asarray(b'eight: \xe5\x85\xab', dtype=dt) + buffer = arrayToBytes(arr) + + # convert back to array + arr_copy = bytesToArray(buffer, dt, ()) + self.assertTrue(ndarray_compare(arr, arr_copy)) + + # invalid UTF string + dt = np.dtype("S2") + arr = np.asarray(b'\xff\xfe', dtype=dt) + buffer = arrayToBytes(arr) + + # convert back to array + arr_copy = bytesToArray(buffer, dt, ()) + self.assertTrue(ndarray_compare(arr, arr_copy)) + + # invalid UTF string with base64 encoding + dt = np.dtype("S2") + arr = np.asarray(b'\xff\xfe', dtype=dt) + buffer = b'//4=' # this is the base64 encoding of b'\xff\xfe' + + # convert back to array + arr_copy = bytesToArray(buffer, dt, (), encoding="base64") + self.assertTrue(ndarray_compare(arr, arr_copy)) + + # Compound non-vlen + dt = np.dtype([("x", "f8"), ("y", "i4")]) + arr = np.zeros((4,), dtype=dt) + arr[0] = (3.12, 42) + arr[3] = (1.28, 69) + buffer = arrayToBytes(arr) + self.assertEqual(buffer, arr.tobytes()) + + # convert back to array + arr_copy = bytesToArray(buffer, dt, (4,)) + self.assertTrue(ndarray_compare(arr, arr_copy)) + + # VLEN of int32's + dt = special_dtype(vlen=np.dtype(" expected_num_bytes) + + # convert buffer back to arr + arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") + self.assertTrue(np.array_equal(arr, arr_copy)) + + # fixed length string + dt = np.dtype("S8") + arr = np.asarray(("abcdefgh", "ABCDEFGH", "12345678"), dtype=dt) + buffer = arrayToBytes(arr, encoding="base64") + + # convert back to array + arr_copy = bytesToArray(buffer, dt, (3,), encoding="base64") + self.assertTrue(ndarray_compare(arr, arr_copy)) + + # Compound non-vlen + dt = np.dtype([("x", "f8"), ("y", "i4")]) + arr = np.zeros((4,), dtype=dt) + arr[0] = (3.12, 42) + arr[3] = (1.28, 69) + buffer = arrayToBytes(arr, encoding="base64") + + # convert back to array + arr_copy = bytesToArray(buffer, dt, (4,), encoding="base64") + self.assertTrue(ndarray_compare(arr, arr_copy)) + + # VLEN of int32's + dt = special_dtype(vlen=np.dtype(" 0: + lhStdout = self.log.handlers[0] # stdout is the only handler initially + else: + lhStdout = None + + self.log.setLevel(logging.INFO) + handler = logging.FileHandler("./h5json_reader_test.log") + # add handler to logger + self.log.addHandler(handler) + + if lhStdout is not None: + self.log.removeHandler(lhStdout) + + def testSimple(self): + filepath = "data/json/tall.json" + kwargs = {"app_logger": self.log} + with Hdf5db(**kwargs) as db: + h5_reader = H5JsonReader(filepath, **kwargs) + db.reader = h5_reader + root_id = db.getObjectIdByPath("/") + root_json = db.getObjectById(root_id) + + root_attrs = root_json["attributes"] + self.assertEqual(len(root_attrs), 2) + self.assertEqual(list(root_attrs.keys()), ["attr1", "attr2"]) + root_links = root_json["links"] + self.assertEqual(len(root_links), 2) + self.assertEqual(list(root_links.keys()), ["g1", "g2"]) + g1_link = root_links["g1"] + self.assertEqual(g1_link["class"], "H5L_TYPE_HARD") + g1_id = g1_link["id"] + self.assertEqual(g1_id, db.getObjectIdByPath("/g1/")) + dset111_id = db.getObjectIdByPath("/g1/g1.1/dset1.1.1") + dset_json = db.getObjectById(dset111_id) + dset_type = dset_json["type"] + self.assertEqual(dset_type["class"], "H5T_INTEGER") + self.assertEqual(dset_type["base"], "H5T_STD_I32BE") + dset_attrs = dset_json["attributes"] + self.assertEqual(len(dset_attrs), 2) + self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2"]) + dset_shape = dset_json["shape"] + self.assertEqual(dset_shape["class"], "H5S_SIMPLE") + self.assertEqual(dset_shape["dims"], [10, 10]) + sel_all = selections.select((10, 10), ...) + arr = db.getDatasetValues(dset111_id, sel_all) + self.assertTrue(isinstance(arr, np.ndarray)) + self.assertEqual(arr.shape, (10, 10)) + for i in range(10): + for j in range(10): + v = arr[i, j] + self.assertEqual(v, i * j) + + # try adding an attribute + db.createAttribute(dset111_id, "attr3", value=42) + dset_json = db.getObjectById(dset111_id) + dset_attrs = dset_json["attributes"] + self.assertEqual(len(dset_attrs), 3) + self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2", "attr3"]) + attr3_json = dset_attrs["attr3"] + attr3_shape = attr3_json["shape"] + self.assertEqual(attr3_shape["class"], "H5S_SCALAR") + attr3_type = attr3_json["type"] + self.assertEqual(attr3_type["class"], "H5T_INTEGER") + self.assertEqual(attr3_type["base"], "H5T_STD_I64LE") + attr3_value = attr3_json["value"] + self.assertEqual(attr3_value, 42) + + db.close() + + +if __name__ == "__main__": + # setup test files + + unittest.main() diff --git a/test/unit/h5json_writer_test.py b/test/unit/h5json_writer_test.py new file mode 100644 index 0000000..0f1fb59 --- /dev/null +++ b/test/unit/h5json_writer_test.py @@ -0,0 +1,345 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import unittest +import time +import logging +import numpy as np +from h5json import Hdf5db +from h5json.jsonstore.h5json_writer import H5JsonWriter +from h5json.hdf5dtype import special_dtype, Reference +from h5json import selections + + +class H5JsonWriterTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(H5JsonWriterTest, self).__init__(*args, **kwargs) + # main + + self.log = logging.getLogger() + if len(self.log.handlers) > 0: + lhStdout = self.log.handlers[0] # stdout is the only handler initially + else: + lhStdout = None + + self.log.setLevel(logging.DEBUG) + # create logger + + handler = logging.FileHandler("./hdf5dbtest.log") + # add handler to logger + self.log.addHandler(handler) + + if lhStdout is not None: + self.log.removeHandler(lhStdout) + # self.log.propagate = False # prevent log out going to stdout + self.log.info("init!") + + def testSimple(self): + + filepath = "test/unit/out/h5json_writer_testSimple.h5" + + with Hdf5db(app_logger=self.log) as db: + db.writer = H5JsonWriter(filepath, app_logger=self.log) + root_id = db.getObjectIdByPath("/") + db.createAttribute(root_id, "attr1", value=[1, 2, 3, 4]) + db.createAttribute(root_id, "attr2", 42) + g1_id = db.createGroup() + db.createHardLink(root_id, "g1", g1_id) + g2_id = db.createGroup() + db.createHardLink(root_id, "g2", g2_id) + + g1_1_id = db.createGroup() + db.createHardLink(g1_id, "g1.1", g1_1_id) + dset_111_id = db.createDataset(shape=(10, 10), dtype=np.int32) + arr = np.zeros((10, 10), dtype=np.int32) + for i in range(10): + for j in range(10): + arr[i, j] = i * j + sel_all = selections.select((10, 10), ...) + db.setDatasetValues(dset_111_id, sel_all, arr) + db.createHardLink(g1_1_id, "dset1.1.1", dset_111_id) + db.createSoftLink(g2_id, "slink", "somewhere") + db.createExternalLink(g2_id, "extlink", "somewhere", "someplace") + db.createCustomLink(g2_id, "cust", {"foo": "bar"}) + db.flush() + + def testNullSpaceAttribute(self): + + filepath = "test/unit/out/h5json_writer_testNullSpaceAttribute.h5" + + with Hdf5db(app_logger=self.log) as db: + db.writer = H5JsonWriter(filepath, app_logger=self.log) + root_id = db.getObjectIdByPath("/") + db.createAttribute(root_id, "A1", None, shape="H5S_NULL", dtype=np.int32) + item = db.getAttribute(root_id, "A1") + self.assertTrue("shape" in item) + shape_item = item["shape"] + self.assertTrue("class" in shape_item) + self.assertEqual(shape_item["class"], "H5S_NULL") + self.assertTrue(item["created"] > time.time() - 1.0) + value = db.getAttributeValue(root_id, "A1") + self.assertEqual(value, None) + + def testScalarAttribute(self): + filepath = "test/unit/out/h5json_writer_testScalarAttribute.h5" + + with Hdf5db(app_logger=self.log) as db: + db.writer = H5JsonWriter(filepath, app_logger=self.log) + root_id = db.getObjectIdByPath("/") + dims = () + value = 42 + db.createAttribute(root_id, "A1", value, shape=dims, dtype=np.int32) + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + self.assertEqual(len(shape_json.keys()), 1) # just one key should be returned + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_INTEGER") + self.assertEqual(item_type["base"], "H5T_STD_I32LE") + self.assertEqual(len(item_type.keys()), 2) # just two keys should be returned + self.assertEqual(item["value"], 42) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + shape = item["shape"] + self.assertEqual(shape["class"], "H5S_SCALAR") + + self.assertEqual(item_type["class"], "H5T_INTEGER") + self.assertEqual(item_type["base"], "H5T_STD_I32LE") + + def testFixedStringAttribute(self): + filepath = "test/unit/out/h5json_writer_testFixedStringAttribute.h5" + + with Hdf5db(app_logger=self.log) as db: + db.writer = H5JsonWriter(filepath, app_logger=self.log) + root_id = db.getObjectIdByPath("/") + value = "Hello, world!" + db.createAttribute(root_id, "A1", value, dtype=np.dtype("S13")) # dims, datatype, value) + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") + self.assertEqual(item_type["length"], 13) + self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") + self.assertEqual(item["value"], "Hello, world!") + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + ret_value = db.getAttributeValue(root_id, "A1") + self.assertEqual(ret_value, b'Hello, world!') + + def testVlenAsciiAttribute(self): + filepath = "test/unit/out/h5json_writer_testVlenAsciiAttribute.h5" + + with Hdf5db(app_logger=self.log) as db: + db.writer = H5JsonWriter(filepath, app_logger=self.log) + root_id = db.getObjectIdByPath("/") + + value = b"Hello, world!" + dt = special_dtype(vlen=bytes) + + # write the attribute + db.createAttribute(root_id, "A1", value, dtype=dt) + # read it back + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") + self.assertEqual(item_type["length"], "H5T_VARIABLE") + self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") + self.assertEqual(item["value"], "Hello, world!") + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + + def testVlenUtf8Attribute(self): + filepath = "test/unit/out/h5json_writer_testVlenutf8Attribute.h5" + + with Hdf5db(app_logger=self.log) as db: + db.writer = H5JsonWriter(filepath, app_logger=self.log) + root_id = db.getObjectIdByPath("/") + + value = b"Hello, world!" + dt = special_dtype(vlen=str) + + # write the attribute + db.createAttribute(root_id, "A1", value, dtype=dt) + # read it back + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") + self.assertEqual(item_type["length"], "H5T_VARIABLE") + self.assertEqual(item_type["charSet"], "H5T_CSET_UTF8") + self.assertEqual(item["value"], "Hello, world!") + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + + def testIntAttribute(self): + filepath = "test/unit/out/h5json_writer_testIntAttribute.h5" + + with Hdf5db(app_logger=self.log) as db: + db.writer = H5JsonWriter(filepath, app_logger=self.log) + root_id = db.getObjectIdByPath("/") + value = [2, 3, 5, 7, 11] + db.createAttribute(root_id, "A1", value, dtype=np.int16) + item = db.getAttribute(root_id, "A1") + self.assertEqual(item["value"], [2, 3, 5, 7, 11]) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + item_shape = item["shape"] + self.assertEqual(item_shape["class"], "H5S_SIMPLE") + self.assertEqual(item_shape["dims"], [5,]) + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_INTEGER") + self.assertEqual(item_type["base"], "H5T_STD_I16LE") + + def testCreateReferenceAttribute(self): + filepath = "test/unit/out/h5json_writer_testCreateReferenceAttribute.h5" + + with Hdf5db(app_logger=self.log) as db: + db.writer = H5JsonWriter(filepath, app_logger=self.log) + root_id = db.getObjectIdByPath("/") + + dset_id = db.createDataset(shape=(), dtype=np.int32) + db.createHardLink(root_id, "DS1", dset_id) + + dt = special_dtype(ref=Reference) + + ds1_ref = "datasets/" + dset_id + value = [ds1_ref,] + db.createAttribute(root_id, "A1", value, dtype=dt) + item = db.getAttribute(root_id, "A1") + attr = db.getAttribute(root_id, "A1") + self.assertTrue("shape" in attr) + + attr_type = attr["type"] + self.assertEqual(attr_type["class"], "H5T_REFERENCE") + self.assertEqual(attr_type["base"], "H5T_STD_REF_OBJ") + attr_value = item["value"] + self.assertEqual(len(attr_value), 1) + self.assertEqual(attr_value[0], ds1_ref) + + def testCreateVlenReferenceAttribute(self): + filepath = "test/unit/out/h5json_writer_testVlenReferenceAttribute.h5" + + with Hdf5db(app_logger=self.log) as db: + db.writer = H5JsonWriter(filepath, app_logger=self.log) + root_id = db.getObjectIdByPath("/") + dset_id = db.createDataset(shape=(), dtype=np.int32) + db.createHardLink(root_id, "DS1", dset_id) + grp_id = db.createGroup() + db.createHardLink(root_id, "G1", grp_id) + + dt_base = special_dtype(ref=Reference) + dt = special_dtype(vlen=dt_base) + + ds1_ref = "datasets/" + dset_id + grp_ref = "groups/" + grp_id + ref_arr = np.zeros((2,), dtype=dt_base) + ref_arr[0] = ds1_ref + ref_arr[1] = grp_ref + vlen_arr = np.zeros((), dtype=dt) + vlen_arr[()] = ref_arr + + db.createAttribute(root_id, "A1", vlen_arr) + item = db.getAttribute(root_id, "A1") + + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_VLEN") + self.assertEqual(item_type["size"], "H5T_VARIABLE") + base_type = item_type["base"] + self.assertEqual(base_type["class"], "H5T_REFERENCE") + self.assertEqual(base_type["base"], "H5T_STD_REF_OBJ") + + item_shape = item["shape"] + self.assertEqual(item_shape["class"], "H5S_SCALAR") + + def testCommittedType(self): + filepath = "test/unit/out/h5json_writer_testCommittedType.h5" + + with Hdf5db(app_logger=self.log) as db: + db.writer = H5JsonWriter(filepath, app_logger=self.log) + root_id = db.getObjectIdByPath("/") + dt = np.dtype("S15") + + ctype_id = db.createCommittedType(dt) + db.createHardLink(root_id, "ctype", ctype_id) + item = db.getObjectById(ctype_id) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + + item_type = item["type"] + + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") + self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") + self.assertEqual(item_type["length"], 15) + + # create an attribute using the committed type + db.createAttribute(root_id, "A1", "hello world!", dtype=f"datatypes/{ctype_id}") + attr = db.getAttribute(root_id, "A1") + self.assertEqual(attr["value"], "hello world!") + + attr_type = attr["type"] + self.assertEqual(attr_type["class"], "H5T_STRING") + self.assertEqual(attr_type["length"], 15) + self.assertEqual(attr_type["charSet"], "H5T_CSET_ASCII") + + def testCommittedCompoundType(self): + filepath = "test/unit/out/h5json_writer_testCommittedCompoundType.h5" + + with Hdf5db(app_logger=self.log) as db: + db.writer = H5JsonWriter(filepath, app_logger=self.log) + root_id = db.getObjectIdByPath("/") + + dt_str = special_dtype(vlen=str) + fields = [] + fields.append(("field_1", np.dtype(">i8"))) + fields.append(("field_2", ">f8")) + fields.append(("field_3", np.dtype("S15"))) + fields.append(("field_4", dt_str)) + dt = np.dtype(fields) + + ctype_id = db.createCommittedType(dt) + db.createHardLink(root_id, "ctype", ctype_id) + item = db.getObjectById(ctype_id) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + + item_type = item["type"] + + self.assertEqual(item_type["class"], "H5T_COMPOUND") + fields = item_type["fields"] + self.assertEqual(len(fields), 4) + + # create an attribute using the committed type + attr_value = (42, 3.14, "circle", "area = R^2 * PI") + db.createAttribute(root_id, "A1", attr_value, dtype=f"datatypes/{ctype_id}") + attr = db.getAttribute(root_id, "A1") + self.assertEqual(attr["value"], list(attr_value)) + attr_shape = attr["shape"] + self.assertEqual(attr_shape["class"], "H5S_SCALAR") + + attr_type = attr["type"] + self.assertEqual(attr_type["class"], "H5T_COMPOUND") + + value = db.getAttributeValue(root_id, "A1") + self.assertTrue(isinstance(value, np.ndarray)) + + +if __name__ == "__main__": + # setup test files + + unittest.main() diff --git a/test/unit/h5py_reader_test.py b/test/unit/h5py_reader_test.py new file mode 100644 index 0000000..7c11c4f --- /dev/null +++ b/test/unit/h5py_reader_test.py @@ -0,0 +1,89 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import unittest + +import logging +from h5json import Hdf5db +from h5json.h5pystore.h5py_reader import H5pyReader + + +class H5pyReaderTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(H5pyReaderTest, self).__init__(*args, **kwargs) + # main + + self.log = logging.getLogger() + if len(self.log.handlers) > 0: + lhStdout = self.log.handlers[0] # stdout is the only handler initially + else: + lhStdout = None + + self.log.setLevel(logging.INFO) + handler = logging.FileHandler("./hdf5dbtest.log") + # add handler to logger + self.log.addHandler(handler) + + if lhStdout is not None: + self.log.removeHandler(lhStdout) + + def testSimple(self): + filepath = "data/hdf5/tall.h5" + kwargs = {"app_logger": self.log} + with Hdf5db(h5_reader=H5pyReader(filepath, **kwargs), **kwargs) as db: + root_id = db.getObjectIdByPath("/") + print("got root_id:", root_id) + root_json = db.getObjectById(root_id) + + root_attrs = root_json["attributes"] + self.assertEqual(len(root_attrs), 2) + self.assertEqual(list(root_attrs.keys()), ["attr1", "attr2"]) + root_links = root_json["links"] + self.assertEqual(len(root_links), 2) + self.assertEqual(list(root_links.keys()), ["g1", "g2"]) + g1_link = root_links["g1"] + self.assertEqual(g1_link["class"], "H5L_TYPE_HARD") + g1_id = g1_link["id"] + self.assertEqual(g1_id, db.getObjectIdByPath("/g1/")) + dset111_id = db.getObjectIdByPath("/g1/g1.1/dset1.1.1") + dset_json = db.getObjectById(dset111_id) + dset_type = dset_json["type"] + self.assertEqual(dset_type["class"], "H5T_INTEGER") + self.assertEqual(dset_type["base"], "H5T_STD_I32BE") + dset_attrs = dset_json["attributes"] + self.assertEqual(len(dset_attrs), 2) + self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2"]) + dset_shape = dset_json["shape"] + self.assertEqual(dset_shape["class"], "H5S_SIMPLE") + self.assertEqual(dset_shape["dims"], [10, 10]) + + # try adding an attribute + db.createAttribute(dset111_id, "attr3", value=42) + dset_json = db.getObjectById(dset111_id) + dset_attrs = dset_json["attributes"] + self.assertEqual(len(dset_attrs), 3) + self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2", "attr3"]) + attr3_json = dset_attrs["attr3"] + attr3_shape = attr3_json["shape"] + self.assertEqual(attr3_shape["class"], "H5S_SCALAR") + attr3_type = attr3_json["type"] + self.assertEqual(attr3_type["class"], "H5T_INTEGER") + self.assertEqual(attr3_type["base"], "H5T_STD_I64LE") + attr3_value = attr3_json["value"] + self.assertEqual(attr3_value, 42) + + db.close() + + +if __name__ == "__main__": + # setup test files + + unittest.main() diff --git a/test/unit/h5py_writer_test.py b/test/unit/h5py_writer_test.py new file mode 100644 index 0000000..3ff91be --- /dev/null +++ b/test/unit/h5py_writer_test.py @@ -0,0 +1,590 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import unittest +import time +import logging +import os + +import h5py +import numpy as np +from h5json import Hdf5db +from h5json.jsonstore.h5json_reader import H5JsonReader +from h5json.h5pystore.h5py_writer import H5pyWriter +from h5json.hdf5dtype import special_dtype, Reference +from h5json import selections + + +class H5pyWriterTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(H5pyWriterTest, self).__init__(*args, **kwargs) + # main + + self.log = logging.getLogger() + if len(self.log.handlers) > 0: + lhStdout = self.log.handlers[0] # stdout is the only handler initially + else: + lhStdout = None + + self.log.setLevel(logging.DEBUG) + # create logger + + handler = logging.FileHandler("./h5pywriterbtest.log") + # add handler to logger + self.log.addHandler(handler) + + if lhStdout is not None: + self.log.removeHandler(lhStdout) + # self.log.propagate = False # prevent log out going to stdout + self.log.info("init!") + + def testSimple(self): + + filepath = "test/unit/out/h5py_writer_test_testSimple.h5" + if os.path.isfile(filepath): + os.remove(filepath) # cleanup any previous run + + db = Hdf5db(app_logger=self.log) + db.writer = H5pyWriter(filepath, no_data=False) + root_id = db.open() + self.assertEqual(db.getObjectIdByPath("/"), root_id) + db.createAttribute(root_id, "attr1", value=[1, 2, 3, 4]) + db.createAttribute(root_id, "attr2", 42) + g1_id = db.createGroup() + db.createHardLink(root_id, "g1", g1_id) + db.createAttribute(g1_id, "a1", "hello") + db.close() + + # open file with h5py and verify changes + with h5py.File(filepath) as f: + self.assertTrue("attr1", f.attrs) + self.assertTrue("attr2", f.attrs) + self.assertEqual(len(f), 1) + self.assertTrue("g1" in f) + g1 = f["g1"] + self.assertTrue("a1" in g1.attrs) + self.assertEqual(len(g1), 0) + + db.open() + g2_id = db.createGroup() + db.createHardLink(root_id, "g2", g2_id) + + g1_1_id = db.createGroup() + db.createHardLink(g1_id, "g1.1", g1_1_id) + dset_111_id = db.createDataset(shape=(10, 10), dtype=np.int32) + arr = np.zeros((10, 10), dtype=np.int32) + for i in range(10): + for j in range(10): + arr[i, j] = i * j + sel_all = selections.select((10, 10), ...) + db.setDatasetValues(dset_111_id, sel_all, arr) + db.createHardLink(g1_1_id, "dset1.1.1", dset_111_id) + db.createSoftLink(g2_id, "slink", "somewhere") + db.createExternalLink(g2_id, "extlink", "somewhere", "someplace") + db.createCustomLink(g2_id, "cust", {"foo": "bar"}) + db.close() + + # open file with h5py and verify changes + with h5py.File(filepath) as f: + self.assertTrue("attr1", f.attrs) + self.assertTrue("attr2", f.attrs) + self.assertTrue("g1" in f) + g1 = f["g1"] + self.assertTrue("a1" in g1.attrs) + self.assertTrue("g1.1" in g1) + g11 = g1["g1.1"] + self.assertTrue("dset1.1.1" in g11) + dset = g11["dset1.1.1"] + self.assertEqual(dset.shape, (10, 10)) + for i in range(10): + for j in range(10): + self.assertEqual(dset[i, j], i * j) + self.assertTrue("g2" in f) + g2 = f["g2"] + self.assertTrue("extlink" in g2) + self.assertTrue("slink" in g2) + + db.open() + db.createAttribute(g1_id, "a2", "bye-bye") + db.close() + + with h5py.File(filepath) as f: + g1 = f["g1"] + self.assertEqual(len(g1.attrs), 2) + self.assertTrue("a1" in g1.attrs) + self.assertTrue("a2" in g1.attrs) + + db.open() + g21 = db.createGroup() + db.createHardLink(g2_id, "g2.1", g21) + db.close() + + with h5py.File(filepath) as f: + g2 = f["g2"] + self.assertTrue("g2.1" in g2) + + db.open() + sel = selections.select((10, 10), (slice(4, 5), slice(4, 5))) + arr = np.zeros((), dtype=np.int32) + arr[()] = 42 + db.setDatasetValues(dset_111_id, sel, arr) + db.close() + + with h5py.File(filepath) as f: + dset = f["/g1/g1.1/dset1.1.1"] + for i in range(10): + for j in range(10): + if i == 4 and j == 4: + # this is the one element that was updated + expected = 42 + else: + expected = i * j + self.assertEqual(dset[i, j], expected) + + def testNullSpaceAttribute(self): + + filepath = "test/unit/out/h5py_writer_test_testNullSpaceAttribute.h5" + if os.path.isfile(filepath): + os.remove(filepath) # cleanup any previous run + db = Hdf5db(app_logger=self.log) + db.writer = H5pyWriter(filepath, no_data=False) + root_id = db.open() + db.createAttribute(root_id, "A1", None, shape="H5S_NULL", dtype=np.int32) + item = db.getAttribute(root_id, "A1") + self.assertTrue("shape" in item) + shape_item = item["shape"] + self.assertTrue("class" in shape_item) + self.assertEqual(shape_item["class"], "H5S_NULL") + self.assertTrue(item["created"] > time.time() - 1.0) + value = db.getAttributeValue(root_id, "A1") + self.assertEqual(value, None) + db.close() + + with h5py.File(filepath) as f: + self.assertTrue("A1" in f.attrs) + self.assertEqual(f.attrs["A1"], h5py.Empty(dtype=np.int32)) + + def testScalarAttribute(self): + + filepath = "test/unit/out/h5py_writer_test_testNullScalarAttribute.h5" + if os.path.isfile(filepath): + os.remove(filepath) # cleanup any previous run + db = Hdf5db(app_logger=self.log) + db.writer = H5pyWriter(filepath, no_data=False) + root_id = db.open() + dims = () + value = 42 + db.createAttribute(root_id, "A1", value, shape=dims, dtype=np.int32) + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + self.assertEqual(len(shape_json.keys()), 1) # just one key should be returned + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_INTEGER") + self.assertEqual(item_type["base"], "H5T_STD_I32LE") + self.assertEqual(len(item_type.keys()), 2) # just two keys should be returned + self.assertEqual(item["value"], 42) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + shape = item["shape"] + self.assertEqual(shape["class"], "H5S_SCALAR") + self.assertEqual(item_type["class"], "H5T_INTEGER") + self.assertEqual(item_type["base"], "H5T_STD_I32LE") + db.close() + + with h5py.File(filepath) as f: + self.assertTrue("A1" in f.attrs) + a1 = f.attrs["A1"] + self.assertTrue(isinstance(a1, np.int32)) + self.assertEqual(a1, 42) + + def testFixedStringAttribute(self): + + filepath = "test/unit/out/h5py_writer_test_testFixedStringAttribute.h5" + if os.path.isfile(filepath): + os.remove(filepath) # cleanup any previous run + db = Hdf5db(app_logger=self.log) + db.writer = H5pyWriter(filepath, no_data=False) + root_id = db.open() + value = "Hello, world!" + db.createAttribute(root_id, "A1", value, dtype=np.dtype("S13")) # dims, datatype, value) + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") + self.assertEqual(item_type["length"], 13) + self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") + self.assertEqual(item["value"], "Hello, world!") + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + db.close() + + with h5py.File(filepath) as f: + self.assertTrue("A1" in f.attrs) + a1 = f.attrs["A1"] + self.assertTrue(isinstance(a1, bytes)) + self.assertEqual(a1, b'Hello, world!') + + def testVlenAsciiAttribute(self): + + filepath = "test/unit/out/h5py_writer_test_testVlenAsciiAttribute.h5" + if os.path.isfile(filepath): + os.remove(filepath) # cleanup any previous run + value = b"Hello, world!" + + db = Hdf5db(app_logger=self.log) + db.writer = H5pyWriter(filepath, no_data=False) + root_id = db.open() + dt = special_dtype(vlen=bytes) + # write the attribute + db.createAttribute(root_id, "A1", value, dtype=dt) + # read it back + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") + self.assertEqual(item_type["length"], "H5T_VARIABLE") + self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") + self.assertEqual(item["value"], "Hello, world!") + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + db.close() + + with h5py.File(filepath) as f: + self.assertTrue("A1" in f.attrs) + a1 = f.attrs["A1"] + self.assertTrue(isinstance(a1, str)) + self.assertEqual(a1, value.decode("ascii")) + + def testVlenUtf8Attribute(self): + + filepath = "test/unit/out/h5py_writer_test_testVlenUtf8Attribute.h5" + if os.path.isfile(filepath): + os.remove(filepath) # cleanup any previous run + value = "one: \u4e00" + + db = Hdf5db(app_logger=self.log) + db.writer = H5pyWriter(filepath, no_data=False) + root_id = db.open() + dt = special_dtype(vlen=str) + # write the attribute + db.createAttribute(root_id, "A1", value, dtype=dt) + # read it back + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") + self.assertEqual(item_type["length"], "H5T_VARIABLE") + self.assertEqual(item_type["charSet"], "H5T_CSET_UTF8") + self.assertEqual(item["value"], value) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + db.close() + + with h5py.File(filepath) as f: + self.assertTrue("A1" in f.attrs) + a1 = f.attrs["A1"] + self.assertTrue(isinstance(a1, str)) + self.assertEqual(a1, value) + + def testIntAttribute(self): + + filepath = "test/unit/out/h5py_writer_test_testIntAttribute.h5" + if os.path.isfile(filepath): + os.remove(filepath) # cleanup any previous run + value = [2, 3, 5, 7, 11] + + db = Hdf5db(app_logger=self.log) + db.writer = H5pyWriter(filepath, no_data=False) + root_id = db.open() + db.createAttribute(root_id, "A1", value, dtype=np.int16) + item = db.getAttribute(root_id, "A1") + self.assertEqual(item["value"], [2, 3, 5, 7, 11]) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + item_shape = item["shape"] + self.assertEqual(item_shape["class"], "H5S_SIMPLE") + self.assertEqual(item_shape["dims"], [5,]) + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_INTEGER") + self.assertEqual(item_type["base"], "H5T_STD_I16LE") + db.close() + + with h5py.File(filepath) as f: + self.assertTrue("A1" in f.attrs) + a1 = f.attrs["A1"] + self.assertTrue(isinstance(a1, np.ndarray)) + self.assertEqual(a1.shape, (5,)) + for i in range(5): + self.assertEqual(a1[i], value[i]) + + def testCreateReferenceAttribute(self): + + filepath = "test/unit/out/h5py_writer_test_testCreateReferenceAttribute.h5" + if os.path.isfile(filepath): + os.remove(filepath) # cleanup any previous run + db = Hdf5db(app_logger=self.log) + db.writer = H5pyWriter(filepath, no_data=False) + root_id = db.open() + dset_id = db.createDataset(shape=(), dtype=np.int32) + db.createHardLink(root_id, "DS1", dset_id) + dt = special_dtype(ref=Reference) + ds1_ref = "datasets/" + dset_id + value = [ds1_ref,] + db.createAttribute(root_id, "A1", value, dtype=dt) + attr = db.getAttribute(root_id, "A1") + self.assertTrue("shape" in attr) + attr_type = attr["type"] + self.assertEqual(attr_type["class"], "H5T_REFERENCE") + self.assertEqual(attr_type["base"], "H5T_STD_REF_OBJ") + attr_value = db.getAttributeValue(root_id, "A1") + self.assertEqual(len(attr_value), 1) + self.assertEqual(attr_value[0], ds1_ref.encode('ascii')) + db.close() + + with h5py.File(filepath) as f: + self.assertTrue("A1" in f.attrs) + a1 = f.attrs["A1"] + obj_ref = a1[0] + obj = f[obj_ref] + self.assertEqual(obj.name, "/DS1") + + def testCreateVlenReferenceAttribute(self): + + filepath = "test/unit/out/h5py_writer_test_testVlenReferenceAttribute.h5" + if os.path.isfile(filepath): + os.remove(filepath) # cleanup any previous run + db = Hdf5db(app_logger=self.log) + db.writer = H5pyWriter(filepath, no_data=False) + root_id = db.open() + dset_id = db.createDataset(shape=(), dtype=np.int32) + db.createHardLink(root_id, "DS1", dset_id) + grp_id = db.createGroup() + db.createHardLink(root_id, "G1", grp_id) + dt_base = special_dtype(ref=Reference) + dt = special_dtype(vlen=dt_base) + ds1_ref = "datasets/" + dset_id + grp_ref = "groups/" + grp_id + ref_arr = np.zeros((2,), dtype=dt_base) + ref_arr[0] = ds1_ref + ref_arr[1] = grp_ref + vlen_arr = np.zeros((), dtype=dt) + vlen_arr[()] = ref_arr + db.createAttribute(root_id, "A1", vlen_arr) + item = db.getAttribute(root_id, "A1") + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_VLEN") + self.assertEqual(item_type["size"], "H5T_VARIABLE") + base_type = item_type["base"] + self.assertEqual(base_type["class"], "H5T_REFERENCE") + self.assertEqual(base_type["base"], "H5T_STD_REF_OBJ") + item_shape = item["shape"] + self.assertEqual(item_shape["class"], "H5S_SCALAR") + db.close() + + with h5py.File(filepath) as f: + self.assertTrue("DS1" in f) + ds1 = f["DS1"] + self.assertTrue(ds1) + self.assertTrue("G1" in f) + g1 = f["G1"] + self.assertTrue(g1) + self.assertTrue("A1" in f.attrs) + a1 = f.attrs["A1"] + ref_obj = f[a1[0]] + self.assertEqual(ref_obj.name, "/DS1") + + def testCommittedType(self): + + filepath = "test/unit/out/h5py_writer_test_testCommittedType.h5" + if os.path.isfile(filepath): + os.remove(filepath) # cleanup any previous run + dt = np.dtype("S15") + + db = Hdf5db(app_logger=self.log) + db.writer = H5pyWriter(filepath, no_data=False) + root_id = db.open() + ctype_id = db.createCommittedType(dt) + db.createHardLink(root_id, "ctype", ctype_id) + item = db.getObjectById(ctype_id) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + db.createHardLink(root_id, "T1", ctype_id) + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_STRING") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") + self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") + self.assertEqual(item_type["length"], 15) + # create an attribute using the committed type + db.createAttribute(root_id, "A1", "hello world!", dtype=f"datatypes/{ctype_id}") + attr = db.getAttribute(root_id, "A1") + self.assertEqual(attr["value"], "hello world!") + attr_type = attr["type"] + self.assertEqual(attr_type["class"], "H5T_STRING") + self.assertEqual(attr_type["length"], 15) + self.assertEqual(attr_type["charSet"], "H5T_CSET_ASCII") + db.close() + + with h5py.File(filepath) as f: + self.assertTrue("T1" in f) + t1 = f["T1"] + self.assertTrue(isinstance(t1, h5py.Datatype)) + self.assertEqual(t1.dtype, dt) + + self.assertTrue("A1" in f.attrs) + a1 = f.attrs["A1"] + self.assertEqual(a1, b"hello world!") + + def testCommittedCompoundType(self): + + filepath = "test/unit/out/h5py_writer_test_testCommittedCompoundType.h5" + if os.path.isfile(filepath): + os.remove(filepath) # cleanup any previous run + + db = Hdf5db(app_logger=self.log) + db.writer = H5pyWriter(filepath, no_data=False) + root_id = db.open() + dt_str = special_dtype(vlen=str) + fields = [] + fields.append(("field_1", np.dtype(">i8"))) + fields.append(("field_2", np.dtype(">f8"))) + fields.append(("field_3", np.dtype("S15"))) + fields.append(("field_4", dt_str)) + dt = np.dtype(fields) + ctype_id = db.createCommittedType(dt) + db.createHardLink(root_id, "ctype", ctype_id) + item = db.getObjectById(ctype_id) + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + db.createHardLink(root_id, "T1", ctype_id) + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_COMPOUND") + fields = item_type["fields"] + self.assertEqual(len(fields), 4) + # create an attribute using the committed type + attr_value = (42, 3.14, "circle", "area = R^2 * PI") + db.createAttribute(root_id, "A1", attr_value, dtype=f"datatypes/{ctype_id}") + attr = db.getAttribute(root_id, "A1") + self.assertEqual(attr["value"], list(attr_value)) + attr_shape = attr["shape"] + self.assertEqual(attr_shape["class"], "H5S_SCALAR") + attr_type = attr["type"] + self.assertEqual(attr_type["class"], "H5T_COMPOUND") + arr = db.getAttributeValue(root_id, "A1") + self.assertTrue(isinstance(arr, np.ndarray)) + db.close() + + with h5py.File(filepath) as f: + self.assertTrue("T1" in f) + t1 = f["T1"] + self.assertTrue(isinstance(t1, h5py.Datatype)) + self.assertEqual(len(t1.dtype), 4) + sub_dt = t1.dtype["field_1"] + self.assertEqual(sub_dt, np.dtype(">i8")) + sub_dt = t1.dtype["field_2"] + self.assertEqual(sub_dt, np.dtype(">f8")) + sub_dt = t1.dtype["field_3"] + self.assertEqual(sub_dt, np.dtype("S15")) + sub_dt = t1.dtype["field_4"] + self.assertEqual(sub_dt, h5py.special_dtype(vlen=str)) + + def testReaderWithUpdate(self): + + file_in = "data/json/tall.json" + file_out = "test/unit/out/h5py_writer_test_testReaderWithUpdate.h5" + if os.path.isfile(file_out): + os.remove(file_out) # cleanup any previous run + + db = Hdf5db(app_logger=self.log) + db.reader = H5JsonReader(file_in) + db.writer = H5pyWriter(file_out) + db.open() + # close should create everything the json reader read to the output file + db.close() + + with h5py.File(file_out) as f: + self.assertTrue("/g1/g1.1/dset1.1.1" in f) + dset111 = f["/g1/g1.1/dset1.1.1"] + self.assertEqual(len(dset111.attrs), 2) + + db.open() + dset111_id = db.getObjectIdByPath("/g1/g1.1/dset1.1.1") + db.createAttribute(dset111_id, "attr3", "hello") + db.close() + + with h5py.File(file_out) as f: + self.assertTrue("/g1/g1.1/dset1.1.1" in f) + dset111 = f["/g1/g1.1/dset1.1.1"] + self.assertEqual(len(dset111.attrs), 3) + self.assertEqual(dset111.attrs["attr3"], b"hello") + + db.open() + db.createAttribute(dset111_id, "attr3", "bye-bye") + db.close() + + with h5py.File(file_out) as f: + self.assertTrue("/g1/g1.1/dset1.1.1" in f) + dset111 = f["/g1/g1.1/dset1.1.1"] + self.assertEqual(len(dset111.attrs), 3) + self.assertEqual(dset111.attrs["attr3"], b"bye-bye") + g1 = f["g1"] + + db.open() + # create a new group + g13_id = db.createGroup() + g1_id = db.getObjectIdByPath("/g1") + db.createHardLink(g1_id, "g1.3", g13_id) + db.close() + + with h5py.File(file_out) as f: + g1 = f["g1"] + self.assertEqual(len(g1), 3) + self.assertTrue("g1.3" in g1) + + db.open() + # create a new dataset + dset_id = db.createDataset(shape=(10, 10), dtype=np.int32) + db.createHardLink(g1_id, "DS1", dset_id) + db.close() + + with h5py.File(file_out) as f: + g1 = f["g1"] + self.assertTrue("DS1" in g1) + ds1 = g1["DS1"] + self.assertEqual(ds1.shape, (10, 10)) + + db.open() + arr = np.asarray(range(10), dtype=np.int32) + sel = selections.select((10, 10), (slice(5, 6), slice(0, 10))) + db.setDatasetValues(dset_id, sel, arr) + db.close() + + with h5py.File(file_out) as f: + ds1 = f["/g1/DS1"] + data = ds1[:, :] + for i in range(10): + for j in range(10): + if i == 5: + self.assertEqual(data[i, j], j) + else: + self.assertEqual(data[i, j], 0) + + +if __name__ == "__main__": + # setup test files + + unittest.main() diff --git a/test/unit/hdf5db_test.py b/test/unit/hdf5db_test.py index 6a310c6..cbd7c87 100755 --- a/test/unit/hdf5db_test.py +++ b/test/unit/hdf5db_test.py @@ -10,42 +10,13 @@ # request a copy from help@hdfgroup.org. # ############################################################################## import unittest -import os import time -import errno -import os.path as op -import stat import logging -import shutil +import numpy as np from h5json import Hdf5db - - -UUID_LEN = 36 # length for uuid strings - - -def getFile(name, tgt, ro=False): - src = "data/hdf5/" + name - logging.info("copying file to this directory: " + src) - - filepath = "./out/" + tgt - - if op.isfile(filepath): - # make sure it's writable, before we copy over it - os.chmod(filepath, stat.S_IWRITE | stat.S_IREAD) - shutil.copyfile(src, filepath) - if ro: - logging.info("make read-only") - os.chmod(filepath, stat.S_IREAD) - return filepath - - -def removeFile(name): - try: - os.stat(name) - except OSError: - return - # file does not exist - os.remove(name) +from h5json import selections +from h5json.objid import isRootObjId, isValidUuid, isSchema2Id +from h5json.hdf5dtype import special_dtype, Reference class Hdf5dbTest(unittest.TestCase): @@ -59,7 +30,7 @@ def __init__(self, *args, **kwargs): else: lhStdout = None - self.log.setLevel(logging.INFO) + self.log.setLevel(logging.DEBUG) # create logger handler = logging.FileHandler("./hdf5dbtest.log") @@ -71,778 +42,218 @@ def __init__(self, *args, **kwargs): # self.log.propagate = False # prevent log out going to stdout self.log.info("init!") - # create directory for test output files - if not os.path.exists("./out"): - os.makedirs("./out") - - def testInvalidPath(self): - filepath = "/tmp/thisisnotafile.h5" - try: - with Hdf5db(filepath, app_logger=self.log) as db: - self.log.error(f"Unexpected Hdf5db ref: {db}") - self.assertTrue(False) # shouldn't get here - except IOError as e: - self.assertEqual(e.errno, errno.ENXIO) - self.assertEqual(e.strerror, "file not found") - - def testInvalidFile(self): - filepath = getFile("notahdf5file.h5", "notahdf5file.h5") - try: - with Hdf5db(filepath, app_logger=self.log) as db: - self.log.error(f"Unexpected Hdf5db ref: {db}") - self.assertTrue(False) # shouldn't get here - except IOError as e: - self.assertEqual(e.errno, errno.EINVAL) - self.assertEqual(e.strerror, "not an HDF5 file") - - def testGetUUIDByPath(self): - # get test file - g1Uuid = None - filepath = getFile("tall.h5", "getuuidbypath.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - g1Uuid = db.getUUIDByPath("/g1") - self.assertEqual(len(g1Uuid), UUID_LEN) - obj = db.getObjByPath("/g1") - self.assertEqual(obj.name, "/g1") - for name in obj: - g = obj[name] - self.log.debug(f"got obj: {g}") - g1links = db.getLinkItems(g1Uuid) - self.assertEqual(len(g1links), 2) - for item in g1links: - self.assertEqual(len(item["id"]), UUID_LEN) - - # end of with will close file - # open again and verify we can get obj by name - with Hdf5db(filepath, app_logger=self.log) as db: - obj = db.getGroupObjByUuid(g1Uuid) - g1 = db.getObjByPath("/g1") - self.assertEqual(obj, g1) - - def testGetCounts(self): - filepath = getFile("tall.h5", "testgetcounts_tall.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - cnt = db.getNumberOfGroups() - self.assertEqual(cnt, 6) - cnt = db.getNumberOfDatasets() - self.assertEqual(cnt, 4) - cnt = db.getNumberOfDatatypes() - self.assertEqual(cnt, 0) - - filepath = getFile("empty.h5", "testgetcounts_empty.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - cnt = db.getNumberOfGroups() - self.assertEqual(cnt, 1) - cnt = db.getNumberOfDatasets() - self.assertEqual(cnt, 0) - cnt = db.getNumberOfDatatypes() - self.assertEqual(cnt, 0) - - def testGroupOperations(self): - # get test file - filepath = getFile("tall.h5", "tall_del_g11.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - rootuuid = db.getUUIDByPath("/") - root = db.getGroupObjByUuid(rootuuid) - self.assertEqual("/", root.name) - rootLinks = db.getLinkItems(rootuuid) - self.assertEqual(len(rootLinks), 2) - g1uuid = db.getUUIDByPath("/g1") - self.assertEqual(len(g1uuid), UUID_LEN) - g1Links = db.getLinkItems(g1uuid) - self.assertEqual(len(g1Links), 2) - g11uuid = db.getUUIDByPath("/g1/g1.1") - db.deleteObjectByUuid("group", g11uuid) - - def testCreateGroup(self): - # get test file - filepath = getFile("tall.h5", "tall_newgrp.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - rootUuid = db.getUUIDByPath("/") - numRootChildren = len(db.getLinkItems(rootUuid)) - self.assertEqual(numRootChildren, 2) - newGrpUuid = db.createGroup() - newGrp = db.getGroupObjByUuid(newGrpUuid) - self.assertNotEqual(newGrp, None) - db.linkObject(rootUuid, newGrpUuid, "g3") - numRootChildren = len(db.getLinkItems(rootUuid)) - self.assertEqual(numRootChildren, 3) - # verify linkObject can be called idempotent-ly - db.linkObject(rootUuid, newGrpUuid, "g3") - - def testGetLinkItemsBatch(self): - # get test file - filepath = getFile("group100.h5", "getlinkitemsbatch.h5") - marker = None - count = 0 - with Hdf5db(filepath, app_logger=self.log) as db: - rootUuid = db.getUUIDByPath("/") - while True: - # get items 13 at a time - batch = db.getLinkItems(rootUuid, marker=marker, limit=13) - if len(batch) == 0: - break # done! - count += len(batch) - lastItem = batch[len(batch) - 1] - marker = lastItem["title"] - self.assertEqual(count, 100) - - def testGetItemHardLink(self): - filepath = getFile("tall.h5", "getitemhardlink.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - grpUuid = db.getUUIDByPath("/g1/g1.1") - item = db.getLinkItemByUuid(grpUuid, "dset1.1.1") - self.assertTrue("id" in item) - self.assertEqual(item["title"], "dset1.1.1") - self.assertEqual(item["class"], "H5L_TYPE_HARD") - self.assertEqual(item["collection"], "datasets") - self.assertTrue("target" not in item) - self.assertTrue("mtime" in item) - self.assertTrue("ctime" in item) - - def testGetItemSoftLink(self): - filepath = getFile("tall.h5", "getitemsoftlink.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - grpUuid = db.getUUIDByPath("/g1/g1.2/g1.2.1") - item = db.getLinkItemByUuid(grpUuid, "slink") - self.assertTrue("id" not in item) - self.assertEqual(item["title"], "slink") - self.assertEqual(item["class"], "H5L_TYPE_SOFT") - self.assertEqual(item["h5path"], "somevalue") - self.assertTrue("mtime" in item) - self.assertTrue("ctime" in item) - - def testGetItemExternalLink(self): - filepath = getFile("tall_with_udlink.h5", "getitemexternallink.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - grpUuid = db.getUUIDByPath("/g1/g1.2") - item = db.getLinkItemByUuid(grpUuid, "extlink") - self.assertTrue("uuid" not in item) - self.assertEqual(item["title"], "extlink") - self.assertEqual(item["class"], "H5L_TYPE_EXTERNAL") - self.assertEqual(item["h5path"], "somepath") - self.assertEqual(item["file"], "somefile") - self.assertTrue("mtime" in item) - self.assertTrue("ctime" in item) - - def testGetItemUDLink(self): - filepath = getFile("tall_with_udlink.h5", "getitemudlink.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - grpUuid = db.getUUIDByPath("/g2") - item = db.getLinkItemByUuid(grpUuid, "udlink") - self.assertTrue("uuid" not in item) - self.assertEqual(item["title"], "udlink") - self.assertEqual(item["class"], "H5L_TYPE_USER_DEFINED") - self.assertTrue("h5path" not in item) - self.assertTrue("file" not in item) - self.assertTrue("mtime" in item) - self.assertTrue("ctime" in item) - - def testGetNumLinks(self): - filepath = getFile("tall.h5", "getnumlinks.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - g1 = db.getObjByPath("/g1") - numLinks = db.getNumLinksToObject(g1) - self.assertEqual(numLinks, 1) - - def testGetLinks(self): - g12_links = ("extlink", "g1.2.1") - hardLink = None - externalLink = None - filepath = getFile("tall_with_udlink.h5", "getlinks.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - grpUuid = db.getUUIDByPath("/g1/g1.2") - items = db.getLinkItems(grpUuid) - self.assertEqual(len(items), 2) - for item in items: - self.assertTrue(item["title"] in g12_links) - if item["class"] == "H5L_TYPE_HARD": - hardLink = item - elif item["class"] == "H5L_TYPE_EXTERNAL": - externalLink = item - self.assertEqual(hardLink["collection"], "groups") - self.assertTrue("id" in hardLink) - self.assertTrue("id" not in externalLink) - self.assertEqual(externalLink["h5path"], "somepath") - self.assertEqual(externalLink["file"], "somefile") - - def testDeleteLink(self): - # get test file - filepath = getFile("tall.h5", "deletelink.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - rootUuid = db.getUUIDByPath("/") - numRootChildren = len(db.getLinkItems(rootUuid)) - self.assertEqual(numRootChildren, 2) - db.unlinkItem(rootUuid, "g2") - numRootChildren = len(db.getLinkItems(rootUuid)) - self.assertEqual(numRootChildren, 1) - - def testDeleteUDLink(self): - # get test file - filepath = getFile("tall_with_udlink.h5", "deleteudlink.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - g2Uuid = db.getUUIDByPath("/g2") - numG2Children = len(db.getLinkItems(g2Uuid)) - self.assertEqual(numG2Children, 3) - got_exception = False - try: - db.unlinkItem(g2Uuid, "udlink") - except IOError as ioe: - got_exception = True - self.assertEqual(ioe.errno, errno.EPERM) - self.assertTrue(got_exception) - numG2Children = len(db.getLinkItems(g2Uuid)) - self.assertEqual(numG2Children, 3) - - def testReadOnlyGetUUID(self): - # get test file - filepath = getFile("tall.h5", "readonlygetuuid.h5", ro=True) - # remove db file! - removeFile("./out/." + "readonlygetuuid.h5") - g1Uuid = None - with Hdf5db(filepath, app_logger=self.log) as db: - g1Uuid = db.getUUIDByPath("/g1") - self.assertEqual(len(g1Uuid), UUID_LEN) - obj = db.getObjByPath("/g1") - self.assertEqual(obj.name, "/g1") - - # end of with will close file - # open again and verify we can get obj by name - with Hdf5db(filepath, app_logger=self.log) as db: - obj = db.getGroupObjByUuid(g1Uuid) - g1 = db.getObjByPath("/g1") - self.assertEqual(obj, g1) - g1links = db.getLinkItems(g1Uuid) - self.assertEqual(len(g1links), 2) - for item in g1links: - self.assertEqual(len(item["id"]), UUID_LEN) - - def testReadDataset(self): - filepath = getFile("tall.h5", "readdataset.h5") - d111_values = None - d112_values = None - with Hdf5db(filepath, app_logger=self.log) as db: - d111Uuid = db.getUUIDByPath("/g1/g1.1/dset1.1.1") - self.assertEqual(len(d111Uuid), UUID_LEN) - d111_values = db.getDatasetValuesByUuid(d111Uuid) - self.assertTrue(type(d111_values) is list) - self.assertEqual(len(d111_values), 10) - for i in range(10): - arr = d111_values[i] - self.assertEqual(len(arr), 10) - for j in range(10): - self.assertEqual(arr[j], i * j) - - d112Uuid = db.getUUIDByPath("/g1/g1.1/dset1.1.2") - self.assertEqual(len(d112Uuid), UUID_LEN) - d112_values = db.getDatasetValuesByUuid(d112Uuid) - self.assertTrue(type(d112_values) is list) - self.assertEqual(len(d112_values), 20) - for i in range(20): - self.assertEqual(d112_values[i], i) - - def testReadDatasetBinary(self): - filepath = getFile("tall.h5", "readdatasetbinary.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - d111Uuid = db.getUUIDByPath("/g1/g1.1/dset1.1.1") - self.assertEqual(len(d111Uuid), UUID_LEN) - d111_data = db.getDatasetValuesByUuid(d111Uuid, format="binary") - self.assertTrue(type(d111_data) is bytes) - self.assertEqual(len(d111_data), 400) # 10x10x(4 byte type) - - d112Uuid = db.getUUIDByPath("/g1/g1.1/dset1.1.2") - self.assertEqual(len(d112Uuid), UUID_LEN) - d112_data = db.getDatasetValuesByUuid(d112Uuid, format="binary") - self.assertEqual(len(d112_data), 80) # 20x(4 byte type) - - def testReadCompoundDataset(self): - filepath = getFile("compound.h5", "readcompound.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - dset_uuid = db.getUUIDByPath("/dset") - self.assertEqual(len(dset_uuid), UUID_LEN) - dset_values = db.getDatasetValuesByUuid(dset_uuid) - - self.assertEqual(len(dset_values), 72) - elem = dset_values[0] - self.assertEqual(elem[0], 24) - self.assertEqual(elem[1], "13:53") - self.assertEqual(elem[2], 63) - self.assertEqual(elem[3], 29.88) - self.assertEqual(elem[4], "SE 10") - - def testReadDatasetCreationProp(self): - filepath = getFile("compound.h5", "readdatasetcreationprop.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - dset_uuid = db.getUUIDByPath("/dset") - self.assertEqual(len(dset_uuid), UUID_LEN) - dset_item = db.getDatasetItemByUuid(dset_uuid) - self.assertTrue("creationProperties" in dset_item) - creationProp = dset_item["creationProperties"] - self.assertTrue("fillValue" in creationProp) - fillValue = creationProp["fillValue"] - - self.assertEqual(fillValue[0], 999) - self.assertEqual(fillValue[1], "99:90") - self.assertEqual(fillValue[2], 999) - self.assertEqual(fillValue[3], 999.0) - self.assertEqual(fillValue[4], "N") - - def testCreateScalarDataset(self): - creation_props = { - "allocTime": "H5D_ALLOC_TIME_LATE", - "fillTime": "H5D_FILL_TIME_IFSET", - "fillValue": "", - "layout": {"class": "H5D_CONTIGUOUS"}, - } - datatype = { - "charSet": "H5T_CSET_ASCII", - "class": "H5T_STRING", - "length": 1, - "strPad": "H5T_STR_NULLPAD", - } - filepath = getFile("empty.h5", "createscalardataset.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - dims = () # if no space in body, default to scalar - max_shape = None - - db.createDataset( - datatype, dims, max_shape=max_shape, creation_props=creation_props - ) - - def testCreate1dDataset(self): - datatype = "H5T_STD_I64LE" - dims = (10,) - filepath = getFile("empty.h5", "create1ddataset.h5") - dset_uuid = None - with Hdf5db(filepath, app_logger=self.log) as db: - rsp = db.createDataset(datatype, dims) - - dset_uuid = rsp["id"] - item = db.getDatasetItemByUuid(dset_uuid) - self.assertEqual(item["attributeCount"], 0) - type_item = item["type"] - self.assertEqual(type_item["class"], "H5T_INTEGER") - self.assertEqual(type_item["base"], "H5T_STD_I64LE") - shape_item = item["shape"] - self.assertEqual(shape_item["class"], "H5S_SIMPLE") - self.assertEqual(shape_item["dims"], (10,)) - - def testCreate2dExtendableDataset(self): - datatype = "H5T_STD_I64LE" - dims = (10, 10) - max_shape = (None, 10) - filepath = getFile("empty.h5", "create2dextendabledataset.h5") - dset_uuid = None - with Hdf5db(filepath, app_logger=self.log) as db: - rsp = db.createDataset(datatype, dims, max_shape=max_shape) - dset_uuid = rsp["id"] - item = db.getDatasetItemByUuid(dset_uuid) - self.assertEqual(item["attributeCount"], 0) - type_item = item["type"] - self.assertEqual(type_item["class"], "H5T_INTEGER") - self.assertEqual(type_item["base"], "H5T_STD_I64LE") - shape_item = item["shape"] - self.assertEqual(shape_item["class"], "H5S_SIMPLE") - self.assertEqual(shape_item["dims"], (10, 10)) - self.assertTrue("maxdims" in shape_item) - self.assertEqual(shape_item["maxdims"], [0, 10]) - - def testCreateCommittedTypeDataset(self): - filepath = getFile("empty.h5", "createcommittedtypedataset.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - self.assertTrue(len(root_uuid) >= 36) - - datatype = { - "charSet": "H5T_CSET_ASCII", - "class": "H5T_STRING", - "strPad": "H5T_STR_NULLTERM", - "length": 15, - } - item = db.createCommittedType(datatype) - type_uuid = item["id"] - - dims = () # if no space in body, default to scalar - rsp = db.createDataset(type_uuid, dims, max_shape=None, creation_props=None) - dset_uuid = rsp["id"] - item = db.getDatasetItemByUuid(dset_uuid) - type_item = item["type"] - self.assertTrue("uuid" in type_item) - self.assertEqual(type_item["uuid"], type_uuid) - - def testCreateCommittedCompoundTypeDataset(self): - filepath = getFile("empty.h5", "createcommittedcompoundtypedataset.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - self.assertTrue(len(root_uuid) >= 36) - - datatype = {"class": "H5T_COMPOUND", "fields": []} - - type_fields = [] - type_fields.append({"name": "field_1", "type": "H5T_STD_I64BE"}) - type_fields.append({"name": "field_2", "type": "H5T_IEEE_F64BE"}) - - datatype["fields"] = type_fields - - creation_props = {"fillValue": [0, 0.0]} - - item = db.createCommittedType(datatype) - type_uuid = item["id"] - - dims = () # if no space in body, default to scalar - rsp = db.createDataset( - type_uuid, dims, max_shape=None, creation_props=creation_props - ) - dset_uuid = rsp["id"] - item = db.getDatasetItemByUuid(dset_uuid) - type_item = item["type"] - self.assertTrue("uuid" in type_item) - self.assertEqual(type_item["uuid"], type_uuid) - - def testReadZeroDimDataset(self): - filepath = getFile("zerodim.h5", "readzerodeimdataset.h5") - - with Hdf5db(filepath, app_logger=self.log) as db: - dsetUuid = db.getUUIDByPath("/dset") - self.assertEqual(len(dsetUuid), UUID_LEN) - dset_value = db.getDatasetValuesByUuid(dsetUuid) - self.assertEqual(dset_value, 42) - - def testReadNullSpaceDataset(self): - filepath = getFile("null_space_dset.h5", "readnullspacedataset.h5") - - with Hdf5db(filepath, app_logger=self.log) as db: - dsetUuid = db.getUUIDByPath("/DS1") - self.assertEqual(len(dsetUuid), UUID_LEN) - obj = db.getDatasetObjByUuid(dsetUuid) - shape_item = db.getShapeItemByDsetObj(obj) - self.assertTrue("class" in shape_item) - self.assertEqual(shape_item["class"], "H5S_NULL") - - def testReadScalarSpaceArrayDataset(self): - filepath = getFile("scalar_array_dset.h5", "readscalarspacearraydataset.h5") - - with Hdf5db(filepath, app_logger=self.log) as db: - dsetUuid = db.getUUIDByPath("/DS1") - self.assertEqual(len(dsetUuid), UUID_LEN) - obj = db.getDatasetObjByUuid(dsetUuid) - shape_item = db.getShapeItemByDsetObj(obj) - self.assertTrue("class" in shape_item) - self.assertEqual(shape_item["class"], "H5S_SCALAR") - - def testReadNullSpaceAttribute(self): - filepath = getFile("null_space_attr.h5", "readnullspaceattr.h5") + def testGroup(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + self.assertTrue(isSchema2Id(root_id)) + self.assertTrue(isRootObjId(root_id)) + + g1_id = db.createGroup() + self.assertTrue(isSchema2Id(g1_id)) + self.assertFalse(isRootObjId(g1_id)) + self.assertTrue(isValidUuid(g1_id, obj_class="groups")) + db.createHardLink(root_id, "g1", g1_id) + + g2_id = db.createGroup() + self.assertTrue(isSchema2Id(g2_id)) + self.assertFalse(isRootObjId(g2_id)) + self.assertTrue(isValidUuid(g2_id, obj_class="groups")) + db.createHardLink(root_id, "g2", g2_id) + + g1_1_id = db.createGroup() + self.assertTrue(isSchema2Id(g1_1_id)) + self.assertFalse(isRootObjId(g1_1_id)) + self.assertTrue(isValidUuid(g1_1_id, obj_class="groups")) + db.createHardLink(g1_id, "g1.1", g1_1_id) + + self.assertEqual(db.getObjectIdByPath("g1"), g1_id) + self.assertEqual(db.getObjectIdByPath("/g1"), g1_id) + self.assertEqual(db.getObjectIdByPath("g1/"), g1_id) + + self.assertEqual(db.getObjectIdByPath("g1/g1.1"), g1_1_id) + self.assertEqual(db.getObjectIdByPath("/g1/g1.1"), g1_1_id) + self.assertEqual(db.getObjectIdByPath("g1/g1.1/"), g1_1_id) + + grp1_json = db.getObjectById(g1_id) + self.assertTrue("links" in grp1_json) + g1_links = grp1_json["links"] + self.assertTrue("g1.1" in g1_links) + g1_1_link = db.getLink(g1_id, "g1.1") + self.assertEqual(g1_1_link["class"], "H5L_TYPE_HARD") + self.assertEqual(g1_1_link["id"], g1_1_id) + self.assertTrue(g1_1_link["created"] > time.time() - 1.0) + + db.createSoftLink(g2_id, "slink", "somewhere") + soft_link = db.getLink(g2_id, "slink") + self.assertEqual(soft_link["class"], "H5L_TYPE_SOFT") + self.assertEqual(soft_link["h5path"], "somewhere") + self.assertTrue(soft_link["created"] > time.time() - 1.0) + + db.createExternalLink(g2_id, "extlink", "somewhere", "someplace") + ext_link = db.getLink(g2_id, "extlink") + self.assertEqual(ext_link["class"], "H5L_TYPE_EXTERNAL") + self.assertEqual(ext_link["h5path"], "somewhere") + self.assertEqual(ext_link["file"], "someplace") + self.assertTrue(ext_link["created"] > time.time() - 1.0) + + db.createCustomLink(g2_id, "cust", {"foo": "bar"}) + cust_link = db.getLink(g2_id, "cust") + self.assertEqual(cust_link["class"], "H5L_TYPE_USER_DEFINED") + self.assertEqual(cust_link["foo"], "bar") + self.assertTrue(cust_link["created"] > time.time() - 1.0) + + links = db.getLinks(g2_id) + self.assertEqual(len(links), 3) + for title in "slink", "extlink", "cust": + self.assertTrue(title in links) + + db.deleteLink(g2_id, "cust") + links = db.getLinks(g2_id) + self.assertEqual(len(links), 2) + for title in "slink", "extlink": + self.assertTrue(title in links) - with Hdf5db(filepath, app_logger=self.log) as db: - rootUuid = db.getUUIDByPath("/") - self.assertEqual(len(rootUuid), UUID_LEN) - item = db.getAttributeItem("groups", rootUuid, "attr1") + try: + db.getObjectIdByPath("/g1/foo") + self.assertTrue(False) + except KeyError: + pass # expected + + ret = db.getLink(g2_id, "not_a_link") + self.assertTrue(ret is None) + + def testNullSpaceAttribute(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + db.createAttribute(root_id, "A1", None, shape="H5S_NULL", dtype=np.int32) + item = db.getAttribute(root_id, "A1") self.assertTrue("shape" in item) shape_item = item["shape"] self.assertTrue("class" in shape_item) self.assertEqual(shape_item["class"], "H5S_NULL") + self.assertTrue(item["created"] > time.time() - 1.0) + value = db.getAttributeValue(root_id, "A1") + self.assertEqual(value, None) - def testReadAttribute(self): - # getAttributeItemByUuid - item = None - filepath = getFile("tall.h5", "readattribute.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - rootUuid = db.getUUIDByPath("/") - self.assertEqual(len(rootUuid), UUID_LEN) - item = db.getAttributeItem("groups", rootUuid, "attr1") - self.assertTrue(item is not None) - - def testWriteScalarAttribute(self): - # getAttributeItemByUuid - item = None - filepath = getFile("empty.h5", "writescalarattribute.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") + def testScalarAttribute(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") dims = () - datatype = "H5T_STD_I32LE" value = 42 - db.createAttribute("groups", root_uuid, "A1", dims, datatype, value) - item = db.getAttributeItem("groups", root_uuid, "A1") - self.assertEqual(item["name"], "A1") + db.createAttribute(root_id, "A1", value, shape=dims, dtype=np.int32) + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") + self.assertEqual(len(shape_json.keys()), 1) # just one key should be returned + item_type = item["type"] + self.assertEqual(item_type["class"], "H5T_INTEGER") + self.assertEqual(item_type["base"], "H5T_STD_I32LE") + self.assertEqual(len(item_type.keys()), 2) # just two keys should be returned self.assertEqual(item["value"], 42) now = int(time.time()) - self.assertTrue(item["ctime"] > now - 5) - self.assertTrue(item["mtime"] > now - 5) + self.assertTrue(item["created"] > now - 1) shape = item["shape"] self.assertEqual(shape["class"], "H5S_SCALAR") - item_type = item["type"] self.assertEqual(item_type["class"], "H5T_INTEGER") self.assertEqual(item_type["base"], "H5T_STD_I32LE") - self.assertEqual( - len(item_type.keys()), 2 - ) # just two keys should be returned - - def testWriteFixedStringAttribute(self): - # getAttributeItemByUuid - item = None - filepath = getFile("empty.h5", "writefixedstringattribute.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - dims = () - datatype = { - "charSet": "H5T_CSET_ASCII", - "class": "H5T_STRING", - "strPad": "H5T_STR_NULLPAD", - "length": 13, - } + + def testFixedStringAttribute(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") value = "Hello, world!" - db.createAttribute("groups", root_uuid, "A1", dims, datatype, value) - item = db.getAttributeItem("groups", root_uuid, "A1") - self.assertEqual(item["name"], "A1") - self.assertEqual(item["value"], "Hello, world!") - now = int(time.time()) - self.assertTrue(item["ctime"] > now - 5) - self.assertTrue(item["mtime"] > now - 5) - shape = item["shape"] - self.assertEqual(shape["class"], "H5S_SCALAR") + db.createAttribute(root_id, "A1", value, dtype=np.dtype("S13")) # dims, datatype, value) + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") item_type = item["type"] - self.assertEqual(item_type["length"], 13) self.assertEqual(item_type["class"], "H5T_STRING") self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") + self.assertEqual(item_type["length"], 13) self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") + self.assertEqual(item["value"], "Hello, world!") + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) + ret_value = db.getAttributeValue(root_id, "A1") + self.assertEqual(ret_value, value.encode("ascii")) + + def testVlenAsciiAttribute(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") - def testWriteFixedNullTermStringAttribute(self): - # getAttributeItemByUuid - item = None - filepath = getFile("empty.h5", "writefixednulltermstringattribute.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - dims = () - datatype = { - "charSet": "H5T_CSET_ASCII", - "class": "H5T_STRING", - "strPad": "H5T_STR_NULLTERM", - "length": 13, - } value = b"Hello, world!" + dt = special_dtype(vlen=bytes) # write the attribute - db.createAttribute("groups", root_uuid, "A1", dims, datatype, value) + db.createAttribute(root_id, "A1", value, dtype=dt) # read it back - item = db.getAttributeItem("groups", root_uuid, "A1") - - self.assertEqual(item["name"], "A1") - # the following compare fails - see issue #34 - # self.assertEqual(item['value'], "Hello, world!") - now = int(time.time()) - self.assertTrue(item["ctime"] > now - 5) - self.assertTrue(item["mtime"] > now - 5) - shape = item["shape"] - self.assertEqual(shape["class"], "H5S_SCALAR") + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") item_type = item["type"] - self.assertEqual(item_type["length"], 13) self.assertEqual(item_type["class"], "H5T_STRING") - # NULLTERM get's converted to NULLPAD since the numpy dtype does not - # support other padding conventions. - self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") + self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") + self.assertEqual(item_type["length"], "H5T_VARIABLE") self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") - - def testWriteVlenStringAttribute(self): - # getAttributeItemByUuid - item = None - filepath = getFile("empty.h5", "writevlenstringattribute.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - dims = () - datatype = { - "charSet": "H5T_CSET_ASCII", - "class": "H5T_STRING", - "strPad": "H5T_STR_NULLTERM", - "length": "H5T_VARIABLE", - } - - # value = np.string_("Hello, world!") - value = "Hello, world!" - db.createAttribute("groups", root_uuid, "A1", dims, datatype, value) - item = db.getAttributeItem("groups", root_uuid, "A1") - self.assertEqual(item["name"], "A1") self.assertEqual(item["value"], "Hello, world!") now = int(time.time()) - self.assertTrue(item["ctime"] > now - 5) - self.assertTrue(item["mtime"] > now - 5) - shape = item["shape"] - self.assertEqual(shape["class"], "H5S_SCALAR") - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_STRING") - self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") - self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") - self.assertEqual(item_type["length"], "H5T_VARIABLE") + self.assertTrue(item["created"] > now - 1) - def testReadVlenStringDataset(self): - item = None - filepath = getFile("vlen_string_dset.h5", "vlen_string_dset.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - dset_uuid = db.getUUIDByPath("/DS1") - item = db.getDatasetItemByUuid(dset_uuid) - shape = item["shape"] - self.assertEqual(shape["class"], "H5S_SIMPLE") - dims = shape["dims"] - self.assertEqual(len(dims), 1) - self.assertEqual(dims[0], 4) - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_STRING") - # actual padding is SPACEPAD - See issue #32 - self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") - self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") - self.assertEqual(item_type["length"], "H5T_VARIABLE") - row = db.getDatasetValuesByUuid(dset_uuid, (slice(0, 1),)) - self.assertEqual(row, ["Parting"]) - - def testReadVlenStringDataset_utc(self): - item = None - filepath = getFile("vlen_string_dset_utc.h5", "vlen_string_dset_utc.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - dset_uuid = db.getUUIDByPath("/ds1") - item = db.getDatasetItemByUuid(dset_uuid) - shape = item["shape"] - self.assertEqual(shape["class"], "H5S_SIMPLE") - dims = shape["dims"] - self.assertEqual(len(dims), 1) - self.assertEqual(dims[0], 2293) + def testVlenUtf8Attribute(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + + value = b"Hello, world!" + dt = special_dtype(vlen=str) + + # write the attribute + db.createAttribute(root_id, "A1", value, dtype=dt) + # read it back + item = db.getAttribute(root_id, "A1") + shape_json = item["shape"] + self.assertEqual(shape_json["class"], "H5S_SCALAR") item_type = item["type"] self.assertEqual(item_type["class"], "H5T_STRING") self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") - self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") self.assertEqual(item_type["length"], "H5T_VARIABLE") - # next line throws conversion error - see issue #19 - # row = db.getDatasetValuesByUuid(dset_uuid, (slice(0, 1),)) - - def testReadFixedStringDataset(self): - item = None - filepath = getFile("fixed_string_dset.h5", "fixed_string_dset.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - dset_uuid = db.getUUIDByPath("/DS1") - item = db.getDatasetItemByUuid(dset_uuid) - shape = item["shape"] - self.assertEqual(shape["class"], "H5S_SIMPLE") - dims = shape["dims"] - self.assertEqual(len(dims), 1) - self.assertEqual(dims[0], 4) - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_STRING") - self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") - self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") - self.assertEqual(item_type["length"], 7) - row = db.getDatasetValuesByUuid(dset_uuid) - self.assertEqual(row, ["Parting", "is such", "sweet", "sorrow."]) - row = db.getDatasetValuesByUuid(dset_uuid, (slice(0, 1),)) - self.assertEqual( - row, - [ - "Parting", - ], - ) - row = db.getDatasetValuesByUuid(dset_uuid, (slice(2, 3),)) - self.assertEqual( - row, - [ - "sweet", - ], - ) - - def testReadFixedStringDatasetBinary(self): - item = None - filepath = getFile("fixed_string_dset.h5", "fixed_string_dset.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - dset_uuid = db.getUUIDByPath("/DS1") - item = db.getDatasetItemByUuid(dset_uuid) - shape = item["shape"] - self.assertEqual(shape["class"], "H5S_SIMPLE") - dims = shape["dims"] - self.assertEqual(len(dims), 1) - self.assertEqual(dims[0], 4) - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_STRING") - self.assertEqual(item_type["strPad"], "H5T_STR_NULLPAD") - self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") - self.assertEqual(item_type["length"], 7) - row = db.getDatasetValuesByUuid(dset_uuid, format="binary") - self.assertEqual(row, b"Partingis suchsweet\x00\x00sorrow.") - row = db.getDatasetValuesByUuid(dset_uuid, (slice(0, 1),), format="binary") - self.assertEqual(row, b"Parting") - row = db.getDatasetValuesByUuid(dset_uuid, (slice(2, 3),), format="binary") - self.assertEqual(row, b"sweet\x00\x00") - - def testWriteVlenUnicodeAttribute(self): - # getAttributeItemByUuid - item = None - filepath = getFile("empty.h5", "writevlenunicodeattribute.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - dims = () - datatype = { - "charSet": "H5T_CSET_UTF8", - "class": "H5T_STRING", - "strPad": "H5T_STR_NULLTERM", - "length": "H5T_VARIABLE", - } - value = "\u6b22\u8fce\u63d0\u4ea4\u5fae\u535a\u641c\u7d22\u4f7f\u7528\u53cd\u9988\uff0c\u8bf7\u76f4\u63a5" - db.createAttribute("groups", root_uuid, "A1", dims, datatype, value) - item = db.getAttributeItem("groups", root_uuid, "A1") - - self.assertEqual(item["name"], "A1") - self.assertEqual(item["value"], value) - now = int(time.time()) - self.assertTrue(item["ctime"] > now - 5) - self.assertTrue(item["mtime"] > now - 5) - shape = item["shape"] - self.assertEqual(shape["class"], "H5S_SCALAR") - item_type = item["type"] - self.assertEqual(item_type["class"], "H5T_STRING") - self.assertEqual(item_type["strPad"], "H5T_STR_NULLTERM") self.assertEqual(item_type["charSet"], "H5T_CSET_UTF8") - self.assertEqual(item_type["length"], "H5T_VARIABLE") + self.assertEqual(item["value"], "Hello, world!") + now = int(time.time()) + self.assertTrue(item["created"] > now - 1) - def testWriteIntAttribute(self): - # getAttributeItemByUuid - item = None - filepath = getFile("empty.h5", "writeintattribute.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - dims = (5,) - datatype = "H5T_STD_I16LE" + def testIntAttribute(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") value = [2, 3, 5, 7, 11] - db.createAttribute("groups", root_uuid, "A1", dims, datatype, value) - item = db.getAttributeItem("groups", root_uuid, "A1") - self.assertEqual(item["name"], "A1") + db.createAttribute(root_id, "A1", value, dtype=np.int16) + item = db.getAttribute(root_id, "A1") self.assertEqual(item["value"], [2, 3, 5, 7, 11]) now = int(time.time()) - self.assertTrue(item["ctime"] > now - 5) - self.assertTrue(item["mtime"] > now - 5) - shape = item["shape"] - self.assertEqual(shape["class"], "H5S_SIMPLE") + self.assertTrue(item["created"] > now - 1) + item_shape = item["shape"] + self.assertEqual(item_shape["class"], "H5S_SIMPLE") + self.assertEqual(item_shape["dims"], [5,]) item_type = item["type"] self.assertEqual(item_type["class"], "H5T_INTEGER") self.assertEqual(item_type["base"], "H5T_STD_I16LE") def testCreateReferenceAttribute(self): - filepath = getFile("empty.h5", "createreferencedataset.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - - dims = () # if no space in body, default to scalar - rsp = db.createDataset( - "H5T_STD_I64LE", dims, max_shape=None, creation_props=None - ) - dset_uuid = rsp["id"] - db.linkObject(root_uuid, dset_uuid, "DS1") - - dims = (1,) - datatype = {"class": "H5T_REFERENCE", "base": "H5T_STD_REF_OBJ"} - ds1_ref = "datasets/" + dset_uuid - value = [ - ds1_ref, - ] - db.createAttribute("groups", root_uuid, "A1", dims, datatype, value) - item = db.getAttributeItem("groups", root_uuid, "A1") - - attr_type = item["type"] + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + + dset_id = db.createDataset(shape=(), dtype=np.int32) + db.createHardLink(root_id, "DS1", dset_id) + + dt = special_dtype(ref=Reference) + + ds1_ref = "datasets/" + dset_id + value = [ds1_ref,] + db.createAttribute(root_id, "A1", value, dtype=dt) + item = db.getAttribute(root_id, "A1") + attr = db.getAttribute(root_id, "A1") + self.assertTrue("shape" in attr) + + attr_type = attr["type"] self.assertEqual(attr_type["class"], "H5T_REFERENCE") self.assertEqual(attr_type["base"], "H5T_STD_REF_OBJ") attr_value = item["value"] @@ -850,149 +261,47 @@ def testCreateReferenceAttribute(self): self.assertEqual(attr_value[0], ds1_ref) def testCreateVlenReferenceAttribute(self): - filepath = getFile("empty.h5", "createreferenceattribute.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - - dims = () # if no space in body, default to scalar - rsp = db.createDataset( - "H5T_STD_I64LE", dims, max_shape=None, creation_props=None - ) - dset_uuid = rsp["id"] - db.linkObject(root_uuid, dset_uuid, "DS1") - - dims = (1,) - datatype = { - "class": "H5T_VLEN", - "base": {"class": "H5T_REFERENCE", "base": "H5T_STD_REF_OBJ"}, - } - ds1_ref = "datasets/" + dset_uuid - value = [ - [ - ds1_ref, - ], - ] - db.createAttribute("groups", root_uuid, "A1", dims, datatype, value) - item = db.getAttributeItem("groups", root_uuid, "A1") - - attr_type = item["type"] - self.assertEqual(attr_type["class"], "H5T_VLEN") - base_type = attr_type["base"] - # todo - this should be H5T_REFERENCE, not H5T_OPAQUE - # See h5py issue: https://github.com/h5py/h5py/issues/553 - import h5py - - # test based on h5py version until we change install requirements - if h5py.version.version_tuple >= (2, 6, 0): - self.assertEqual(base_type["class"], "H5T_REFERENCE") - else: - self.assertEqual(base_type["class"], "H5T_OPAQUE") - - def testCreateReferenceListAttribute(self): - filepath = getFile("empty.h5", "createreferencelistattribute.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - - dims = (10,) - - rsp = db.createDataset( - "H5T_STD_I64LE", dims, max_shape=None, creation_props=None - ) - dset_uuid = rsp["id"] - db.linkObject(root_uuid, dset_uuid, "dset") - - rsp = db.createDataset( - "H5T_STD_I64LE", dims, max_shape=None, creation_props=None - ) - xscale_uuid = rsp["id"] - nullterm_string_type = { - "charSet": "H5T_CSET_ASCII", - "class": "H5T_STRING", - "length": 16, - "strPad": "H5T_STR_NULLTERM", - } - scalar_dims = () - db.createAttribute( - "datasets", - xscale_uuid, - "CLASS", - scalar_dims, - nullterm_string_type, - "DIMENSION_SCALE", - ) - db.linkObject(root_uuid, xscale_uuid, "xscale") - - ref_dims = (1,) - datatype = { - "class": "H5T_VLEN", - "base": {"class": "H5T_REFERENCE", "base": "H5T_STD_REF_OBJ"}, - } - xscale_ref = "datasets/" + xscale_uuid - value = [ - (xscale_ref,), - ] - db.createAttribute( - "datasets", dset_uuid, "DIMENSION_LIST", ref_dims, datatype, value - ) - item = db.getAttributeItem("datasets", dset_uuid, "DIMENSION_LIST") - - attr_type = item["type"] - self.assertEqual(attr_type["class"], "H5T_VLEN") - base_type = attr_type["base"] - # todo - this should be H5T_REFERENCE, not H5T_OPAQUE - self.assertEqual(base_type["class"], "H5T_REFERENCE") + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + dset_id = db.createDataset(shape=(), dtype=np.int32) + db.createHardLink(root_id, "DS1", dset_id) + grp_id = db.createGroup() + db.createHardLink(root_id, "G1", grp_id) + + dt_base = special_dtype(ref=Reference) + dt = special_dtype(vlen=dt_base) + + ds1_ref = "datasets/" + dset_id + grp_ref = "groups/" + grp_id + ref_arr = np.zeros((2,), dtype=dt_base) + ref_arr[0] = ds1_ref + ref_arr[1] = grp_ref + vlen_arr = np.zeros((), dtype=dt) + vlen_arr[()] = ref_arr + + db.createAttribute(root_id, "A1", vlen_arr) + item = db.getAttribute(root_id, "A1") - def testReadCommittedType(self): - filepath = getFile("committed_type.h5", "readcommitted_type.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - type_uuid = db.getUUIDByPath("/Sensor_Type") - item = db.getCommittedTypeItemByUuid(type_uuid) - self.assertTrue("type" in item) item_type = item["type"] - self.assertTrue(item_type["class"], "H5T_COMPOUND") - ds1_uuid = db.getUUIDByPath("/DS1") - item = db.getDatasetItemByUuid(ds1_uuid) - shape = item["shape"] - self.assertEqual(shape["class"], "H5S_SIMPLE") - dims = shape["dims"] - self.assertEqual(len(dims), 1) - self.assertEqual(dims[0], 4) - item_type = item["type"] - self.assertTrue("class" in item_type) - self.assertEqual(item_type["class"], "H5T_COMPOUND") - self.assertTrue("uuid" in item_type) - self.assertEqual(item_type["uuid"], type_uuid) + self.assertEqual(item_type["class"], "H5T_VLEN") + self.assertEqual(item_type["size"], "H5T_VARIABLE") + base_type = item_type["base"] + self.assertEqual(base_type["class"], "H5T_REFERENCE") + self.assertEqual(base_type["base"], "H5T_STD_REF_OBJ") - item = db.getAttributeItem("groups", root_uuid, "attr1") - shape = item["shape"] - self.assertEqual(shape["class"], "H5S_SCALAR") - item_type = item["type"] - self.assertTrue("class" in item_type) - self.assertEqual(item_type["class"], "H5T_COMPOUND") - self.assertTrue("uuid" in item_type) - self.assertEqual(item_type["uuid"], type_uuid) - - def testWriteCommittedType(self): - filepath = getFile("empty.h5", "writecommittedtype.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - self.assertTrue(len(root_uuid) >= 36) - datatype = { - "charSet": "H5T_CSET_ASCII", - "class": "H5T_STRING", - "strPad": "H5T_STR_NULLTERM", - "length": 15, - } - item = db.createCommittedType(datatype) - type_uuid = item["id"] - item = db.getCommittedTypeItemByUuid(type_uuid) - self.assertEqual(item["id"], type_uuid) - self.assertEqual(item["attributeCount"], 0) + item_shape = item["shape"] + self.assertEqual(item_shape["class"], "H5S_SCALAR") + + def testCommittedType(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + dt = np.dtype("S15") + + ctype_id = db.createCommittedType(dt) + db.createHardLink(root_id, "ctype", ctype_id) + item = db.getObjectById(ctype_id) now = int(time.time()) - self.assertTrue(item["ctime"] > now - 5) - self.assertTrue(item["mtime"] > now - 5) - self.assertEqual(len(item["alias"]), 0) # anonymous, so no alias + self.assertTrue(item["created"] > now - 1) item_type = item["type"] @@ -1001,318 +310,123 @@ def testWriteCommittedType(self): self.assertEqual(item_type["charSet"], "H5T_CSET_ASCII") self.assertEqual(item_type["length"], 15) - def testWriteCommittedCompoundType(self): - filepath = getFile("empty.h5", "writecommittedcompoundtype.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - self.assertTrue(len(root_uuid) >= 36) - - datatype = {"class": "H5T_COMPOUND", "fields": []} - - fixed_str_type = { - "charSet": "H5T_CSET_ASCII", - "class": "H5T_STRING", - "strPad": "H5T_STR_NULLTERM", - "length": 15, - } - - var_str_type = { - "charSet": "H5T_CSET_ASCII", - "class": "H5T_STRING", - "length": "H5T_VARIABLE", - "strPad": "H5T_STR_NULLTERM", - } - type_fields = [] - type_fields.append({"name": "field_1", "type": "H5T_STD_I64BE"}) - type_fields.append({"name": "field_2", "type": "H5T_IEEE_F64BE"}) - type_fields.append({"name": "field_3", "type": fixed_str_type}) - type_fields.append({"name": "field_4", "type": var_str_type}) - datatype["fields"] = type_fields - - item = db.createCommittedType(datatype) - type_uuid = item["id"] - item = db.getCommittedTypeItemByUuid(type_uuid) - self.assertEqual(item["id"], type_uuid) - self.assertEqual(item["attributeCount"], 0) + # create an attribute using the committed type + db.createAttribute(root_id, "A1", "hello world!", dtype=f"datatypes/{ctype_id}") + attr = db.getAttribute(root_id, "A1") + self.assertEqual(attr["value"], "hello world!") + + attr_type = attr["type"] + self.assertEqual(attr_type["class"], "H5T_STRING") + self.assertEqual(attr_type["length"], 15) + self.assertEqual(attr_type["charSet"], "H5T_CSET_ASCII") + + def testCommittedCompoundType(self): + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + + dt_str = special_dtype(vlen=str) + fields = [] + fields.append(("field_1", np.dtype(">i8"))) + fields.append(("field_2", ">f8")) + fields.append(("field_3", np.dtype("S15"))) + fields.append(("field_4", dt_str)) + dt = np.dtype(fields) + + ctype_id = db.createCommittedType(dt) + db.createHardLink(root_id, "ctype", ctype_id) + item = db.getObjectById(ctype_id) now = int(time.time()) - self.assertTrue(item["ctime"] > now - 5) - self.assertTrue(item["mtime"] > now - 5) - self.assertEqual(len(item["alias"]), 0) # anonymous, so no alias + self.assertTrue(item["created"] > now - 1) item_type = item["type"] self.assertEqual(item_type["class"], "H5T_COMPOUND") fields = item_type["fields"] self.assertEqual(len(fields), 4) - # todo - the last field class should be H5T_STRING, but it is getting - # saved to HDF5 as Opaque - see: https://github.com/h5py/h5py/issues/613 - # this is fixed in h5py v. 2.6.0 - check the version until 2.6.0 becomes - # available via pip and anaconda. - import h5py - - if h5py.version.version_tuple >= (2, 6, 0): - field_classes = ("H5T_INTEGER", "H5T_FLOAT", "H5T_STRING", "H5T_STRING") - else: - field_classes = ("H5T_INTEGER", "H5T_FLOAT", "H5T_STRING", "H5T_OPAQUE") - for i in range(4): - field = fields[i] - self.assertEqual(field["name"], "field_" + str(i + 1)) - field_type = field["type"] - self.assertEqual(field_type["class"], field_classes[i]) - - def testToRef(self): - - filepath = getFile("empty.h5", "toref.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - type_item = { - "order": "H5T_ORDER_LE", - "base_size": 1, - "class": "H5T_INTEGER", - "base": "H5T_STD_I8LE", - "size": 1, - } - data_list = [2, 3, 5, 7, 11] - ref_value = db.toRef(1, type_item, data_list) - self.assertEqual(ref_value, data_list) - - type_item = { - "charSet": "H5T_CSET_ASCII", - "class": "H5T_STRING", - "length": 8, - "strPad": "H5T_STR_NULLPAD", - } - data_list = ["Hypertext", "as", "engine", "of", "state"] - ref_value = db.toRef(1, type_item, data_list) - - def testToTuple(self): - filepath = getFile("empty.h5", "totuple.h5") - data1d = [1, 2, 3] - data2d = [[1, 2], [3, 4]] - data3d = [[[1, 2], [3, 4]], [[5, 6], [7, 8]]] - with Hdf5db(filepath, app_logger=self.log) as db: - self.assertEqual(db.toTuple(1, data1d), [1, 2, 3]) - self.assertEqual(db.toTuple(2, data2d), [[1, 2], [3, 4]]) - self.assertEqual(db.toTuple(1, data2d), [(1, 2), (3, 4)]) - self.assertEqual( - db.toTuple(3, data3d), [[[1, 2], [3, 4]], [[5, 6], [7, 8]]] - ) - self.assertEqual( - db.toTuple(2, data3d), [[(1, 2), (3, 4)], [(5, 6), (7, 8)]] - ) - self.assertEqual( - db.toTuple(1, data3d), [((1, 2), (3, 4)), ((5, 6), (7, 8))] - ) - - def testBytesArrayToList(self): - filepath = getFile("empty.h5", "bytestostring.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - - val = db.bytesArrayToList(b"Hello") - self.assertTrue(type(val) is str) - val = db.bytesArrayToList( - [ - b"Hello", - ] - ) - self.assertEqual(len(val), 1) - self.assertTrue(type(val[0]) is str) - self.assertEqual(val[0], "Hello") - - import numpy as np - - data = np.array([b"Hello"]) - val = db.bytesArrayToList(data) - self.assertEqual(len(val), 1) - self.assertTrue(type(val[0]) is str) - self.assertEqual(val[0], "Hello") - - def testGetDataValue(self): - # typeItem, value, dimension=0, dims=None): - filepath = getFile("empty.h5", "bytestostring.h5") - string_type = { - "charSet": "H5T_CSET_ASCII", - "class": "H5T_STRING", - "strPad": "H5T_STR_NULLTERM", - "length": "H5T_VARIABLE", - } - - with Hdf5db(filepath, app_logger=self.log) as db: - - import numpy as np - - data = np.array([b"Hello"]) - val = db.getDataValue(string_type, data, dimension=1, dims=(1,)) - self.assertTrue(type(val[0]) is str) - - def testGetAclDataset(self): - filepath = getFile("tall.h5", "getacldataset.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - d111_uuid = db.getUUIDByPath("/g1/g1.1/dset1.1.1") - num_acls = db.getNumAcls(d111_uuid) - self.assertEqual(num_acls, 0) - acl_dset = db.getAclDataset(d111_uuid, create=True) - self.assertTrue(acl_dset.name.endswith(d111_uuid)) - self.assertEqual(len(acl_dset.dtype), 7) - self.assertEqual(len(acl_dset.shape), 1) - self.assertEqual(acl_dset.shape[0], 0) - num_acls = db.getNumAcls(d111_uuid) - self.assertEqual(num_acls, 0) - - def testSetAcl(self): - filepath = getFile("tall.h5", "setacl.h5") - user1 = 123 - user2 = 456 - with Hdf5db(filepath, app_logger=self.log) as db: - d111_uuid = db.getUUIDByPath("/g1/g1.1/dset1.1.1") - num_acls = db.getNumAcls(d111_uuid) - self.assertEqual(num_acls, 0) - - # add read/write acl for user1 - acl_user1 = db.getAcl(d111_uuid, user1) - - self.assertEqual(acl_user1["userid"], 0) - acl_user1["userid"] = user1 - acl_user1["readACL"] = 0 - acl_user1["updateACL"] = 0 - num_acls = db.getNumAcls(d111_uuid) - self.assertEqual(num_acls, 0) - - db.setAcl(d111_uuid, acl_user1) - acl = db.getAcl(d111_uuid, user1) - num_acls = db.getNumAcls(d111_uuid) - self.assertEqual(num_acls, 1) - - # add read-only acl for user2 - acl_user2 = db.getAcl(d111_uuid, user2) - self.assertEqual(acl_user2["userid"], 0) - acl_user2["userid"] = user2 - acl_user2["create"] = 0 - acl_user2["read"] = 1 - acl_user2["update"] = 0 - acl_user2["delete"] = 0 - acl_user2["readACL"] = 0 - acl_user2["updateACL"] = 0 - db.setAcl(d111_uuid, acl_user2) - num_acls = db.getNumAcls(d111_uuid) - self.assertEqual(num_acls, 2) - - # fetch and verify acls - acl = db.getAcl(d111_uuid, user1) - self.assertEqual(acl["userid"], user1) - self.assertEqual(acl["create"], 1) - self.assertEqual(acl["read"], 1) - self.assertEqual(acl["update"], 1) - self.assertEqual(acl["delete"], 1) - self.assertEqual(acl["readACL"], 0) - self.assertEqual(acl["updateACL"], 0) - - acl = db.getAcl(d111_uuid, user2) - self.assertEqual(acl["userid"], user2) - self.assertEqual(acl["create"], 0) - self.assertEqual(acl["read"], 1) - self.assertEqual(acl["update"], 0) - self.assertEqual(acl["delete"], 0) - self.assertEqual(acl["readACL"], 0) - self.assertEqual(acl["updateACL"], 0) - - num_acls = db.getNumAcls(d111_uuid) - self.assertEqual(num_acls, 2) - - # get acl data_list - acls = db.getAcls(d111_uuid) - self.assertEqual(len(acls), 2) - - def testRootAcl(self): - filepath = getFile("tall.h5", "rootacl.h5") - user1 = 123 - with Hdf5db(filepath, app_logger=self.log) as db: - root_uuid = db.getUUIDByPath("/") - d111_uuid = db.getUUIDByPath("/g1/g1.1/dset1.1.1") - num_acls = db.getNumAcls(d111_uuid) - self.assertEqual(num_acls, 0) - - # add read/write acl for user1 at root - acl_root = db.getAcl(root_uuid, 0) - self.assertEqual(acl_root["userid"], 0) - acl_root["create"] = 0 - acl_root["read"] = 1 - acl_root["update"] = 0 - acl_root["delete"] = 0 - acl_root["readACL"] = 0 - acl_root["updateACL"] = 0 - num_acls = db.getNumAcls(root_uuid) - self.assertEqual(num_acls, 0) - - db.setAcl(root_uuid, acl_root) - num_acls = db.getNumAcls(root_uuid) - self.assertEqual(num_acls, 1) - - acl = db.getAcl(d111_uuid, user1) - num_acls = db.getNumAcls(d111_uuid) # this will fetch the root acl - self.assertEqual(num_acls, 0) - self.assertEqual(acl["userid"], 0) - self.assertEqual(acl["create"], 0) - self.assertEqual(acl["read"], 1) - self.assertEqual(acl["update"], 0) - self.assertEqual(acl["delete"], 0) - self.assertEqual(acl["readACL"], 0) - self.assertEqual(acl["updateACL"], 0) - - def testGetEvalStr(self): - queries = { - "date == 23": "rows['date'] == 23", - "wind == b'W 5'": "rows['wind'] == b'W 5'", - "temp > 61": "rows['temp'] > 61", - "(date >=22) & (date <= 24)": "(rows['date'] >=22) & (rows['date'] <= 24)", - "(date == 21) & (temp > 70)": "(rows['date'] == 21) & (rows['temp'] > 70)", - "(wind == b'E 7') | (wind == b'S 7')": "(rows['wind'] == b'E 7') | (rows['wind'] == b'S 7')", - } - - fields = ["date", "wind", "temp"] - filepath = getFile("empty.h5", "getevalstring.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - - for query in queries.keys(): - eval_str = db._getEvalStr(query, fields) - self.assertEqual(eval_str, queries[query]) - # print(query, "->", eval_str) - - def testBadQuery(self): - queries = ( - "foobar", # no variable used - "wind = b'abc", # non-closed literal - "(wind = b'N') & (temp = 32", # missing paren - "foobar > 42", # invalid field name - "import subprocess; subprocess.call(['ls', '/'])", - ) # injection attack - - fields = ("date", "wind", "temp") - filepath = getFile("empty.h5", "badquery.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - for query in queries: - try: - eval_str = db._getEvalStr(query, fields) - self.log.error(f"got eval_str: {eval_str}") - self.assertTrue(False) # shouldn't get here - except IOError: - pass # ok - - def testInjectionBlock(self): - queries = ( - "import subprocess; subprocess.call(['ls', '/'])", - ) # injection attack - - fields = ("import", "subprocess", "call") - filepath = getFile("empty.h5", "injectionblock.h5") - with Hdf5db(filepath, app_logger=self.log) as db: - - for query in queries: - try: - eval_str = db._getEvalStr(query, fields) - self.log.error(f"got eval_str: {eval_str}") - self.assertTrue(False) # shouldn't get here - except IOError: - pass # ok + + # create an attribute using the committed type + attr_value = (42, 3.14, "circle", "area = R^2 * PI") + db.createAttribute(root_id, "A1", attr_value, dtype=f"datatypes/{ctype_id}") + attr = db.getAttribute(root_id, "A1") + self.assertEqual(attr["value"], list(attr_value)) + attr_shape = attr["shape"] + self.assertEqual(attr_shape["class"], "H5S_SCALAR") + + attr_type = attr["type"] + self.assertEqual(attr_type["class"], "H5T_COMPOUND") + + value = db.getAttributeValue(root_id, "A1") + self.assertTrue(isinstance(value, np.ndarray)) + + def testSimpleDataset(self): + with Hdf5db(app_logger=self.log) as db: + nrows = 8 + ncols = 10 + shape = (nrows, ncols) + dtype = np.int32 + root_id = db.getObjectIdByPath("/") + dset_id = db.createDataset(shape, dtype=dtype) + db.createHardLink(root_id, "dset", dset_id) + db.createAttribute(dset_id, "a1", "Hello, world") + sel_all = selections.select(shape, ...) + arr = db.getDatasetValues(dset_id, sel_all) + self.assertEqual(arr.dtype, dtype) + self.assertEqual(arr.shape, shape) + self.assertEqual(arr.min(), 0) + self.assertEqual(arr.max(), 0) + row = np.zeros((ncols,), dtype=dtype) + for i in range(nrows): + row[:] = list(range(i * 10, (i + 1) * 10)) + row_sel = selections.select(shape, (slice(i, i + 1), slice(0, ncols))) + db.setDatasetValues(dset_id, row_sel, row) + arr = db.getDatasetValues(dset_id, sel_all) + for i in range(nrows): + row = np.array(list(range(i * 10, (i + 1) * 10)), dtype=dtype) + np.testing.assert_array_equal(arr[i, :], row) + + def testScalarDataset(self): + dtype = np.int32 + with Hdf5db(app_logger=self.log) as db: + root_id = db.getObjectIdByPath("/") + dset_id = db.createDataset((), dtype=dtype) + db.createHardLink(root_id, "dset", dset_id) + db.createAttribute(dset_id, "a1", "Hello, world") + sel_all = selections.select((), ...) + arr = db.getDatasetValues(dset_id, sel_all) + self.assertEqual(arr.dtype, dtype) + self.assertEqual(arr.shape, ()) + self.assertEqual(arr[()], 0) + db.setDatasetValues(dset_id, sel_all, np.array(42, dtype=dtype)) + arr = db.getDatasetValues(dset_id, sel_all) + self.assertEqual(arr.dtype, dtype) + self.assertEqual(arr.shape, ()) + self.assertEqual(arr.min(), 42) + self.assertEqual(arr.max(), 42) + + def testResizableDataset(self): + with Hdf5db(app_logger=self.log) as db: + nrows = 8 + ncols = 10 + shape = (nrows, ncols) + dtype = np.int32 + maxdims = (None, ncols * 2) + root_id = db.getObjectIdByPath("/") + dset_id = db.createDataset(shape, maxdims=maxdims, dtype=dtype) + db.createHardLink(root_id, "dset", dset_id) + db.createAttribute(dset_id, "a1", "Hello, world") + + # resize limited dimension + db.resizeDataset(dset_id, (nrows, ncols * 2)) + + # try to go beyond max extent + try: + db.resizeDataset(dset_id, (nrows, ncols * 3)) + self.assertTrue(False) + except ValueError: + pass # expected + + # resize unlimited dimension + db.resizeDataset(dset_id, (nrows * 10, ncols)) if __name__ == "__main__": diff --git a/test/unit/hdf5dtype_test.py b/test/unit/hdf5dtype_test.py index 0f67d7b..fc0ffb4 100755 --- a/test/unit/hdf5dtype_test.py +++ b/test/unit/hdf5dtype_test.py @@ -2,8 +2,8 @@ # Copyright by The HDF Group. # # All rights reserved. # # # -# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # -# Utilities. The full HDF5 REST Server copyright notice, including # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # # terms governing use, modification, and redistribution, is contained in # # the file COPYING, which can be found at the root of the source code # # distribution tree. If you do not have access to this file, you may # @@ -12,11 +12,14 @@ import unittest import logging import numpy as np -from h5py import special_dtype -from h5py import check_dtype -from h5py import Reference -from h5py import RegionReference + from h5json import hdf5dtype +from h5json.hdf5dtype import special_dtype +from h5json.hdf5dtype import check_dtype +from h5json.hdf5dtype import Reference +from h5json.hdf5dtype import RegionReference +from h5json.hdf5dtype import isOpaqueDtype +from h5json.hdf5dtype import isVlen class Hdf5dtypeTest(unittest.TestCase): @@ -26,6 +29,31 @@ def __init__(self, *args, **kwargs): self.logger = logging.getLogger() self.logger.setLevel(logging.INFO) + def testGetBaseTypeJson(self): + type_json = hdf5dtype.getBaseTypeJson("H5T_IEEE_F64LE") + self.assertTrue("class" in type_json) + self.assertEqual(type_json["class"], "H5T_FLOAT") + self.assertTrue("base" in type_json) + self.assertEqual(type_json["base"], "H5T_IEEE_F64LE") + + type_json = hdf5dtype.getBaseTypeJson("H5T_IEEE_F16LE") + self.assertTrue("class" in type_json) + self.assertEqual(type_json["class"], "H5T_FLOAT") + self.assertTrue("base" in type_json) + self.assertEqual(type_json["base"], "H5T_IEEE_F16LE") + + type_json = hdf5dtype.getBaseTypeJson("H5T_STD_I32LE") + self.assertTrue("class" in type_json) + self.assertEqual(type_json["class"], "H5T_INTEGER") + self.assertTrue("base" in type_json) + self.assertEqual(type_json["base"], "H5T_STD_I32LE") + + try: + hdf5dtype.getBaseTypeJson("foobar") + self.assertTrue(False) + except TypeError: + pass # expected + def testBaseIntegerTypeItem(self): dt = np.dtype("") self.assertEqual(dt.kind, "u") + self.assertFalse(isVlen(dt)) dt = hdf5dtype.createDataType("H5T_STD_I16LE") self.assertEqual(dt.name, "int16") @@ -317,10 +430,12 @@ def testCreateBaseType(self): dt = hdf5dtype.createDataType("H5T_IEEE_F64LE") self.assertEqual(dt.name, "float64") self.assertEqual(dt.kind, "f") + self.assertFalse(isVlen(dt)) dt = hdf5dtype.createDataType("H5T_IEEE_F32LE") self.assertEqual(dt.name, "float32") self.assertEqual(dt.kind, "f") + self.assertFalse(isVlen(dt)) typeItem = {"class": "H5T_INTEGER", "base": "H5T_STD_I32BE"} typeSize = hdf5dtype.getItemSize(typeItem) @@ -328,6 +443,7 @@ def testCreateBaseType(self): self.assertEqual(dt.name, "int32") self.assertEqual(dt.kind, "i") self.assertEqual(typeSize, 4) + self.assertFalse(isVlen(dt)) def testCreateBaseStringType(self): typeItem = {"class": "H5T_STRING", "charSet": "H5T_CSET_ASCII", "length": 6} @@ -336,15 +452,18 @@ def testCreateBaseStringType(self): self.assertEqual(dt.name, "bytes48") self.assertEqual(dt.kind, "S") self.assertEqual(typeSize, 6) + self.assertFalse(isVlen(dt)) def testCreateBaseUnicodeType(self): - typeItem = {"class": "H5T_STRING", "charSet": "H5T_CSET_UTF8", "length": 32} - try: - # dt = hdf5dtype.createDataType(typeItem) - hdf5dtype.createDataType(typeItem) - self.assertTrue(False) # expected exception - except TypeError: - pass + typeItem = {"class": "H5T_STRING", "charSet": "H5T_CSET_UTF8", "length": 6} + + dt = hdf5dtype.createDataType(typeItem) + typeSize = hdf5dtype.getItemSize(typeItem) + self.assertTrue(dt is not None) + self.assertEqual(dt.name, "bytes48") + self.assertEqual(dt.kind, "S") # uses byte + self.assertEqual(typeSize, 6) + self.assertFalse(isVlen(dt)) def testCreateNullTermStringType(self): typeItem = { @@ -355,9 +474,11 @@ def testCreateNullTermStringType(self): } typeSize = hdf5dtype.getItemSize(typeItem) dt = hdf5dtype.createDataType(typeItem) + self.assertEqual(dt.name, "bytes48") self.assertEqual(dt.kind, "S") self.assertEqual(typeSize, 6) + self.assertFalse(isVlen(dt)) def testCreateVLenStringType(self): typeItem = { @@ -371,6 +492,28 @@ def testCreateVLenStringType(self): self.assertEqual(dt.kind, "O") self.assertEqual(check_dtype(vlen=dt), bytes) self.assertEqual(typeSize, "H5T_VARIABLE") + self.assertTrue(isVlen(dt)) + + def testCreateVLenStringArrayType(self): + typeItem = { + "class": "H5T_ARRAY", + "dims": (2, 2), + "base": { + "class": "H5T_STRING", + "charSet": "H5T_CSET_ASCII", + "length": "H5T_VARIABLE", + } + } + typeSize = hdf5dtype.getItemSize(typeItem) + dt = hdf5dtype.createDataType(typeItem) + self.assertEqual(dt.name, "void256") # assuming 8-byte pointers + self.assertEqual(dt.kind, "V") + self.assertEqual(dt.shape, (2, 2)) + self.assertEqual(check_dtype(vlen=dt), None) + self.assertEqual(check_dtype(vlen=dt.base), bytes) + self.assertEqual(typeSize, "H5T_VARIABLE") + self.assertEqual(dt.base.kind, 'O') + self.assertTrue(isVlen(dt)) def testCreateVLenUTF8Type(self): typeItem = { @@ -384,14 +527,16 @@ def testCreateVLenUTF8Type(self): self.assertEqual(dt.kind, "O") self.assertEqual(check_dtype(vlen=dt), str) self.assertEqual(typeSize, "H5T_VARIABLE") + self.assertTrue(isVlen(dt)) def testCreateVLenDataType(self): typeItem = {"class": "H5T_VLEN", "base": "H5T_STD_I32BE"} typeSize = hdf5dtype.getItemSize(typeItem) + self.assertEqual(typeSize, "H5T_VARIABLE") dt = hdf5dtype.createDataType(typeItem) self.assertEqual(dt.name, "object") self.assertEqual(dt.kind, "O") - self.assertEqual(typeSize, "H5T_VARIABLE") + self.assertTrue(isVlen(dt)) def testCreateOpaqueType(self): typeItem = {"class": "H5T_OPAQUE", "size": 200} @@ -400,17 +545,13 @@ def testCreateOpaqueType(self): self.assertEqual(dt.name, "void1600") self.assertEqual(dt.kind, "V") self.assertEqual(typeSize, 200) + self.assertFalse(isVlen(dt)) def testCreateEnumType(self): typeItem = { "class": "H5T_ENUM", "base": {"base": "H5T_STD_I16LE", "class": "H5T_INTEGER"}, - "members": [ - {"name": "GAS", "value": 2}, - {"name": "LIQUID", "value": 1}, - {"name": "PLASMA", "value": 3}, - {"name": "SOLID", "value": 0}, - ], + "mapping": {"GAS": 2, "LIQUID": 1, "PLASMA": 3, "SOLID": 0}, } typeSize = hdf5dtype.getItemSize(typeItem) @@ -424,12 +565,13 @@ def testCreateEnumType(self): self.assertEqual(mapping["LIQUID"], 1) self.assertEqual(mapping["GAS"], 2) self.assertEqual(mapping["PLASMA"], 3) + self.assertFalse(isVlen(dt)) def testCreateBoolType(self): typeItem = { "class": "H5T_ENUM", "base": {"base": "H5T_STD_I8LE", "class": "H5T_INTEGER"}, - "members": [{"name": "TRUE", "value": 1}, {"name": "FALSE", "value": 0}], + "mapping": {"TRUE": 1, "FALSE": 0}, } typeSize = hdf5dtype.getItemSize(typeItem) @@ -437,6 +579,38 @@ def testCreateBoolType(self): dt = hdf5dtype.createDataType(typeItem) self.assertEqual(dt.name, "bool") self.assertEqual(dt.kind, "b") + self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dt)) + self.assertFalse(isVlen(dt)) + + def testCreateReferenceType(self): + typeItem = { + "class": "H5T_REFERENCE", + "base": "H5T_STD_REF_OBJ", + "length": 48, + "charSet": "H5T_CSET_ASCII", + "strPad": "H5T_STR_NULLPAD" + } + typeSize = hdf5dtype.getItemSize(typeItem) + self.assertEqual(typeSize, 48) + dt = hdf5dtype.createDataType(typeItem) + self.assertEqual(dt.kind, "S") + self.assertTrue(dt.metadata['ref'] is Reference) + self.assertEqual(check_dtype(ref=dt), Reference) + self.assertFalse(isVlen(dt)) + + def testCreateVlenReferenceType(self): + typeItem = { + 'class': 'H5T_VLEN', + 'base': {'class': 'H5T_REFERENCE', 'base': 'H5T_STD_REF_OBJ'} + } + typeSize = hdf5dtype.getItemSize(typeItem) + self.assertEqual(typeSize, 'H5T_VARIABLE') + dt = hdf5dtype.createDataType(typeItem) + self.assertEqual(dt.kind, "O") + base = dt.metadata['vlen'] + self.assertTrue(base.metadata['ref'] is Reference) + self.assertEqual(check_dtype(ref=base), Reference) + self.assertTrue(isVlen(dt)) def testCreateCompoundType(self): typeItem = { @@ -461,11 +635,35 @@ def testCreateCompoundType(self): self.assertEqual(dt.name, "void144") self.assertEqual(dt.kind, "V") self.assertEqual(len(dt.fields), 4) + self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dt)) + self.assertTrue(isVlen(dt)) + dtLocation = dt[2] self.assertEqual(dtLocation.name, "object") self.assertEqual(dtLocation.kind, "O") self.assertEqual(check_dtype(vlen=dtLocation), bytes) self.assertEqual(typeSize, "H5T_VARIABLE") + self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dtLocation)) + + def testCreateCompoundInvalidFieldName(self): + typeItem = { + "class": "H5T_COMPOUND", + "fields": [ + { + "name": "\u03b1", + "type": {"base": "H5T_STD_I32LE", "class": "H5T_INTEGER"}, + }, + { + "name": "\u03c9", + "type": {"base": "H5T_STD_I32LE", "class": "H5T_INTEGER"}, + }, + ], + } + try: + hdf5dtype.createDataType(typeItem) + self.assertTrue(False) + except TypeError: + pass # expected def testCreateCompoundOfCompoundType(self): typeItem = { @@ -528,6 +726,7 @@ def testCreateCompoundOfCompoundType(self): self.assertEqual(dt.name, "void160") self.assertEqual(dt.kind, "V") self.assertEqual(len(dt.fields), 2) + self.assertFalse(isVlen(dt)) dt_field1 = dt[0] self.assertEqual(dt_field1.name, "void64") self.assertEqual(dt_field1.kind, "V") @@ -552,6 +751,8 @@ def testCreateCompoundTypeUnicodeFields(self): self.assertEqual(dt.kind, "V") self.assertEqual(len(dt.fields), 3) self.assertEqual(typeSize, 10) + self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dt)) + self.assertFalse(isVlen(dt)) def testCreateArrayType(self): typeItem = {"class": "H5T_ARRAY", "base": "H5T_STD_I64LE", "dims": (3, 5)} @@ -559,16 +760,46 @@ def testCreateArrayType(self): dt = hdf5dtype.createDataType(typeItem) self.assertEqual(dt.name, "void960") self.assertEqual(dt.kind, "V") + self.assertEqual(dt.base.kind, "i") self.assertEqual(typeSize, 120) + self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dt)) + self.assertFalse(isVlen(dt)) + + def testCreateCompoundArrayVlenType(self): + typeItem = { + "fields": [ + {"type": {"class": "H5T_INTEGER", "base": "H5T_STD_U64BE"}, "name": "VALUE"}, + {"type": {"class": "H5T_FLOAT", "base": "H5T_IEEE_F64BE"}, "name": "VALUE2"}, + {"type": {"class": "H5T_ARRAY", "dims": [8], + "base": { + "class": "H5T_STRING", + "charSet": "H5T_CSET_ASCII", + "strPad": "H5T_STR_NULLTERM", + "length": "H5T_VARIABLE" + } # noqa: E126 + }, + "name": "VALUE3"} + ], # noqa: E123 + "class": "H5T_COMPOUND" + } + typeSize = hdf5dtype.getItemSize(typeItem) + dt = hdf5dtype.createDataType(typeItem) + self.assertEqual(dt.name, "void640") + self.assertEqual(dt.kind, "V") + self.assertEqual(typeSize, "H5T_VARIABLE") + self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dt)) + self.assertTrue(isVlen(dt)) + dt_arr = dt["VALUE3"] + self.assertEqual(dt_arr.kind, "V") + self.assertEqual(dt_arr.shape, (8,)) + self.assertEqual(dt_arr.metadata, None) def testCreateArrayIntegerType(self): typeItem = {"class": "H5T_INTEGER", "base": "H5T_STD_I64LE", "dims": (3, 5)} try: hdf5dtype.createDataType(typeItem) - self.assertTrue( - False - ) # expected exception - dims used with none array type + self.assertTrue(False) # expected exception - dims used with non-array type except TypeError: pass # should get exception @@ -581,6 +812,7 @@ def testCreateVlenObjRefType(self): self.assertEqual(dt.name, "object") self.assertEqual(dt.kind, "O") self.assertTrue(check_dtype(ref=dt) is None) + self.assertTrue(isVlen(dt)) dt_base = check_dtype(vlen=dt) self.assertTrue(dt_base is not None) self.assertTrue(check_dtype(ref=dt_base) is Reference) @@ -611,6 +843,45 @@ def testCreateCompoundArrayType(self): self.assertTrue("a" in dt.fields.keys()) self.assertTrue("b" in dt.fields.keys()) self.assertEqual(typeSize, 11) + self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dt)) + self.assertFalse(isVlen(dt)) + + def testCompoundArrayType(self): + typeItem = { + "class": "H5T_COMPOUND", + "fields": [ + { + "type": {"class": "H5T_INTEGER", "base": "H5T_STD_U64BE"}, + "name": "VALUE1", + }, + { + "type": {"class": "H5T_FLOAT", "base": "H5T_IEEE_F64BE"}, + "name": "VALUE2", + }, + { + "type": { + "class": "H5T_ARRAY", + "dims": [2], + "base": { + "class": "H5T_STRING", + "charSet": "H5T_CSET_ASCII", + "strPad": "H5T_STR_NULLTERM", + "length": "H5T_VARIABLE", + }, + }, + "name": "VALUE3", + }, + ], + } + dt = hdf5dtype.createDataType(typeItem) + typeSize = hdf5dtype.getItemSize(typeItem) + self.assertEqual(typeSize, "H5T_VARIABLE") + self.assertTrue(isVlen(dt)) + self.assertEqual(len(dt), 3) + self.assertTrue("VALUE1" in dt.fields.keys()) + self.assertTrue("VALUE2" in dt.fields.keys()) + self.assertTrue("VALUE3" in dt.fields.keys()) + self.assertEqual(typeSize, hdf5dtype.getDtypeItemSize(dt)) if __name__ == "__main__": diff --git a/test/unit/hsds_reader_test.py b/test/unit/hsds_reader_test.py new file mode 100644 index 0000000..72cf601 --- /dev/null +++ b/test/unit/hsds_reader_test.py @@ -0,0 +1,109 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import unittest +import logging +import numpy as np +from h5json import Hdf5db +from h5json.hsdsstore.hsds_reader import HSDSReader +from h5json import selections + + +class HSDSReaderTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(HSDSReaderTest, self).__init__(*args, **kwargs) + # main + + self.log = logging.getLogger() + if len(self.log.handlers) > 0: + lhStdout = self.log.handlers[0] # stdout is the only handler initially + else: + lhStdout = None + + self.log.setLevel(logging.DEBUG) + handler = logging.FileHandler("./hsds_reader_test.log") + # add handler to logger + self.log.addHandler(handler) + + if lhStdout is not None: + self.log.removeHandler(lhStdout) + + def testSimple(self): + filepath = "/home/test_user1/test/tall.h5" + kwargs = {"app_logger": self.log} + with Hdf5db(**kwargs) as db: + hsds_reader = HSDSReader(filepath, **kwargs) + db.reader = hsds_reader + root_id = db.getObjectIdByPath("/") + root_json = db.getObjectById(root_id) + + root_attrs = root_json["attributes"] + self.assertEqual(len(root_attrs), 2) + self.assertEqual(list(root_attrs.keys()), ["attr1", "attr2"]) + root_links = root_json["links"] + self.assertEqual(len(root_links), 2) + self.assertEqual(list(root_links.keys()), ["g1", "g2"]) + g1_link = root_links["g1"] + self.assertEqual(g1_link["class"], "H5L_TYPE_HARD") + g1_id = g1_link["id"] + self.assertEqual(g1_id, db.getObjectIdByPath("/g1/")) + dset111_id = db.getObjectIdByPath("/g1/g1.1/dset1.1.1") + dset_json = db.getObjectById(dset111_id) + dset_type = dset_json["type"] + self.assertEqual(dset_type["class"], "H5T_INTEGER") + self.assertEqual(dset_type["base"], "H5T_STD_I32BE") + dset_attrs = dset_json["attributes"] + self.assertEqual(len(dset_attrs), 2) + self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2"]) + dset_shape = dset_json["shape"] + self.assertEqual(dset_shape["class"], "H5S_SIMPLE") + self.assertEqual(dset_shape["dims"], [10, 10]) + + # got the 5th row of the dataset + sel_row = selections.select((10, 10), (5, slice(0, 10))) + row = db.getDatasetValues(dset111_id, sel_row) + self.assertTrue(isinstance(row, np.ndarray)) + self.assertEqual(row.shape, (10,)) + for i in range(10): + v = row[i] + self.assertEqual(v, i * 5) + + sel_all = selections.select((10, 10), ...) + arr = db.getDatasetValues(dset111_id, sel_all) + self.assertTrue(isinstance(arr, np.ndarray)) + self.assertEqual(arr.shape, (10, 10)) + for i in range(10): + for j in range(10): + v = arr[i, j] + self.assertEqual(v, i * j) + + # try adding an attribute + db.createAttribute(dset111_id, "attr3", value=42) + dset_json = db.getObjectById(dset111_id) + dset_attrs = dset_json["attributes"] + self.assertEqual(len(dset_attrs), 3) + self.assertEqual(list(dset_attrs.keys()), ["attr1", "attr2", "attr3"]) + attr3_json = dset_attrs["attr3"] + attr3_shape = attr3_json["shape"] + self.assertEqual(attr3_shape["class"], "H5S_SCALAR") + attr3_type = attr3_json["type"] + self.assertEqual(attr3_type["class"], "H5T_INTEGER") + self.assertEqual(attr3_type["base"], "H5T_STD_I64LE") + attr3_value = attr3_json["value"] + self.assertEqual(attr3_value, 42) + + db.close() + + +if __name__ == "__main__": + # setup test files + + unittest.main() diff --git a/test/unit/hsds_writer_test.py b/test/unit/hsds_writer_test.py new file mode 100644 index 0000000..a3ba9be --- /dev/null +++ b/test/unit/hsds_writer_test.py @@ -0,0 +1,82 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of H5Serv (HDF5 REST Server) Service, Libraries and # +# Utilities. The full HDF5 REST Server copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import unittest +import time +import logging +import h5py +import numpy as np +from h5json import Hdf5db +from h5json.hsdsstore.hsds_writer import HSDSWriter +from h5json.hdf5dtype import special_dtype, Reference +from h5json import selections + + +class HSDSWriterTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(HSDSWriterTest, self).__init__(*args, **kwargs) + # main + + # create logger + logfname = "hsds_writer_test.log" + loglevel = logging.DEBUG + logging.basicConfig(filename=logfname, format='%(levelname)s %(asctime)s %(message)s', level=loglevel) + self.log = logging.getLogger() + self.log.info("init!") + + def testSimple(self): + + filepath = "/home/test_user1/writer_test.h5" + db = Hdf5db(app_logger=self.log) + db.writer = HSDSWriter(filepath) + root_id = db.open() + print("root_id:", root_id) + db.createAttribute(root_id, "attr1", value=[1, 2, 3, 4]) + db.createAttribute(root_id, "attr2", 42) + g1_id = db.createGroup() + db.createHardLink(root_id, "g1", g1_id) + db.createAttribute(g1_id, "a1", "hello") + g2_id = db.createGroup() + db.createHardLink(root_id, "g2", g2_id) + + g1_1_id = db.createGroup() + db.createHardLink(g1_id, "g1.1", g1_1_id) + dset_111_id = db.createDataset(shape=(10, 10), dtype=np.int32) + arr = np.zeros((10, 10), dtype=np.int32) + for i in range(10): + for j in range(10): + arr[i, j] = i * j + sel_all = selections.select((10, 10), ...) + db.setDatasetValues(dset_111_id, sel_all, arr) + db.createHardLink(g1_1_id, "dset1.1.1", dset_111_id) + db.createSoftLink(g2_id, "slink", "somewhere") + db.createExternalLink(g2_id, "extlink", "somewhere", "someplace") + db.createCustomLink(g2_id, "cust", {"foo": "bar"}) + db.flush() + + db.createAttribute(g1_id, "a2", "bye-bye") + db.flush() + + g21 = db.createGroup() + db.createHardLink(g2_id, "g2.1", g21) + db.flush() + + sel = selections.select((10, 10), (slice(4, 5), slice(4, 5))) + arr = np.zeros((), dtype=np.int32) + arr[()] = 42 + db.setDatasetValues(dset_111_id, sel, arr) + db.close() + + +if __name__ == "__main__": + # setup test files + + unittest.main() diff --git a/test/unit/objid_test.py b/test/unit/objid_test.py new file mode 100755 index 0000000..d74ec10 --- /dev/null +++ b/test/unit/objid_test.py @@ -0,0 +1,211 @@ +############################################################################## +# Copyright by The HDF Group. # +# All rights reserved. # +# # +# This file is part of HSDS (HDF5 Scalable Data Service), Libraries and # +# Utilities. The full HSDS copyright notice, including # +# terms governing use, modification, and redistribution, is contained in # +# the file COPYING, which can be found at the root of the source code # +# distribution tree. If you do not have access to this file, you may # +# request a copy from help@hdfgroup.org. # +############################################################################## +import unittest + +from h5json.objid import isRootObjId, isValidUuid, validateUuid +from h5json.objid import createObjId, getCollectionForId, getUuidFromId +from h5json.objid import isObjId, isS3ObjKey, getS3Key, getObjId, isSchema2Id + + +class IdUtilTest(unittest.TestCase): + def __init__(self, *args, **kwargs): + super(IdUtilTest, self).__init__(*args, **kwargs) + # main + + def testCreateObjId(self): + id_len = 38 # 36 for uuid plus two for prefix ("g-", "d-") + ids = set() # we'll use this to verify we always get a unique id + # create just a plain uuid... + id = createObjId() + self.assertEqual(len(id) + 2, id_len) + # create a v2 root_id + root_id = createObjId(obj_type="groups") + self.assertEqual(len(root_id), id_len) + for obj_type in ("groups", "datasets", "datatypes", "chunks"): + for i in range(100): + id = createObjId(obj_type=obj_type, root_id=root_id) + self.assertEqual(len(id), id_len) + self.assertTrue(id[0] in ("g", "d", "t", "c")) + self.assertEqual(id[1], "-") + ids.add(id) + + self.assertEqual(len(ids), 400) + try: + createObjId(obj_type="bad_class") + self.assertTrue(False) # should throw exception + except ValueError: + pass # expected + + def testIsValidUuid(self): + group1_id = "g-314d61b8-9954-11e6-a733-3c15c2da029e" # orig schema + group2_id = "g-314d61b8-995411e6-a733-3c15c2-da029e" + root_id = "g-f9aaa28e-d42e10e5-7122-2a065c-a6986d" + dataset1_id = "d-4c48f3ae-9954-11e6-a3cd-3c15c2da029e" # orig schema + dataset2_id = "d-4c48f3ae-995411e6-a3cd-3c15c2-da029e" + ctype1_id = "t-8c785f1c-9953-11e6-9bc2-0242ac110005" # orig schema + ctype2_id = "t-8c785f1c-995311e6-9bc2-0242ac-110005" + chunk1_id = "c-8c785f1c-9953-11e6-9bc2-0242ac110005_7_2" # orig schema + chunk2_id = "c-8c785f1c-995311e6-9bc2-0242ac-110005_7_2" + domain_id = "mybucket/bob/mydata.h5" + s3_domain_id = "s3://mybucket/bob/mydata.h5" + file_domain_id = "file://mybucket/bob/mydata.h5" + azure_domain_id = "https://myaccount.blob.core.windows.net/mybucket/bob/mydata.h5" + valid_id_map = { + group1_id: "a49be-g-314d61b8-9954-11e6-a733-3c15c2da029e", + group2_id: "db/314d61b8-995411e6/g/a733-3c15c2-da029e/.group.json", + dataset1_id: "26928-d-4c48f3ae-9954-11e6-a3cd-3c15c2da029e", + dataset2_id: "db/4c48f3ae-995411e6/d/a3cd-3c15c2-da029e/.dataset.json", + ctype1_id: "5a9cf-t-8c785f1c-9953-11e6-9bc2-0242ac110005", + ctype2_id: "db/8c785f1c-995311e6/t/9bc2-0242ac-110005/.datatype.json", + chunk1_id: "dc4ce-c-8c785f1c-9953-11e6-9bc2-0242ac110005_7_2", + chunk2_id: "db/8c785f1c-995311e6/d/9bc2-0242ac-110005/7_2", + domain_id: "bob/mydata.h5/.domain.json", + s3_domain_id: "bob/mydata.h5/.domain.json", + file_domain_id: "bob/mydata.h5/.domain.json", + azure_domain_id: "bob/mydata.h5/.domain.json", } + + bad_ids = ("g-1e76d862", "/bob/mydata.h5") + + self.assertTrue(isValidUuid(group1_id)) + self.assertFalse(isSchema2Id(group1_id)) + self.assertTrue(isValidUuid(group1_id, obj_class="Group")) + self.assertTrue(isValidUuid(group1_id, obj_class="group")) + self.assertTrue(isValidUuid(group1_id, obj_class="groups")) + self.assertTrue(isSchema2Id(root_id)) + self.assertTrue(isValidUuid(root_id, obj_class="Group")) + self.assertTrue(isValidUuid(root_id, obj_class="group")) + self.assertTrue(isValidUuid(root_id, obj_class="groups")) + self.assertTrue(isRootObjId(root_id)) + self.assertTrue(isValidUuid(dataset1_id, obj_class="datasets")) + self.assertFalse(isSchema2Id(dataset1_id)) + self.assertTrue(isValidUuid(ctype1_id, obj_class="datatypes")) + self.assertFalse(isSchema2Id(ctype1_id)) + self.assertTrue(isValidUuid(chunk1_id, obj_class="chunks")) + self.assertFalse(isSchema2Id(chunk1_id)) + self.assertTrue(isValidUuid(group2_id)) + self.assertTrue(isSchema2Id(group2_id)) + self.assertTrue(isValidUuid(group2_id, obj_class="Group")) + self.assertTrue(isValidUuid(group2_id, obj_class="group")) + self.assertTrue(isValidUuid(group2_id, obj_class="groups")) + self.assertFalse(isRootObjId(group2_id)) + self.assertTrue(isValidUuid(dataset2_id, obj_class="datasets")) + self.assertTrue(isSchema2Id(dataset2_id)) + self.assertTrue(isValidUuid(ctype2_id, obj_class="datatypes")) + self.assertTrue(isSchema2Id(ctype2_id)) + self.assertTrue(isValidUuid(chunk2_id, obj_class="chunks")) + self.assertTrue(isSchema2Id(chunk2_id)) + validateUuid(group1_id) + try: + isRootObjId(group1_id) + self.assertTrue(False) + except ValueError: + # only works for v2 schema + pass # expected + + for item in valid_id_map: + self.assertTrue(isObjId(item)) + s3key = getS3Key(item) + self.assertTrue(s3key[0] != "/") + self.assertTrue(isS3ObjKey(s3key)) + expected = valid_id_map[item] + self.assertEqual(s3key, expected) + if item.find("/") > 0: + continue # bucket name gets lost when domain ids get converted to s3keys + objid = getObjId(s3key) + self.assertEqual(objid, item) + for item in bad_ids: + self.assertFalse(isValidUuid(item)) + self.assertFalse(isObjId(item)) + + def testGetCollection(self): + group_id = "g-314d61b8-9954-11e6-a733-3c15c2da029e" + dataset_id = "d-4c48f3ae-9954-11e6-a3cd-3c15c2da029e" + ctype_id = "t-8c785f1c-9953-11e6-9bc2-0242ac110005" + bad_id = "x-59647858-9954-11e6-95d2-3c15c2da029e" + self.assertEqual(getCollectionForId(group_id), "groups") + self.assertEqual(getCollectionForId(dataset_id), "datasets") + self.assertEqual(getCollectionForId(ctype_id), "datatypes") + self.assertEqual(getUuidFromId(group_id), "314d61b8-9954-11e6-a733-3c15c2da029e") + try: + getCollectionForId(bad_id) + self.assertTrue(False) + except ValueError: + pass # expected + try: + getCollectionForId(None) + self.assertTrue(False) + except ValueError: + pass # expected + + def testSchema2Id(self): + root_id = createObjId("groups") + group_id = createObjId("groups", root_id=root_id) + dataset_id = createObjId("datasets", root_id=root_id) + ctype_id = createObjId("datatypes", root_id=root_id) + + self.assertEqual(getCollectionForId(root_id), "groups") + self.assertEqual(getCollectionForId(group_id), "groups") + self.assertEqual(getCollectionForId(dataset_id), "datasets") + self.assertEqual(getCollectionForId(ctype_id), "datatypes") + chunk_id = "c" + dataset_id[1:] + "_1_2" + chunk_partition_id = "c42-" + dataset_id[2:] + "_1_2" + + for id in (chunk_id, chunk_partition_id): + try: + getCollectionForId(id) + self.assertTrue(False) + except ValueError: + pass # expected + valid_ids = ( + group_id, + dataset_id, + ctype_id, + chunk_id, + chunk_partition_id, + root_id, + ) + s3prefix = getS3Key(root_id) + self.assertTrue(s3prefix.endswith("/.group.json")) + s3prefix = s3prefix[: -(len(".group.json"))] + for oid in valid_ids: + self.assertTrue(len(oid) >= 38) + parts = oid.split("-") + self.assertEqual(len(parts), 6) + self.assertTrue(oid[0] in ("g", "d", "t", "c")) + self.assertTrue(isSchema2Id(oid)) + if oid == root_id: + self.assertTrue(isRootObjId(oid)) + else: + self.assertFalse(isRootObjId(oid)) + + s3key = getS3Key(oid) + self.assertTrue(s3key.startswith(s3prefix)) + self.assertEqual(getObjId(s3key), oid) + self.assertTrue(isS3ObjKey(s3key)) + + def testGetDataTypeId(self): + test_uuid = "9b652223-83f8-11e5-b028-3c15c2da029e" + test_ids = ( + "datatypes/9b652223-83f8-11e5-b028-3c15c2da029e", + "datatypes/t-9b652223-83f8-11e5-b028-3c15c2da029e", + "t-9b652223-83f8-11e5-b028-3c15c2da029e" + ) + for test_id in test_ids: + self.assertTrue(isValidUuid(test_id)) + self.assertEqual(getCollectionForId(test_id), "datatypes") + self.assertEqual(getUuidFromId(test_id), test_uuid) + + +if __name__ == "__main__": + # setup test files + + unittest.main() diff --git a/testall.py b/testall.py index 8e5d041..45e0610 100755 --- a/testall.py +++ b/testall.py @@ -15,7 +15,28 @@ import shutil import h5py -unit_tests = ("hdf5dtype_test", "hdf5db_test") +unit_tests = [ + "array_util_test", + "objid_test", + "hdf5dtype_test", + "hdf5db_test", + "h5json_reader_test", + "h5json_writer_test", + "h5py_reader_test", + "h5py_writer_test", +] + +use_hsds = True +for key in ("HS_ENDPOINT", "HS_USERNAME", "HS_PASSWORD"): + if key not in os.environ: + use_hsds = False + print(f"not including HSDS tests, no {key} environment set") + break + +if use_hsds: + unit_tests.append("hsds_reader_test") +unit_tests = tuple(unit_tests) + integ_tests = ("h5tojson_test", "jsontoh5_test") # verify the hdf5 lib version is recent @@ -28,6 +49,9 @@ print(h5py.version.info) sys.exit("Need h5py version 3.0 or later") +if not os.path.isdir("./test/unit/out"): + os.makedirs("test/unit/out") + # Run all hdf5-json tests # Run this script before running any integ tests for file_name in unit_tests: @@ -39,6 +63,13 @@ os.remove("hdf5dbtest.log") os.chdir("test/integ") + +if not os.path.isdir("./h5_out"): + os.makedirs("h5_out") + +if not os.path.isdir("./json_out"): + os.makedirs("json_out") + for file_name in integ_tests: print(file_name) rc = os.system("python " + file_name + ".py")