Source code for at_py.readwrite.mat_bundle

"""MATLAB ``.mat`` normalization (documented subset) for Acoustics Toolbox readers.

This module loads **classic** MAT files (v4 / v6 / v7–v7.2) via ``scipy.io.loadmat`` and
**v7.3** (HDF5-based) via ``h5py``, then returns a single Python representation.

**Layout / memory order**

MATLAB stores matrices in **column-major** order. SciPy and NumPy return arrays in the
usual NumPy memory layout (typically C-contiguous) that reflects how values were stored in
the file; **at-py does not implicitly transpose** loaded arrays. Downstream AT mappers
(``read_*_from_mat``) apply the same explicit reshapes/transposes documented in the
corresponding Matlab ``read_*.m`` files (e.g. ``RTS = RTS.'`` in ``read_ts.m``).

**Supported value types** (anything else raises ``TypeError`` or ``ValueError`` naming the
key and offending type):

- Real or complex ``numpy.ndarray`` of any shape (numeric dtypes).
- **Scalars** (0-d arrays, Python scalars, NumPy scalar types) promoted to 0-d arrays.
- **Structs** (classic ``mat_struct`` or v7.3 struct groups) normalized to ``dict[str, Any]``
  recursively (max depth **16**).
- **Cell arrays** (object arrays or HDF5 cell layout we recognize) normalized to ``list``.

**Explicitly unsupported** (loud failure; message suggests remediation where practical):

- **Classic** MAT sparse matrices from SciPy ``loadmat`` (install extras and use dense export
  in MATLAB, or convert offline).
- **v7.3** MATLAB sparse **groups** (CSC layout with ``MATLAB_sparse`` / ``data`` / ``ir`` /
  ``jc``) are converted to **dense** ``numpy.ndarray`` via SciPy when ``[mat]`` is installed
  (real or complex ``data``).
- MATLAB ``table`` / ``timetable`` / ``datetime`` / ``duration`` / ``categorical`` / ``string``
  (string array) / ``function_handle`` / generic ``object``, or SciPy ``MatlabOpaque`` /
  ``MatlabObject`` payloads from classic MAT.
- v7.3 **opaque or unsupported** reference targets (e.g. user classes) and layouts we do
  not normalize (same spirit as other unsupported types).

**Top-level variables**

For **v7.3 (HDF5)**, internal groups whose names start with ``#`` (for example
``#refs#``, ``#subsystem#``) are not exposed as user variables—they are skipped when
building ``MatBundle.variables``.

User-facing keys are a ``dict[str, Any]``. For classic MAT, SciPy metadata keys
``__header__``, ``__globals__``, ``__version__`` are **not** included in
``MatBundle.variables``; they appear in ``MatBundle.scipy_meta`` when present.

**Variable names**

Keys preserve MATLAB variable names as stored in the file (including unusual characters).
Callers should use the **same field names** as Acoustics Toolbox ``read_shd.m`` /
``read_modes.m`` / ``read_ts.m`` expect on the ``.mat`` branches.
"""

from __future__ import annotations

import io
import re
from dataclasses import dataclass
from os import PathLike
from typing import Any, Literal

import numpy as np

_MAT_EXTRA_HINT = "Install optional dependencies with: pip install 'oalib-at-py[mat]'"

HDF5_MAT_SIGNATURE = b"\x89HDF\r\n\x1a\n"

_MAX_NEST_DEPTH = 16

_SCIPY_META_KEYS = frozenset({"__header__", "__globals__", "__version__"})

# MATLAB high-level types not described in the Acoustics Toolbox ``read_*.m`` ``.mat`` paths
# in https://github.com/jgebbie/at — reject rather than mis-parse.
_UNSUPPORTED_H5_MATLAB_CLASS = frozenset(
    {
        "table",
        "timetable",
        "datetime",
        "duration",
        "categorical",
        "string",
        "function_handle",
        "object",
    }
)


def _import_error_mat(extra: str) -> ImportError:
    """Build an :exc:`ImportError` mentioning optional ``[mat]`` install hints."""
    return ImportError(f"{extra} {_MAT_EXTRA_HINT}")


def _require_scipy_loadmat() -> Any:
    """Import ``scipy.io`` or raise with install hint."""
    try:
        import scipy.io as sio  # noqa: PLC0415
    except ImportError as e:
        raise _import_error_mat("scipy is required for classic MAT (v4/v6/v7–v7.2).") from e
    return sio


def _require_h5py() -> Any:
    """Import ``h5py`` or raise with install hint."""
    try:
        import h5py  # noqa: PLC0415
    except ImportError as e:
        raise _import_error_mat("h5py is required for MAT v7.3 (HDF5).") from e
    return h5py


def _is_hdf5_mat_prefix(data: bytes) -> bool:
    """True if ``data`` begins with the HDF5 signature (MAT v7.3)."""
    return data.startswith(HDF5_MAT_SIGNATURE)


[docs] @dataclass(frozen=True) class MatBundle: """Normalized top-level MATLAB variables.""" variables: dict[str, Any] source_format: Literal["mat_classic", "mat_v7_3"] raw_backend: str scipy_meta: dict[str, Any] | None = None
[docs] def load_mat_normalized(data: bytes) -> MatBundle: """Load ``.mat`` bytes and return a normalized :class:`MatBundle`. Raises ``ImportError`` with install instructions if optional dependencies for the detected format are missing. """ if not data: raise ValueError("empty MAT payload") if _is_hdf5_mat_prefix(data): return _load_mat_v73_normalized(data) return _load_mat_classic_normalized(data)
[docs] def load_mat_normalized_path(path: str | bytes | PathLike[str]) -> MatBundle: """Load a ``.mat`` file from disk (convenience; reads entire file into memory).""" from os import fspath from pathlib import Path # noqa: PLC0415 if isinstance(path, bytes): p = Path(path.decode(errors="surrogateescape")) else: p = Path(fspath(path)) return load_mat_normalized(p.read_bytes())
def _load_mat_classic_normalized(data: bytes) -> MatBundle: """Normalize classic MAT (v4–v7.2) via SciPy ``loadmat``.""" sio = _require_scipy_loadmat() try: from scipy.io.matlab import mat_struct # noqa: PLC0415 except ImportError as e: raise _import_error_mat("scipy.io.matlab is required for classic MAT.") from e raw: dict[str, Any] = sio.loadmat( io.BytesIO(data), struct_as_record=False, squeeze_me=False, chars_as_strings=False, ) scipy_meta = {k: raw[k] for k in _SCIPY_META_KEYS if k in raw} variables_in = {k: v for k, v in raw.items() if k not in _SCIPY_META_KEYS} variables_out: dict[str, Any] = {} for key, val in variables_in.items(): variables_out[key] = _normalize_value_scipy(key, val, depth=0, mat_struct_type=mat_struct) return MatBundle( variables=variables_out, source_format="mat_classic", raw_backend="scipy.io.loadmat", scipy_meta=scipy_meta or None, ) def _load_mat_v73_normalized(data: bytes) -> MatBundle: """Normalize MAT v7.3 (HDF5) top-level variables.""" h5py = _require_h5py() variables_out: dict[str, Any] = {} with h5py.File(io.BytesIO(data), "r") as f: for name in f: # Skip MATLAB/HDF5 internal groups (#refs#, #subsystem#, …), not user variables. if str(name).startswith("#"): continue variables_out[name] = _h5_normalize_item(f, name, f[name], depth=0) return MatBundle( variables=variables_out, source_format="mat_v7_3", raw_backend="h5py.File", scipy_meta=None, ) def _check_depth(key: str, depth: int) -> None: """Raise if struct/cell nesting exceeds :data:`_MAX_NEST_DEPTH`.""" if depth > _MAX_NEST_DEPTH: raise ValueError(f"MAT structure nested too deeply (>{_MAX_NEST_DEPTH}) at {key!r}") def _normalize_value_scipy(key: str, val: Any, depth: int, mat_struct_type: type) -> Any: """Recursively normalize one SciPy ``loadmat`` value to Python/NumPy types.""" _check_depth(key, depth) _scipy_matlab_runtime_types: tuple[type, ...] = () try: from scipy.io.matlab import MatlabObject, MatlabOpaque # noqa: PLC0415 _scipy_matlab_runtime_types = (MatlabOpaque, MatlabObject) except ImportError: pass if _scipy_matlab_runtime_types and isinstance(val, _scipy_matlab_runtime_types): raise TypeError( f"{key!r}: SciPy {type(val).__name__!r} (MATLAB class object / opaque) is not " "supported in the documented MAT subset (e.g. table, timetable, datetime); " "export numeric arrays or structs in MATLAB." ) try: import scipy.sparse as sp # noqa: PLC0415 except ImportError: sp = None # type: ignore[assignment] if sp is not None and sp.issparse(val): raise ValueError( f"{key!r}: sparse matrices are not supported in the documented MAT subset " f"(got {type(val).__name__}); convert to dense in MATLAB or offline." ) if isinstance(val, mat_struct_type): return _mat_struct_to_dict(key, val, depth, mat_struct_type) if isinstance(val, np.ndarray): if val.dtype.kind == "O": return _normalize_object_array_scipy(key, val, depth, mat_struct_type) if val.dtype.names: return _structured_ndarray_to_dict_scipy(key, val, depth, mat_struct_type) if val.dtype.kind in "SU": return _char_array_to_str(key, val) return _as_numeric_array(key, val) if isinstance(val, (bytes, bytearray, memoryview)): return bytes(val).decode("utf-8", errors="replace") if isinstance(val, str): return val if isinstance(val, (float, int, bool, np.bool_, np.integer, np.floating, np.complexfloating)): return np.asarray(val) raise TypeError( f"{key!r}: unsupported MAT value type {type(val).__name__!r} in documented subset" ) def _mat_struct_to_dict(key: str, obj: Any, depth: int, mat_struct_type: type) -> dict[str, Any]: """Convert ``mat_struct`` to a plain ``dict`` of normalized fields.""" out: dict[str, Any] = {} for name in obj._fieldnames: # noqa: SLF001 child = getattr(obj, name) child_key = f"{key}.{name}" out[str(name)] = _normalize_value_scipy(child_key, child, depth + 1, mat_struct_type) return out def _structured_ndarray_to_dict_scipy( key: str, arr: np.ndarray, depth: int, mat_struct_type: type ) -> Any: """Single-row structured array → dict; multiple rows → list of dicts.""" flat = arr.reshape(-1) if flat.size == 1: row = flat[0] return { str(n): _normalize_value_scipy(f"{key}.{n}", row[n], depth + 1, mat_struct_type) for n in row.dtype.names # type: ignore[union-attr] } out_list = [] for i in range(flat.shape[0]): row = flat[i] names = row.dtype.names # type: ignore[union-attr] out_list.append( { str(n): _normalize_value_scipy( f"{key}[{i}].{n}", row[n], depth + 2, mat_struct_type ) for n in names } ) return out_list def _normalize_object_array_scipy( key: str, arr: np.ndarray, depth: int, mat_struct_type: type ) -> Any: """Normalize dtype ``object`` ndarray (cells or nested arrays).""" flat = arr.reshape(-1) if flat.size == 1: only = flat[0] if isinstance(only, mat_struct_type): return _mat_struct_to_dict(key, only, depth, mat_struct_type) if isinstance(only, np.ndarray) and only.dtype == object: return _normalize_object_array_scipy(key, only, depth, mat_struct_type) return _normalize_value_scipy(key, only, depth, mat_struct_type) return [ _normalize_value_scipy(f"{key}[{i}]", flat[i], depth + 1, mat_struct_type) for i in range(flat.size) ] def _char_array_to_str(key: str, arr: np.ndarray) -> str: """Decode classic MAT char array to Python ``str``.""" _ = key a = np.asarray(arr) if a.dtype.kind == "U": # ``chars_as_strings=False`` yields per-character ``U1`` arrays (e.g. shape (1, N)). return "".join(str(x) for x in a.reshape(-1)).rstrip(" \x00") if a.size else "" if a.dtype.kind == "S": flat = np.ravel(a) raw = b"".join(np.bytes_(x).tobytes() for x in flat).split(b"\x00", 1)[0] return raw.decode("latin-1", errors="replace") if a.dtype == np.uint16: return "".join(chr(int(x)) for x in a.astype(np.uint32).reshape(-1)).rstrip(" \x00") if a.dtype == np.uint8 or a.dtype.kind in "iu": flat = a.reshape(-1) if flat.size and int(flat.max()) <= 127: return bytes(int(x) for x in flat).decode("ascii", errors="replace").strip("\x00") raise TypeError(f"{key!r}: unsupported character array dtype {a.dtype!r}") def _as_numeric_array(key: str, arr: np.ndarray) -> np.ndarray: """Return ``arr`` as ndarray, rejecting non-numeric dtypes.""" _ = key out = np.asarray(arr) if out.dtype.kind not in "biufc": raise TypeError(f"{key!r}: ndarray has unsupported dtype {out.dtype!r}") return out def _h5_matlab_char_to_str(key: str, arr: np.ndarray) -> str | list[str]: """Decode MATLAB v7.3 ``char`` stored as ``uint16`` (UTF-16 code units per character).""" a = np.asarray(arr) if a.dtype.kind not in "iu": if a.dtype.kind in "fc": raise TypeError( f"{key!r}: MATLAB char array has non-integer dtype {a.dtype!r}; " "expected uint16-style storage" ) a = a.astype(np.uint32) else: a = a.astype(np.uint32) if a.ndim == 0: return chr(int(a)) if a.size else "" if a.ndim == 1: return "".join(chr(int(x)) for x in a.flat).rstrip("\x00") if a.ndim == 2: if a.shape[0] == 1: return "".join(chr(int(x)) for x in a[0]).rstrip("\x00 ") if a.shape[1] == 1: return "".join(chr(int(x)) for x in a[:, 0]).rstrip("\x00 ") return [ "".join(chr(int(a[i, j])) for i in range(a.shape[0])).rstrip("\x00 ") for j in range(a.shape[1]) ] raise TypeError(f"{key!r}: MATLAB char array rank {a.ndim} is not supported") def _h5_sparse_group_to_dense(key: str, grp: Any) -> np.ndarray: """MATLAB v7.3 CSC sparse group → dense float array (SciPy).""" try: from scipy.sparse import csc_matrix # noqa: PLC0415 except ImportError as e: raise ImportError( f"{key!r}: MAT v7.3 sparse groups require scipy (install oalib-at-py[mat])." ) from e import h5py # noqa: PLC0415 if not isinstance(grp, h5py.Group): raise TypeError(f"{key!r}: expected HDF5 group for sparse matrix") if "data" not in grp or "ir" not in grp or "jc" not in grp: raise ValueError( f"{key!r}: MAT v7.3 sparse group missing data/ir/jc datasets (got {list(grp.keys())})" ) # h5py stubs disagree with runtime Group indexing; keep dynamic lookups on ``Any``. g: Any = grp nrows = int(np.asarray(g.attrs["MATLAB_sparse"]).reshape(-1)[0]) data = np.asarray(g["data"][:]) ir = np.asarray(g["ir"][:], dtype=np.int64).reshape(-1) jc = np.asarray(g["jc"][:], dtype=np.int64).reshape(-1) ncols = int(jc.size) - 1 if ncols < 0: raise ValueError(f"{key!r}: invalid sparse jc length {jc.size}") mat = csc_matrix((data, ir, jc), shape=(nrows, ncols)) dense = mat.toarray() if np.iscomplexobj(dense): return np.asarray(dense, dtype=np.complex128) return np.asarray(dense, dtype=np.float64) def _h5_matlab_class(obj: Any) -> str | None: """``MATLAB_class`` attribute as ``str``, or ``None``.""" cls = obj.attrs.get("MATLAB_class") if cls is None: return None if isinstance(cls, bytes): return cls.decode("ascii", errors="replace") if isinstance(cls, (np.bytes_, np.ndarray)): return np.asarray(cls).tobytes().decode("ascii", errors="replace").strip("\x00") return str(cls) def _h5_cell_from_ref_array(f: Any, key: str, arr: np.ndarray, depth: int) -> Any: """MATLAB ``cell`` stored as a dataset of HDF5 object references (see mat73-style layout).""" import h5py # noqa: PLC0415 if h5py.check_dtype(ref=arr.dtype) is None: raise TypeError(f"{key!r}: expected HDF5 reference array for MATLAB cell") if arr.ndim == 0: return _h5_normalize_item(f, key, f[arr.item()], depth + 1) if arr.ndim == 1: return [ _h5_normalize_item(f, f"{key}[{i}]", f[arr[i]], depth + 1) for i in range(int(arr.shape[0])) ] if arr.ndim == 2: # Row vector (1, N) or column (N, 1): flatten to a Python list (common 1-D cell). if arr.shape[0] == 1: return [ _h5_normalize_item(f, f"{key}[{j}]", f[arr[0, j]], depth + 1) for j in range(int(arr.shape[1])) ] if arr.shape[1] == 1: return [ _h5_normalize_item(f, f"{key}[{i}]", f[arr[i, 0]], depth + 1) for i in range(int(arr.shape[0])) ] cell: list[list[Any]] = [] for i in range(int(arr.shape[0])): row: list[Any] = [] for j in range(int(arr.shape[1])): r = arr[i, j] row.append(_h5_normalize_item(f, f"{key}[{i},{j}]", f[r], depth + 1)) cell.append(row) if len(cell) > 1 or (cell and len(cell[0]) > 1): cell = list(map(list, zip(*cell, strict=True))) if len(cell) == 1: return cell[0] return cell def _h5_deref_ref_array_generic(f: Any, key: str, arr: np.ndarray, depth: int) -> Any: """Dereference a non-cell dataset of HDF5 object references (e.g. struct fields).""" import h5py # noqa: PLC0415 if h5py.check_dtype(ref=arr.dtype) is None: raise TypeError(f"{key!r}: expected HDF5 reference array") if arr.ndim == 0: return _h5_normalize_item(f, key, f[arr.item()], depth + 1) if arr.ndim == 1: return [ _h5_normalize_item(f, f"{key}[{i}]", f[arr[i]], depth + 1) for i in range(int(arr.shape[0])) ] out = np.empty(arr.shape, dtype=object) for idx in np.ndindex(arr.shape): out[idx] = _h5_normalize_item(f, f"{key}{list(idx)}", f[arr[idx]], depth + 1) if out.size == 1: return out.flat[0] if out.shape == (1, 1): return out[0, 0] dims = tuple(np.shape(out)) if len(dims) < 2: return out.tolist() n0, n1 = int(dims[0]), int(dims[1]) if n0 == 1: return [out[0, j] for j in range(n1)] if n1 == 1: return [out[i, 0] for i in range(n0)] return out.tolist() def _h5_cell_group_to_list(f: Any, key: str, grp: Any, depth: int) -> list[Any]: """MATLAB ``cell`` stored as a group with numbered children ``1``, ``2``, ….""" names = [n for n in grp.keys() if not str(n).startswith("#")] def _sort_key(n: str) -> tuple[int, int | str]: """Order cell indices ``1``, ``2``, … before non-numeric group names.""" try: return (0, int(n)) except ValueError: return (1, n) ordered = sorted(names, key=_sort_key) return [_h5_normalize_item(f, f"{key}.{n}", grp[n], depth + 1) for n in ordered] def _h5_compound_to_dict(key: str, arr: np.ndarray, depth: int) -> Any: """Normalize HDF5 compound arrays without scipy mat_struct.""" _check_depth(key, depth) flat = arr.reshape(-1) if flat.size == 1: row = flat[0] return { str(n): _h5_field_value(f"{key}.{n}", row[n], depth + 1) for n in row.dtype.names # type: ignore[union-attr] } return [ { str(n): _h5_field_value(f"{key}[{i}].{n}", flat[i][n], depth + 2) for n in flat[i].dtype.names # type: ignore[union-attr] } for i in range(flat.shape[0]) ] def _h5_field_value(key: str, val: Any, depth: int) -> Any: """Normalize one field value from a compound HDF5 row.""" _check_depth(key, depth) if isinstance(val, np.ndarray): if val.dtype.names: return _h5_compound_to_dict(key, val, depth) if val.dtype.kind in "SU": return _char_array_to_str(key, val) return _as_numeric_array(key, val) if isinstance(val, (float, int, bool, np.integer, np.floating, np.complexfloating)): return np.asarray(val) raise TypeError(f"{key!r}: unsupported compound field type {type(val).__name__!r}") def _h5_dataset_to_numpy(f: Any, key: str, ds: Any, depth: int) -> Any: """Load one HDF5 dataset: numeric, char, cell refs, compound, etc.""" import h5py # noqa: PLC0415 mclass = _h5_matlab_class(ds) if mclass in _UNSUPPORTED_H5_MATLAB_CLASS: raise TypeError( f"{key!r}: MATLAB_class={mclass!r} is not supported in the documented MAT subset" ) if ds.dtype.names: raw = ds[()] arr = np.asarray(raw, dtype=ds.dtype) return _h5_compound_to_dict(key, arr, depth=0) if h5py.check_dtype(ref=ds.dtype) is not None: raw = np.asarray(ds[()], dtype=ds.dtype) if mclass == "cell": return _h5_cell_from_ref_array(f, key, raw, depth) return _h5_deref_ref_array_generic(f, key, raw, depth) arr = np.asarray(ds[()]) if mclass == "char": return _h5_matlab_char_to_str(key, arr) if mclass == "logical": return np.not_equal(arr, 0) if arr.dtype.kind in "SU": return _char_array_to_str(key, arr) if arr.dtype.kind == "O": raise TypeError(f"{key!r}: v7.3 object dataset {ds.name!r} is not supported") return _as_numeric_array(key, arr) def _h5_group_as_struct(f: Any, key: str, grp: Any, depth: int) -> dict[str, Any]: """MATLAB struct as HDF5 group → nested dict.""" _check_depth(key, depth) out: dict[str, Any] = {} for child_name in grp: if child_name.startswith("#"): continue child = grp[child_name] out[str(child_name)] = _h5_normalize_item(f, f"{key}.{child_name}", child, depth + 1) return out def _h5_group_resembles_matlab_table_layout(grp: Any) -> bool: """Heuristic for v7.3 ``table``/``timetable`` groups when ``MATLAB_class`` is missing.""" import h5py # noqa: PLC0415 if not isinstance(grp, h5py.Group): return False names = {str(n) for n in grp.keys() if not str(n).startswith("#")} # Typical MATLAB R2013b+ HDF5 table layout (see MathWorks MAT-file v7.3 spec examples). if {"Properties", "Variables"}.issubset(names): return True return False def _h5_normalize_item(f: Any, key: str, obj: Any, depth: int) -> Any: """Entry point for v7.3: dispatch dataset vs group (struct/cell/sparse).""" import h5py # noqa: PLC0415 _check_depth(key, depth) mclass = _h5_matlab_class(obj) if isinstance(obj, h5py.Dataset): return _h5_dataset_to_numpy(f, key, obj, depth) if isinstance(obj, h5py.Group): if "MATLAB_sparse" in obj.attrs: return _h5_sparse_group_to_dense(key, obj) if mclass == "struct": return _h5_group_as_struct(f, key, obj, depth) if mclass == "cell": return _h5_cell_group_to_list(f, key, obj, depth) if mclass in _UNSUPPORTED_H5_MATLAB_CLASS: raise TypeError( f"{key!r}: MATLAB_class={mclass!r} is not supported in the documented MAT subset" ) if mclass: safe = re.fullmatch( r"(double|single|int8|int16|int32|int64|uint8|uint16|uint32|uint64|canonical)", mclass, flags=re.I, ) if not safe: raise TypeError( f"{key!r}: MATLAB_class={mclass!r} is not supported in the documented subset" ) elif _h5_group_resembles_matlab_table_layout(obj): raise TypeError( f"{key!r}: HDF5 group looks like a MATLAB table/timetable layout " "(Properties/Variables); not supported in the documented MAT subset" ) return _h5_group_as_struct(f, key, obj, depth) raise TypeError(f"{key!r}: unsupported HDF5 object type {type(obj).__name__}")