"""MATLAB ``.mat`` normalization (documented subset) for Acoustics Toolbox readers.
This module loads **classic** MAT files (v4 / v6 / v7–v7.2) via ``scipy.io.loadmat`` and
**v7.3** (HDF5-based) via ``h5py``, then returns a single Python representation.
**Layout / memory order**
MATLAB stores matrices in **column-major** order. SciPy and NumPy return arrays in the
usual NumPy memory layout (typically C-contiguous) that reflects how values were stored in
the file; **at-py does not implicitly transpose** loaded arrays. Downstream AT mappers
(``read_*_from_mat``) apply the same explicit reshapes/transposes documented in the
corresponding Matlab ``read_*.m`` files (e.g. ``RTS = RTS.'`` in ``read_ts.m``).
**Supported value types** (anything else raises ``TypeError`` or ``ValueError`` naming the
key and offending type):
- Real or complex ``numpy.ndarray`` of any shape (numeric dtypes).
- **Scalars** (0-d arrays, Python scalars, NumPy scalar types) promoted to 0-d arrays.
- **Structs** (classic ``mat_struct`` or v7.3 struct groups) normalized to ``dict[str, Any]``
recursively (max depth **16**).
- **Cell arrays** (object arrays or HDF5 cell layout we recognize) normalized to ``list``.
**Explicitly unsupported** (loud failure; message suggests remediation where practical):
- **Classic** MAT sparse matrices from SciPy ``loadmat`` (install extras and use dense export
in MATLAB, or convert offline).
- **v7.3** MATLAB sparse **groups** (CSC layout with ``MATLAB_sparse`` / ``data`` / ``ir`` /
``jc``) are converted to **dense** ``numpy.ndarray`` via SciPy when ``[mat]`` is installed
(real or complex ``data``).
- MATLAB ``table`` / ``timetable`` / ``datetime`` / ``duration`` / ``categorical`` / ``string``
(string array) / ``function_handle`` / generic ``object``, or SciPy ``MatlabOpaque`` /
``MatlabObject`` payloads from classic MAT.
- v7.3 **opaque or unsupported** reference targets (e.g. user classes) and layouts we do
not normalize (same spirit as other unsupported types).
**Top-level variables**
For **v7.3 (HDF5)**, internal groups whose names start with ``#`` (for example
``#refs#``, ``#subsystem#``) are not exposed as user variables—they are skipped when
building ``MatBundle.variables``.
User-facing keys are a ``dict[str, Any]``. For classic MAT, SciPy metadata keys
``__header__``, ``__globals__``, ``__version__`` are **not** included in
``MatBundle.variables``; they appear in ``MatBundle.scipy_meta`` when present.
**Variable names**
Keys preserve MATLAB variable names as stored in the file (including unusual characters).
Callers should use the **same field names** as Acoustics Toolbox ``read_shd.m`` /
``read_modes.m`` / ``read_ts.m`` expect on the ``.mat`` branches.
"""
from __future__ import annotations
import io
import re
from dataclasses import dataclass
from os import PathLike
from typing import Any, Literal
import numpy as np
_MAT_EXTRA_HINT = "Install optional dependencies with: pip install 'oalib-at-py[mat]'"
HDF5_MAT_SIGNATURE = b"\x89HDF\r\n\x1a\n"
_MAX_NEST_DEPTH = 16
_SCIPY_META_KEYS = frozenset({"__header__", "__globals__", "__version__"})
# MATLAB high-level types not described in the Acoustics Toolbox ``read_*.m`` ``.mat`` paths
# in https://github.com/jgebbie/at — reject rather than mis-parse.
_UNSUPPORTED_H5_MATLAB_CLASS = frozenset(
{
"table",
"timetable",
"datetime",
"duration",
"categorical",
"string",
"function_handle",
"object",
}
)
def _import_error_mat(extra: str) -> ImportError:
"""Build an :exc:`ImportError` mentioning optional ``[mat]`` install hints."""
return ImportError(f"{extra} {_MAT_EXTRA_HINT}")
def _require_scipy_loadmat() -> Any:
"""Import ``scipy.io`` or raise with install hint."""
try:
import scipy.io as sio # noqa: PLC0415
except ImportError as e:
raise _import_error_mat("scipy is required for classic MAT (v4/v6/v7–v7.2).") from e
return sio
def _require_h5py() -> Any:
"""Import ``h5py`` or raise with install hint."""
try:
import h5py # noqa: PLC0415
except ImportError as e:
raise _import_error_mat("h5py is required for MAT v7.3 (HDF5).") from e
return h5py
def _is_hdf5_mat_prefix(data: bytes) -> bool:
"""True if ``data`` begins with the HDF5 signature (MAT v7.3)."""
return data.startswith(HDF5_MAT_SIGNATURE)
[docs]
@dataclass(frozen=True)
class MatBundle:
"""Normalized top-level MATLAB variables."""
variables: dict[str, Any]
source_format: Literal["mat_classic", "mat_v7_3"]
raw_backend: str
scipy_meta: dict[str, Any] | None = None
[docs]
def load_mat_normalized(data: bytes) -> MatBundle:
"""Load ``.mat`` bytes and return a normalized :class:`MatBundle`.
Raises ``ImportError`` with install instructions if optional dependencies for the
detected format are missing.
"""
if not data:
raise ValueError("empty MAT payload")
if _is_hdf5_mat_prefix(data):
return _load_mat_v73_normalized(data)
return _load_mat_classic_normalized(data)
[docs]
def load_mat_normalized_path(path: str | bytes | PathLike[str]) -> MatBundle:
"""Load a ``.mat`` file from disk (convenience; reads entire file into memory)."""
from os import fspath
from pathlib import Path # noqa: PLC0415
if isinstance(path, bytes):
p = Path(path.decode(errors="surrogateescape"))
else:
p = Path(fspath(path))
return load_mat_normalized(p.read_bytes())
def _load_mat_classic_normalized(data: bytes) -> MatBundle:
"""Normalize classic MAT (v4–v7.2) via SciPy ``loadmat``."""
sio = _require_scipy_loadmat()
try:
from scipy.io.matlab import mat_struct # noqa: PLC0415
except ImportError as e:
raise _import_error_mat("scipy.io.matlab is required for classic MAT.") from e
raw: dict[str, Any] = sio.loadmat(
io.BytesIO(data),
struct_as_record=False,
squeeze_me=False,
chars_as_strings=False,
)
scipy_meta = {k: raw[k] for k in _SCIPY_META_KEYS if k in raw}
variables_in = {k: v for k, v in raw.items() if k not in _SCIPY_META_KEYS}
variables_out: dict[str, Any] = {}
for key, val in variables_in.items():
variables_out[key] = _normalize_value_scipy(key, val, depth=0, mat_struct_type=mat_struct)
return MatBundle(
variables=variables_out,
source_format="mat_classic",
raw_backend="scipy.io.loadmat",
scipy_meta=scipy_meta or None,
)
def _load_mat_v73_normalized(data: bytes) -> MatBundle:
"""Normalize MAT v7.3 (HDF5) top-level variables."""
h5py = _require_h5py()
variables_out: dict[str, Any] = {}
with h5py.File(io.BytesIO(data), "r") as f:
for name in f:
# Skip MATLAB/HDF5 internal groups (#refs#, #subsystem#, …), not user variables.
if str(name).startswith("#"):
continue
variables_out[name] = _h5_normalize_item(f, name, f[name], depth=0)
return MatBundle(
variables=variables_out,
source_format="mat_v7_3",
raw_backend="h5py.File",
scipy_meta=None,
)
def _check_depth(key: str, depth: int) -> None:
"""Raise if struct/cell nesting exceeds :data:`_MAX_NEST_DEPTH`."""
if depth > _MAX_NEST_DEPTH:
raise ValueError(f"MAT structure nested too deeply (>{_MAX_NEST_DEPTH}) at {key!r}")
def _normalize_value_scipy(key: str, val: Any, depth: int, mat_struct_type: type) -> Any:
"""Recursively normalize one SciPy ``loadmat`` value to Python/NumPy types."""
_check_depth(key, depth)
_scipy_matlab_runtime_types: tuple[type, ...] = ()
try:
from scipy.io.matlab import MatlabObject, MatlabOpaque # noqa: PLC0415
_scipy_matlab_runtime_types = (MatlabOpaque, MatlabObject)
except ImportError:
pass
if _scipy_matlab_runtime_types and isinstance(val, _scipy_matlab_runtime_types):
raise TypeError(
f"{key!r}: SciPy {type(val).__name__!r} (MATLAB class object / opaque) is not "
"supported in the documented MAT subset (e.g. table, timetable, datetime); "
"export numeric arrays or structs in MATLAB."
)
try:
import scipy.sparse as sp # noqa: PLC0415
except ImportError:
sp = None # type: ignore[assignment]
if sp is not None and sp.issparse(val):
raise ValueError(
f"{key!r}: sparse matrices are not supported in the documented MAT subset "
f"(got {type(val).__name__}); convert to dense in MATLAB or offline."
)
if isinstance(val, mat_struct_type):
return _mat_struct_to_dict(key, val, depth, mat_struct_type)
if isinstance(val, np.ndarray):
if val.dtype.kind == "O":
return _normalize_object_array_scipy(key, val, depth, mat_struct_type)
if val.dtype.names:
return _structured_ndarray_to_dict_scipy(key, val, depth, mat_struct_type)
if val.dtype.kind in "SU":
return _char_array_to_str(key, val)
return _as_numeric_array(key, val)
if isinstance(val, (bytes, bytearray, memoryview)):
return bytes(val).decode("utf-8", errors="replace")
if isinstance(val, str):
return val
if isinstance(val, (float, int, bool, np.bool_, np.integer, np.floating, np.complexfloating)):
return np.asarray(val)
raise TypeError(
f"{key!r}: unsupported MAT value type {type(val).__name__!r} in documented subset"
)
def _mat_struct_to_dict(key: str, obj: Any, depth: int, mat_struct_type: type) -> dict[str, Any]:
"""Convert ``mat_struct`` to a plain ``dict`` of normalized fields."""
out: dict[str, Any] = {}
for name in obj._fieldnames: # noqa: SLF001
child = getattr(obj, name)
child_key = f"{key}.{name}"
out[str(name)] = _normalize_value_scipy(child_key, child, depth + 1, mat_struct_type)
return out
def _structured_ndarray_to_dict_scipy(
key: str, arr: np.ndarray, depth: int, mat_struct_type: type
) -> Any:
"""Single-row structured array → dict; multiple rows → list of dicts."""
flat = arr.reshape(-1)
if flat.size == 1:
row = flat[0]
return {
str(n): _normalize_value_scipy(f"{key}.{n}", row[n], depth + 1, mat_struct_type)
for n in row.dtype.names # type: ignore[union-attr]
}
out_list = []
for i in range(flat.shape[0]):
row = flat[i]
names = row.dtype.names # type: ignore[union-attr]
out_list.append(
{
str(n): _normalize_value_scipy(
f"{key}[{i}].{n}", row[n], depth + 2, mat_struct_type
)
for n in names
}
)
return out_list
def _normalize_object_array_scipy(
key: str, arr: np.ndarray, depth: int, mat_struct_type: type
) -> Any:
"""Normalize dtype ``object`` ndarray (cells or nested arrays)."""
flat = arr.reshape(-1)
if flat.size == 1:
only = flat[0]
if isinstance(only, mat_struct_type):
return _mat_struct_to_dict(key, only, depth, mat_struct_type)
if isinstance(only, np.ndarray) and only.dtype == object:
return _normalize_object_array_scipy(key, only, depth, mat_struct_type)
return _normalize_value_scipy(key, only, depth, mat_struct_type)
return [
_normalize_value_scipy(f"{key}[{i}]", flat[i], depth + 1, mat_struct_type)
for i in range(flat.size)
]
def _char_array_to_str(key: str, arr: np.ndarray) -> str:
"""Decode classic MAT char array to Python ``str``."""
_ = key
a = np.asarray(arr)
if a.dtype.kind == "U":
# ``chars_as_strings=False`` yields per-character ``U1`` arrays (e.g. shape (1, N)).
return "".join(str(x) for x in a.reshape(-1)).rstrip(" \x00") if a.size else ""
if a.dtype.kind == "S":
flat = np.ravel(a)
raw = b"".join(np.bytes_(x).tobytes() for x in flat).split(b"\x00", 1)[0]
return raw.decode("latin-1", errors="replace")
if a.dtype == np.uint16:
return "".join(chr(int(x)) for x in a.astype(np.uint32).reshape(-1)).rstrip(" \x00")
if a.dtype == np.uint8 or a.dtype.kind in "iu":
flat = a.reshape(-1)
if flat.size and int(flat.max()) <= 127:
return bytes(int(x) for x in flat).decode("ascii", errors="replace").strip("\x00")
raise TypeError(f"{key!r}: unsupported character array dtype {a.dtype!r}")
def _as_numeric_array(key: str, arr: np.ndarray) -> np.ndarray:
"""Return ``arr`` as ndarray, rejecting non-numeric dtypes."""
_ = key
out = np.asarray(arr)
if out.dtype.kind not in "biufc":
raise TypeError(f"{key!r}: ndarray has unsupported dtype {out.dtype!r}")
return out
def _h5_matlab_char_to_str(key: str, arr: np.ndarray) -> str | list[str]:
"""Decode MATLAB v7.3 ``char`` stored as ``uint16`` (UTF-16 code units per character)."""
a = np.asarray(arr)
if a.dtype.kind not in "iu":
if a.dtype.kind in "fc":
raise TypeError(
f"{key!r}: MATLAB char array has non-integer dtype {a.dtype!r}; "
"expected uint16-style storage"
)
a = a.astype(np.uint32)
else:
a = a.astype(np.uint32)
if a.ndim == 0:
return chr(int(a)) if a.size else ""
if a.ndim == 1:
return "".join(chr(int(x)) for x in a.flat).rstrip("\x00")
if a.ndim == 2:
if a.shape[0] == 1:
return "".join(chr(int(x)) for x in a[0]).rstrip("\x00 ")
if a.shape[1] == 1:
return "".join(chr(int(x)) for x in a[:, 0]).rstrip("\x00 ")
return [
"".join(chr(int(a[i, j])) for i in range(a.shape[0])).rstrip("\x00 ")
for j in range(a.shape[1])
]
raise TypeError(f"{key!r}: MATLAB char array rank {a.ndim} is not supported")
def _h5_sparse_group_to_dense(key: str, grp: Any) -> np.ndarray:
"""MATLAB v7.3 CSC sparse group → dense float array (SciPy)."""
try:
from scipy.sparse import csc_matrix # noqa: PLC0415
except ImportError as e:
raise ImportError(
f"{key!r}: MAT v7.3 sparse groups require scipy (install oalib-at-py[mat])."
) from e
import h5py # noqa: PLC0415
if not isinstance(grp, h5py.Group):
raise TypeError(f"{key!r}: expected HDF5 group for sparse matrix")
if "data" not in grp or "ir" not in grp or "jc" not in grp:
raise ValueError(
f"{key!r}: MAT v7.3 sparse group missing data/ir/jc datasets (got {list(grp.keys())})"
)
# h5py stubs disagree with runtime Group indexing; keep dynamic lookups on ``Any``.
g: Any = grp
nrows = int(np.asarray(g.attrs["MATLAB_sparse"]).reshape(-1)[0])
data = np.asarray(g["data"][:])
ir = np.asarray(g["ir"][:], dtype=np.int64).reshape(-1)
jc = np.asarray(g["jc"][:], dtype=np.int64).reshape(-1)
ncols = int(jc.size) - 1
if ncols < 0:
raise ValueError(f"{key!r}: invalid sparse jc length {jc.size}")
mat = csc_matrix((data, ir, jc), shape=(nrows, ncols))
dense = mat.toarray()
if np.iscomplexobj(dense):
return np.asarray(dense, dtype=np.complex128)
return np.asarray(dense, dtype=np.float64)
def _h5_matlab_class(obj: Any) -> str | None:
"""``MATLAB_class`` attribute as ``str``, or ``None``."""
cls = obj.attrs.get("MATLAB_class")
if cls is None:
return None
if isinstance(cls, bytes):
return cls.decode("ascii", errors="replace")
if isinstance(cls, (np.bytes_, np.ndarray)):
return np.asarray(cls).tobytes().decode("ascii", errors="replace").strip("\x00")
return str(cls)
def _h5_cell_from_ref_array(f: Any, key: str, arr: np.ndarray, depth: int) -> Any:
"""MATLAB ``cell`` stored as a dataset of HDF5 object references (see mat73-style layout)."""
import h5py # noqa: PLC0415
if h5py.check_dtype(ref=arr.dtype) is None:
raise TypeError(f"{key!r}: expected HDF5 reference array for MATLAB cell")
if arr.ndim == 0:
return _h5_normalize_item(f, key, f[arr.item()], depth + 1)
if arr.ndim == 1:
return [
_h5_normalize_item(f, f"{key}[{i}]", f[arr[i]], depth + 1)
for i in range(int(arr.shape[0]))
]
if arr.ndim == 2:
# Row vector (1, N) or column (N, 1): flatten to a Python list (common 1-D cell).
if arr.shape[0] == 1:
return [
_h5_normalize_item(f, f"{key}[{j}]", f[arr[0, j]], depth + 1)
for j in range(int(arr.shape[1]))
]
if arr.shape[1] == 1:
return [
_h5_normalize_item(f, f"{key}[{i}]", f[arr[i, 0]], depth + 1)
for i in range(int(arr.shape[0]))
]
cell: list[list[Any]] = []
for i in range(int(arr.shape[0])):
row: list[Any] = []
for j in range(int(arr.shape[1])):
r = arr[i, j]
row.append(_h5_normalize_item(f, f"{key}[{i},{j}]", f[r], depth + 1))
cell.append(row)
if len(cell) > 1 or (cell and len(cell[0]) > 1):
cell = list(map(list, zip(*cell, strict=True)))
if len(cell) == 1:
return cell[0]
return cell
def _h5_deref_ref_array_generic(f: Any, key: str, arr: np.ndarray, depth: int) -> Any:
"""Dereference a non-cell dataset of HDF5 object references (e.g. struct fields)."""
import h5py # noqa: PLC0415
if h5py.check_dtype(ref=arr.dtype) is None:
raise TypeError(f"{key!r}: expected HDF5 reference array")
if arr.ndim == 0:
return _h5_normalize_item(f, key, f[arr.item()], depth + 1)
if arr.ndim == 1:
return [
_h5_normalize_item(f, f"{key}[{i}]", f[arr[i]], depth + 1)
for i in range(int(arr.shape[0]))
]
out = np.empty(arr.shape, dtype=object)
for idx in np.ndindex(arr.shape):
out[idx] = _h5_normalize_item(f, f"{key}{list(idx)}", f[arr[idx]], depth + 1)
if out.size == 1:
return out.flat[0]
if out.shape == (1, 1):
return out[0, 0]
dims = tuple(np.shape(out))
if len(dims) < 2:
return out.tolist()
n0, n1 = int(dims[0]), int(dims[1])
if n0 == 1:
return [out[0, j] for j in range(n1)]
if n1 == 1:
return [out[i, 0] for i in range(n0)]
return out.tolist()
def _h5_cell_group_to_list(f: Any, key: str, grp: Any, depth: int) -> list[Any]:
"""MATLAB ``cell`` stored as a group with numbered children ``1``, ``2``, …."""
names = [n for n in grp.keys() if not str(n).startswith("#")]
def _sort_key(n: str) -> tuple[int, int | str]:
"""Order cell indices ``1``, ``2``, … before non-numeric group names."""
try:
return (0, int(n))
except ValueError:
return (1, n)
ordered = sorted(names, key=_sort_key)
return [_h5_normalize_item(f, f"{key}.{n}", grp[n], depth + 1) for n in ordered]
def _h5_compound_to_dict(key: str, arr: np.ndarray, depth: int) -> Any:
"""Normalize HDF5 compound arrays without scipy mat_struct."""
_check_depth(key, depth)
flat = arr.reshape(-1)
if flat.size == 1:
row = flat[0]
return {
str(n): _h5_field_value(f"{key}.{n}", row[n], depth + 1)
for n in row.dtype.names # type: ignore[union-attr]
}
return [
{
str(n): _h5_field_value(f"{key}[{i}].{n}", flat[i][n], depth + 2)
for n in flat[i].dtype.names # type: ignore[union-attr]
}
for i in range(flat.shape[0])
]
def _h5_field_value(key: str, val: Any, depth: int) -> Any:
"""Normalize one field value from a compound HDF5 row."""
_check_depth(key, depth)
if isinstance(val, np.ndarray):
if val.dtype.names:
return _h5_compound_to_dict(key, val, depth)
if val.dtype.kind in "SU":
return _char_array_to_str(key, val)
return _as_numeric_array(key, val)
if isinstance(val, (float, int, bool, np.integer, np.floating, np.complexfloating)):
return np.asarray(val)
raise TypeError(f"{key!r}: unsupported compound field type {type(val).__name__!r}")
def _h5_dataset_to_numpy(f: Any, key: str, ds: Any, depth: int) -> Any:
"""Load one HDF5 dataset: numeric, char, cell refs, compound, etc."""
import h5py # noqa: PLC0415
mclass = _h5_matlab_class(ds)
if mclass in _UNSUPPORTED_H5_MATLAB_CLASS:
raise TypeError(
f"{key!r}: MATLAB_class={mclass!r} is not supported in the documented MAT subset"
)
if ds.dtype.names:
raw = ds[()]
arr = np.asarray(raw, dtype=ds.dtype)
return _h5_compound_to_dict(key, arr, depth=0)
if h5py.check_dtype(ref=ds.dtype) is not None:
raw = np.asarray(ds[()], dtype=ds.dtype)
if mclass == "cell":
return _h5_cell_from_ref_array(f, key, raw, depth)
return _h5_deref_ref_array_generic(f, key, raw, depth)
arr = np.asarray(ds[()])
if mclass == "char":
return _h5_matlab_char_to_str(key, arr)
if mclass == "logical":
return np.not_equal(arr, 0)
if arr.dtype.kind in "SU":
return _char_array_to_str(key, arr)
if arr.dtype.kind == "O":
raise TypeError(f"{key!r}: v7.3 object dataset {ds.name!r} is not supported")
return _as_numeric_array(key, arr)
def _h5_group_as_struct(f: Any, key: str, grp: Any, depth: int) -> dict[str, Any]:
"""MATLAB struct as HDF5 group → nested dict."""
_check_depth(key, depth)
out: dict[str, Any] = {}
for child_name in grp:
if child_name.startswith("#"):
continue
child = grp[child_name]
out[str(child_name)] = _h5_normalize_item(f, f"{key}.{child_name}", child, depth + 1)
return out
def _h5_group_resembles_matlab_table_layout(grp: Any) -> bool:
"""Heuristic for v7.3 ``table``/``timetable`` groups when ``MATLAB_class`` is missing."""
import h5py # noqa: PLC0415
if not isinstance(grp, h5py.Group):
return False
names = {str(n) for n in grp.keys() if not str(n).startswith("#")}
# Typical MATLAB R2013b+ HDF5 table layout (see MathWorks MAT-file v7.3 spec examples).
if {"Properties", "Variables"}.issubset(names):
return True
return False
def _h5_normalize_item(f: Any, key: str, obj: Any, depth: int) -> Any:
"""Entry point for v7.3: dispatch dataset vs group (struct/cell/sparse)."""
import h5py # noqa: PLC0415
_check_depth(key, depth)
mclass = _h5_matlab_class(obj)
if isinstance(obj, h5py.Dataset):
return _h5_dataset_to_numpy(f, key, obj, depth)
if isinstance(obj, h5py.Group):
if "MATLAB_sparse" in obj.attrs:
return _h5_sparse_group_to_dense(key, obj)
if mclass == "struct":
return _h5_group_as_struct(f, key, obj, depth)
if mclass == "cell":
return _h5_cell_group_to_list(f, key, obj, depth)
if mclass in _UNSUPPORTED_H5_MATLAB_CLASS:
raise TypeError(
f"{key!r}: MATLAB_class={mclass!r} is not supported in the documented MAT subset"
)
if mclass:
safe = re.fullmatch(
r"(double|single|int8|int16|int32|int64|uint8|uint16|uint32|uint64|canonical)",
mclass,
flags=re.I,
)
if not safe:
raise TypeError(
f"{key!r}: MATLAB_class={mclass!r} is not supported in the documented subset"
)
elif _h5_group_resembles_matlab_table_layout(obj):
raise TypeError(
f"{key!r}: HDF5 group looks like a MATLAB table/timetable layout "
"(Properties/Variables); not supported in the documented MAT subset"
)
return _h5_group_as_struct(f, key, obj, depth)
raise TypeError(f"{key!r}: unsupported HDF5 object type {type(obj).__name__}")