Source code for stable_datasets.backends.protocol
"""Read-side storage backend protocol.
Defines :class:`StorageBackend`, the interface :class:`StableDataset`
depends on for row access, iteration, and materialization. Concrete
backends (e.g. :class:`ArrowBackend`) conform structurally.
Arrow types (:class:`pa.Table`, :class:`pa.RecordBatch`,
:class:`pa.Schema`) are the boundary types. Members not declared on the
protocol are backend-private.
"""
from __future__ import annotations
from collections.abc import Iterator
from typing import Protocol, runtime_checkable
import numpy as np
import pyarrow as pa
[docs]
@runtime_checkable
class StorageBackend(Protocol):
"""Read-side storage interface consumed by ``StableDataset``."""
# -- Shape ----------------------------------------------------------------
@property
def num_rows(self) -> int: ...
@property
def num_shards(self) -> int: ...
@property
def is_file_backed(self) -> bool: ...
@property
def schema(self) -> pa.Schema: ...
# -- Materialization ------------------------------------------------------
@property
def table(self) -> pa.Table:
"""Full materialization as a single ``pa.Table``.
Expensive for multi-shard datasets. Hot paths should prefer
``get_row``, ``take``, ``slice``, or ``iter_batches``.
"""
...
# -- Random access --------------------------------------------------------
[docs]
def get_row(self, idx: int) -> dict: ...
[docs]
def take(self, indices: np.ndarray | list[int]) -> pa.Table: ...
[docs]
def slice(self, start: int, length: int) -> pa.Table: ...
# -- Sequential iteration -------------------------------------------------
self,
shard_indices: list[int] | None = None,
shuffle: bool = False,
seed: int | None = None,
) -> Iterator[pa.RecordBatch]: ...
# -- Pickle contract ------------------------------------------------------
# Backends must survive DataLoader worker forks. State should reference
# files by path (not hold open handles/mmaps) so workers can re-open
# after fork.
def __getstate__(self) -> dict: ...
def __setstate__(self, state: dict) -> None: ...