`afnio.utils.data.dataloader`

`afnio.utils.data.dataloader.DataLoader`

Bases: Generic[T_co]

Data loader combines a dataset and a sampler, and provides an iterable over the given dataset.

The DataLoader supports both map-style and iterable-style datasets with single-process loading, customizing loading order and optional automatic batching (collation) and memory pinning.

See afnio.utils.data documentation page for more details.

Source code in afnio/utils/data/dataloader.py

class DataLoader(Generic[T_co]):
    """
    Data loader combines a dataset and a sampler, and provides an iterable over the
    given dataset.

    The [`DataLoader`][afnio.utils.data.DataLoader] supports both map-style and
    iterable-style datasets with single-process loading, customizing loading order
    and optional automatic batching (collation) and memory pinning.

    See [`afnio.utils.data`][afnio.utils.data] documentation page for more details.
    """

    dataset: Dataset[T_co]
    batch_size: Optional[int]
    drop_last: bool
    sampler: Union[Sampler, Iterable]
    __initialized = False

    def __init__(
        self,
        dataset: Dataset[T_co],
        batch_size: Optional[int] = 1,
        shuffle: Optional[bool] = False,
        sampler: Union[Sampler, Iterable, None] = None,
        drop_last: bool = False,
        seed: Optional[int] = None,
    ):
        """Initializes the `DataLoader` with the given dataset and options.

        Args:
            dataset: Dataset from which to load the data.
            batch_size: How many samples per batch to load.
            shuffle: Set to `True` to have the data reshuffled at every epoch.
            sampler: Defines the strategy to draw samples from the dataset. Can be any
                `Iterable` with `__len__` implemented. If specified, `shuffle`
                must not be specified.
            drop_last: Set to `True` to drop the last incomplete batch, if the dataset
                size is not divisible by the batch size. If `False` and the size of
                dataset is not divisible by the batch size, then the last batch
                will be smaller.
            seed: If not `None`, this seed will be used by
                [`RandomSampler`][afnio.utils.data.RandomSampler]
                to generate random indexes.
        """
        self.dataset = dataset
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.drop_last = drop_last

        if shuffle not in {True, False}:
            raise ValueError(
                f"DataLoader with IterableDataset: "
                f"expected unspecified shuffle option, but got shuffle={shuffle}"
            )

        if sampler is not None and shuffle:
            raise ValueError("sampler option is mutually exclusive with shuffle")

        if sampler is None:
            if shuffle:
                sampler = RandomSampler(dataset, seed=seed)
            else:
                sampler = SequentialSampler(dataset)

        self.index_sampler = sampler
        self._sampler_iter = iter(self.index_sampler)
        self.__initialized = True

    def __iter__(self) -> Iterable[Any]:
        self._sampler_iter = iter(self.index_sampler)  # Ensure new iterator every time
        return self

    def _next_index(self):
        return next(self._sampler_iter)

    def __next__(self) -> Any:
        """Returns the next batch from the dataset, collated according to the structure
        of the dataset's `__getitem__` output.

        **Batching logic:**

        - If the dataset returns a dictionary, this method aggregates each key across
          the batch into a list of values. For example, if each sample is
          `{'a': 'foo', 'b': 'bar'}`, the batch will be `{'a': [...], 'b': [...]}`.
        - If the dataset returns a tuple (e.g., `(X, y)`), this method recursively
          collates each position in the tuple using
          [`collate_tuple()`][afnio.utils.data.dataloader.collate_tuple], preserving
          nested tuple structure and batching [`Variables`][afnio.Variable]
          as described below.
        - If the dataset returns [`Variables`][afnio.Variable] directly, this method
          batches them into a single Variable whose [`data`][afnio.Variable.data] is a
          list of the original [`data`][afnio.Variable.data] fields, and whose
          [`role`][afnio.Variable.role] and
          [`requires_grad`][afnio.Variable.requires_grad] are taken
          from the first [`Variables`][afnio.Variable].
        - Otherwise, returns the batch as a `list`.
        """
        # Suppress notifications for individual Variables
        with suppress_variable_notifications():
            batch = []
            for _ in range(self.batch_size):
                try:
                    index = self._next_index()
                    batch.append(self.dataset[index])
                except StopIteration:
                    if not batch or self.drop_last:
                        raise
                    break

        # If dataset returns a dictionary, we aggregate each key across the batch
        if (
            batch
            and isinstance(batch[0], dict)  # noqa: W503
            and all(isinstance(item, dict) for item in batch)  # noqa: W503
        ):
            keys = batch[0].keys()
            collated = {}
            for key in keys:
                values = [item[key] for item in batch]
                collated[key] = values
            return collated
        # If dataset returns a tuple, we recursively collate each position in the tuple
        if (
            batch
            and isinstance(batch[0], tuple)  # noqa: W503
            and all(isinstance(item, tuple) for item in batch)  # noqa: W503
        ):
            return collate_tuple(batch)

        # If dataset returns Variables, we batch them into a single Variable
        if (
            batch
            and isinstance(batch[0], Variable)  # noqa: W503
            and all(isinstance(item, Variable) for item in batch)  # noqa: W503
        ):
            first = batch[0]
            return Variable(
                data=[item.data for item in batch],
                role=first.role,
                requires_grad=first.requires_grad,
            )

        return batch

    def __len__(self) -> int:
        length = len(self.dataset)
        if self.batch_size is not None:
            from math import ceil

            if self.drop_last:
                length = length // self.batch_size
            else:
                length = ceil(length / self.batch_size)
        return length

`init(dataset, batch_size=1, shuffle=False, sampler=None, drop_last=False, seed=None)`

Initializes the DataLoader with the given dataset and options.

Parameters:

Name	Type	Description	Default
`dataset`	`Dataset[T_co]`	Dataset from which to load the data.	required
`batch_size`	`int \| None`	How many samples per batch to load.	`1`
`shuffle`	`bool \| None`	Set to `True` to have the data reshuffled at every epoch.	`False`
`sampler`	`Sampler \| Iterable \| None`	Defines the strategy to draw samples from the dataset. Can be any `Iterable` with `__len__` implemented. If specified, `shuffle` must not be specified.	`None`
`drop_last`	`bool`	Set to `True` to drop the last incomplete batch, if the dataset size is not divisible by the batch size. If `False` and the size of dataset is not divisible by the batch size, then the last batch will be smaller.	`False`
`seed`	`int \| None`	If not `None`, this seed will be used by `RandomSampler` to generate random indexes.	`None`

Source code in afnio/utils/data/dataloader.py

def __init__(
    self,
    dataset: Dataset[T_co],
    batch_size: Optional[int] = 1,
    shuffle: Optional[bool] = False,
    sampler: Union[Sampler, Iterable, None] = None,
    drop_last: bool = False,
    seed: Optional[int] = None,
):
    """Initializes the `DataLoader` with the given dataset and options.

    Args:
        dataset: Dataset from which to load the data.
        batch_size: How many samples per batch to load.
        shuffle: Set to `True` to have the data reshuffled at every epoch.
        sampler: Defines the strategy to draw samples from the dataset. Can be any
            `Iterable` with `__len__` implemented. If specified, `shuffle`
            must not be specified.
        drop_last: Set to `True` to drop the last incomplete batch, if the dataset
            size is not divisible by the batch size. If `False` and the size of
            dataset is not divisible by the batch size, then the last batch
            will be smaller.
        seed: If not `None`, this seed will be used by
            [`RandomSampler`][afnio.utils.data.RandomSampler]
            to generate random indexes.
    """
    self.dataset = dataset
    self.batch_size = batch_size
    self.shuffle = shuffle
    self.drop_last = drop_last

    if shuffle not in {True, False}:
        raise ValueError(
            f"DataLoader with IterableDataset: "
            f"expected unspecified shuffle option, but got shuffle={shuffle}"
        )

    if sampler is not None and shuffle:
        raise ValueError("sampler option is mutually exclusive with shuffle")

    if sampler is None:
        if shuffle:
            sampler = RandomSampler(dataset, seed=seed)
        else:
            sampler = SequentialSampler(dataset)

    self.index_sampler = sampler
    self._sampler_iter = iter(self.index_sampler)
    self.__initialized = True

`next()`

Returns the next batch from the dataset, collated according to the structure of the dataset's __getitem__ output.

Batching logic:

If the dataset returns a dictionary, this method aggregates each key across the batch into a list of values. For example, if each sample is {'a': 'foo', 'b': 'bar'}, the batch will be {'a': [...], 'b': [...]}.
If the dataset returns a tuple (e.g., (X, y)), this method recursively collates each position in the tuple using collate_tuple(), preserving nested tuple structure and batching Variables as described below.
If the dataset returns Variables directly, this method batches them into a single Variable whose data is a list of the original data fields, and whose role and requires_grad are taken from the first Variables.
Otherwise, returns the batch as a list.

Source code in afnio/utils/data/dataloader.py

def __next__(self) -> Any:
    """Returns the next batch from the dataset, collated according to the structure
    of the dataset's `__getitem__` output.

    **Batching logic:**

    - If the dataset returns a dictionary, this method aggregates each key across
      the batch into a list of values. For example, if each sample is
      `{'a': 'foo', 'b': 'bar'}`, the batch will be `{'a': [...], 'b': [...]}`.
    - If the dataset returns a tuple (e.g., `(X, y)`), this method recursively
      collates each position in the tuple using
      [`collate_tuple()`][afnio.utils.data.dataloader.collate_tuple], preserving
      nested tuple structure and batching [`Variables`][afnio.Variable]
      as described below.
    - If the dataset returns [`Variables`][afnio.Variable] directly, this method
      batches them into a single Variable whose [`data`][afnio.Variable.data] is a
      list of the original [`data`][afnio.Variable.data] fields, and whose
      [`role`][afnio.Variable.role] and
      [`requires_grad`][afnio.Variable.requires_grad] are taken
      from the first [`Variables`][afnio.Variable].
    - Otherwise, returns the batch as a `list`.
    """
    # Suppress notifications for individual Variables
    with suppress_variable_notifications():
        batch = []
        for _ in range(self.batch_size):
            try:
                index = self._next_index()
                batch.append(self.dataset[index])
            except StopIteration:
                if not batch or self.drop_last:
                    raise
                break

    # If dataset returns a dictionary, we aggregate each key across the batch
    if (
        batch
        and isinstance(batch[0], dict)  # noqa: W503
        and all(isinstance(item, dict) for item in batch)  # noqa: W503
    ):
        keys = batch[0].keys()
        collated = {}
        for key in keys:
            values = [item[key] for item in batch]
            collated[key] = values
        return collated
    # If dataset returns a tuple, we recursively collate each position in the tuple
    if (
        batch
        and isinstance(batch[0], tuple)  # noqa: W503
        and all(isinstance(item, tuple) for item in batch)  # noqa: W503
    ):
        return collate_tuple(batch)

    # If dataset returns Variables, we batch them into a single Variable
    if (
        batch
        and isinstance(batch[0], Variable)  # noqa: W503
        and all(isinstance(item, Variable) for item in batch)  # noqa: W503
    ):
        first = batch[0]
        return Variable(
            data=[item.data for item in batch],
            role=first.role,
            requires_grad=first.requires_grad,
        )

    return batch