Skip to content

afnio.utils.data.sampler

afnio.utils.data.sampler.Sampler

Bases: Generic[T_co]

Base class for all Samplers.

Every Sampler subclass has to provide an __iter__() method, providing a way to iterate over indices or lists of indices (batches) of dataset elements, and may provide a __len__() method that returns the length of the returned iterators.

Examples:

>>> class AccedingSequenceLengthSampler(Sampler[int]):
>>>     def __init__(self, data: List[str]) -> None:
>>>         self.data = data
>>>
>>>     def __len__(self) -> int:
>>>         return len(self.data)
>>>
>>>     def __iter__(self) -> Iterator[int]:
>>>         sizes = [len(x) for x in self.data]
>>>         yield from sorted(range(len(sizes)), key=sizes.__getitem__)
>>>
>>> class AccedingSequenceLengthBatchSampler(Sampler[List[int]]):
>>>     def __init__(self, data: List[str], batch_size: int) -> None:
>>>         self.data = data
>>>         self.batch_size = batch_size
>>>
>>>     def __len__(self) -> int:
>>>         return (len(self.data) + self.batch_size - 1) // self.batch_size
>>>
>>>     def __iter__(self) -> Iterator[List[int]]:
>>>         sizes = [len(x) for x in self.data]
>>>         sorted_indices = sorted(range(len(sizes)), key=sizes.__getitem__)
>>>         for start in range(0, len(sorted_indices), self.batch_size):
>>>             yield sorted_indices[start : start + self.batch_size]
Source code in afnio/utils/data/sampler.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
class Sampler(Generic[T_co]):
    """Base class for all Samplers.

    Every Sampler subclass has to provide an `__iter__()` method, providing a
    way to iterate over indices or lists of indices (batches) of dataset elements,
    and may provide a `__len__()` method that returns the length of the returned
    iterators.

    Examples:
        >>> class AccedingSequenceLengthSampler(Sampler[int]):
        >>>     def __init__(self, data: List[str]) -> None:
        >>>         self.data = data
        >>>
        >>>     def __len__(self) -> int:
        >>>         return len(self.data)
        >>>
        >>>     def __iter__(self) -> Iterator[int]:
        >>>         sizes = [len(x) for x in self.data]
        >>>         yield from sorted(range(len(sizes)), key=sizes.__getitem__)
        >>>
        >>> class AccedingSequenceLengthBatchSampler(Sampler[List[int]]):
        >>>     def __init__(self, data: List[str], batch_size: int) -> None:
        >>>         self.data = data
        >>>         self.batch_size = batch_size
        >>>
        >>>     def __len__(self) -> int:
        >>>         return (len(self.data) + self.batch_size - 1) // self.batch_size
        >>>
        >>>     def __iter__(self) -> Iterator[List[int]]:
        >>>         sizes = [len(x) for x in self.data]
        >>>         sorted_indices = sorted(range(len(sizes)), key=sizes.__getitem__)
        >>>         for start in range(0, len(sorted_indices), self.batch_size):
        >>>             yield sorted_indices[start : start + self.batch_size]
    """

    def __init__(self) -> None:
        raise NotImplementedError

    def __iter__(self) -> Iterator[T_co]:
        raise NotImplementedError

afnio.utils.data.sampler.SequentialSampler

Bases: Sampler[int]

Samples elements sequentially, always in the same order.

Source code in afnio/utils/data/sampler.py
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
class SequentialSampler(Sampler[int]):
    """Samples elements sequentially, always in the same order."""

    data_source: Sized

    def __init__(self, data_source: Sized) -> None:
        """Initializes a `SequentialSampler`.

        Args:
            data_source: Dataset to sample from.
        """
        self.data_source = data_source

    def __iter__(self) -> Iterator[int]:
        return iter(range(len(self.data_source)))

    def __len__(self) -> int:
        return len(self.data_source)

__init__(data_source)

Initializes a SequentialSampler.

Parameters:

Name Type Description Default
data_source Sized

Dataset to sample from.

required
Source code in afnio/utils/data/sampler.py
54
55
56
57
58
59
60
def __init__(self, data_source: Sized) -> None:
    """Initializes a `SequentialSampler`.

    Args:
        data_source: Dataset to sample from.
    """
    self.data_source = data_source

afnio.utils.data.sampler.RandomSampler

Bases: Sampler[int]

Samples elements randomly. If without replacement, then sample from a shuffled dataset.

If with replacement, then user can specify num_samples to draw.

Source code in afnio/utils/data/sampler.py
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
class RandomSampler(Sampler[int]):
    """Samples elements randomly. If without replacement,
    then sample from a shuffled dataset.

    If with replacement, then user can specify `num_samples` to draw.
    """

    data_source: Sized
    replacement: bool

    def __init__(
        self,
        data_source: Sized,
        replacement: bool = False,
        num_samples: Optional[int] = None,
        seed: Optional[int] = None,
    ) -> None:
        """
        Initializes a `RandomSampler`.

        Args:
            data_source: Dataset to sample from.
            replacement: Samples are drawn on-demand with replacement if `True`.
            num_samples: Number of samples to draw, default=`len(dataset)`.
            seed: A number to set the seed for the random draws.
        """
        self.data_source = data_source
        self.replacement = replacement
        self._num_samples = num_samples
        self.seed = seed

        if not isinstance(self.replacement, bool):
            raise TypeError(
                f"replacement should be a boolean value, "
                f"but got replacement={self.replacement}"
            )

        if not isinstance(self.num_samples, int) or self.num_samples <= 0:
            raise ValueError(
                f"num_samples should be a positive integer value, "
                f"but got num_samples={self.num_samples}"
            )

    @property
    def num_samples(self) -> int:
        # dataset size might change at runtime
        if self._num_samples is None:
            return len(self.data_source)
        return self._num_samples

    def _is_valid_random_state(self, state) -> bool:
        return isinstance(state, tuple) and len(state) > 0

    def __iter__(self) -> Iterator[int]:
        n = len(self.data_source)
        random.seed(self.seed)

        if self.replacement:
            for _ in range(self.num_samples // 32):
                yield from random.choices(range(n), k=32)
            yield from random.choices(range(n), k=self.num_samples % 32)
        else:
            for _ in range(self.num_samples // n):
                yield from random.sample(range(n), n)
            yield from random.sample(range(n), self.num_samples % n)

    def __len__(self) -> int:
        return self.num_samples

__init__(data_source, replacement=False, num_samples=None, seed=None)

Initializes a RandomSampler.

Parameters:

Name Type Description Default
data_source Sized

Dataset to sample from.

required
replacement bool

Samples are drawn on-demand with replacement if True.

False
num_samples int | None

Number of samples to draw, default=len(dataset).

None
seed int | None

A number to set the seed for the random draws.

None
Source code in afnio/utils/data/sampler.py
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
def __init__(
    self,
    data_source: Sized,
    replacement: bool = False,
    num_samples: Optional[int] = None,
    seed: Optional[int] = None,
) -> None:
    """
    Initializes a `RandomSampler`.

    Args:
        data_source: Dataset to sample from.
        replacement: Samples are drawn on-demand with replacement if `True`.
        num_samples: Number of samples to draw, default=`len(dataset)`.
        seed: A number to set the seed for the random draws.
    """
    self.data_source = data_source
    self.replacement = replacement
    self._num_samples = num_samples
    self.seed = seed

    if not isinstance(self.replacement, bool):
        raise TypeError(
            f"replacement should be a boolean value, "
            f"but got replacement={self.replacement}"
        )

    if not isinstance(self.num_samples, int) or self.num_samples <= 0:
        raise ValueError(
            f"num_samples should be a positive integer value, "
            f"but got num_samples={self.num_samples}"
        )

afnio.utils.data.sampler.WeightedRandomSampler

Bases: Sampler[int]

Samples elements from [0,..,len(weights)-1] with given probabilities (weights).

Examples:

>>> list(WeightedRandomSampler([0.1, 0.9, 0.4, 0.7, 3.0, 0.6], 5, replacement=True))
[4, 4, 1, 4, 5]
>>> list(WeightedRandomSampler([0.9, 0.4, 0.05, 0.2, 0.3, 0.1], 5, replacement=False))
[0, 1, 4, 3, 2]
Source code in afnio/utils/data/sampler.py
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
class WeightedRandomSampler(Sampler[int]):
    """Samples elements from `[0,..,len(weights)-1]` with given probabilities (weights).

    Examples:
        >>> list(WeightedRandomSampler([0.1, 0.9, 0.4, 0.7, 3.0, 0.6], 5, replacement=True))
        [4, 4, 1, 4, 5]
        >>> list(WeightedRandomSampler([0.9, 0.4, 0.05, 0.2, 0.3, 0.1], 5, replacement=False))
        [0, 1, 4, 3, 2]
    """  # noqa: E501

    weights: Sequence[float]
    num_samples: int
    replacement: bool

    def __init__(
        self,
        weights: Sequence[float],
        num_samples: int,
        replacement: bool = True,
        seed: Optional[int] = None,
    ) -> None:
        """Initializes a `WeightedRandomSampler`.

        Args:
            weights: A sequence of weights, not necessary summing up to one
            num_samples: Number of samples to draw
            replacement: If `True`, samples are drawn with replacement.
                If not, they are drawn without replacement, which means that when a
                sample index is drawn for a row, it cannot be drawn again for that row.
            seed: A number to set the seed for the random draws.
        """
        if (
            not isinstance(num_samples, int)
            or isinstance(num_samples, bool)
            or num_samples <= 0
        ):
            raise ValueError(
                f"num_samples should be a positive integer value, "
                f"but got num_samples={num_samples}"
            )
        if not isinstance(replacement, bool):
            raise ValueError(
                f"replacement should be a boolean value, "
                f"but got replacement={replacement}"
            )

        if len(weights) == 0 or not all(isinstance(w, (float, int)) for w in weights):
            raise ValueError("Weights must be a non-empty sequence of numbers.")

        if not replacement and num_samples > len(weights):
            raise ValueError(
                f"num_samples ({num_samples}) cannot be greater than "
                f"the population size ({len(weights)}) when replacement is False."
            )

        self.weights = weights
        self.num_samples = num_samples
        self.replacement = replacement
        self.seed = seed

    def __iter__(self) -> Iterator[int]:
        random.seed(self.seed)

        total_weight = sum(self.weights)
        probabilities = [w / total_weight for w in self.weights]

        if self.replacement:
            yield from random.choices(
                population=range(len(self.weights)),
                weights=probabilities,
                k=self.num_samples,
            )
        else:
            # Sample without replacement
            yield from random.sample(range(len(self.weights)), k=self.num_samples)

    def __len__(self) -> int:
        return self.num_samples

__init__(weights, num_samples, replacement=True, seed=None)

Initializes a WeightedRandomSampler.

Parameters:

Name Type Description Default
weights Sequence[float]

A sequence of weights, not necessary summing up to one

required
num_samples int

Number of samples to draw

required
replacement bool

If True, samples are drawn with replacement. If not, they are drawn without replacement, which means that when a sample index is drawn for a row, it cannot be drawn again for that row.

True
seed int | None

A number to set the seed for the random draws.

None
Source code in afnio/utils/data/sampler.py
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
def __init__(
    self,
    weights: Sequence[float],
    num_samples: int,
    replacement: bool = True,
    seed: Optional[int] = None,
) -> None:
    """Initializes a `WeightedRandomSampler`.

    Args:
        weights: A sequence of weights, not necessary summing up to one
        num_samples: Number of samples to draw
        replacement: If `True`, samples are drawn with replacement.
            If not, they are drawn without replacement, which means that when a
            sample index is drawn for a row, it cannot be drawn again for that row.
        seed: A number to set the seed for the random draws.
    """
    if (
        not isinstance(num_samples, int)
        or isinstance(num_samples, bool)
        or num_samples <= 0
    ):
        raise ValueError(
            f"num_samples should be a positive integer value, "
            f"but got num_samples={num_samples}"
        )
    if not isinstance(replacement, bool):
        raise ValueError(
            f"replacement should be a boolean value, "
            f"but got replacement={replacement}"
        )

    if len(weights) == 0 or not all(isinstance(w, (float, int)) for w in weights):
        raise ValueError("Weights must be a non-empty sequence of numbers.")

    if not replacement and num_samples > len(weights):
        raise ValueError(
            f"num_samples ({num_samples}) cannot be greater than "
            f"the population size ({len(weights)}) when replacement is False."
        )

    self.weights = weights
    self.num_samples = num_samples
    self.replacement = replacement
    self.seed = seed