Skip to content

afnio.utils.datasets

afnio.utils.datasets.FacilitySupport

Bases: Dataset

The Meta Facility Support Analyzer dataset consists of 200 real-world emails or messages sent in enterprise settings related to facility maintenance or support requests. Each example is annotated with:

  • urgency (low, medium, high)
  • sentiment (negative, neutral, positive)
  • relevant service request categories (e.g., cleaning, IT support, maintenance)

The dataset is split into train, validation, and test sets with a 33%/33%/34% ratio. The split is deterministic, ensuring reproducibility across different runs.

References:

Source code in afnio/utils/datasets/facility_support.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
class FacilitySupport(Dataset):
    """The Meta Facility Support Analyzer dataset consists of 200 real-world emails or
    messages sent in enterprise settings related to facility maintenance or support
    requests. Each example is annotated with:

      - urgency (low, medium, high)
      - sentiment (negative, neutral, positive)
      - relevant service request categories (e.g., cleaning, IT support, maintenance)

    The dataset is split into train, validation, and test sets with a 33%/33%/34%
    ratio. The split is deterministic, ensuring reproducibility across different runs.

    **References:**

    - *Meta Facility Support Analyzer Dataset*
        [https://github.com/meta-llama/prompt-ops/tree/main/use-cases/facility-support-analyzer](https://github.com/meta-llama/prompt-ops/tree/main/use-cases/facility-support-analyzer)
    """

    mirrors = [
        "https://raw.githubusercontent.com/meta-llama/llama-prompt-ops/refs/heads/main/use-cases/facility-support-analyzer/"  # noqa: E501
    ]

    resources = [
        ("dataset.json", "530dc66b1b07c9b15b19f08891e9bfa0"),
    ]

    _repr_indent = 4

    def __init__(self, split: str, root: Union[str, Path] = None) -> None:
        """
        Initializes the `FacilitySupport` dataset.

        Args:
            split: The dataset split to load. Must be either `"train"`, `"val"`,
                or `"test"`.
            root: The root directory where JSON files are stored.
        """
        if split not in {"train", "val", "test"}:
            raise ValueError(
                f"FacilitySupport Dataset: expected split in ['train', 'val', 'test'], "
                f"but got split={split}"
            )

        if isinstance(root, str):
            root = os.path.expanduser(root)

        self.split = split
        self.root = root

        self._download()

        # Load dataset from JSON
        file_path = os.path.join(self.raw_folder, self.resources[0][0])
        with open(file_path, "r", encoding="utf-8") as f:
            dataset: List[Dict] = json.load(f)

        # Shuffle deterministically
        random.Random(0).shuffle(dataset)

        n = len(dataset)
        n_train = int(n * 0.33)
        n_val = int(n * 0.33)

        if split == "train":
            self.data = dataset[:n_train]
        elif split == "val":
            self.data = dataset[n_train : n_train + n_val]  # noqa: E203
        else:  # test
            self.data = dataset[n_train + n_val :]  # noqa: E203

    def __getitem__(
        self, index: int
    ) -> tuple[Variable, tuple[Variable, Variable, Variable]]:
        """Fetches a data sample `(message, (urgency, sentiment, categories))`
        for a given `index`.

        Args:
            index: The index of the data sample to fetch.

        Returns:
            A tuple containing input `message` and a tuple of output variables \
            `(urgency, sentiment, categories)` (see below note for more details).

        Note:
            The return value is a tuple of the form
            `(message, (urgency, sentiment, categories))` where:

            - `message` is a [`Variable`][afnio.Variable] containing the input
                email or message text.
            - `urgency` is a [`Variable`][afnio.Variable] containing the urgency
                label (low, medium, high).
            - `sentiment` is a [`Variable`][afnio.Variable] containing the
                sentiment label (negative, neutral, positive).
            - `categories` is a [`Variable`][afnio.Variable] containing a JSON
                string of the relevant service request categories (e.g., cleaning,
                IT support, maintenance).
        """
        if not (0 <= index < len(self.data)):
            raise IndexError("Index out of range.")

        item = self.data[index]

        answer: dict = json.loads(item["answer"])
        urgency = answer.get("urgency", None)
        sentiment = answer.get("sentiment", None)
        categories = answer.get("categories", None)

        message = Variable(
            data=item["fields"]["input"],
            role="input email or message",
        )
        urgency = Variable(data=urgency, role="output urgency")
        sentiment = Variable(data=sentiment, role="output sentiment")
        categories = Variable(data=json.dumps(categories), role="output categories")
        return message, (urgency, sentiment, categories)

    def __len__(self) -> int:
        return len(self.data)

    def extra_repr(self) -> str:
        split_map = {"train": "Train", "val": "Validation", "test": "Test"}

        try:
            split = split_map[self.split]
        except KeyError:
            raise ValueError(
                f"Invalid split value: {self.split}. "
                f"Expected one of ['train', 'val', 'test']."
            )

        return f"Split: {split}"

    def __repr__(self) -> str:
        head = "Dataset " + self.__class__.__name__
        body = [f"Number of datapoints: {self.__len__()}"]
        if self.root is not None:
            body.append(f"Root location: {self.root}")
        body += self.extra_repr().splitlines()
        lines = [head] + [" " * self._repr_indent + line for line in body]
        return "\n".join(lines)

    @property
    def raw_folder(self) -> str:
        return os.path.join(self.root, self.__class__.__name__, "raw")

    def _check_exists(self) -> bool:
        return all(
            check_integrity(
                os.path.join(
                    self.raw_folder, os.path.splitext(os.path.basename(url))[0]
                )
            )
            for url, _ in self.resources
        )

    def _download(self) -> None:
        """Download the Facility Support data if it doesn't exist already."""

        if self._check_exists():
            return

        os.makedirs(self.raw_folder, exist_ok=True)

        # download files
        for filename, md5 in self.resources:
            for mirror in self.mirrors:
                url = f"{mirror}{filename}"
                try:
                    download(
                        url, download_root=self.raw_folder, filename=filename, md5=md5
                    )
                except URLError as error:
                    print(f"Failed to download (trying next):\n{error}")
                    continue
                finally:
                    print()
                break
            else:
                raise RuntimeError(f"Error downloading {filename}")

__init__(split, root=None)

Initializes the FacilitySupport dataset.

Parameters:

Name Type Description Default
split str

The dataset split to load. Must be either "train", "val", or "test".

required
root str | Path

The root directory where JSON files are stored.

None
Source code in afnio/utils/datasets/facility_support.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def __init__(self, split: str, root: Union[str, Path] = None) -> None:
    """
    Initializes the `FacilitySupport` dataset.

    Args:
        split: The dataset split to load. Must be either `"train"`, `"val"`,
            or `"test"`.
        root: The root directory where JSON files are stored.
    """
    if split not in {"train", "val", "test"}:
        raise ValueError(
            f"FacilitySupport Dataset: expected split in ['train', 'val', 'test'], "
            f"but got split={split}"
        )

    if isinstance(root, str):
        root = os.path.expanduser(root)

    self.split = split
    self.root = root

    self._download()

    # Load dataset from JSON
    file_path = os.path.join(self.raw_folder, self.resources[0][0])
    with open(file_path, "r", encoding="utf-8") as f:
        dataset: List[Dict] = json.load(f)

    # Shuffle deterministically
    random.Random(0).shuffle(dataset)

    n = len(dataset)
    n_train = int(n * 0.33)
    n_val = int(n * 0.33)

    if split == "train":
        self.data = dataset[:n_train]
    elif split == "val":
        self.data = dataset[n_train : n_train + n_val]  # noqa: E203
    else:  # test
        self.data = dataset[n_train + n_val :]  # noqa: E203

__getitem__(index)

Fetches a data sample (message, (urgency, sentiment, categories)) for a given index.

Parameters:

Name Type Description Default
index int

The index of the data sample to fetch.

required

Returns:

Type Description
tuple[Variable, tuple[Variable, Variable, Variable]]

A tuple containing input message and a tuple of output variables (urgency, sentiment, categories) (see below note for more details).

Note

The return value is a tuple of the form (message, (urgency, sentiment, categories)) where:

  • message is a Variable containing the input email or message text.
  • urgency is a Variable containing the urgency label (low, medium, high).
  • sentiment is a Variable containing the sentiment label (negative, neutral, positive).
  • categories is a Variable containing a JSON string of the relevant service request categories (e.g., cleaning, IT support, maintenance).
Source code in afnio/utils/datasets/facility_support.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def __getitem__(
    self, index: int
) -> tuple[Variable, tuple[Variable, Variable, Variable]]:
    """Fetches a data sample `(message, (urgency, sentiment, categories))`
    for a given `index`.

    Args:
        index: The index of the data sample to fetch.

    Returns:
        A tuple containing input `message` and a tuple of output variables \
        `(urgency, sentiment, categories)` (see below note for more details).

    Note:
        The return value is a tuple of the form
        `(message, (urgency, sentiment, categories))` where:

        - `message` is a [`Variable`][afnio.Variable] containing the input
            email or message text.
        - `urgency` is a [`Variable`][afnio.Variable] containing the urgency
            label (low, medium, high).
        - `sentiment` is a [`Variable`][afnio.Variable] containing the
            sentiment label (negative, neutral, positive).
        - `categories` is a [`Variable`][afnio.Variable] containing a JSON
            string of the relevant service request categories (e.g., cleaning,
            IT support, maintenance).
    """
    if not (0 <= index < len(self.data)):
        raise IndexError("Index out of range.")

    item = self.data[index]

    answer: dict = json.loads(item["answer"])
    urgency = answer.get("urgency", None)
    sentiment = answer.get("sentiment", None)
    categories = answer.get("categories", None)

    message = Variable(
        data=item["fields"]["input"],
        role="input email or message",
    )
    urgency = Variable(data=urgency, role="output urgency")
    sentiment = Variable(data=sentiment, role="output sentiment")
    categories = Variable(data=json.dumps(categories), role="output categories")
    return message, (urgency, sentiment, categories)

afnio.utils.datasets.TREC

Bases: Dataset

The Text REtrieval Conference (TREC) Question Classification dataset contains 5452 labeled questions in the training set (before removing duplicates) and 5382 unique labeled questions (after removing duplicates), along with another 500 questions for the test set.

The dataset has 6 coarse class labels and 50 fine class labels. Average length of each sentence is 10, vocabulary size of 8700.

Data are collected from four sources: 4,500 English questions published by USC (Hovy et al., 2001), about 500 manually constructed questions for a few rare classes, 894 TREC 8 and TREC 9 questions, and also 500 questions from TREC 10 which serves as the test set. These questions were manually labeled.

TREC provides a stratified train set and validation set, ensuring that both splits maintain the same class distribution proportions as in the original dataset.

References:

Source code in afnio/utils/datasets/trec.py
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
class TREC(Dataset):
    """The Text REtrieval Conference (TREC) Question Classification dataset contains
    5452 labeled questions in the training set (before removing duplicates) and 5382
    unique labeled questions (after removing duplicates), along with another 500
    questions for the test set.

    The dataset has 6 coarse class labels and 50 fine class labels. Average length of
    each sentence is 10, vocabulary size of 8700.

    Data are collected from four sources: 4,500 English questions published by USC
    (Hovy et al., 2001), about 500 manually constructed questions for a few rare
    classes, 894 TREC 8 and TREC 9 questions, and also 500 questions from TREC 10 which
    serves as the test set. These questions were manually labeled.

    `TREC` provides a stratified train set and validation set, ensuring that both
    splits maintain the same class distribution proportions as in the original dataset.

    **References:**

    - *TREC Question Classification Dataset*
        [https://cogcomp.seas.upenn.edu/Data/QA/QC/](https://cogcomp.seas.upenn.edu/Data/QA/QC/)
    """

    mirrors = ["https://cogcomp.seas.upenn.edu/Data/QA/QC/"]

    resources = [
        ("train_5500.label", "073462e3fcefaae31e00edb1f18d2d02"),
        ("TREC_10.label", "323a3554401d86e650717e2d2f942589"),
    ]

    _repr_indent = 4

    def __init__(
        self,
        task: str = None,
        split: str = None,
        validation_split: Optional[float] = 0.0,
        root: Union[str, Path] = None,
    ) -> None:
        """Initializes the `TREC` dataset.

        Args:
            task: Defines the classes to classify between `["coarse", "fine"]`. If
            split: The dataset split in `["train", "val", "test"]`.
            validation_split: Float between `0` and `1`. Fraction of the training data
                to be used as validation data.
            root: Root directory of dataset where `TREC/raw/train_5500.label` and
                `TREC/raw/TREC_10.label` exist.
        """
        if task not in {"coarse", "fine"}:
            raise ValueError(
                f"TREC Dataset: expected classification task in ['coarse', 'fine'], "
                f"but got task={task}"
            )

        if split not in {"train", "val", "test"}:
            raise ValueError(
                f"TREC Dataset: expected split in ['train', 'val', 'test'], "
                f"but got split={split}"
            )

        if validation_split < 0.0 or validation_split > 1.0:
            raise ValueError(
                f"TREC Dataset: expected validation_split in [0.0, 1.0], "
                f"but got validation_split={validation_split}"
            )

        if isinstance(root, str):
            root = os.path.expanduser(root)

        self.task = task
        self.split = split
        self.root = root

        self._download()

        if split == "train":
            (self.data, self.targets), (_, _) = self._load_train_and_val_data(
                task=self.task, validation_split=validation_split
            )
        elif split == "val":
            (_, _), (self.data, self.targets) = self._load_train_and_val_data(
                task=self.task, validation_split=validation_split
            )
        elif split == "test":
            self.data, self.targets = self._load_test_data()
        else:
            self.data, self.targets = None, None

    def __getitem__(self, index) -> Tuple[str, Tuple[str, str]]:
        """Fetches a data sample `(question, (fine_label, coarse_label))`
        for a given `index`.

        Args:
            index: The index of the data sample to fetch.

        Returns:
            A tuple containing the input `question` and a tuple of output labels \
            `(fine_label, coarse_label)` (see below note for more details).

        Note:
            The return value is a tuple of the form
            `(question, (fine_label, coarse_label))` where:

            - `question` is a string containing the text of the question.
            - `fine_label` is a string containing the fine-grained class label
                for the question (e.g., `"DESC:manner"`, `"ABBR:exp"`, etc.).
            - `coarse_label` is a string containing the coarse-grained class label
                for the question (e.g., `"DESC"`, `"ABBR"`, etc.).
        """
        return self.data[index], self.targets[index]

    def __len__(self):
        return len(self.data)

    def extra_repr(self) -> str:
        split_map = {"train": "Train", "val": "Validation", "test": "Test"}
        task_map = {"coarse": "Classify Coarse Labels", "fine": "Classify Fine Labels"}

        try:
            split = split_map[self.split]
        except KeyError:
            raise ValueError(
                f"Invalid split value: {self.split}. "
                f"Expected one of ['train', 'val', 'test']."
            )
        try:
            task = task_map[self.task]
        except KeyError:
            raise ValueError(
                f"Invalid task value: {self.task}. Expected one of ['coarse', 'fine']."
            )

        return f"Split: {split}\nTask: {task}"

    def __repr__(self) -> str:
        head = "Dataset " + self.__class__.__name__
        body = [f"Number of datapoints: {self.__len__()}"]
        if self.root is not None:
            body.append(f"Root location: {self.root}")
        body += self.extra_repr().splitlines()
        lines = [head] + [" " * self._repr_indent + line for line in body]
        return "\n".join(lines)

    @property
    def raw_folder(self) -> str:
        return os.path.join(self.root, self.__class__.__name__, "raw")

    def _check_exists(self) -> bool:
        return all(
            check_integrity(
                os.path.join(
                    self.raw_folder, os.path.splitext(os.path.basename(url))[0]
                )
            )
            for url, _ in self.resources
        )

    def _download(self) -> None:
        """Download the TREC data if it doesn't exist already."""

        if self._check_exists():
            return

        os.makedirs(self.raw_folder, exist_ok=True)

        # download files
        for filename, md5 in self.resources:
            for mirror in self.mirrors:
                url = f"{mirror}{filename}"
                try:
                    download(
                        url, download_root=self.raw_folder, filename=filename, md5=md5
                    )
                except URLError as error:
                    print(f"Failed to download (trying next):\n{error}")
                    continue
                finally:
                    print()
                break
            else:
                raise RuntimeError(f"Error downloading {filename}")

    def _load_train_and_val_data(self, task: str = None, validation_split: float = 0.0):
        train_file_path = os.path.join(self.raw_folder, self.resources[0][0])

        data = []
        targets = []
        unique_samples = set()  # A set to track unique samples

        with open(train_file_path, "rb") as f:
            for row in f:
                # One non-ASCII byte: sisterBADBYTEcity. We replace it with a space
                fine_label, _, text = (
                    row.replace(b"\xf0", b" ").strip().decode().partition(" ")
                )
                coarse_label = fine_label.split(":")[0]
                sample = (text, (fine_label, coarse_label))

                # Only add unique samples
                if sample not in unique_samples:
                    unique_samples.add(sample)
                    data.append(text)
                    targets.append((fine_label, coarse_label))

        # Group data by either fine_label or coarse_label based on the task
        label_to_data = defaultdict(list)
        for text, (fine_label, coarse_label) in zip(data, targets):
            label = fine_label if task == "fine" else coarse_label
            label_to_data[label].append((text, fine_label, coarse_label))

        # Split the data based on validation_split
        train_data = []
        train_targets = []
        val_data = []
        val_targets = []

        random.seed(42)

        for label, samples in label_to_data.items():
            # Ensure there are enough samples to split
            if len(samples) < (len(samples) * validation_split):
                raise ValueError(
                    f"Not enough data for label '{label}' to respect the validation split."  # noqa: E501
                )

            random.shuffle(samples)
            split_idx = int(len(samples) * (1 - validation_split))

            if len(samples[:split_idx]) == 0:
                raise ValueError(f"Label {label} missing from the training set.")
            if len(samples[split_idx:]) == 0 and validation_split > 0.0:
                raise ValueError(f"Label {label} missing from the validation set.")

            # Add to training set
            for sample in samples[:split_idx]:
                train_data.append(sample[0])
                train_targets.append((sample[1], sample[2]))

            # Add to validation set
            for sample in samples[split_idx:]:
                val_data.append(sample[0])
                val_targets.append((sample[1], sample[2]))

        return (train_data, train_targets), (val_data, val_targets)

    def _load_test_data(self):
        test_file_path = os.path.join(self.raw_folder, self.resources[1][0])

        data = []
        targets = []

        with open(test_file_path, "rb") as f:
            for row in f:
                # One non-ASCII byte: sisterBADBYTEcity. We replace it with a space
                fine_label, _, text = (
                    row.replace(b"\xf0", b" ").strip().decode().partition(" ")
                )
                coarse_label = fine_label.split(":")[0]
                data.append(text)
                targets.append((fine_label, coarse_label))

        return data, targets

__init__(task=None, split=None, validation_split=0.0, root=None)

Initializes the TREC dataset.

Parameters:

Name Type Description Default
task str

Defines the classes to classify between ["coarse", "fine"]. If

None
split str

The dataset split in ["train", "val", "test"].

None
validation_split float | None

Float between 0 and 1. Fraction of the training data to be used as validation data.

0.0
root str | Path

Root directory of dataset where TREC/raw/train_5500.label and TREC/raw/TREC_10.label exist.

None
Source code in afnio/utils/datasets/trec.py
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
def __init__(
    self,
    task: str = None,
    split: str = None,
    validation_split: Optional[float] = 0.0,
    root: Union[str, Path] = None,
) -> None:
    """Initializes the `TREC` dataset.

    Args:
        task: Defines the classes to classify between `["coarse", "fine"]`. If
        split: The dataset split in `["train", "val", "test"]`.
        validation_split: Float between `0` and `1`. Fraction of the training data
            to be used as validation data.
        root: Root directory of dataset where `TREC/raw/train_5500.label` and
            `TREC/raw/TREC_10.label` exist.
    """
    if task not in {"coarse", "fine"}:
        raise ValueError(
            f"TREC Dataset: expected classification task in ['coarse', 'fine'], "
            f"but got task={task}"
        )

    if split not in {"train", "val", "test"}:
        raise ValueError(
            f"TREC Dataset: expected split in ['train', 'val', 'test'], "
            f"but got split={split}"
        )

    if validation_split < 0.0 or validation_split > 1.0:
        raise ValueError(
            f"TREC Dataset: expected validation_split in [0.0, 1.0], "
            f"but got validation_split={validation_split}"
        )

    if isinstance(root, str):
        root = os.path.expanduser(root)

    self.task = task
    self.split = split
    self.root = root

    self._download()

    if split == "train":
        (self.data, self.targets), (_, _) = self._load_train_and_val_data(
            task=self.task, validation_split=validation_split
        )
    elif split == "val":
        (_, _), (self.data, self.targets) = self._load_train_and_val_data(
            task=self.task, validation_split=validation_split
        )
    elif split == "test":
        self.data, self.targets = self._load_test_data()
    else:
        self.data, self.targets = None, None

__getitem__(index)

Fetches a data sample (question, (fine_label, coarse_label)) for a given index.

Parameters:

Name Type Description Default
index

The index of the data sample to fetch.

required

Returns:

Type Description
tuple[str, tuple[str, str]]

A tuple containing the input question and a tuple of output labels (fine_label, coarse_label) (see below note for more details).

Note

The return value is a tuple of the form (question, (fine_label, coarse_label)) where:

  • question is a string containing the text of the question.
  • fine_label is a string containing the fine-grained class label for the question (e.g., "DESC:manner", "ABBR:exp", etc.).
  • coarse_label is a string containing the coarse-grained class label for the question (e.g., "DESC", "ABBR", etc.).
Source code in afnio/utils/datasets/trec.py
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
def __getitem__(self, index) -> Tuple[str, Tuple[str, str]]:
    """Fetches a data sample `(question, (fine_label, coarse_label))`
    for a given `index`.

    Args:
        index: The index of the data sample to fetch.

    Returns:
        A tuple containing the input `question` and a tuple of output labels \
        `(fine_label, coarse_label)` (see below note for more details).

    Note:
        The return value is a tuple of the form
        `(question, (fine_label, coarse_label))` where:

        - `question` is a string containing the text of the question.
        - `fine_label` is a string containing the fine-grained class label
            for the question (e.g., `"DESC:manner"`, `"ABBR:exp"`, etc.).
        - `coarse_label` is a string containing the coarse-grained class label
            for the question (e.g., `"DESC"`, `"ABBR"`, etc.).
    """
    return self.data[index], self.targets[index]