`afnio.utils.datasets`

`afnio.utils.datasets.FacilitySupport`

Bases: Dataset

The Meta Facility Support Analyzer dataset consists of 200 real-world emails or messages sent in enterprise settings related to facility maintenance or support requests. Each example is annotated with:

urgency (low, medium, high)
sentiment (negative, neutral, positive)
relevant service request categories (e.g., cleaning, IT support, maintenance)

The dataset is split into train, validation, and test sets with a 33%/33%/34% ratio. The split is deterministic, ensuring reproducibility across different runs.

References:

Meta Facility Support Analyzer Dataset https://github.com/meta-llama/prompt-ops/tree/main/use-cases/facility-support-analyzer

Source code in afnio/utils/datasets/facility_support.py

class FacilitySupport(Dataset):
    """The Meta Facility Support Analyzer dataset consists of 200 real-world emails or
    messages sent in enterprise settings related to facility maintenance or support
    requests. Each example is annotated with:

      - urgency (low, medium, high)
      - sentiment (negative, neutral, positive)
      - relevant service request categories (e.g., cleaning, IT support, maintenance)

    The dataset is split into train, validation, and test sets with a 33%/33%/34%
    ratio. The split is deterministic, ensuring reproducibility across different runs.

    **References:**

    - *Meta Facility Support Analyzer Dataset*
        [https://github.com/meta-llama/prompt-ops/tree/main/use-cases/facility-support-analyzer](https://github.com/meta-llama/prompt-ops/tree/main/use-cases/facility-support-analyzer)
    """

    mirrors = [
        "https://raw.githubusercontent.com/meta-llama/llama-prompt-ops/refs/heads/main/use-cases/facility-support-analyzer/"  # noqa: E501
    ]

    resources = [
        ("dataset.json", "530dc66b1b07c9b15b19f08891e9bfa0"),
    ]

    _repr_indent = 4

    def __init__(self, split: str, root: Union[str, Path] = None) -> None:
        """
        Initializes the `FacilitySupport` dataset.

        Args:
            split: The dataset split to load. Must be either `"train"`, `"val"`,
                or `"test"`.
            root: The root directory where JSON files are stored.
        """
        if split not in {"train", "val", "test"}:
            raise ValueError(
                f"FacilitySupport Dataset: expected split in ['train', 'val', 'test'], "
                f"but got split={split}"
            )

        if isinstance(root, str):
            root = os.path.expanduser(root)

        self.split = split
        self.root = root

        self._download()

        # Load dataset from JSON
        file_path = os.path.join(self.raw_folder, self.resources[0][0])
        with open(file_path, "r", encoding="utf-8") as f:
            dataset: List[Dict] = json.load(f)

        # Shuffle deterministically
        random.Random(0).shuffle(dataset)

        n = len(dataset)
        n_train = int(n * 0.33)
        n_val = int(n * 0.33)

        if split == "train":
            self.data = dataset[:n_train]
        elif split == "val":
            self.data = dataset[n_train : n_train + n_val]  # noqa: E203
        else:  # test
            self.data = dataset[n_train + n_val :]  # noqa: E203

    def __getitem__(
        self, index: int
    ) -> tuple[Variable, tuple[Variable, Variable, Variable]]:
        """Fetches a data sample `(message, (urgency, sentiment, categories))`
        for a given `index`.

        Args:
            index: The index of the data sample to fetch.

        Returns:
            A tuple containing input `message` and a tuple of output variables \
            `(urgency, sentiment, categories)` (see below note for more details).

        Note:
            The return value is a tuple of the form
            `(message, (urgency, sentiment, categories))` where:

            - `message` is a [`Variable`][afnio.Variable] containing the input
                email or message text.
            - `urgency` is a [`Variable`][afnio.Variable] containing the urgency
                label (low, medium, high).
            - `sentiment` is a [`Variable`][afnio.Variable] containing the
                sentiment label (negative, neutral, positive).
            - `categories` is a [`Variable`][afnio.Variable] containing a JSON
                string of the relevant service request categories (e.g., cleaning,
                IT support, maintenance).
        """
        if not (0 <= index < len(self.data)):
            raise IndexError("Index out of range.")

        item = self.data[index]

        answer: dict = json.loads(item["answer"])
        urgency = answer.get("urgency", None)
        sentiment = answer.get("sentiment", None)
        categories = answer.get("categories", None)

        message = Variable(
            data=item["fields"]["input"],
            role="input email or message",
        )
        urgency = Variable(data=urgency, role="output urgency")
        sentiment = Variable(data=sentiment, role="output sentiment")
        categories = Variable(data=json.dumps(categories), role="output categories")
        return message, (urgency, sentiment, categories)

    def __len__(self) -> int:
        return len(self.data)

    def extra_repr(self) -> str:
        split_map = {"train": "Train", "val": "Validation", "test": "Test"}

        try:
            split = split_map[self.split]
        except KeyError:
            raise ValueError(
                f"Invalid split value: {self.split}. "
                f"Expected one of ['train', 'val', 'test']."
            )

        return f"Split: {split}"

    def __repr__(self) -> str:
        head = "Dataset " + self.__class__.__name__
        body = [f"Number of datapoints: {self.__len__()}"]
        if self.root is not None:
            body.append(f"Root location: {self.root}")
        body += self.extra_repr().splitlines()
        lines = [head] + [" " * self._repr_indent + line for line in body]
        return "\n".join(lines)

    @property
    def raw_folder(self) -> str:
        return os.path.join(self.root, self.__class__.__name__, "raw")

    def _check_exists(self) -> bool:
        return all(
            check_integrity(
                os.path.join(
                    self.raw_folder, os.path.splitext(os.path.basename(url))[0]
                )
            )
            for url, _ in self.resources
        )

    def _download(self) -> None:
        """Download the Facility Support data if it doesn't exist already."""

        if self._check_exists():
            return

        os.makedirs(self.raw_folder, exist_ok=True)

        # download files
        for filename, md5 in self.resources:
            for mirror in self.mirrors:
                url = f"{mirror}{filename}"
                try:
                    download(
                        url, download_root=self.raw_folder, filename=filename, md5=md5
                    )
                except URLError as error:
                    print(f"Failed to download (trying next):\n{error}")
                    continue
                finally:
                    print()
                break
            else:
                raise RuntimeError(f"Error downloading {filename}")

`init(split, root=None)`

Initializes the FacilitySupport dataset.

Parameters:

Name	Type	Description	Default
`split`	`str`	The dataset split to load. Must be either `"train"`, `"val"`, or `"test"`.	required
`root`	`str \| Path`	The root directory where JSON files are stored.	`None`

Source code in afnio/utils/datasets/facility_support.py

def __init__(self, split: str, root: Union[str, Path] = None) -> None:
    """
    Initializes the `FacilitySupport` dataset.

    Args:
        split: The dataset split to load. Must be either `"train"`, `"val"`,
            or `"test"`.
        root: The root directory where JSON files are stored.
    """
    if split not in {"train", "val", "test"}:
        raise ValueError(
            f"FacilitySupport Dataset: expected split in ['train', 'val', 'test'], "
            f"but got split={split}"
        )

    if isinstance(root, str):
        root = os.path.expanduser(root)

    self.split = split
    self.root = root

    self._download()

    # Load dataset from JSON
    file_path = os.path.join(self.raw_folder, self.resources[0][0])
    with open(file_path, "r", encoding="utf-8") as f:
        dataset: List[Dict] = json.load(f)

    # Shuffle deterministically
    random.Random(0).shuffle(dataset)

    n = len(dataset)
    n_train = int(n * 0.33)
    n_val = int(n * 0.33)

    if split == "train":
        self.data = dataset[:n_train]
    elif split == "val":
        self.data = dataset[n_train : n_train + n_val]  # noqa: E203
    else:  # test
        self.data = dataset[n_train + n_val :]  # noqa: E203

`getitem(index)`

Fetches a data sample (message, (urgency, sentiment, categories)) for a given index.

Parameters:

Name	Type	Description	Default
`index`	`int`	The index of the data sample to fetch.	required

Returns:

Type	Description
`tuple[Variable, tuple[Variable, Variable, Variable]]`	A tuple containing input `message` and a tuple of output variables `(urgency, sentiment, categories)` (see below note for more details).

Note

The return value is a tuple of the form (message, (urgency, sentiment, categories)) where:

message is a Variable containing the input email or message text.
urgency is a Variable containing the urgency label (low, medium, high).
sentiment is a Variable containing the sentiment label (negative, neutral, positive).
categories is a Variable containing a JSON string of the relevant service request categories (e.g., cleaning, IT support, maintenance).

Source code in afnio/utils/datasets/facility_support.py

def __getitem__(
    self, index: int
) -> tuple[Variable, tuple[Variable, Variable, Variable]]:
    """Fetches a data sample `(message, (urgency, sentiment, categories))`
    for a given `index`.

    Args:
        index: The index of the data sample to fetch.

    Returns:
        A tuple containing input `message` and a tuple of output variables \
        `(urgency, sentiment, categories)` (see below note for more details).

    Note:
        The return value is a tuple of the form
        `(message, (urgency, sentiment, categories))` where:

        - `message` is a [`Variable`][afnio.Variable] containing the input
            email or message text.
        - `urgency` is a [`Variable`][afnio.Variable] containing the urgency
            label (low, medium, high).
        - `sentiment` is a [`Variable`][afnio.Variable] containing the
            sentiment label (negative, neutral, positive).
        - `categories` is a [`Variable`][afnio.Variable] containing a JSON
            string of the relevant service request categories (e.g., cleaning,
            IT support, maintenance).
    """
    if not (0 <= index < len(self.data)):
        raise IndexError("Index out of range.")

    item = self.data[index]

    answer: dict = json.loads(item["answer"])
    urgency = answer.get("urgency", None)
    sentiment = answer.get("sentiment", None)
    categories = answer.get("categories", None)

    message = Variable(
        data=item["fields"]["input"],
        role="input email or message",
    )
    urgency = Variable(data=urgency, role="output urgency")
    sentiment = Variable(data=sentiment, role="output sentiment")
    categories = Variable(data=json.dumps(categories), role="output categories")
    return message, (urgency, sentiment, categories)

`afnio.utils.datasets.TREC`

Bases: Dataset

The Text REtrieval Conference (TREC) Question Classification dataset contains 5452 labeled questions in the training set (before removing duplicates) and 5382 unique labeled questions (after removing duplicates), along with another 500 questions for the test set.

The dataset has 6 coarse class labels and 50 fine class labels. Average length of each sentence is 10, vocabulary size of 8700.

Data are collected from four sources: 4,500 English questions published by USC (Hovy et al., 2001), about 500 manually constructed questions for a few rare classes, 894 TREC 8 and TREC 9 questions, and also 500 questions from TREC 10 which serves as the test set. These questions were manually labeled.

TREC provides a stratified train set and validation set, ensuring that both splits maintain the same class distribution proportions as in the original dataset.

References:

TREC Question Classification Dataset https://cogcomp.seas.upenn.edu/Data/QA/QC/

Source code in afnio/utils/datasets/trec.py

class TREC(Dataset):
    """The Text REtrieval Conference (TREC) Question Classification dataset contains
    5452 labeled questions in the training set (before removing duplicates) and 5382
    unique labeled questions (after removing duplicates), along with another 500
    questions for the test set.

    The dataset has 6 coarse class labels and 50 fine class labels. Average length of
    each sentence is 10, vocabulary size of 8700.

    Data are collected from four sources: 4,500 English questions published by USC
    (Hovy et al., 2001), about 500 manually constructed questions for a few rare
    classes, 894 TREC 8 and TREC 9 questions, and also 500 questions from TREC 10 which
    serves as the test set. These questions were manually labeled.

    `TREC` provides a stratified train set and validation set, ensuring that both
    splits maintain the same class distribution proportions as in the original dataset.

    **References:**

    - *TREC Question Classification Dataset*
        [https://cogcomp.seas.upenn.edu/Data/QA/QC/](https://cogcomp.seas.upenn.edu/Data/QA/QC/)
    """

    mirrors = ["https://cogcomp.seas.upenn.edu/Data/QA/QC/"]

    resources = [
        ("train_5500.label", "073462e3fcefaae31e00edb1f18d2d02"),
        ("TREC_10.label", "323a3554401d86e650717e2d2f942589"),
    ]

    _repr_indent = 4

    def __init__(
        self,
        task: str = None,
        split: str = None,
        validation_split: Optional[float] = 0.0,
        root: Union[str, Path] = None,
    ) -> None:
        """Initializes the `TREC` dataset.

        Args:
            task: Defines the classes to classify between `["coarse", "fine"]`. If
            split: The dataset split in `["train", "val", "test"]`.
            validation_split: Float between `0` and `1`. Fraction of the training data
                to be used as validation data.
            root: Root directory of dataset where `TREC/raw/train_5500.label` and
                `TREC/raw/TREC_10.label` exist.
        """
        if task not in {"coarse", "fine"}:
            raise ValueError(
                f"TREC Dataset: expected classification task in ['coarse', 'fine'], "
                f"but got task={task}"
            )

        if split not in {"train", "val", "test"}:
            raise ValueError(
                f"TREC Dataset: expected split in ['train', 'val', 'test'], "
                f"but got split={split}"
            )

        if validation_split < 0.0 or validation_split > 1.0:
            raise ValueError(
                f"TREC Dataset: expected validation_split in [0.0, 1.0], "
                f"but got validation_split={validation_split}"
            )

        if isinstance(root, str):
            root = os.path.expanduser(root)

        self.task = task
        self.split = split
        self.root = root

        self._download()

        if split == "train":
            (self.data, self.targets), (_, _) = self._load_train_and_val_data(
                task=self.task, validation_split=validation_split
            )
        elif split == "val":
            (_, _), (self.data, self.targets) = self._load_train_and_val_data(
                task=self.task, validation_split=validation_split
            )
        elif split == "test":
            self.data, self.targets = self._load_test_data()
        else:
            self.data, self.targets = None, None

    def __getitem__(self, index) -> Tuple[str, Tuple[str, str]]:
        """Fetches a data sample `(question, (fine_label, coarse_label))`
        for a given `index`.

        Args:
            index: The index of the data sample to fetch.

        Returns:
            A tuple containing the input `question` and a tuple of output labels \
            `(fine_label, coarse_label)` (see below note for more details).

        Note:
            The return value is a tuple of the form
            `(question, (fine_label, coarse_label))` where:

            - `question` is a string containing the text of the question.
            - `fine_label` is a string containing the fine-grained class label
                for the question (e.g., `"DESC:manner"`, `"ABBR:exp"`, etc.).
            - `coarse_label` is a string containing the coarse-grained class label
                for the question (e.g., `"DESC"`, `"ABBR"`, etc.).
        """
        return self.data[index], self.targets[index]

    def __len__(self):
        return len(self.data)

    def extra_repr(self) -> str:
        split_map = {"train": "Train", "val": "Validation", "test": "Test"}
        task_map = {"coarse": "Classify Coarse Labels", "fine": "Classify Fine Labels"}

        try:
            split = split_map[self.split]
        except KeyError:
            raise ValueError(
                f"Invalid split value: {self.split}. "
                f"Expected one of ['train', 'val', 'test']."
            )
        try:
            task = task_map[self.task]
        except KeyError:
            raise ValueError(
                f"Invalid task value: {self.task}. Expected one of ['coarse', 'fine']."
            )

        return f"Split: {split}\nTask: {task}"

    def __repr__(self) -> str:
        head = "Dataset " + self.__class__.__name__
        body = [f"Number of datapoints: {self.__len__()}"]
        if self.root is not None:
            body.append(f"Root location: {self.root}")
        body += self.extra_repr().splitlines()
        lines = [head] + [" " * self._repr_indent + line for line in body]
        return "\n".join(lines)

    @property
    def raw_folder(self) -> str:
        return os.path.join(self.root, self.__class__.__name__, "raw")

    def _check_exists(self) -> bool:
        return all(
            check_integrity(
                os.path.join(
                    self.raw_folder, os.path.splitext(os.path.basename(url))[0]
                )
            )
            for url, _ in self.resources
        )

    def _download(self) -> None:
        """Download the TREC data if it doesn't exist already."""

        if self._check_exists():
            return

        os.makedirs(self.raw_folder, exist_ok=True)

        # download files
        for filename, md5 in self.resources:
            for mirror in self.mirrors:
                url = f"{mirror}{filename}"
                try:
                    download(
                        url, download_root=self.raw_folder, filename=filename, md5=md5
                    )
                except URLError as error:
                    print(f"Failed to download (trying next):\n{error}")
                    continue
                finally:
                    print()
                break
            else:
                raise RuntimeError(f"Error downloading {filename}")

    def _load_train_and_val_data(self, task: str = None, validation_split: float = 0.0):
        train_file_path = os.path.join(self.raw_folder, self.resources[0][0])

        data = []
        targets = []
        unique_samples = set()  # A set to track unique samples

        with open(train_file_path, "rb") as f:
            for row in f:
                # One non-ASCII byte: sisterBADBYTEcity. We replace it with a space
                fine_label, _, text = (
                    row.replace(b"\xf0", b" ").strip().decode().partition(" ")
                )
                coarse_label = fine_label.split(":")[0]
                sample = (text, (fine_label, coarse_label))

                # Only add unique samples
                if sample not in unique_samples:
                    unique_samples.add(sample)
                    data.append(text)
                    targets.append((fine_label, coarse_label))

        # Group data by either fine_label or coarse_label based on the task
        label_to_data = defaultdict(list)
        for text, (fine_label, coarse_label) in zip(data, targets):
            label = fine_label if task == "fine" else coarse_label
            label_to_data[label].append((text, fine_label, coarse_label))

        # Split the data based on validation_split
        train_data = []
        train_targets = []
        val_data = []
        val_targets = []

        random.seed(42)

        for label, samples in label_to_data.items():
            # Ensure there are enough samples to split
            if len(samples) < (len(samples) * validation_split):
                raise ValueError(
                    f"Not enough data for label '{label}' to respect the validation split."  # noqa: E501
                )

            random.shuffle(samples)
            split_idx = int(len(samples) * (1 - validation_split))

            if len(samples[:split_idx]) == 0:
                raise ValueError(f"Label {label} missing from the training set.")
            if len(samples[split_idx:]) == 0 and validation_split > 0.0:
                raise ValueError(f"Label {label} missing from the validation set.")

            # Add to training set
            for sample in samples[:split_idx]:
                train_data.append(sample[0])
                train_targets.append((sample[1], sample[2]))

            # Add to validation set
            for sample in samples[split_idx:]:
                val_data.append(sample[0])
                val_targets.append((sample[1], sample[2]))

        return (train_data, train_targets), (val_data, val_targets)

    def _load_test_data(self):
        test_file_path = os.path.join(self.raw_folder, self.resources[1][0])

        data = []
        targets = []

        with open(test_file_path, "rb") as f:
            for row in f:
                # One non-ASCII byte: sisterBADBYTEcity. We replace it with a space
                fine_label, _, text = (
                    row.replace(b"\xf0", b" ").strip().decode().partition(" ")
                )
                coarse_label = fine_label.split(":")[0]
                data.append(text)
                targets.append((fine_label, coarse_label))

        return data, targets

`init(task=None, split=None, validation_split=0.0, root=None)`

Initializes the TREC dataset.

Parameters:

Name	Type	Description	Default
`task`	`str`	Defines the classes to classify between `["coarse", "fine"]`. If	`None`
`split`	`str`	The dataset split in `["train", "val", "test"]`.	`None`
`validation_split`	`float \| None`	Float between `0` and `1`. Fraction of the training data to be used as validation data.	`0.0`
`root`	`str \| Path`	Root directory of dataset where `TREC/raw/train_5500.label` and `TREC/raw/TREC_10.label` exist.	`None`

Source code in afnio/utils/datasets/trec.py

def __init__(
    self,
    task: str = None,
    split: str = None,
    validation_split: Optional[float] = 0.0,
    root: Union[str, Path] = None,
) -> None:
    """Initializes the `TREC` dataset.

    Args:
        task: Defines the classes to classify between `["coarse", "fine"]`. If
        split: The dataset split in `["train", "val", "test"]`.
        validation_split: Float between `0` and `1`. Fraction of the training data
            to be used as validation data.
        root: Root directory of dataset where `TREC/raw/train_5500.label` and
            `TREC/raw/TREC_10.label` exist.
    """
    if task not in {"coarse", "fine"}:
        raise ValueError(
            f"TREC Dataset: expected classification task in ['coarse', 'fine'], "
            f"but got task={task}"
        )

    if split not in {"train", "val", "test"}:
        raise ValueError(
            f"TREC Dataset: expected split in ['train', 'val', 'test'], "
            f"but got split={split}"
        )

    if validation_split < 0.0 or validation_split > 1.0:
        raise ValueError(
            f"TREC Dataset: expected validation_split in [0.0, 1.0], "
            f"but got validation_split={validation_split}"
        )

    if isinstance(root, str):
        root = os.path.expanduser(root)

    self.task = task
    self.split = split
    self.root = root

    self._download()

    if split == "train":
        (self.data, self.targets), (_, _) = self._load_train_and_val_data(
            task=self.task, validation_split=validation_split
        )
    elif split == "val":
        (_, _), (self.data, self.targets) = self._load_train_and_val_data(
            task=self.task, validation_split=validation_split
        )
    elif split == "test":
        self.data, self.targets = self._load_test_data()
    else:
        self.data, self.targets = None, None

`getitem(index)`

Fetches a data sample (question, (fine_label, coarse_label)) for a given index.

Parameters:

Name	Type	Description	Default
`index`		The index of the data sample to fetch.	required

Returns:

Type	Description
`tuple[str, tuple[str, str]]`	A tuple containing the input `question` and a tuple of output labels `(fine_label, coarse_label)` (see below note for more details).

Note

The return value is a tuple of the form (question, (fine_label, coarse_label)) where:

question is a string containing the text of the question.
fine_label is a string containing the fine-grained class label for the question (e.g., "DESC:manner", "ABBR:exp", etc.).
coarse_label is a string containing the coarse-grained class label for the question (e.g., "DESC", "ABBR", etc.).

Source code in afnio/utils/datasets/trec.py

def __getitem__(self, index) -> Tuple[str, Tuple[str, str]]:
    """Fetches a data sample `(question, (fine_label, coarse_label))`
    for a given `index`.

    Args:
        index: The index of the data sample to fetch.

    Returns:
        A tuple containing the input `question` and a tuple of output labels \
        `(fine_label, coarse_label)` (see below note for more details).

    Note:
        The return value is a tuple of the form
        `(question, (fine_label, coarse_label))` where:

        - `question` is a string containing the text of the question.
        - `fine_label` is a string containing the fine-grained class label
            for the question (e.g., `"DESC:manner"`, `"ABBR:exp"`, etc.).
        - `coarse_label` is a string containing the coarse-grained class label
            for the question (e.g., `"DESC"`, `"ABBR"`, etc.).
    """
    return self.data[index], self.targets[index]

afnio.utils.datasets

afnio.utils.datasets.FacilitySupport

__init__(split, root=None)

__getitem__(index)

afnio.utils.datasets.TREC

__init__(task=None, split=None, validation_split=0.0, root=None)

__getitem__(index)

`afnio.utils.datasets`

`afnio.utils.datasets.FacilitySupport`

`init(split, root=None)`

`getitem(index)`

`afnio.utils.datasets.TREC`

`init(task=None, split=None, validation_split=0.0, root=None)`

`getitem(index)`