Skip to content

afnio.utils.datasets.facility_support

Meta's Facility Support Analyzer dataset.

afnio.utils.datasets.facility_support.FacilitySupport

Bases: Dataset

The Meta Facility Support Analyzer dataset consists of 200 real-world emails or messages sent in enterprise settings related to facility maintenance or support requests. Each example is annotated with:

  • urgency (low, medium, high)
  • sentiment (negative, neutral, positive)
  • relevant service request categories (e.g., cleaning, IT support, maintenance)

The dataset is split into train, validation, and test sets with a 33%/33%/34% ratio. The split is deterministic, ensuring reproducibility across different runs.

References:

Source code in afnio/utils/datasets/facility_support.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
class FacilitySupport(Dataset):
    """The Meta Facility Support Analyzer dataset consists of 200 real-world emails or
    messages sent in enterprise settings related to facility maintenance or support
    requests. Each example is annotated with:

      - urgency (low, medium, high)
      - sentiment (negative, neutral, positive)
      - relevant service request categories (e.g., cleaning, IT support, maintenance)

    The dataset is split into train, validation, and test sets with a 33%/33%/34%
    ratio. The split is deterministic, ensuring reproducibility across different runs.

    **References:**

    - *Meta Facility Support Analyzer Dataset*
        [https://github.com/meta-llama/prompt-ops/tree/main/use-cases/facility-support-analyzer](https://github.com/meta-llama/prompt-ops/tree/main/use-cases/facility-support-analyzer)
    """

    mirrors = [
        "https://raw.githubusercontent.com/meta-llama/llama-prompt-ops/refs/heads/main/use-cases/facility-support-analyzer/"  # noqa: E501
    ]

    resources = [
        ("dataset.json", "530dc66b1b07c9b15b19f08891e9bfa0"),
    ]

    _repr_indent = 4

    def __init__(self, split: str, root: Union[str, Path] = None) -> None:
        """
        Initializes the `FacilitySupport` dataset.

        Args:
            split: The dataset split to load. Must be either `"train"`, `"val"`,
                or `"test"`.
            root: The root directory where JSON files are stored.
        """
        if split not in {"train", "val", "test"}:
            raise ValueError(
                f"FacilitySupport Dataset: expected split in ['train', 'val', 'test'], "
                f"but got split={split}"
            )

        if isinstance(root, str):
            root = os.path.expanduser(root)

        self.split = split
        self.root = root

        self._download()

        # Load dataset from JSON
        file_path = os.path.join(self.raw_folder, self.resources[0][0])
        with open(file_path, "r", encoding="utf-8") as f:
            dataset: List[Dict] = json.load(f)

        # Shuffle deterministically
        random.Random(0).shuffle(dataset)

        n = len(dataset)
        n_train = int(n * 0.33)
        n_val = int(n * 0.33)

        if split == "train":
            self.data = dataset[:n_train]
        elif split == "val":
            self.data = dataset[n_train : n_train + n_val]  # noqa: E203
        else:  # test
            self.data = dataset[n_train + n_val :]  # noqa: E203

    def __getitem__(
        self, index: int
    ) -> tuple[Variable, tuple[Variable, Variable, Variable]]:
        """Fetches a data sample `(message, (urgency, sentiment, categories))`
        for a given `index`.

        Args:
            index: The index of the data sample to fetch.

        Returns:
            A tuple containing input `message` and a tuple of output variables \
            `(urgency, sentiment, categories)` (see below note for more details).

        Note:
            The return value is a tuple of the form
            `(message, (urgency, sentiment, categories))` where:

            - `message` is a [`Variable`][afnio.Variable] containing the input
                email or message text.
            - `urgency` is a [`Variable`][afnio.Variable] containing the urgency
                label (low, medium, high).
            - `sentiment` is a [`Variable`][afnio.Variable] containing the
                sentiment label (negative, neutral, positive).
            - `categories` is a [`Variable`][afnio.Variable] containing a JSON
                string of the relevant service request categories (e.g., cleaning,
                IT support, maintenance).
        """
        if not (0 <= index < len(self.data)):
            raise IndexError("Index out of range.")

        item = self.data[index]

        answer: dict = json.loads(item["answer"])
        urgency = answer.get("urgency", None)
        sentiment = answer.get("sentiment", None)
        categories = answer.get("categories", None)

        message = Variable(
            data=item["fields"]["input"],
            role="input email or message",
        )
        urgency = Variable(data=urgency, role="output urgency")
        sentiment = Variable(data=sentiment, role="output sentiment")
        categories = Variable(data=json.dumps(categories), role="output categories")
        return message, (urgency, sentiment, categories)

    def __len__(self) -> int:
        return len(self.data)

    def extra_repr(self) -> str:
        split_map = {"train": "Train", "val": "Validation", "test": "Test"}

        try:
            split = split_map[self.split]
        except KeyError:
            raise ValueError(
                f"Invalid split value: {self.split}. "
                f"Expected one of ['train', 'val', 'test']."
            )

        return f"Split: {split}"

    def __repr__(self) -> str:
        head = "Dataset " + self.__class__.__name__
        body = [f"Number of datapoints: {self.__len__()}"]
        if self.root is not None:
            body.append(f"Root location: {self.root}")
        body += self.extra_repr().splitlines()
        lines = [head] + [" " * self._repr_indent + line for line in body]
        return "\n".join(lines)

    @property
    def raw_folder(self) -> str:
        return os.path.join(self.root, self.__class__.__name__, "raw")

    def _check_exists(self) -> bool:
        return all(
            check_integrity(
                os.path.join(
                    self.raw_folder, os.path.splitext(os.path.basename(url))[0]
                )
            )
            for url, _ in self.resources
        )

    def _download(self) -> None:
        """Download the Facility Support data if it doesn't exist already."""

        if self._check_exists():
            return

        os.makedirs(self.raw_folder, exist_ok=True)

        # download files
        for filename, md5 in self.resources:
            for mirror in self.mirrors:
                url = f"{mirror}{filename}"
                try:
                    download(
                        url, download_root=self.raw_folder, filename=filename, md5=md5
                    )
                except URLError as error:
                    print(f"Failed to download (trying next):\n{error}")
                    continue
                finally:
                    print()
                break
            else:
                raise RuntimeError(f"Error downloading {filename}")

__init__(split, root=None)

Initializes the FacilitySupport dataset.

Parameters:

Name Type Description Default
split str

The dataset split to load. Must be either "train", "val", or "test".

required
root str | Path

The root directory where JSON files are stored.

None
Source code in afnio/utils/datasets/facility_support.py
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
def __init__(self, split: str, root: Union[str, Path] = None) -> None:
    """
    Initializes the `FacilitySupport` dataset.

    Args:
        split: The dataset split to load. Must be either `"train"`, `"val"`,
            or `"test"`.
        root: The root directory where JSON files are stored.
    """
    if split not in {"train", "val", "test"}:
        raise ValueError(
            f"FacilitySupport Dataset: expected split in ['train', 'val', 'test'], "
            f"but got split={split}"
        )

    if isinstance(root, str):
        root = os.path.expanduser(root)

    self.split = split
    self.root = root

    self._download()

    # Load dataset from JSON
    file_path = os.path.join(self.raw_folder, self.resources[0][0])
    with open(file_path, "r", encoding="utf-8") as f:
        dataset: List[Dict] = json.load(f)

    # Shuffle deterministically
    random.Random(0).shuffle(dataset)

    n = len(dataset)
    n_train = int(n * 0.33)
    n_val = int(n * 0.33)

    if split == "train":
        self.data = dataset[:n_train]
    elif split == "val":
        self.data = dataset[n_train : n_train + n_val]  # noqa: E203
    else:  # test
        self.data = dataset[n_train + n_val :]  # noqa: E203

__getitem__(index)

Fetches a data sample (message, (urgency, sentiment, categories)) for a given index.

Parameters:

Name Type Description Default
index int

The index of the data sample to fetch.

required

Returns:

Type Description
tuple[Variable, tuple[Variable, Variable, Variable]]

A tuple containing input message and a tuple of output variables (urgency, sentiment, categories) (see below note for more details).

Note

The return value is a tuple of the form (message, (urgency, sentiment, categories)) where:

  • message is a Variable containing the input email or message text.
  • urgency is a Variable containing the urgency label (low, medium, high).
  • sentiment is a Variable containing the sentiment label (negative, neutral, positive).
  • categories is a Variable containing a JSON string of the relevant service request categories (e.g., cleaning, IT support, maintenance).
Source code in afnio/utils/datasets/facility_support.py
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
def __getitem__(
    self, index: int
) -> tuple[Variable, tuple[Variable, Variable, Variable]]:
    """Fetches a data sample `(message, (urgency, sentiment, categories))`
    for a given `index`.

    Args:
        index: The index of the data sample to fetch.

    Returns:
        A tuple containing input `message` and a tuple of output variables \
        `(urgency, sentiment, categories)` (see below note for more details).

    Note:
        The return value is a tuple of the form
        `(message, (urgency, sentiment, categories))` where:

        - `message` is a [`Variable`][afnio.Variable] containing the input
            email or message text.
        - `urgency` is a [`Variable`][afnio.Variable] containing the urgency
            label (low, medium, high).
        - `sentiment` is a [`Variable`][afnio.Variable] containing the
            sentiment label (negative, neutral, positive).
        - `categories` is a [`Variable`][afnio.Variable] containing a JSON
            string of the relevant service request categories (e.g., cleaning,
            IT support, maintenance).
    """
    if not (0 <= index < len(self.data)):
        raise IndexError("Index out of range.")

    item = self.data[index]

    answer: dict = json.loads(item["answer"])
    urgency = answer.get("urgency", None)
    sentiment = answer.get("sentiment", None)
    categories = answer.get("categories", None)

    message = Variable(
        data=item["fields"]["input"],
        role="input email or message",
    )
    urgency = Variable(data=urgency, role="output urgency")
    sentiment = Variable(data=sentiment, role="output sentiment")
    categories = Variable(data=json.dumps(categories), role="output categories")
    return message, (urgency, sentiment, categories)