Skip to content

afnio.models.openai

afnio.models.openai.Omit

In certain situations you need to be able to represent a case where a default value has to be explicitly removed and None is not an appropriate substitute.

Examples:

>>> # as the default `Content-Type` header is `application/json` that will be sent
>>> client.post("/upload/files", files={"file": b"my raw file content"})
>>>
>>> # you can't explicitly override the header as it has to be dynamically generated
>>> # to look something like: 'multipart/form-data; boundary=0d8382fcf5f8c3be01ca2e11002d2983'
>>> client.post(..., headers={"Content-Type": "multipart/form-data"})
>>>
>>> # instead you can remove the default `application/json` header by passing Omit
>>> client.post(..., headers={"Content-Type": Omit()})
Source code in afnio/models/openai.py
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
class Omit:
    """In certain situations you need to be able to represent a case where a default
    value has to be explicitly removed and `None` is not an appropriate substitute.

    Examples:
        >>> # as the default `Content-Type` header is `application/json` that will be sent
        >>> client.post("/upload/files", files={"file": b"my raw file content"})
        >>>
        >>> # you can't explicitly override the header as it has to be dynamically generated
        >>> # to look something like: 'multipart/form-data; boundary=0d8382fcf5f8c3be01ca2e11002d2983'
        >>> client.post(..., headers={"Content-Type": "multipart/form-data"})
        >>>
        >>> # instead you can remove the default `application/json` header by passing Omit
        >>> client.post(..., headers={"Content-Type": Omit()})
    """  # noqa: E501

    def __bool__(self) -> Literal[False]:
        return False

afnio.models.openai.OpenAI

Bases: TextCompletionModel, ChatCompletionModel, EmbeddingModel, OpenAI

OpenAI synchronous client to perform multiple language model operations.

Source code in afnio/models/openai.py
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
class OpenAI(
    TextCompletionModel,
    ChatCompletionModel,
    EmbeddingModel,
    OpenAICli,
):
    """
    OpenAI synchronous client to perform multiple language model operations.
    """

    # class-level stubs so Sphinx/autodoc can inspect these attributes safely
    api_key: Optional[str] = None
    organization: Optional[str] = None
    project: Optional[str] = None
    webhook_secret: Optional[str] = None
    websocket_base_url: Optional[Union[str, httpx.URL]] = None

    def __init__(
        self,
        api_key: Optional[str] = None,
        organization: Optional[str] = None,
        project: Optional[str] = None,
        base_url: Optional[Union[str, httpx.URL]] = None,
        websocket_base_url: Optional[Union[str, httpx.URL]] = None,
        timeout: Union[float, httpx.Timeout, None, NotGiven] = NOT_GIVEN,
        max_retries: int = DEFAULT_MAX_RETRIES,
        default_headers: Optional[Mapping[str, str]] = None,
        default_query: Optional[Mapping[str, object]] = None,
        http_client: Optional[httpx.Client] = None,
    ):
        """Initialize the OpenAI client.

        Args:
            api_key: The API key for authentication.
            organization: The organization ID.
            project: The project ID.
            base_url: The base URL for the API.
            websocket_base_url: The base URL for the WebSocket.
            timeout: The timeout for requests.
            max_retries: The maximum number of retries for requests.
            default_headers: Default headers for requests.
            default_query: Default query parameters for requests.
            http_client: The HTTP client instance.
        """
        usage = copy.deepcopy(INITIAL_USAGE)  # Ensure a fresh copy

        # Validate and build config
        config = {
            "api_key": api_key or os.getenv("OPENAI_API_KEY"),
            "organization": organization,
            "project": project,
            "base_url": base_url,
            "websocket_base_url": websocket_base_url,
            "timeout": timeout,
            "max_retries": max_retries,
            "default_headers": default_headers,
            "default_query": default_query,
        }
        # Remove None and NOT_GIVEN values
        config = {
            k: v for k, v in config.items() if v is not None and v is not NOT_GIVEN
        }
        # Validate serializability
        for k, v in config.items():
            _validate_config_param(k, v)

        self._client = OpenAICli(
            api_key=api_key or os.getenv("OPENAI_API_KEY"),
            organization=organization,
            project=project,
            base_url=base_url,
            websocket_base_url=websocket_base_url,
            timeout=timeout,
            max_retries=max_retries,
            default_headers=default_headers,
            default_query=default_query,
            http_client=http_client,
        )
        super().__init__(provider=PROVIDER, config=config, usage=usage)

    # TODO: Finalize implementation
    def complete(self, prompt: str) -> str:
        """
        Synchronous method to generate a completion for the given prompt.

        Args:
            prompt: The input text for which the model should generate a completion.

        Returns:
            A string containing the generated completion.
        """
        raise NotImplementedError

    def chat(
        self,
        *,
        messages: Iterable[ChatCompletionMessageParam],
        model: Union[str, ChatModel],
        audio: Union[Optional[ChatCompletionAudioParam], NotGiven] = NOT_GIVEN,
        frequency_penalty: Union[Optional[float], NotGiven] = NOT_GIVEN,
        function_call: Union[
            completion_create_params.FunctionCall, NotGiven
        ] = NOT_GIVEN,
        functions: Union[
            Iterable[completion_create_params.Function], NotGiven
        ] = NOT_GIVEN,
        logit_bias: Union[Optional[Dict[str, int]], NotGiven] = NOT_GIVEN,
        logprobs: Union[Optional[bool], NotGiven] = NOT_GIVEN,
        max_completion_tokens: Union[Optional[int], NotGiven] = NOT_GIVEN,
        max_tokens: Union[Optional[int], NotGiven] = NOT_GIVEN,
        metadata: Union[Optional[Metadata], NotGiven] = NOT_GIVEN,
        modalities: Union[
            Optional[List[Literal["text", "audio"]]], NotGiven
        ] = NOT_GIVEN,
        n: Union[Optional[int], NotGiven] = NOT_GIVEN,
        parallel_tool_calls: Union[bool, NotGiven] = NOT_GIVEN,
        prediction: Union[
            Optional[ChatCompletionPredictionContentParam], NotGiven
        ] = NOT_GIVEN,
        presence_penalty: Union[Optional[float], NotGiven] = NOT_GIVEN,
        prompt_cache_key: Union[str, NotGiven] = NOT_GIVEN,
        reasoning_effort: Union[ReasoningEffort, NotGiven] = NOT_GIVEN,
        response_format: Union[
            completion_create_params.ResponseFormat, NotGiven
        ] = NOT_GIVEN,
        safety_identifier: Union[str, NotGiven] = NOT_GIVEN,
        seed: Union[Optional[int], NotGiven] = NOT_GIVEN,
        service_tier: Union[
            Optional[Literal["auto", "default", "flex", "scale", "priority"]], NotGiven
        ] = NOT_GIVEN,
        stop: Union[
            Union[Optional[str], SequenceNotStr[str], None], NotGiven
        ] = NOT_GIVEN,
        store: Union[Optional[bool], NotGiven] = NOT_GIVEN,
        # TODO: `stream` can be useful during inference, but forbid during training or backpropagation
        stream: Union[Optional[Literal[False]], NotGiven] = NOT_GIVEN,
        stream_options: Union[
            Optional[ChatCompletionStreamOptionsParam], NotGiven
        ] = NOT_GIVEN,
        temperature: Union[Optional[float], NotGiven] = NOT_GIVEN,
        tool_choice: Union[ChatCompletionToolChoiceOptionParam, NotGiven] = NOT_GIVEN,
        tools: Union[Iterable[ChatCompletionToolUnionParam], NotGiven] = NOT_GIVEN,
        top_logprobs: Union[Optional[int], NotGiven] = NOT_GIVEN,
        top_p: Union[Optional[float], NotGiven] = NOT_GIVEN,
        user: Union[str, NotGiven] = NOT_GIVEN,
        verbosity: Union[
            Optional[Literal["low", "medium", "high"]], NotGiven
        ] = NOT_GIVEN,
        web_search_options: Union[
            completion_create_params.WebSearchOptions, NotGiven
        ] = NOT_GIVEN,
        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
        # The extra values given here take precedence over values defined on the client or passed to this method.
        extra_headers: Optional[Headers] = None,
        extra_query: Optional[Query] = None,
        extra_body: Optional[Body] = None,
        timeout: Optional[Union[float, httpx.Timeout, NotGiven]] = NOT_GIVEN,
    ) -> str:
        """Synchronously creates a model response for the given chat conversation.
        Learn more in the
        [text generation](https://platform.openai.com/docs/guides/text-generation),
        [vision](https://platform.openai.com/docs/guides/vision), and
        [audio](https://platform.openai.com/docs/guides/audio) guides.

        Parameter support can differ depending on the model used to generate the
        response, particularly for newer reasoning models. Parameters that are only
        supported for reasoning models are noted below. For the current state of
        unsupported parameters in reasoning models,
        [refer to the reasoning guide](https://platform.openai.com/docs/guides/reasoning).

        Args:
          messages: A list of messages comprising the conversation so far. Depending on the
              [model](https://platform.openai.com/docs/models) you use, different message
              types (modalities) are supported, like
              [text](https://platform.openai.com/docs/guides/text-generation),
              [images](https://platform.openai.com/docs/guides/vision), and
              [audio](https://platform.openai.com/docs/guides/audio).

          model: Model ID used to generate the response, like `gpt-4o` or `o3`. OpenAI offers a
              wide range of models with different capabilities, performance characteristics,
              and price points. Refer to the
              [model guide](https://platform.openai.com/docs/models) to browse and compare
              available models.

          audio: Parameters for audio output. Required when audio output is requested with
              `modalities: ["audio"]`.
              [Learn more](https://platform.openai.com/docs/guides/audio).

          frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their
              existing frequency in the text so far, decreasing the model's likelihood to
              repeat the same line verbatim.

          function_call: Deprecated in favor of `tool_choice`.

              Controls which (if any) function is called by the model.

              `none` means the model will not call a function and instead generates a message.

              `auto` means the model can pick between generating a message or calling a
              function.

              Specifying a particular function via `{"name": "my_function"}` forces the model
              to call that function.

              `none` is the default when no functions are present. `auto` is the default if
              functions are present.

          functions: Deprecated in favor of `tools`.

              A list of functions the model may generate JSON inputs for.

          logit_bias: Modify the likelihood of specified tokens appearing in the completion.

              Accepts a JSON object that maps tokens (specified by their token ID in the
              tokenizer) to an associated bias value from -100 to 100. Mathematically, the
              bias is added to the logits generated by the model prior to sampling. The exact
              effect will vary per model, but values between -1 and 1 should decrease or
              increase likelihood of selection; values like -100 or 100 should result in a ban
              or exclusive selection of the relevant token.

          logprobs: Whether to return log probabilities of the output tokens or not. If true,
              returns the log probabilities of each output token returned in the `content` of
              `message`.

          max_completion_tokens: An upper bound for the number of tokens that can be generated for a completion,
              including visible output tokens and
              [reasoning tokens](https://platform.openai.com/docs/guides/reasoning).

          max_tokens: The maximum number of [tokens](/tokenizer) that can be generated in the chat
              completion. This value can be used to control
              [costs](https://openai.com/api/pricing/) for text generated via API.

              This value is now deprecated in favor of `max_completion_tokens`, and is not
              compatible with
              [o-series models](https://platform.openai.com/docs/guides/reasoning).

          metadata: Set of 16 key-value pairs that can be attached to an object. This can be useful
              for storing additional information about the object in a structured format, and
              querying for objects via API or the dashboard.

              Keys are strings with a maximum length of 64 characters. Values are strings with
              a maximum length of 512 characters.

          modalities: Output types that you would like the model to generate. Most models are capable
              of generating text, which is the default:

              `["text"]`

              The `gpt-4o-audio-preview` model can also be used to
              [generate audio](https://platform.openai.com/docs/guides/audio). To request that
              this model generate both text and audio responses, you can use:

              `["text", "audio"]`

          n: How many chat completion choices to generate for each input message. Note that
              you will be charged based on the number of generated tokens across all of the
              choices. Keep `n` as `1` to minimize costs.

          parallel_tool_calls: Whether to enable
              [parallel function calling](https://platform.openai.com/docs/guides/function-calling#configuring-parallel-function-calling)
              during tool use.

          prediction: Static predicted output content, such as the content of a text file that is
              being regenerated.

          presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on
              whether they appear in the text so far, increasing the model's likelihood to
              talk about new topics.

          prompt_cache_key: Used by OpenAI to cache responses for similar requests to optimize your cache
              hit rates. Replaces the `user` field.
              [Learn more](https://platform.openai.com/docs/guides/prompt-caching).

          reasoning_effort: Constrains effort on reasoning for
              [reasoning models](https://platform.openai.com/docs/guides/reasoning). Currently
              supported values are `minimal`, `low`, `medium`, and `high`. Reducing reasoning
              effort can result in faster responses and fewer tokens used on reasoning in a
              response.

          response_format: An object specifying the format that the model must output.

              Setting to `{ "type": "json_schema", "json_schema": {...} }` enables Structured
              Outputs which ensures the model will match your supplied JSON schema. Learn more
              in the
              [Structured Outputs guide](https://platform.openai.com/docs/guides/structured-outputs).

              Setting to `{ "type": "json_object" }` enables the older JSON mode, which
              ensures the message the model generates is valid JSON. Using `json_schema` is
              preferred for models that support it.

          safety_identifier: A stable identifier used to help detect users of your application that may be
              violating OpenAI's usage policies. The IDs should be a string that uniquely
              identifies each user. We recommend hashing their username or email address, in
              order to avoid sending us any identifying information.
              [Learn more](https://platform.openai.com/docs/guides/safety-best-practices#safety-identifiers).

          seed: This feature is in Beta. If specified, our system will make a best effort to
              sample deterministically, such that repeated requests with the same `seed` and
              parameters should return the same result. Determinism is not guaranteed, and you
              should refer to the `system_fingerprint` response parameter to monitor changes
              in the backend.

          service_tier: Specifies the processing type used for serving the request.

              - If set to 'auto', then the request will be processed with the service tier
                configured in the Project settings. Unless otherwise configured, the Project
                will use 'default'.
              - If set to 'default', then the request will be processed with the standard
                pricing and performance for the selected model.
              - If set to '[flex](https://platform.openai.com/docs/guides/flex-processing)' or
                '[priority](https://openai.com/api-priority-processing/)', then the request
                will be processed with the corresponding service tier.
              - When not set, the default behavior is 'auto'.

              When the `service_tier` parameter is set, the response body will include the
              `service_tier` value based on the processing mode actually used to serve the
              request. This response value may be different from the value set in the
              parameter.

          stop: Not supported with latest reasoning models `o3` and `o4-mini`.

              Up to 4 sequences where the API will stop generating further tokens. The
              returned text will not contain the stop sequence.

          store: Whether or not to store the output of this chat completion request for use in
              our [model distillation](https://platform.openai.com/docs/guides/distillation)
              or [evals](https://platform.openai.com/docs/guides/evals) products.

              Supports text and image inputs. Note: image inputs over 8MB will be dropped.

          stream: If set to true, the model response data will be streamed to the client as it is
              generated using
              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
              See the
              [Streaming section below](https://platform.openai.com/docs/api-reference/chat/streaming)
              for more information, along with the
              [streaming responses](https://platform.openai.com/docs/guides/streaming-responses)
              guide for more information on how to handle the streaming events.

          stream_options: Options for streaming response. Only set this when you set `stream: true`.

          temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
              make the output more random, while lower values like 0.2 will make it more
              focused and deterministic. We generally recommend altering this or `top_p` but
              not both.

          tool_choice: Controls which (if any) tool is called by the model. `none` means the model will
              not call any tool and instead generates a message. `auto` means the model can
              pick between generating a message or calling one or more tools. `required` means
              the model must call one or more tools. Specifying a particular tool via
              `{"type": "function", "function": {"name": "my_function"}}` forces the model to
              call that tool.

              `none` is the default when no tools are present. `auto` is the default if tools
              are present.

          tools: A list of tools the model may call. You can provide either
              [custom tools](https://platform.openai.com/docs/guides/function-calling#custom-tools)
              or [function tools](https://platform.openai.com/docs/guides/function-calling).

          top_logprobs: An integer between 0 and 20 specifying the number of most likely tokens to
              return at each token position, each with an associated log probability.
              `logprobs` must be set to `true` if this parameter is used.

          top_p: An alternative to sampling with temperature, called nucleus sampling, where the
              model considers the results of the tokens with top_p probability mass. So 0.1
              means only the tokens comprising the top 10% probability mass are considered.

              We generally recommend altering this or `temperature` but not both.

          user: This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use
              `prompt_cache_key` instead to maintain caching optimizations. A stable
              identifier for your end-users. Used to boost cache hit rates by better bucketing
              similar requests and to help OpenAI detect and prevent abuse.
              [Learn more](https://platform.openai.com/docs/guides/safety-best-practices#safety-identifiers).

          verbosity: Constrains the verbosity of the model's response. Lower values will result in
              more concise responses, while higher values will result in more verbose
              responses. Currently supported values are `low`, `medium`, and `high`.

          web_search_options: This tool searches the web for relevant results to use in a response. Learn more
              about the
              [web search tool](https://platform.openai.com/docs/guides/tools-web-search?api-mode=chat).

          extra_headers: Send extra headers

          extra_query: Add additional query parameters to the request

          extra_body: Add additional JSON properties to the request

          timeout: Override the client-level default timeout for this request, in seconds

        Returns:
          The generated chat completion content. If `response_format` is used, \
          the format of the returned content will depend on the specified format. \
          Otherwise, this will return a string containing the generated completion.
        """  # noqa: E501
        # TODO: handle `n > 1`` that returns multiple completions
        if isinstance(n, int) and n > 1:
            raise ValueError("n > 1 is not supported for chat completions.")

        response = self._client.chat.completions.create(
            messages=messages,
            model=model,
            audio=audio,
            frequency_penalty=frequency_penalty,
            function_call=function_call,
            functions=functions,
            logit_bias=logit_bias,
            logprobs=logprobs,
            max_completion_tokens=max_completion_tokens,
            max_tokens=max_tokens,
            metadata=metadata,
            modalities=modalities,
            n=n,
            parallel_tool_calls=parallel_tool_calls,
            prediction=prediction,
            presence_penalty=presence_penalty,
            prompt_cache_key=prompt_cache_key,
            reasoning_effort=reasoning_effort,
            response_format=response_format,
            safety_identifier=safety_identifier,
            seed=seed,
            service_tier=service_tier,
            stop=stop,
            store=store,
            stream=stream,
            stream_options=stream_options,
            temperature=temperature,
            tool_choice=tool_choice,
            tools=tools,
            top_logprobs=top_logprobs,
            top_p=top_p,
            user=user,
            verbosity=verbosity,
            web_search_options=web_search_options,
            extra_headers=extra_headers,
            extra_query=extra_query,
            extra_body=extra_body,
            timeout=timeout,
        )

        # Update usage statistics
        if hasattr(response, "usage") and response.usage:
            self.update_usage(response.usage, model)

        return response.choices[0].message.content

    # TODO: Finalize implementation
    def embed(self, input: List[str]) -> List[List[float]]:
        """
        Synchronous method to generate embeddings for the given input texts.

        Args:
            input: A list of input strings for which embeddings should be generated.

        Returns:
            A list of embeddings, where each embedding is represented \
            as a list of floats corresponding to the input strings.
        """
        raise NotImplementedError

    def update_usage(self, usage: CompletionUsage, model_name: str = None) -> None:
        """Updates the internal usage counters with values from a new API response.

        Args:
            usage: The usage object returned by the OpenAI API.
            model_name (str, optional): The name of the model for which the usage
                is being updated. If None, cost is copied from usage if available.
        """
        if not hasattr(self, "_usage"):
            self._usage.update(copy.deepcopy(INITIAL_USAGE))  # Ensure a fresh copy

        # Ensure we convert CompletionUsage to dict properly
        if isinstance(usage, CompletionUsage):
            usage = usage.model_dump()

        # Update core token usage fields
        self._usage["completion_tokens"] += usage.get("completion_tokens", 0)
        self._usage["prompt_tokens"] += usage.get("prompt_tokens", 0)
        self._usage["total_tokens"] += usage.get("total_tokens", 0)

        # Update prompt tokens details
        prompt_tokens_details = usage.get("prompt_tokens_details", {})
        self._usage["prompt_tokens_details"][
            "cached_tokens"
        ] += prompt_tokens_details.get("cached_tokens", 0)
        self._usage["prompt_tokens_details"][
            "audio_tokens"
        ] += prompt_tokens_details.get("audio_tokens", 0)

        # Update completion tokens details
        completion_tokens_details = usage.get("completion_tokens_details", {})
        self._usage["completion_tokens_details"][
            "reasoning_tokens"
        ] += completion_tokens_details.get("reasoning_tokens", 0)
        self._usage["completion_tokens_details"][
            "audio_tokens"
        ] += completion_tokens_details.get("audio_tokens", 0)
        self._usage["completion_tokens_details"][
            "accepted_prediction_tokens"
        ] += completion_tokens_details.get("accepted_prediction_tokens", 0)
        self._usage["completion_tokens_details"][
            "rejected_prediction_tokens"
        ] += completion_tokens_details.get("rejected_prediction_tokens", 0)

        # Update cost
        if model_name is not None:
            pricing = _get_pricing_for_model(self.provider, model_name)
            cost = _calculate_cost(usage, pricing)
            self._usage["cost"]["amount"] += cost
        else:
            # If cost is present in usage, copy it directly
            if "cost" in usage and "amount" in usage["cost"]:
                self._usage["cost"]["amount"] = usage["cost"]["amount"]

__init__(api_key=None, organization=None, project=None, base_url=None, websocket_base_url=None, timeout=NOT_GIVEN, max_retries=DEFAULT_MAX_RETRIES, default_headers=None, default_query=None, http_client=None)

Initialize the OpenAI client.

Parameters:

Name Type Description Default
api_key str | None

The API key for authentication.

None
organization str | None

The organization ID.

None
project str | None

The project ID.

None
base_url str | URL | None

The base URL for the API.

None
websocket_base_url str | URL | None

The base URL for the WebSocket.

None
timeout float | Timeout | None | NotGiven

The timeout for requests.

NOT_GIVEN
max_retries int

The maximum number of retries for requests.

DEFAULT_MAX_RETRIES
default_headers Mapping[str, str] | None

Default headers for requests.

None
default_query Mapping[str, object] | None

Default query parameters for requests.

None
http_client Client | None

The HTTP client instance.

None
Source code in afnio/models/openai.py
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
def __init__(
    self,
    api_key: Optional[str] = None,
    organization: Optional[str] = None,
    project: Optional[str] = None,
    base_url: Optional[Union[str, httpx.URL]] = None,
    websocket_base_url: Optional[Union[str, httpx.URL]] = None,
    timeout: Union[float, httpx.Timeout, None, NotGiven] = NOT_GIVEN,
    max_retries: int = DEFAULT_MAX_RETRIES,
    default_headers: Optional[Mapping[str, str]] = None,
    default_query: Optional[Mapping[str, object]] = None,
    http_client: Optional[httpx.Client] = None,
):
    """Initialize the OpenAI client.

    Args:
        api_key: The API key for authentication.
        organization: The organization ID.
        project: The project ID.
        base_url: The base URL for the API.
        websocket_base_url: The base URL for the WebSocket.
        timeout: The timeout for requests.
        max_retries: The maximum number of retries for requests.
        default_headers: Default headers for requests.
        default_query: Default query parameters for requests.
        http_client: The HTTP client instance.
    """
    usage = copy.deepcopy(INITIAL_USAGE)  # Ensure a fresh copy

    # Validate and build config
    config = {
        "api_key": api_key or os.getenv("OPENAI_API_KEY"),
        "organization": organization,
        "project": project,
        "base_url": base_url,
        "websocket_base_url": websocket_base_url,
        "timeout": timeout,
        "max_retries": max_retries,
        "default_headers": default_headers,
        "default_query": default_query,
    }
    # Remove None and NOT_GIVEN values
    config = {
        k: v for k, v in config.items() if v is not None and v is not NOT_GIVEN
    }
    # Validate serializability
    for k, v in config.items():
        _validate_config_param(k, v)

    self._client = OpenAICli(
        api_key=api_key or os.getenv("OPENAI_API_KEY"),
        organization=organization,
        project=project,
        base_url=base_url,
        websocket_base_url=websocket_base_url,
        timeout=timeout,
        max_retries=max_retries,
        default_headers=default_headers,
        default_query=default_query,
        http_client=http_client,
    )
    super().__init__(provider=PROVIDER, config=config, usage=usage)

complete(prompt)

Synchronous method to generate a completion for the given prompt.

Parameters:

Name Type Description Default
prompt str

The input text for which the model should generate a completion.

required

Returns:

Type Description
str

A string containing the generated completion.

Source code in afnio/models/openai.py
160
161
162
163
164
165
166
167
168
169
170
def complete(self, prompt: str) -> str:
    """
    Synchronous method to generate a completion for the given prompt.

    Args:
        prompt: The input text for which the model should generate a completion.

    Returns:
        A string containing the generated completion.
    """
    raise NotImplementedError

chat(*, messages, model, audio=NOT_GIVEN, frequency_penalty=NOT_GIVEN, function_call=NOT_GIVEN, functions=NOT_GIVEN, logit_bias=NOT_GIVEN, logprobs=NOT_GIVEN, max_completion_tokens=NOT_GIVEN, max_tokens=NOT_GIVEN, metadata=NOT_GIVEN, modalities=NOT_GIVEN, n=NOT_GIVEN, parallel_tool_calls=NOT_GIVEN, prediction=NOT_GIVEN, presence_penalty=NOT_GIVEN, prompt_cache_key=NOT_GIVEN, reasoning_effort=NOT_GIVEN, response_format=NOT_GIVEN, safety_identifier=NOT_GIVEN, seed=NOT_GIVEN, service_tier=NOT_GIVEN, stop=NOT_GIVEN, store=NOT_GIVEN, stream=NOT_GIVEN, stream_options=NOT_GIVEN, temperature=NOT_GIVEN, tool_choice=NOT_GIVEN, tools=NOT_GIVEN, top_logprobs=NOT_GIVEN, top_p=NOT_GIVEN, user=NOT_GIVEN, verbosity=NOT_GIVEN, web_search_options=NOT_GIVEN, extra_headers=None, extra_query=None, extra_body=None, timeout=NOT_GIVEN)

Synchronously creates a model response for the given chat conversation. Learn more in the text generation, vision, and audio guides.

Parameter support can differ depending on the model used to generate the response, particularly for newer reasoning models. Parameters that are only supported for reasoning models are noted below. For the current state of unsupported parameters in reasoning models, refer to the reasoning guide.

Parameters:

Name Type Description Default
messages Iterable[ChatCompletionMessageParam]

A list of messages comprising the conversation so far. Depending on the model you use, different message types (modalities) are supported, like text, images, and audio.

required
model str | ChatModel

Model ID used to generate the response, like gpt-4o or o3. OpenAI offers a wide range of models with different capabilities, performance characteristics, and price points. Refer to the model guide to browse and compare available models.

required
audio ChatCompletionAudioParam | None | NotGiven

Parameters for audio output. Required when audio output is requested with modalities: ["audio"]. Learn more.

NOT_GIVEN
frequency_penalty float | None | NotGiven

Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.

NOT_GIVEN
function_call FunctionCall | NotGiven

Deprecated in favor of tool_choice.

Controls which (if any) function is called by the model.

none means the model will not call a function and instead generates a message.

auto means the model can pick between generating a message or calling a function.

Specifying a particular function via {"name": "my_function"} forces the model to call that function.

none is the default when no functions are present. auto is the default if functions are present.

NOT_GIVEN
functions Iterable[Function] | NotGiven

Deprecated in favor of tools.

A list of functions the model may generate JSON inputs for.

NOT_GIVEN
logit_bias dict[str, int] | None | NotGiven

Modify the likelihood of specified tokens appearing in the completion.

Accepts a JSON object that maps tokens (specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically, the bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model, but values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should result in a ban or exclusive selection of the relevant token.

NOT_GIVEN
logprobs bool | None | NotGiven

Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message.

NOT_GIVEN
max_completion_tokens int | None | NotGiven

An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens.

NOT_GIVEN
max_tokens int | None | NotGiven

The maximum number of tokens that can be generated in the chat completion. This value can be used to control costs for text generated via API.

This value is now deprecated in favor of max_completion_tokens, and is not compatible with o-series models.

NOT_GIVEN
metadata Metadata | None | NotGiven

Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional information about the object in a structured format, and querying for objects via API or the dashboard.

Keys are strings with a maximum length of 64 characters. Values are strings with a maximum length of 512 characters.

NOT_GIVEN
modalities list[Literal['text', 'audio']] | None | NotGiven

Output types that you would like the model to generate. Most models are capable of generating text, which is the default:

["text"]

The gpt-4o-audio-preview model can also be used to generate audio. To request that this model generate both text and audio responses, you can use:

["text", "audio"]

NOT_GIVEN
n int | None | NotGiven

How many chat completion choices to generate for each input message. Note that you will be charged based on the number of generated tokens across all of the choices. Keep n as 1 to minimize costs.

NOT_GIVEN
parallel_tool_calls bool | NotGiven

Whether to enable parallel function calling during tool use.

NOT_GIVEN
prediction ChatCompletionPredictionContentParam | None | NotGiven

Static predicted output content, such as the content of a text file that is being regenerated.

NOT_GIVEN
presence_penalty float | None | NotGiven

Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.

NOT_GIVEN
prompt_cache_key str | NotGiven

Used by OpenAI to cache responses for similar requests to optimize your cache hit rates. Replaces the user field. Learn more.

NOT_GIVEN
reasoning_effort ReasoningEffort | NotGiven

Constrains effort on reasoning for reasoning models. Currently supported values are minimal, low, medium, and high. Reducing reasoning effort can result in faster responses and fewer tokens used on reasoning in a response.

NOT_GIVEN
response_format ResponseFormat | NotGiven

An object specifying the format that the model must output.

Setting to { "type": "json_schema", "json_schema": {...} } enables Structured Outputs which ensures the model will match your supplied JSON schema. Learn more in the Structured Outputs guide.

Setting to { "type": "json_object" } enables the older JSON mode, which ensures the message the model generates is valid JSON. Using json_schema is preferred for models that support it.

NOT_GIVEN
safety_identifier str | NotGiven

A stable identifier used to help detect users of your application that may be violating OpenAI's usage policies. The IDs should be a string that uniquely identifies each user. We recommend hashing their username or email address, in order to avoid sending us any identifying information. Learn more.

NOT_GIVEN
seed int | None | NotGiven

This feature is in Beta. If specified, our system will make a best effort to sample deterministically, such that repeated requests with the same seed and parameters should return the same result. Determinism is not guaranteed, and you should refer to the system_fingerprint response parameter to monitor changes in the backend.

NOT_GIVEN
service_tier Literal['auto', 'default', 'flex', 'scale', 'priority'] | None | NotGiven

Specifies the processing type used for serving the request.

  • If set to 'auto', then the request will be processed with the service tier configured in the Project settings. Unless otherwise configured, the Project will use 'default'.
  • If set to 'default', then the request will be processed with the standard pricing and performance for the selected model.
  • If set to 'flex' or 'priority', then the request will be processed with the corresponding service tier.
  • When not set, the default behavior is 'auto'.

When the service_tier parameter is set, the response body will include the service_tier value based on the processing mode actually used to serve the request. This response value may be different from the value set in the parameter.

NOT_GIVEN
stop str | None | SequenceNotStr[str] | None | NotGiven

Not supported with latest reasoning models o3 and o4-mini.

Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence.

NOT_GIVEN
store bool | None | NotGiven

Whether or not to store the output of this chat completion request for use in our model distillation or evals products.

Supports text and image inputs. Note: image inputs over 8MB will be dropped.

NOT_GIVEN
stream Literal[False] | None | NotGiven

If set to true, the model response data will be streamed to the client as it is generated using server-sent events. See the Streaming section below for more information, along with the streaming responses guide for more information on how to handle the streaming events.

NOT_GIVEN
stream_options ChatCompletionStreamOptionsParam | None | NotGiven

Options for streaming response. Only set this when you set stream: true.

NOT_GIVEN
temperature float | None | NotGiven

What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. We generally recommend altering this or top_p but not both.

NOT_GIVEN
tool_choice ChatCompletionToolChoiceOptionParam | NotGiven

Controls which (if any) tool is called by the model. none means the model will not call any tool and instead generates a message. auto means the model can pick between generating a message or calling one or more tools. required means the model must call one or more tools. Specifying a particular tool via {"type": "function", "function": {"name": "my_function"}} forces the model to call that tool.

none is the default when no tools are present. auto is the default if tools are present.

NOT_GIVEN
tools Iterable[ChatCompletionToolUnionParam] | NotGiven

A list of tools the model may call. You can provide either custom tools or function tools.

NOT_GIVEN
top_logprobs int | None | NotGiven

An integer between 0 and 20 specifying the number of most likely tokens to return at each token position, each with an associated log probability. logprobs must be set to true if this parameter is used.

NOT_GIVEN
top_p float | None | NotGiven

An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.

We generally recommend altering this or temperature but not both.

NOT_GIVEN
user str | NotGiven

This field is being replaced by safety_identifier and prompt_cache_key. Use prompt_cache_key instead to maintain caching optimizations. A stable identifier for your end-users. Used to boost cache hit rates by better bucketing similar requests and to help OpenAI detect and prevent abuse. Learn more.

NOT_GIVEN
verbosity Literal['low', 'medium', 'high'] | None | NotGiven

Constrains the verbosity of the model's response. Lower values will result in more concise responses, while higher values will result in more verbose responses. Currently supported values are low, medium, and high.

NOT_GIVEN
web_search_options WebSearchOptions | NotGiven

This tool searches the web for relevant results to use in a response. Learn more about the web search tool.

NOT_GIVEN
extra_headers Headers | None

Send extra headers

None
extra_query Query | None

Add additional query parameters to the request

None
extra_body Body | None

Add additional JSON properties to the request

None
timeout float | Timeout | NotGiven | None

Override the client-level default timeout for this request, in seconds

NOT_GIVEN

Returns:

Type Description
str

The generated chat completion content. If response_format is used, the format of the returned content will depend on the specified format. Otherwise, this will return a string containing the generated completion.

Source code in afnio/models/openai.py
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
def chat(
    self,
    *,
    messages: Iterable[ChatCompletionMessageParam],
    model: Union[str, ChatModel],
    audio: Union[Optional[ChatCompletionAudioParam], NotGiven] = NOT_GIVEN,
    frequency_penalty: Union[Optional[float], NotGiven] = NOT_GIVEN,
    function_call: Union[
        completion_create_params.FunctionCall, NotGiven
    ] = NOT_GIVEN,
    functions: Union[
        Iterable[completion_create_params.Function], NotGiven
    ] = NOT_GIVEN,
    logit_bias: Union[Optional[Dict[str, int]], NotGiven] = NOT_GIVEN,
    logprobs: Union[Optional[bool], NotGiven] = NOT_GIVEN,
    max_completion_tokens: Union[Optional[int], NotGiven] = NOT_GIVEN,
    max_tokens: Union[Optional[int], NotGiven] = NOT_GIVEN,
    metadata: Union[Optional[Metadata], NotGiven] = NOT_GIVEN,
    modalities: Union[
        Optional[List[Literal["text", "audio"]]], NotGiven
    ] = NOT_GIVEN,
    n: Union[Optional[int], NotGiven] = NOT_GIVEN,
    parallel_tool_calls: Union[bool, NotGiven] = NOT_GIVEN,
    prediction: Union[
        Optional[ChatCompletionPredictionContentParam], NotGiven
    ] = NOT_GIVEN,
    presence_penalty: Union[Optional[float], NotGiven] = NOT_GIVEN,
    prompt_cache_key: Union[str, NotGiven] = NOT_GIVEN,
    reasoning_effort: Union[ReasoningEffort, NotGiven] = NOT_GIVEN,
    response_format: Union[
        completion_create_params.ResponseFormat, NotGiven
    ] = NOT_GIVEN,
    safety_identifier: Union[str, NotGiven] = NOT_GIVEN,
    seed: Union[Optional[int], NotGiven] = NOT_GIVEN,
    service_tier: Union[
        Optional[Literal["auto", "default", "flex", "scale", "priority"]], NotGiven
    ] = NOT_GIVEN,
    stop: Union[
        Union[Optional[str], SequenceNotStr[str], None], NotGiven
    ] = NOT_GIVEN,
    store: Union[Optional[bool], NotGiven] = NOT_GIVEN,
    # TODO: `stream` can be useful during inference, but forbid during training or backpropagation
    stream: Union[Optional[Literal[False]], NotGiven] = NOT_GIVEN,
    stream_options: Union[
        Optional[ChatCompletionStreamOptionsParam], NotGiven
    ] = NOT_GIVEN,
    temperature: Union[Optional[float], NotGiven] = NOT_GIVEN,
    tool_choice: Union[ChatCompletionToolChoiceOptionParam, NotGiven] = NOT_GIVEN,
    tools: Union[Iterable[ChatCompletionToolUnionParam], NotGiven] = NOT_GIVEN,
    top_logprobs: Union[Optional[int], NotGiven] = NOT_GIVEN,
    top_p: Union[Optional[float], NotGiven] = NOT_GIVEN,
    user: Union[str, NotGiven] = NOT_GIVEN,
    verbosity: Union[
        Optional[Literal["low", "medium", "high"]], NotGiven
    ] = NOT_GIVEN,
    web_search_options: Union[
        completion_create_params.WebSearchOptions, NotGiven
    ] = NOT_GIVEN,
    # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
    # The extra values given here take precedence over values defined on the client or passed to this method.
    extra_headers: Optional[Headers] = None,
    extra_query: Optional[Query] = None,
    extra_body: Optional[Body] = None,
    timeout: Optional[Union[float, httpx.Timeout, NotGiven]] = NOT_GIVEN,
) -> str:
    """Synchronously creates a model response for the given chat conversation.
    Learn more in the
    [text generation](https://platform.openai.com/docs/guides/text-generation),
    [vision](https://platform.openai.com/docs/guides/vision), and
    [audio](https://platform.openai.com/docs/guides/audio) guides.

    Parameter support can differ depending on the model used to generate the
    response, particularly for newer reasoning models. Parameters that are only
    supported for reasoning models are noted below. For the current state of
    unsupported parameters in reasoning models,
    [refer to the reasoning guide](https://platform.openai.com/docs/guides/reasoning).

    Args:
      messages: A list of messages comprising the conversation so far. Depending on the
          [model](https://platform.openai.com/docs/models) you use, different message
          types (modalities) are supported, like
          [text](https://platform.openai.com/docs/guides/text-generation),
          [images](https://platform.openai.com/docs/guides/vision), and
          [audio](https://platform.openai.com/docs/guides/audio).

      model: Model ID used to generate the response, like `gpt-4o` or `o3`. OpenAI offers a
          wide range of models with different capabilities, performance characteristics,
          and price points. Refer to the
          [model guide](https://platform.openai.com/docs/models) to browse and compare
          available models.

      audio: Parameters for audio output. Required when audio output is requested with
          `modalities: ["audio"]`.
          [Learn more](https://platform.openai.com/docs/guides/audio).

      frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their
          existing frequency in the text so far, decreasing the model's likelihood to
          repeat the same line verbatim.

      function_call: Deprecated in favor of `tool_choice`.

          Controls which (if any) function is called by the model.

          `none` means the model will not call a function and instead generates a message.

          `auto` means the model can pick between generating a message or calling a
          function.

          Specifying a particular function via `{"name": "my_function"}` forces the model
          to call that function.

          `none` is the default when no functions are present. `auto` is the default if
          functions are present.

      functions: Deprecated in favor of `tools`.

          A list of functions the model may generate JSON inputs for.

      logit_bias: Modify the likelihood of specified tokens appearing in the completion.

          Accepts a JSON object that maps tokens (specified by their token ID in the
          tokenizer) to an associated bias value from -100 to 100. Mathematically, the
          bias is added to the logits generated by the model prior to sampling. The exact
          effect will vary per model, but values between -1 and 1 should decrease or
          increase likelihood of selection; values like -100 or 100 should result in a ban
          or exclusive selection of the relevant token.

      logprobs: Whether to return log probabilities of the output tokens or not. If true,
          returns the log probabilities of each output token returned in the `content` of
          `message`.

      max_completion_tokens: An upper bound for the number of tokens that can be generated for a completion,
          including visible output tokens and
          [reasoning tokens](https://platform.openai.com/docs/guides/reasoning).

      max_tokens: The maximum number of [tokens](/tokenizer) that can be generated in the chat
          completion. This value can be used to control
          [costs](https://openai.com/api/pricing/) for text generated via API.

          This value is now deprecated in favor of `max_completion_tokens`, and is not
          compatible with
          [o-series models](https://platform.openai.com/docs/guides/reasoning).

      metadata: Set of 16 key-value pairs that can be attached to an object. This can be useful
          for storing additional information about the object in a structured format, and
          querying for objects via API or the dashboard.

          Keys are strings with a maximum length of 64 characters. Values are strings with
          a maximum length of 512 characters.

      modalities: Output types that you would like the model to generate. Most models are capable
          of generating text, which is the default:

          `["text"]`

          The `gpt-4o-audio-preview` model can also be used to
          [generate audio](https://platform.openai.com/docs/guides/audio). To request that
          this model generate both text and audio responses, you can use:

          `["text", "audio"]`

      n: How many chat completion choices to generate for each input message. Note that
          you will be charged based on the number of generated tokens across all of the
          choices. Keep `n` as `1` to minimize costs.

      parallel_tool_calls: Whether to enable
          [parallel function calling](https://platform.openai.com/docs/guides/function-calling#configuring-parallel-function-calling)
          during tool use.

      prediction: Static predicted output content, such as the content of a text file that is
          being regenerated.

      presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on
          whether they appear in the text so far, increasing the model's likelihood to
          talk about new topics.

      prompt_cache_key: Used by OpenAI to cache responses for similar requests to optimize your cache
          hit rates. Replaces the `user` field.
          [Learn more](https://platform.openai.com/docs/guides/prompt-caching).

      reasoning_effort: Constrains effort on reasoning for
          [reasoning models](https://platform.openai.com/docs/guides/reasoning). Currently
          supported values are `minimal`, `low`, `medium`, and `high`. Reducing reasoning
          effort can result in faster responses and fewer tokens used on reasoning in a
          response.

      response_format: An object specifying the format that the model must output.

          Setting to `{ "type": "json_schema", "json_schema": {...} }` enables Structured
          Outputs which ensures the model will match your supplied JSON schema. Learn more
          in the
          [Structured Outputs guide](https://platform.openai.com/docs/guides/structured-outputs).

          Setting to `{ "type": "json_object" }` enables the older JSON mode, which
          ensures the message the model generates is valid JSON. Using `json_schema` is
          preferred for models that support it.

      safety_identifier: A stable identifier used to help detect users of your application that may be
          violating OpenAI's usage policies. The IDs should be a string that uniquely
          identifies each user. We recommend hashing their username or email address, in
          order to avoid sending us any identifying information.
          [Learn more](https://platform.openai.com/docs/guides/safety-best-practices#safety-identifiers).

      seed: This feature is in Beta. If specified, our system will make a best effort to
          sample deterministically, such that repeated requests with the same `seed` and
          parameters should return the same result. Determinism is not guaranteed, and you
          should refer to the `system_fingerprint` response parameter to monitor changes
          in the backend.

      service_tier: Specifies the processing type used for serving the request.

          - If set to 'auto', then the request will be processed with the service tier
            configured in the Project settings. Unless otherwise configured, the Project
            will use 'default'.
          - If set to 'default', then the request will be processed with the standard
            pricing and performance for the selected model.
          - If set to '[flex](https://platform.openai.com/docs/guides/flex-processing)' or
            '[priority](https://openai.com/api-priority-processing/)', then the request
            will be processed with the corresponding service tier.
          - When not set, the default behavior is 'auto'.

          When the `service_tier` parameter is set, the response body will include the
          `service_tier` value based on the processing mode actually used to serve the
          request. This response value may be different from the value set in the
          parameter.

      stop: Not supported with latest reasoning models `o3` and `o4-mini`.

          Up to 4 sequences where the API will stop generating further tokens. The
          returned text will not contain the stop sequence.

      store: Whether or not to store the output of this chat completion request for use in
          our [model distillation](https://platform.openai.com/docs/guides/distillation)
          or [evals](https://platform.openai.com/docs/guides/evals) products.

          Supports text and image inputs. Note: image inputs over 8MB will be dropped.

      stream: If set to true, the model response data will be streamed to the client as it is
          generated using
          [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
          See the
          [Streaming section below](https://platform.openai.com/docs/api-reference/chat/streaming)
          for more information, along with the
          [streaming responses](https://platform.openai.com/docs/guides/streaming-responses)
          guide for more information on how to handle the streaming events.

      stream_options: Options for streaming response. Only set this when you set `stream: true`.

      temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
          make the output more random, while lower values like 0.2 will make it more
          focused and deterministic. We generally recommend altering this or `top_p` but
          not both.

      tool_choice: Controls which (if any) tool is called by the model. `none` means the model will
          not call any tool and instead generates a message. `auto` means the model can
          pick between generating a message or calling one or more tools. `required` means
          the model must call one or more tools. Specifying a particular tool via
          `{"type": "function", "function": {"name": "my_function"}}` forces the model to
          call that tool.

          `none` is the default when no tools are present. `auto` is the default if tools
          are present.

      tools: A list of tools the model may call. You can provide either
          [custom tools](https://platform.openai.com/docs/guides/function-calling#custom-tools)
          or [function tools](https://platform.openai.com/docs/guides/function-calling).

      top_logprobs: An integer between 0 and 20 specifying the number of most likely tokens to
          return at each token position, each with an associated log probability.
          `logprobs` must be set to `true` if this parameter is used.

      top_p: An alternative to sampling with temperature, called nucleus sampling, where the
          model considers the results of the tokens with top_p probability mass. So 0.1
          means only the tokens comprising the top 10% probability mass are considered.

          We generally recommend altering this or `temperature` but not both.

      user: This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use
          `prompt_cache_key` instead to maintain caching optimizations. A stable
          identifier for your end-users. Used to boost cache hit rates by better bucketing
          similar requests and to help OpenAI detect and prevent abuse.
          [Learn more](https://platform.openai.com/docs/guides/safety-best-practices#safety-identifiers).

      verbosity: Constrains the verbosity of the model's response. Lower values will result in
          more concise responses, while higher values will result in more verbose
          responses. Currently supported values are `low`, `medium`, and `high`.

      web_search_options: This tool searches the web for relevant results to use in a response. Learn more
          about the
          [web search tool](https://platform.openai.com/docs/guides/tools-web-search?api-mode=chat).

      extra_headers: Send extra headers

      extra_query: Add additional query parameters to the request

      extra_body: Add additional JSON properties to the request

      timeout: Override the client-level default timeout for this request, in seconds

    Returns:
      The generated chat completion content. If `response_format` is used, \
      the format of the returned content will depend on the specified format. \
      Otherwise, this will return a string containing the generated completion.
    """  # noqa: E501
    # TODO: handle `n > 1`` that returns multiple completions
    if isinstance(n, int) and n > 1:
        raise ValueError("n > 1 is not supported for chat completions.")

    response = self._client.chat.completions.create(
        messages=messages,
        model=model,
        audio=audio,
        frequency_penalty=frequency_penalty,
        function_call=function_call,
        functions=functions,
        logit_bias=logit_bias,
        logprobs=logprobs,
        max_completion_tokens=max_completion_tokens,
        max_tokens=max_tokens,
        metadata=metadata,
        modalities=modalities,
        n=n,
        parallel_tool_calls=parallel_tool_calls,
        prediction=prediction,
        presence_penalty=presence_penalty,
        prompt_cache_key=prompt_cache_key,
        reasoning_effort=reasoning_effort,
        response_format=response_format,
        safety_identifier=safety_identifier,
        seed=seed,
        service_tier=service_tier,
        stop=stop,
        store=store,
        stream=stream,
        stream_options=stream_options,
        temperature=temperature,
        tool_choice=tool_choice,
        tools=tools,
        top_logprobs=top_logprobs,
        top_p=top_p,
        user=user,
        verbosity=verbosity,
        web_search_options=web_search_options,
        extra_headers=extra_headers,
        extra_query=extra_query,
        extra_body=extra_body,
        timeout=timeout,
    )

    # Update usage statistics
    if hasattr(response, "usage") and response.usage:
        self.update_usage(response.usage, model)

    return response.choices[0].message.content

embed(input)

Synchronous method to generate embeddings for the given input texts.

Parameters:

Name Type Description Default
input list[str]

A list of input strings for which embeddings should be generated.

required

Returns:

Type Description
list[list[float]]

A list of embeddings, where each embedding is represented as a list of floats corresponding to the input strings.

Source code in afnio/models/openai.py
528
529
530
531
532
533
534
535
536
537
538
539
def embed(self, input: List[str]) -> List[List[float]]:
    """
    Synchronous method to generate embeddings for the given input texts.

    Args:
        input: A list of input strings for which embeddings should be generated.

    Returns:
        A list of embeddings, where each embedding is represented \
        as a list of floats corresponding to the input strings.
    """
    raise NotImplementedError

update_usage(usage, model_name=None)

Updates the internal usage counters with values from a new API response.

Parameters:

Name Type Description Default
usage CompletionUsage

The usage object returned by the OpenAI API.

required
model_name str

The name of the model for which the usage is being updated. If None, cost is copied from usage if available.

None
Source code in afnio/models/openai.py
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
def update_usage(self, usage: CompletionUsage, model_name: str = None) -> None:
    """Updates the internal usage counters with values from a new API response.

    Args:
        usage: The usage object returned by the OpenAI API.
        model_name (str, optional): The name of the model for which the usage
            is being updated. If None, cost is copied from usage if available.
    """
    if not hasattr(self, "_usage"):
        self._usage.update(copy.deepcopy(INITIAL_USAGE))  # Ensure a fresh copy

    # Ensure we convert CompletionUsage to dict properly
    if isinstance(usage, CompletionUsage):
        usage = usage.model_dump()

    # Update core token usage fields
    self._usage["completion_tokens"] += usage.get("completion_tokens", 0)
    self._usage["prompt_tokens"] += usage.get("prompt_tokens", 0)
    self._usage["total_tokens"] += usage.get("total_tokens", 0)

    # Update prompt tokens details
    prompt_tokens_details = usage.get("prompt_tokens_details", {})
    self._usage["prompt_tokens_details"][
        "cached_tokens"
    ] += prompt_tokens_details.get("cached_tokens", 0)
    self._usage["prompt_tokens_details"][
        "audio_tokens"
    ] += prompt_tokens_details.get("audio_tokens", 0)

    # Update completion tokens details
    completion_tokens_details = usage.get("completion_tokens_details", {})
    self._usage["completion_tokens_details"][
        "reasoning_tokens"
    ] += completion_tokens_details.get("reasoning_tokens", 0)
    self._usage["completion_tokens_details"][
        "audio_tokens"
    ] += completion_tokens_details.get("audio_tokens", 0)
    self._usage["completion_tokens_details"][
        "accepted_prediction_tokens"
    ] += completion_tokens_details.get("accepted_prediction_tokens", 0)
    self._usage["completion_tokens_details"][
        "rejected_prediction_tokens"
    ] += completion_tokens_details.get("rejected_prediction_tokens", 0)

    # Update cost
    if model_name is not None:
        pricing = _get_pricing_for_model(self.provider, model_name)
        cost = _calculate_cost(usage, pricing)
        self._usage["cost"]["amount"] += cost
    else:
        # If cost is present in usage, copy it directly
        if "cost" in usage and "amount" in usage["cost"]:
            self._usage["cost"]["amount"] = usage["cost"]["amount"]

afnio.models.openai.AsyncOpenAI

Bases: TextCompletionModel, ChatCompletionModel, EmbeddingModel, AsyncOpenAI

OpenAI asynchronous client to perform multiple language model operations.

Source code in afnio/models/openai.py
 596
 597
 598
 599
 600
 601
 602
 603
 604
 605
 606
 607
 608
 609
 610
 611
 612
 613
 614
 615
 616
 617
 618
 619
 620
 621
 622
 623
 624
 625
 626
 627
 628
 629
 630
 631
 632
 633
 634
 635
 636
 637
 638
 639
 640
 641
 642
 643
 644
 645
 646
 647
 648
 649
 650
 651
 652
 653
 654
 655
 656
 657
 658
 659
 660
 661
 662
 663
 664
 665
 666
 667
 668
 669
 670
 671
 672
 673
 674
 675
 676
 677
 678
 679
 680
 681
 682
 683
 684
 685
 686
 687
 688
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
class AsyncOpenAI(
    TextCompletionModel,
    ChatCompletionModel,
    EmbeddingModel,
    AsyncOpenAICli,
):
    """
    OpenAI asynchronous client to perform multiple language model operations.
    """

    # class-level stubs so Sphinx/autodoc can inspect these attributes safely
    api_key: Optional[str] = None
    organization: Optional[str] = None
    project: Optional[str] = None
    webhook_secret: Optional[str] = None
    websocket_base_url: Optional[Union[str, httpx.URL]] = None

    def __init__(
        self,
        api_key: Optional[str] = None,
        organization: Optional[str] = None,
        project: Optional[str] = None,
        base_url: Optional[Union[str, httpx.URL]] = None,
        websocket_base_url: Optional[Union[str, httpx.URL]] = None,
        timeout: Union[float, httpx.Timeout, None, NotGiven] = NOT_GIVEN,
        max_retries: int = DEFAULT_MAX_RETRIES,
        default_headers: Optional[Mapping[str, str]] = None,
        default_query: Optional[Mapping[str, object]] = None,
        http_client: Optional[httpx.AsyncClient] = None,
    ):
        """Initialize the OpenAI asynchronous client.

        Args:
            api_key: The API key for authentication.
            organization: The organization ID.
            project: The project ID.
            base_url: The base URL for the API.
            websocket_base_url: The base URL for the WebSocket.
            timeout: The timeout for requests.
            max_retries: The maximum number of retries for requests.
            default_headers: Default headers for requests.
            default_query: Default query parameters for requests.
            http_client: The HTTP client instance.
        """
        usage = copy.deepcopy(INITIAL_USAGE)  # Ensure a fresh copy

        # Validate and build config
        config = {
            "api_key": api_key or os.getenv("OPENAI_API_KEY"),
            "organization": organization,
            "project": project,
            "base_url": base_url,
            "websocket_base_url": websocket_base_url,
            "timeout": timeout,
            "max_retries": max_retries,
            "default_headers": default_headers,
            "default_query": default_query,
        }
        # Remove None and NOT_GIVEN values
        config = {
            k: v for k, v in config.items() if v is not None and v is not NOT_GIVEN
        }
        # Validate serializability
        for k, v in config.items():
            _validate_config_param(k, v)

        self._aclient = AsyncOpenAICli(
            api_key=api_key or os.getenv("OPENAI_API_KEY"),
            organization=organization,
            project=project,
            base_url=base_url,
            websocket_base_url=websocket_base_url,
            timeout=timeout,
            max_retries=max_retries,
            default_headers=default_headers,
            default_query=default_query,
            http_client=http_client,
        )
        super().__init__(provider=PROVIDER, config=config, usage=usage)

    # TODO: Finalize implementation
    async def acomplete(self, prompt: str) -> str:
        """
        Asynchronous method to generate a completion for the given prompt.

        Args:
            prompt: The input text for which the model should generate a completion.

        Returns:
            A string containing the generated completion.
        """
        raise NotImplementedError

    async def achat(
        self,
        *,
        messages: Iterable[ChatCompletionMessageParam],
        model: Union[str, ChatModel],
        audio: Union[Optional[ChatCompletionAudioParam], NotGiven] = NOT_GIVEN,
        frequency_penalty: Union[Optional[float], NotGiven] = NOT_GIVEN,
        function_call: Union[
            completion_create_params.FunctionCall, NotGiven
        ] = NOT_GIVEN,
        functions: Union[
            Iterable[completion_create_params.Function], NotGiven
        ] = NOT_GIVEN,
        logit_bias: Union[Optional[Dict[str, int]], NotGiven] = NOT_GIVEN,
        logprobs: Union[Optional[bool], NotGiven] = NOT_GIVEN,
        max_completion_tokens: Union[Optional[int], NotGiven] = NOT_GIVEN,
        max_tokens: Union[Optional[int], NotGiven] = NOT_GIVEN,
        metadata: Union[Optional[Metadata], NotGiven] = NOT_GIVEN,
        modalities: Union[
            Optional[List[Literal["text", "audio"]]], NotGiven
        ] = NOT_GIVEN,
        n: Union[Optional[int], NotGiven] = NOT_GIVEN,
        parallel_tool_calls: Union[bool, NotGiven] = NOT_GIVEN,
        prediction: Union[
            Optional[ChatCompletionPredictionContentParam], NotGiven
        ] = NOT_GIVEN,
        presence_penalty: Union[Optional[float], NotGiven] = NOT_GIVEN,
        prompt_cache_key: Union[str, NotGiven] = NOT_GIVEN,
        reasoning_effort: Union[ReasoningEffort, NotGiven] = NOT_GIVEN,
        response_format: Union[
            completion_create_params.ResponseFormat, NotGiven
        ] = NOT_GIVEN,
        safety_identifier: Union[str, NotGiven] = NOT_GIVEN,
        seed: Union[Optional[int], NotGiven] = NOT_GIVEN,
        service_tier: Union[
            Optional[Literal["auto", "default", "flex", "scale", "priority"]], NotGiven
        ] = NOT_GIVEN,
        stop: Union[
            Union[Optional[str], SequenceNotStr[str], None], NotGiven
        ] = NOT_GIVEN,
        store: Union[Optional[bool], NotGiven] = NOT_GIVEN,
        # TODO: `stream` can be useful during inference, but forbid during training or backpropagation
        stream: Union[Optional[Literal[False]], NotGiven] = NOT_GIVEN,
        stream_options: Union[
            Optional[ChatCompletionStreamOptionsParam], NotGiven
        ] = NOT_GIVEN,
        temperature: Union[Optional[float], NotGiven] = NOT_GIVEN,
        tool_choice: Union[ChatCompletionToolChoiceOptionParam, NotGiven] = NOT_GIVEN,
        tools: Union[Iterable[ChatCompletionToolUnionParam], NotGiven] = NOT_GIVEN,
        top_logprobs: Union[Optional[int], NotGiven] = NOT_GIVEN,
        top_p: Union[Optional[float], NotGiven] = NOT_GIVEN,
        user: Union[str, NotGiven] = NOT_GIVEN,
        verbosity: Union[
            Optional[Literal["low", "medium", "high"]], NotGiven
        ] = NOT_GIVEN,
        web_search_options: Union[
            completion_create_params.WebSearchOptions, NotGiven
        ] = NOT_GIVEN,
        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
        # The extra values given here take precedence over values defined on the client or passed to this method.
        extra_headers: Optional[Headers] = None,
        extra_query: Optional[Query] = None,
        extra_body: Optional[Body] = None,
        timeout: Optional[Union[float, httpx.Timeout, NotGiven]] = NOT_GIVEN,
    ) -> str:
        """Asynchronously creates a model response for the given chat conversation.
        Learn more in the
        [text generation](https://platform.openai.com/docs/guides/text-generation),
        [vision](https://platform.openai.com/docs/guides/vision), and
        [audio](https://platform.openai.com/docs/guides/audio) guides.

        Parameter support can differ depending on the model used to generate the
        response, particularly for newer reasoning models. Parameters that are only
        supported for reasoning models are noted below. For the current state of
        unsupported parameters in reasoning models,
        [refer to the reasoning guide](https://platform.openai.com/docs/guides/reasoning).

        Args:
          messages: A list of messages comprising the conversation so far. Depending on the
              [model](https://platform.openai.com/docs/models) you use, different message
              types (modalities) are supported, like
              [text](https://platform.openai.com/docs/guides/text-generation),
              [images](https://platform.openai.com/docs/guides/vision), and
              [audio](https://platform.openai.com/docs/guides/audio).

          model: Model ID used to generate the response, like `gpt-4o` or `o3`. OpenAI offers a
              wide range of models with different capabilities, performance characteristics,
              and price points. Refer to the
              [model guide](https://platform.openai.com/docs/models) to browse and compare
              available models.

          audio: Parameters for audio output. Required when audio output is requested with
              `modalities: ["audio"]`.
              [Learn more](https://platform.openai.com/docs/guides/audio).

          frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their
              existing frequency in the text so far, decreasing the model's likelihood to
              repeat the same line verbatim.

          function_call: Deprecated in favor of `tool_choice`.

              Controls which (if any) function is called by the model.

              `none` means the model will not call a function and instead generates a message.

              `auto` means the model can pick between generating a message or calling a
              function.

              Specifying a particular function via `{"name": "my_function"}` forces the model
              to call that function.

              `none` is the default when no functions are present. `auto` is the default if
              functions are present.

          functions: Deprecated in favor of `tools`.

              A list of functions the model may generate JSON inputs for.

          logit_bias: Modify the likelihood of specified tokens appearing in the completion.

              Accepts a JSON object that maps tokens (specified by their token ID in the
              tokenizer) to an associated bias value from -100 to 100. Mathematically, the
              bias is added to the logits generated by the model prior to sampling. The exact
              effect will vary per model, but values between -1 and 1 should decrease or
              increase likelihood of selection; values like -100 or 100 should result in a ban
              or exclusive selection of the relevant token.

          logprobs: Whether to return log probabilities of the output tokens or not. If true,
              returns the log probabilities of each output token returned in the `content` of
              `message`.

          max_completion_tokens: An upper bound for the number of tokens that can be generated for a completion,
              including visible output tokens and
              [reasoning tokens](https://platform.openai.com/docs/guides/reasoning).

          max_tokens: The maximum number of [tokens](/tokenizer) that can be generated in the chat
              completion. This value can be used to control
              [costs](https://openai.com/api/pricing/) for text generated via API.

              This value is now deprecated in favor of `max_completion_tokens`, and is not
              compatible with
              [o-series models](https://platform.openai.com/docs/guides/reasoning).

          metadata: Set of 16 key-value pairs that can be attached to an object. This can be useful
              for storing additional information about the object in a structured format, and
              querying for objects via API or the dashboard.

              Keys are strings with a maximum length of 64 characters. Values are strings with
              a maximum length of 512 characters.

          modalities: Output types that you would like the model to generate. Most models are capable
              of generating text, which is the default:

              `["text"]`

              The `gpt-4o-audio-preview` model can also be used to
              [generate audio](https://platform.openai.com/docs/guides/audio). To request that
              this model generate both text and audio responses, you can use:

              `["text", "audio"]`

          n: How many chat completion choices to generate for each input message. Note that
              you will be charged based on the number of generated tokens across all of the
              choices. Keep `n` as `1` to minimize costs.

          parallel_tool_calls: Whether to enable
              [parallel function calling](https://platform.openai.com/docs/guides/function-calling#configuring-parallel-function-calling)
              during tool use.

          prediction: Static predicted output content, such as the content of a text file that is
              being regenerated.

          presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on
              whether they appear in the text so far, increasing the model's likelihood to
              talk about new topics.

          prompt_cache_key: Used by OpenAI to cache responses for similar requests to optimize your cache
              hit rates. Replaces the `user` field.
              [Learn more](https://platform.openai.com/docs/guides/prompt-caching).

          reasoning_effort: Constrains effort on reasoning for
              [reasoning models](https://platform.openai.com/docs/guides/reasoning). Currently
              supported values are `minimal`, `low`, `medium`, and `high`. Reducing reasoning
              effort can result in faster responses and fewer tokens used on reasoning in a
              response.

          response_format: An object specifying the format that the model must output.

              Setting to `{ "type": "json_schema", "json_schema": {...} }` enables Structured
              Outputs which ensures the model will match your supplied JSON schema. Learn more
              in the
              [Structured Outputs guide](https://platform.openai.com/docs/guides/structured-outputs).

              Setting to `{ "type": "json_object" }` enables the older JSON mode, which
              ensures the message the model generates is valid JSON. Using `json_schema` is
              preferred for models that support it.

          safety_identifier: A stable identifier used to help detect users of your application that may be
              violating OpenAI's usage policies. The IDs should be a string that uniquely
              identifies each user. We recommend hashing their username or email address, in
              order to avoid sending us any identifying information.
              [Learn more](https://platform.openai.com/docs/guides/safety-best-practices#safety-identifiers).

          seed: This feature is in Beta. If specified, our system will make a best effort to
              sample deterministically, such that repeated requests with the same `seed` and
              parameters should return the same result. Determinism is not guaranteed, and you
              should refer to the `system_fingerprint` response parameter to monitor changes
              in the backend.

          service_tier: Specifies the processing type used for serving the request.

              - If set to 'auto', then the request will be processed with the service tier
                configured in the Project settings. Unless otherwise configured, the Project
                will use 'default'.
              - If set to 'default', then the request will be processed with the standard
                pricing and performance for the selected model.
              - If set to '[flex](https://platform.openai.com/docs/guides/flex-processing)' or
                '[priority](https://openai.com/api-priority-processing/)', then the request
                will be processed with the corresponding service tier.
              - When not set, the default behavior is 'auto'.

              When the `service_tier` parameter is set, the response body will include the
              `service_tier` value based on the processing mode actually used to serve the
              request. This response value may be different from the value set in the
              parameter.

          stop: Not supported with latest reasoning models `o3` and `o4-mini`.

              Up to 4 sequences where the API will stop generating further tokens. The
              returned text will not contain the stop sequence.

          store: Whether or not to store the output of this chat completion request for use in
              our [model distillation](https://platform.openai.com/docs/guides/distillation)
              or [evals](https://platform.openai.com/docs/guides/evals) products.

              Supports text and image inputs. Note: image inputs over 8MB will be dropped.

          stream: If set to true, the model response data will be streamed to the client as it is
              generated using
              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
              See the
              [Streaming section below](https://platform.openai.com/docs/api-reference/chat/streaming)
              for more information, along with the
              [streaming responses](https://platform.openai.com/docs/guides/streaming-responses)
              guide for more information on how to handle the streaming events.

          stream_options: Options for streaming response. Only set this when you set `stream: true`.

          temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
              make the output more random, while lower values like 0.2 will make it more
              focused and deterministic. We generally recommend altering this or `top_p` but
              not both.

          tool_choice: Controls which (if any) tool is called by the model. `none` means the model will
              not call any tool and instead generates a message. `auto` means the model can
              pick between generating a message or calling one or more tools. `required` means
              the model must call one or more tools. Specifying a particular tool via
              `{"type": "function", "function": {"name": "my_function"}}` forces the model to
              call that tool.

              `none` is the default when no tools are present. `auto` is the default if tools
              are present.

          tools: A list of tools the model may call. You can provide either
              [custom tools](https://platform.openai.com/docs/guides/function-calling#custom-tools)
              or [function tools](https://platform.openai.com/docs/guides/function-calling).

          top_logprobs: An integer between 0 and 20 specifying the number of most likely tokens to
              return at each token position, each with an associated log probability.
              `logprobs` must be set to `true` if this parameter is used.

          top_p: An alternative to sampling with temperature, called nucleus sampling, where the
              model considers the results of the tokens with top_p probability mass. So 0.1
              means only the tokens comprising the top 10% probability mass are considered.

              We generally recommend altering this or `temperature` but not both.

          user: This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use
              `prompt_cache_key` instead to maintain caching optimizations. A stable
              identifier for your end-users. Used to boost cache hit rates by better bucketing
              similar requests and to help OpenAI detect and prevent abuse.
              [Learn more](https://platform.openai.com/docs/guides/safety-best-practices#safety-identifiers).

          verbosity: Constrains the verbosity of the model's response. Lower values will result in
              more concise responses, while higher values will result in more verbose
              responses. Currently supported values are `low`, `medium`, and `high`.

          web_search_options: This tool searches the web for relevant results to use in a response. Learn more
              about the
              [web search tool](https://platform.openai.com/docs/guides/tools-web-search?api-mode=chat).

          extra_headers: Send extra headers

          extra_query: Add additional query parameters to the request

          extra_body: Add additional JSON properties to the request

          timeout: Override the client-level default timeout for this request, in seconds

        Returns:
          The generated chat completion content. If `response_format` is used, \
          the format of the returned content will depend on the specified format. \
          Otherwise, this will return a string containing the generated completion.
        """  # noqa: E501
        # TODO: handle `n > 1`` that returns multiple completions
        if isinstance(n, int) and n > 1:
            raise ValueError("n > 1 is not supported for async chat completions.")

        response = await self._aclient.chat.completions.create(
            messages=messages,
            model=model,
            audio=audio,
            frequency_penalty=frequency_penalty,
            function_call=function_call,
            functions=functions,
            logit_bias=logit_bias,
            logprobs=logprobs,
            max_completion_tokens=max_completion_tokens,
            max_tokens=max_tokens,
            metadata=metadata,
            modalities=modalities,
            n=n,
            parallel_tool_calls=parallel_tool_calls,
            prediction=prediction,
            presence_penalty=presence_penalty,
            prompt_cache_key=prompt_cache_key,
            reasoning_effort=reasoning_effort,
            response_format=response_format,
            safety_identifier=safety_identifier,
            seed=seed,
            service_tier=service_tier,
            stop=stop,
            store=store,
            stream=stream,
            stream_options=stream_options,
            temperature=temperature,
            tool_choice=tool_choice,
            tools=tools,
            top_logprobs=top_logprobs,
            top_p=top_p,
            user=user,
            verbosity=verbosity,
            web_search_options=web_search_options,
            extra_headers=extra_headers,
            extra_query=extra_query,
            extra_body=extra_body,
            timeout=timeout,
        )

        # Update usage statistics
        if hasattr(response, "usage") and response.usage:
            self.update_usage(response.usage, model)

        return response.choices[0].message.content

    # TODO: Finalize implementation
    async def aembed(self, input: List[str]) -> List[List[float]]:
        """
        Asynchronous method to generate embeddings for the given input texts.

        Args:
            input: A list of input strings for which embeddings
              should be generated.

        Returns:
            A list of embeddings, where each embedding is represented \
            as a list of floats corresponding to the input strings.
        """
        raise NotImplementedError

    def update_usage(self, usage: CompletionUsage, model_name: str = None) -> None:
        """Updates the internal usage counters with values from a new API response.

        Args:
            usage: The usage object returned by the OpenAI API.
            model_name (str, optional): The name of the model for which the usage
                is being updated. If None, cost is copied from usage if available.
        """
        if not hasattr(self, "_usage"):
            self._usage.update(copy.deepcopy(INITIAL_USAGE))  # Ensure a fresh copy

        # Ensure we convert CompletionUsage to dict properly
        if isinstance(usage, CompletionUsage):
            usage = usage.model_dump()

        # Update core token usage fields
        self._usage["completion_tokens"] += usage.get("completion_tokens", 0)
        self._usage["prompt_tokens"] += usage.get("prompt_tokens", 0)
        self._usage["total_tokens"] += usage.get("total_tokens", 0)

        # Update prompt tokens details
        prompt_tokens_details = usage.get("prompt_tokens_details", {})
        self._usage["prompt_tokens_details"][
            "cached_tokens"
        ] += prompt_tokens_details.get("cached_tokens", 0)
        self._usage["prompt_tokens_details"][
            "audio_tokens"
        ] += prompt_tokens_details.get("audio_tokens", 0)

        # Update completion tokens details
        completion_tokens_details = usage.get("completion_tokens_details", {})
        self._usage["completion_tokens_details"][
            "reasoning_tokens"
        ] += completion_tokens_details.get("reasoning_tokens", 0)
        self._usage["completion_tokens_details"][
            "audio_tokens"
        ] += completion_tokens_details.get("audio_tokens", 0)
        self._usage["completion_tokens_details"][
            "accepted_prediction_tokens"
        ] += completion_tokens_details.get("accepted_prediction_tokens", 0)
        self._usage["completion_tokens_details"][
            "rejected_prediction_tokens"
        ] += completion_tokens_details.get("rejected_prediction_tokens", 0)

        # Update cost
        if model_name is not None:
            pricing = _get_pricing_for_model(self.provider, model_name)
            cost = _calculate_cost(usage, pricing)
            self._usage["cost"]["amount"] += cost
        else:
            # If cost is present in usage, copy it directly
            if "cost" in usage and "amount" in usage["cost"]:
                self._usage["cost"]["amount"] = usage["cost"]["amount"]

__init__(api_key=None, organization=None, project=None, base_url=None, websocket_base_url=None, timeout=NOT_GIVEN, max_retries=DEFAULT_MAX_RETRIES, default_headers=None, default_query=None, http_client=None)

Initialize the OpenAI asynchronous client.

Parameters:

Name Type Description Default
api_key str | None

The API key for authentication.

None
organization str | None

The organization ID.

None
project str | None

The project ID.

None
base_url str | URL | None

The base URL for the API.

None
websocket_base_url str | URL | None

The base URL for the WebSocket.

None
timeout float | Timeout | None | NotGiven

The timeout for requests.

NOT_GIVEN
max_retries int

The maximum number of retries for requests.

DEFAULT_MAX_RETRIES
default_headers Mapping[str, str] | None

Default headers for requests.

None
default_query Mapping[str, object] | None

Default query parameters for requests.

None
http_client AsyncClient | None

The HTTP client instance.

None
Source code in afnio/models/openai.py
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
def __init__(
    self,
    api_key: Optional[str] = None,
    organization: Optional[str] = None,
    project: Optional[str] = None,
    base_url: Optional[Union[str, httpx.URL]] = None,
    websocket_base_url: Optional[Union[str, httpx.URL]] = None,
    timeout: Union[float, httpx.Timeout, None, NotGiven] = NOT_GIVEN,
    max_retries: int = DEFAULT_MAX_RETRIES,
    default_headers: Optional[Mapping[str, str]] = None,
    default_query: Optional[Mapping[str, object]] = None,
    http_client: Optional[httpx.AsyncClient] = None,
):
    """Initialize the OpenAI asynchronous client.

    Args:
        api_key: The API key for authentication.
        organization: The organization ID.
        project: The project ID.
        base_url: The base URL for the API.
        websocket_base_url: The base URL for the WebSocket.
        timeout: The timeout for requests.
        max_retries: The maximum number of retries for requests.
        default_headers: Default headers for requests.
        default_query: Default query parameters for requests.
        http_client: The HTTP client instance.
    """
    usage = copy.deepcopy(INITIAL_USAGE)  # Ensure a fresh copy

    # Validate and build config
    config = {
        "api_key": api_key or os.getenv("OPENAI_API_KEY"),
        "organization": organization,
        "project": project,
        "base_url": base_url,
        "websocket_base_url": websocket_base_url,
        "timeout": timeout,
        "max_retries": max_retries,
        "default_headers": default_headers,
        "default_query": default_query,
    }
    # Remove None and NOT_GIVEN values
    config = {
        k: v for k, v in config.items() if v is not None and v is not NOT_GIVEN
    }
    # Validate serializability
    for k, v in config.items():
        _validate_config_param(k, v)

    self._aclient = AsyncOpenAICli(
        api_key=api_key or os.getenv("OPENAI_API_KEY"),
        organization=organization,
        project=project,
        base_url=base_url,
        websocket_base_url=websocket_base_url,
        timeout=timeout,
        max_retries=max_retries,
        default_headers=default_headers,
        default_query=default_query,
        http_client=http_client,
    )
    super().__init__(provider=PROVIDER, config=config, usage=usage)

acomplete(prompt) async

Asynchronous method to generate a completion for the given prompt.

Parameters:

Name Type Description Default
prompt str

The input text for which the model should generate a completion.

required

Returns:

Type Description
str

A string containing the generated completion.

Source code in afnio/models/openai.py
677
678
679
680
681
682
683
684
685
686
687
async def acomplete(self, prompt: str) -> str:
    """
    Asynchronous method to generate a completion for the given prompt.

    Args:
        prompt: The input text for which the model should generate a completion.

    Returns:
        A string containing the generated completion.
    """
    raise NotImplementedError

achat(*, messages, model, audio=NOT_GIVEN, frequency_penalty=NOT_GIVEN, function_call=NOT_GIVEN, functions=NOT_GIVEN, logit_bias=NOT_GIVEN, logprobs=NOT_GIVEN, max_completion_tokens=NOT_GIVEN, max_tokens=NOT_GIVEN, metadata=NOT_GIVEN, modalities=NOT_GIVEN, n=NOT_GIVEN, parallel_tool_calls=NOT_GIVEN, prediction=NOT_GIVEN, presence_penalty=NOT_GIVEN, prompt_cache_key=NOT_GIVEN, reasoning_effort=NOT_GIVEN, response_format=NOT_GIVEN, safety_identifier=NOT_GIVEN, seed=NOT_GIVEN, service_tier=NOT_GIVEN, stop=NOT_GIVEN, store=NOT_GIVEN, stream=NOT_GIVEN, stream_options=NOT_GIVEN, temperature=NOT_GIVEN, tool_choice=NOT_GIVEN, tools=NOT_GIVEN, top_logprobs=NOT_GIVEN, top_p=NOT_GIVEN, user=NOT_GIVEN, verbosity=NOT_GIVEN, web_search_options=NOT_GIVEN, extra_headers=None, extra_query=None, extra_body=None, timeout=NOT_GIVEN) async

Asynchronously creates a model response for the given chat conversation. Learn more in the text generation, vision, and audio guides.

Parameter support can differ depending on the model used to generate the response, particularly for newer reasoning models. Parameters that are only supported for reasoning models are noted below. For the current state of unsupported parameters in reasoning models, refer to the reasoning guide.

Parameters:

Name Type Description Default
messages Iterable[ChatCompletionMessageParam]

A list of messages comprising the conversation so far. Depending on the model you use, different message types (modalities) are supported, like text, images, and audio.

required
model str | ChatModel

Model ID used to generate the response, like gpt-4o or o3. OpenAI offers a wide range of models with different capabilities, performance characteristics, and price points. Refer to the model guide to browse and compare available models.

required
audio ChatCompletionAudioParam | None | NotGiven

Parameters for audio output. Required when audio output is requested with modalities: ["audio"]. Learn more.

NOT_GIVEN
frequency_penalty float | None | NotGiven

Number between -2.0 and 2.0. Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.

NOT_GIVEN
function_call FunctionCall | NotGiven

Deprecated in favor of tool_choice.

Controls which (if any) function is called by the model.

none means the model will not call a function and instead generates a message.

auto means the model can pick between generating a message or calling a function.

Specifying a particular function via {"name": "my_function"} forces the model to call that function.

none is the default when no functions are present. auto is the default if functions are present.

NOT_GIVEN
functions Iterable[Function] | NotGiven

Deprecated in favor of tools.

A list of functions the model may generate JSON inputs for.

NOT_GIVEN
logit_bias dict[str, int] | None | NotGiven

Modify the likelihood of specified tokens appearing in the completion.

Accepts a JSON object that maps tokens (specified by their token ID in the tokenizer) to an associated bias value from -100 to 100. Mathematically, the bias is added to the logits generated by the model prior to sampling. The exact effect will vary per model, but values between -1 and 1 should decrease or increase likelihood of selection; values like -100 or 100 should result in a ban or exclusive selection of the relevant token.

NOT_GIVEN
logprobs bool | None | NotGiven

Whether to return log probabilities of the output tokens or not. If true, returns the log probabilities of each output token returned in the content of message.

NOT_GIVEN
max_completion_tokens int | None | NotGiven

An upper bound for the number of tokens that can be generated for a completion, including visible output tokens and reasoning tokens.

NOT_GIVEN
max_tokens int | None | NotGiven

The maximum number of tokens that can be generated in the chat completion. This value can be used to control costs for text generated via API.

This value is now deprecated in favor of max_completion_tokens, and is not compatible with o-series models.

NOT_GIVEN
metadata Metadata | None | NotGiven

Set of 16 key-value pairs that can be attached to an object. This can be useful for storing additional information about the object in a structured format, and querying for objects via API or the dashboard.

Keys are strings with a maximum length of 64 characters. Values are strings with a maximum length of 512 characters.

NOT_GIVEN
modalities list[Literal['text', 'audio']] | None | NotGiven

Output types that you would like the model to generate. Most models are capable of generating text, which is the default:

["text"]

The gpt-4o-audio-preview model can also be used to generate audio. To request that this model generate both text and audio responses, you can use:

["text", "audio"]

NOT_GIVEN
n int | None | NotGiven

How many chat completion choices to generate for each input message. Note that you will be charged based on the number of generated tokens across all of the choices. Keep n as 1 to minimize costs.

NOT_GIVEN
parallel_tool_calls bool | NotGiven

Whether to enable parallel function calling during tool use.

NOT_GIVEN
prediction ChatCompletionPredictionContentParam | None | NotGiven

Static predicted output content, such as the content of a text file that is being regenerated.

NOT_GIVEN
presence_penalty float | None | NotGiven

Number between -2.0 and 2.0. Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.

NOT_GIVEN
prompt_cache_key str | NotGiven

Used by OpenAI to cache responses for similar requests to optimize your cache hit rates. Replaces the user field. Learn more.

NOT_GIVEN
reasoning_effort ReasoningEffort | NotGiven

Constrains effort on reasoning for reasoning models. Currently supported values are minimal, low, medium, and high. Reducing reasoning effort can result in faster responses and fewer tokens used on reasoning in a response.

NOT_GIVEN
response_format ResponseFormat | NotGiven

An object specifying the format that the model must output.

Setting to { "type": "json_schema", "json_schema": {...} } enables Structured Outputs which ensures the model will match your supplied JSON schema. Learn more in the Structured Outputs guide.

Setting to { "type": "json_object" } enables the older JSON mode, which ensures the message the model generates is valid JSON. Using json_schema is preferred for models that support it.

NOT_GIVEN
safety_identifier str | NotGiven

A stable identifier used to help detect users of your application that may be violating OpenAI's usage policies. The IDs should be a string that uniquely identifies each user. We recommend hashing their username or email address, in order to avoid sending us any identifying information. Learn more.

NOT_GIVEN
seed int | None | NotGiven

This feature is in Beta. If specified, our system will make a best effort to sample deterministically, such that repeated requests with the same seed and parameters should return the same result. Determinism is not guaranteed, and you should refer to the system_fingerprint response parameter to monitor changes in the backend.

NOT_GIVEN
service_tier Literal['auto', 'default', 'flex', 'scale', 'priority'] | None | NotGiven

Specifies the processing type used for serving the request.

  • If set to 'auto', then the request will be processed with the service tier configured in the Project settings. Unless otherwise configured, the Project will use 'default'.
  • If set to 'default', then the request will be processed with the standard pricing and performance for the selected model.
  • If set to 'flex' or 'priority', then the request will be processed with the corresponding service tier.
  • When not set, the default behavior is 'auto'.

When the service_tier parameter is set, the response body will include the service_tier value based on the processing mode actually used to serve the request. This response value may be different from the value set in the parameter.

NOT_GIVEN
stop str | None | SequenceNotStr[str] | None | NotGiven

Not supported with latest reasoning models o3 and o4-mini.

Up to 4 sequences where the API will stop generating further tokens. The returned text will not contain the stop sequence.

NOT_GIVEN
store bool | None | NotGiven

Whether or not to store the output of this chat completion request for use in our model distillation or evals products.

Supports text and image inputs. Note: image inputs over 8MB will be dropped.

NOT_GIVEN
stream Literal[False] | None | NotGiven

If set to true, the model response data will be streamed to the client as it is generated using server-sent events. See the Streaming section below for more information, along with the streaming responses guide for more information on how to handle the streaming events.

NOT_GIVEN
stream_options ChatCompletionStreamOptionsParam | None | NotGiven

Options for streaming response. Only set this when you set stream: true.

NOT_GIVEN
temperature float | None | NotGiven

What sampling temperature to use, between 0 and 2. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and deterministic. We generally recommend altering this or top_p but not both.

NOT_GIVEN
tool_choice ChatCompletionToolChoiceOptionParam | NotGiven

Controls which (if any) tool is called by the model. none means the model will not call any tool and instead generates a message. auto means the model can pick between generating a message or calling one or more tools. required means the model must call one or more tools. Specifying a particular tool via {"type": "function", "function": {"name": "my_function"}} forces the model to call that tool.

none is the default when no tools are present. auto is the default if tools are present.

NOT_GIVEN
tools Iterable[ChatCompletionToolUnionParam] | NotGiven

A list of tools the model may call. You can provide either custom tools or function tools.

NOT_GIVEN
top_logprobs int | None | NotGiven

An integer between 0 and 20 specifying the number of most likely tokens to return at each token position, each with an associated log probability. logprobs must be set to true if this parameter is used.

NOT_GIVEN
top_p float | None | NotGiven

An alternative to sampling with temperature, called nucleus sampling, where the model considers the results of the tokens with top_p probability mass. So 0.1 means only the tokens comprising the top 10% probability mass are considered.

We generally recommend altering this or temperature but not both.

NOT_GIVEN
user str | NotGiven

This field is being replaced by safety_identifier and prompt_cache_key. Use prompt_cache_key instead to maintain caching optimizations. A stable identifier for your end-users. Used to boost cache hit rates by better bucketing similar requests and to help OpenAI detect and prevent abuse. Learn more.

NOT_GIVEN
verbosity Literal['low', 'medium', 'high'] | None | NotGiven

Constrains the verbosity of the model's response. Lower values will result in more concise responses, while higher values will result in more verbose responses. Currently supported values are low, medium, and high.

NOT_GIVEN
web_search_options WebSearchOptions | NotGiven

This tool searches the web for relevant results to use in a response. Learn more about the web search tool.

NOT_GIVEN
extra_headers Headers | None

Send extra headers

None
extra_query Query | None

Add additional query parameters to the request

None
extra_body Body | None

Add additional JSON properties to the request

None
timeout float | Timeout | NotGiven | None

Override the client-level default timeout for this request, in seconds

NOT_GIVEN

Returns:

Type Description
str

The generated chat completion content. If response_format is used, the format of the returned content will depend on the specified format. Otherwise, this will return a string containing the generated completion.

Source code in afnio/models/openai.py
 689
 690
 691
 692
 693
 694
 695
 696
 697
 698
 699
 700
 701
 702
 703
 704
 705
 706
 707
 708
 709
 710
 711
 712
 713
 714
 715
 716
 717
 718
 719
 720
 721
 722
 723
 724
 725
 726
 727
 728
 729
 730
 731
 732
 733
 734
 735
 736
 737
 738
 739
 740
 741
 742
 743
 744
 745
 746
 747
 748
 749
 750
 751
 752
 753
 754
 755
 756
 757
 758
 759
 760
 761
 762
 763
 764
 765
 766
 767
 768
 769
 770
 771
 772
 773
 774
 775
 776
 777
 778
 779
 780
 781
 782
 783
 784
 785
 786
 787
 788
 789
 790
 791
 792
 793
 794
 795
 796
 797
 798
 799
 800
 801
 802
 803
 804
 805
 806
 807
 808
 809
 810
 811
 812
 813
 814
 815
 816
 817
 818
 819
 820
 821
 822
 823
 824
 825
 826
 827
 828
 829
 830
 831
 832
 833
 834
 835
 836
 837
 838
 839
 840
 841
 842
 843
 844
 845
 846
 847
 848
 849
 850
 851
 852
 853
 854
 855
 856
 857
 858
 859
 860
 861
 862
 863
 864
 865
 866
 867
 868
 869
 870
 871
 872
 873
 874
 875
 876
 877
 878
 879
 880
 881
 882
 883
 884
 885
 886
 887
 888
 889
 890
 891
 892
 893
 894
 895
 896
 897
 898
 899
 900
 901
 902
 903
 904
 905
 906
 907
 908
 909
 910
 911
 912
 913
 914
 915
 916
 917
 918
 919
 920
 921
 922
 923
 924
 925
 926
 927
 928
 929
 930
 931
 932
 933
 934
 935
 936
 937
 938
 939
 940
 941
 942
 943
 944
 945
 946
 947
 948
 949
 950
 951
 952
 953
 954
 955
 956
 957
 958
 959
 960
 961
 962
 963
 964
 965
 966
 967
 968
 969
 970
 971
 972
 973
 974
 975
 976
 977
 978
 979
 980
 981
 982
 983
 984
 985
 986
 987
 988
 989
 990
 991
 992
 993
 994
 995
 996
 997
 998
 999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
async def achat(
    self,
    *,
    messages: Iterable[ChatCompletionMessageParam],
    model: Union[str, ChatModel],
    audio: Union[Optional[ChatCompletionAudioParam], NotGiven] = NOT_GIVEN,
    frequency_penalty: Union[Optional[float], NotGiven] = NOT_GIVEN,
    function_call: Union[
        completion_create_params.FunctionCall, NotGiven
    ] = NOT_GIVEN,
    functions: Union[
        Iterable[completion_create_params.Function], NotGiven
    ] = NOT_GIVEN,
    logit_bias: Union[Optional[Dict[str, int]], NotGiven] = NOT_GIVEN,
    logprobs: Union[Optional[bool], NotGiven] = NOT_GIVEN,
    max_completion_tokens: Union[Optional[int], NotGiven] = NOT_GIVEN,
    max_tokens: Union[Optional[int], NotGiven] = NOT_GIVEN,
    metadata: Union[Optional[Metadata], NotGiven] = NOT_GIVEN,
    modalities: Union[
        Optional[List[Literal["text", "audio"]]], NotGiven
    ] = NOT_GIVEN,
    n: Union[Optional[int], NotGiven] = NOT_GIVEN,
    parallel_tool_calls: Union[bool, NotGiven] = NOT_GIVEN,
    prediction: Union[
        Optional[ChatCompletionPredictionContentParam], NotGiven
    ] = NOT_GIVEN,
    presence_penalty: Union[Optional[float], NotGiven] = NOT_GIVEN,
    prompt_cache_key: Union[str, NotGiven] = NOT_GIVEN,
    reasoning_effort: Union[ReasoningEffort, NotGiven] = NOT_GIVEN,
    response_format: Union[
        completion_create_params.ResponseFormat, NotGiven
    ] = NOT_GIVEN,
    safety_identifier: Union[str, NotGiven] = NOT_GIVEN,
    seed: Union[Optional[int], NotGiven] = NOT_GIVEN,
    service_tier: Union[
        Optional[Literal["auto", "default", "flex", "scale", "priority"]], NotGiven
    ] = NOT_GIVEN,
    stop: Union[
        Union[Optional[str], SequenceNotStr[str], None], NotGiven
    ] = NOT_GIVEN,
    store: Union[Optional[bool], NotGiven] = NOT_GIVEN,
    # TODO: `stream` can be useful during inference, but forbid during training or backpropagation
    stream: Union[Optional[Literal[False]], NotGiven] = NOT_GIVEN,
    stream_options: Union[
        Optional[ChatCompletionStreamOptionsParam], NotGiven
    ] = NOT_GIVEN,
    temperature: Union[Optional[float], NotGiven] = NOT_GIVEN,
    tool_choice: Union[ChatCompletionToolChoiceOptionParam, NotGiven] = NOT_GIVEN,
    tools: Union[Iterable[ChatCompletionToolUnionParam], NotGiven] = NOT_GIVEN,
    top_logprobs: Union[Optional[int], NotGiven] = NOT_GIVEN,
    top_p: Union[Optional[float], NotGiven] = NOT_GIVEN,
    user: Union[str, NotGiven] = NOT_GIVEN,
    verbosity: Union[
        Optional[Literal["low", "medium", "high"]], NotGiven
    ] = NOT_GIVEN,
    web_search_options: Union[
        completion_create_params.WebSearchOptions, NotGiven
    ] = NOT_GIVEN,
    # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
    # The extra values given here take precedence over values defined on the client or passed to this method.
    extra_headers: Optional[Headers] = None,
    extra_query: Optional[Query] = None,
    extra_body: Optional[Body] = None,
    timeout: Optional[Union[float, httpx.Timeout, NotGiven]] = NOT_GIVEN,
) -> str:
    """Asynchronously creates a model response for the given chat conversation.
    Learn more in the
    [text generation](https://platform.openai.com/docs/guides/text-generation),
    [vision](https://platform.openai.com/docs/guides/vision), and
    [audio](https://platform.openai.com/docs/guides/audio) guides.

    Parameter support can differ depending on the model used to generate the
    response, particularly for newer reasoning models. Parameters that are only
    supported for reasoning models are noted below. For the current state of
    unsupported parameters in reasoning models,
    [refer to the reasoning guide](https://platform.openai.com/docs/guides/reasoning).

    Args:
      messages: A list of messages comprising the conversation so far. Depending on the
          [model](https://platform.openai.com/docs/models) you use, different message
          types (modalities) are supported, like
          [text](https://platform.openai.com/docs/guides/text-generation),
          [images](https://platform.openai.com/docs/guides/vision), and
          [audio](https://platform.openai.com/docs/guides/audio).

      model: Model ID used to generate the response, like `gpt-4o` or `o3`. OpenAI offers a
          wide range of models with different capabilities, performance characteristics,
          and price points. Refer to the
          [model guide](https://platform.openai.com/docs/models) to browse and compare
          available models.

      audio: Parameters for audio output. Required when audio output is requested with
          `modalities: ["audio"]`.
          [Learn more](https://platform.openai.com/docs/guides/audio).

      frequency_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on their
          existing frequency in the text so far, decreasing the model's likelihood to
          repeat the same line verbatim.

      function_call: Deprecated in favor of `tool_choice`.

          Controls which (if any) function is called by the model.

          `none` means the model will not call a function and instead generates a message.

          `auto` means the model can pick between generating a message or calling a
          function.

          Specifying a particular function via `{"name": "my_function"}` forces the model
          to call that function.

          `none` is the default when no functions are present. `auto` is the default if
          functions are present.

      functions: Deprecated in favor of `tools`.

          A list of functions the model may generate JSON inputs for.

      logit_bias: Modify the likelihood of specified tokens appearing in the completion.

          Accepts a JSON object that maps tokens (specified by their token ID in the
          tokenizer) to an associated bias value from -100 to 100. Mathematically, the
          bias is added to the logits generated by the model prior to sampling. The exact
          effect will vary per model, but values between -1 and 1 should decrease or
          increase likelihood of selection; values like -100 or 100 should result in a ban
          or exclusive selection of the relevant token.

      logprobs: Whether to return log probabilities of the output tokens or not. If true,
          returns the log probabilities of each output token returned in the `content` of
          `message`.

      max_completion_tokens: An upper bound for the number of tokens that can be generated for a completion,
          including visible output tokens and
          [reasoning tokens](https://platform.openai.com/docs/guides/reasoning).

      max_tokens: The maximum number of [tokens](/tokenizer) that can be generated in the chat
          completion. This value can be used to control
          [costs](https://openai.com/api/pricing/) for text generated via API.

          This value is now deprecated in favor of `max_completion_tokens`, and is not
          compatible with
          [o-series models](https://platform.openai.com/docs/guides/reasoning).

      metadata: Set of 16 key-value pairs that can be attached to an object. This can be useful
          for storing additional information about the object in a structured format, and
          querying for objects via API or the dashboard.

          Keys are strings with a maximum length of 64 characters. Values are strings with
          a maximum length of 512 characters.

      modalities: Output types that you would like the model to generate. Most models are capable
          of generating text, which is the default:

          `["text"]`

          The `gpt-4o-audio-preview` model can also be used to
          [generate audio](https://platform.openai.com/docs/guides/audio). To request that
          this model generate both text and audio responses, you can use:

          `["text", "audio"]`

      n: How many chat completion choices to generate for each input message. Note that
          you will be charged based on the number of generated tokens across all of the
          choices. Keep `n` as `1` to minimize costs.

      parallel_tool_calls: Whether to enable
          [parallel function calling](https://platform.openai.com/docs/guides/function-calling#configuring-parallel-function-calling)
          during tool use.

      prediction: Static predicted output content, such as the content of a text file that is
          being regenerated.

      presence_penalty: Number between -2.0 and 2.0. Positive values penalize new tokens based on
          whether they appear in the text so far, increasing the model's likelihood to
          talk about new topics.

      prompt_cache_key: Used by OpenAI to cache responses for similar requests to optimize your cache
          hit rates. Replaces the `user` field.
          [Learn more](https://platform.openai.com/docs/guides/prompt-caching).

      reasoning_effort: Constrains effort on reasoning for
          [reasoning models](https://platform.openai.com/docs/guides/reasoning). Currently
          supported values are `minimal`, `low`, `medium`, and `high`. Reducing reasoning
          effort can result in faster responses and fewer tokens used on reasoning in a
          response.

      response_format: An object specifying the format that the model must output.

          Setting to `{ "type": "json_schema", "json_schema": {...} }` enables Structured
          Outputs which ensures the model will match your supplied JSON schema. Learn more
          in the
          [Structured Outputs guide](https://platform.openai.com/docs/guides/structured-outputs).

          Setting to `{ "type": "json_object" }` enables the older JSON mode, which
          ensures the message the model generates is valid JSON. Using `json_schema` is
          preferred for models that support it.

      safety_identifier: A stable identifier used to help detect users of your application that may be
          violating OpenAI's usage policies. The IDs should be a string that uniquely
          identifies each user. We recommend hashing their username or email address, in
          order to avoid sending us any identifying information.
          [Learn more](https://platform.openai.com/docs/guides/safety-best-practices#safety-identifiers).

      seed: This feature is in Beta. If specified, our system will make a best effort to
          sample deterministically, such that repeated requests with the same `seed` and
          parameters should return the same result. Determinism is not guaranteed, and you
          should refer to the `system_fingerprint` response parameter to monitor changes
          in the backend.

      service_tier: Specifies the processing type used for serving the request.

          - If set to 'auto', then the request will be processed with the service tier
            configured in the Project settings. Unless otherwise configured, the Project
            will use 'default'.
          - If set to 'default', then the request will be processed with the standard
            pricing and performance for the selected model.
          - If set to '[flex](https://platform.openai.com/docs/guides/flex-processing)' or
            '[priority](https://openai.com/api-priority-processing/)', then the request
            will be processed with the corresponding service tier.
          - When not set, the default behavior is 'auto'.

          When the `service_tier` parameter is set, the response body will include the
          `service_tier` value based on the processing mode actually used to serve the
          request. This response value may be different from the value set in the
          parameter.

      stop: Not supported with latest reasoning models `o3` and `o4-mini`.

          Up to 4 sequences where the API will stop generating further tokens. The
          returned text will not contain the stop sequence.

      store: Whether or not to store the output of this chat completion request for use in
          our [model distillation](https://platform.openai.com/docs/guides/distillation)
          or [evals](https://platform.openai.com/docs/guides/evals) products.

          Supports text and image inputs. Note: image inputs over 8MB will be dropped.

      stream: If set to true, the model response data will be streamed to the client as it is
          generated using
          [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
          See the
          [Streaming section below](https://platform.openai.com/docs/api-reference/chat/streaming)
          for more information, along with the
          [streaming responses](https://platform.openai.com/docs/guides/streaming-responses)
          guide for more information on how to handle the streaming events.

      stream_options: Options for streaming response. Only set this when you set `stream: true`.

      temperature: What sampling temperature to use, between 0 and 2. Higher values like 0.8 will
          make the output more random, while lower values like 0.2 will make it more
          focused and deterministic. We generally recommend altering this or `top_p` but
          not both.

      tool_choice: Controls which (if any) tool is called by the model. `none` means the model will
          not call any tool and instead generates a message. `auto` means the model can
          pick between generating a message or calling one or more tools. `required` means
          the model must call one or more tools. Specifying a particular tool via
          `{"type": "function", "function": {"name": "my_function"}}` forces the model to
          call that tool.

          `none` is the default when no tools are present. `auto` is the default if tools
          are present.

      tools: A list of tools the model may call. You can provide either
          [custom tools](https://platform.openai.com/docs/guides/function-calling#custom-tools)
          or [function tools](https://platform.openai.com/docs/guides/function-calling).

      top_logprobs: An integer between 0 and 20 specifying the number of most likely tokens to
          return at each token position, each with an associated log probability.
          `logprobs` must be set to `true` if this parameter is used.

      top_p: An alternative to sampling with temperature, called nucleus sampling, where the
          model considers the results of the tokens with top_p probability mass. So 0.1
          means only the tokens comprising the top 10% probability mass are considered.

          We generally recommend altering this or `temperature` but not both.

      user: This field is being replaced by `safety_identifier` and `prompt_cache_key`. Use
          `prompt_cache_key` instead to maintain caching optimizations. A stable
          identifier for your end-users. Used to boost cache hit rates by better bucketing
          similar requests and to help OpenAI detect and prevent abuse.
          [Learn more](https://platform.openai.com/docs/guides/safety-best-practices#safety-identifiers).

      verbosity: Constrains the verbosity of the model's response. Lower values will result in
          more concise responses, while higher values will result in more verbose
          responses. Currently supported values are `low`, `medium`, and `high`.

      web_search_options: This tool searches the web for relevant results to use in a response. Learn more
          about the
          [web search tool](https://platform.openai.com/docs/guides/tools-web-search?api-mode=chat).

      extra_headers: Send extra headers

      extra_query: Add additional query parameters to the request

      extra_body: Add additional JSON properties to the request

      timeout: Override the client-level default timeout for this request, in seconds

    Returns:
      The generated chat completion content. If `response_format` is used, \
      the format of the returned content will depend on the specified format. \
      Otherwise, this will return a string containing the generated completion.
    """  # noqa: E501
    # TODO: handle `n > 1`` that returns multiple completions
    if isinstance(n, int) and n > 1:
        raise ValueError("n > 1 is not supported for async chat completions.")

    response = await self._aclient.chat.completions.create(
        messages=messages,
        model=model,
        audio=audio,
        frequency_penalty=frequency_penalty,
        function_call=function_call,
        functions=functions,
        logit_bias=logit_bias,
        logprobs=logprobs,
        max_completion_tokens=max_completion_tokens,
        max_tokens=max_tokens,
        metadata=metadata,
        modalities=modalities,
        n=n,
        parallel_tool_calls=parallel_tool_calls,
        prediction=prediction,
        presence_penalty=presence_penalty,
        prompt_cache_key=prompt_cache_key,
        reasoning_effort=reasoning_effort,
        response_format=response_format,
        safety_identifier=safety_identifier,
        seed=seed,
        service_tier=service_tier,
        stop=stop,
        store=store,
        stream=stream,
        stream_options=stream_options,
        temperature=temperature,
        tool_choice=tool_choice,
        tools=tools,
        top_logprobs=top_logprobs,
        top_p=top_p,
        user=user,
        verbosity=verbosity,
        web_search_options=web_search_options,
        extra_headers=extra_headers,
        extra_query=extra_query,
        extra_body=extra_body,
        timeout=timeout,
    )

    # Update usage statistics
    if hasattr(response, "usage") and response.usage:
        self.update_usage(response.usage, model)

    return response.choices[0].message.content

aembed(input) async

Asynchronous method to generate embeddings for the given input texts.

Parameters:

Name Type Description Default
input list[str]

A list of input strings for which embeddings should be generated.

required

Returns:

Type Description
list[list[float]]

A list of embeddings, where each embedding is represented as a list of floats corresponding to the input strings.

Source code in afnio/models/openai.py
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
async def aembed(self, input: List[str]) -> List[List[float]]:
    """
    Asynchronous method to generate embeddings for the given input texts.

    Args:
        input: A list of input strings for which embeddings
          should be generated.

    Returns:
        A list of embeddings, where each embedding is represented \
        as a list of floats corresponding to the input strings.
    """
    raise NotImplementedError

update_usage(usage, model_name=None)

Updates the internal usage counters with values from a new API response.

Parameters:

Name Type Description Default
usage CompletionUsage

The usage object returned by the OpenAI API.

required
model_name str

The name of the model for which the usage is being updated. If None, cost is copied from usage if available.

None
Source code in afnio/models/openai.py
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
def update_usage(self, usage: CompletionUsage, model_name: str = None) -> None:
    """Updates the internal usage counters with values from a new API response.

    Args:
        usage: The usage object returned by the OpenAI API.
        model_name (str, optional): The name of the model for which the usage
            is being updated. If None, cost is copied from usage if available.
    """
    if not hasattr(self, "_usage"):
        self._usage.update(copy.deepcopy(INITIAL_USAGE))  # Ensure a fresh copy

    # Ensure we convert CompletionUsage to dict properly
    if isinstance(usage, CompletionUsage):
        usage = usage.model_dump()

    # Update core token usage fields
    self._usage["completion_tokens"] += usage.get("completion_tokens", 0)
    self._usage["prompt_tokens"] += usage.get("prompt_tokens", 0)
    self._usage["total_tokens"] += usage.get("total_tokens", 0)

    # Update prompt tokens details
    prompt_tokens_details = usage.get("prompt_tokens_details", {})
    self._usage["prompt_tokens_details"][
        "cached_tokens"
    ] += prompt_tokens_details.get("cached_tokens", 0)
    self._usage["prompt_tokens_details"][
        "audio_tokens"
    ] += prompt_tokens_details.get("audio_tokens", 0)

    # Update completion tokens details
    completion_tokens_details = usage.get("completion_tokens_details", {})
    self._usage["completion_tokens_details"][
        "reasoning_tokens"
    ] += completion_tokens_details.get("reasoning_tokens", 0)
    self._usage["completion_tokens_details"][
        "audio_tokens"
    ] += completion_tokens_details.get("audio_tokens", 0)
    self._usage["completion_tokens_details"][
        "accepted_prediction_tokens"
    ] += completion_tokens_details.get("accepted_prediction_tokens", 0)
    self._usage["completion_tokens_details"][
        "rejected_prediction_tokens"
    ] += completion_tokens_details.get("rejected_prediction_tokens", 0)

    # Update cost
    if model_name is not None:
        pricing = _get_pricing_for_model(self.provider, model_name)
        cost = _calculate_cost(usage, pricing)
        self._usage["cost"]["amount"] += cost
    else:
        # If cost is present in usage, copy it directly
        if "cost" in usage and "amount" in usage["cost"]:
            self._usage["cost"]["amount"] = usage["cost"]["amount"]