diff --git a/.stats.yml b/.stats.yml
index d223c8f1f..9600edae3 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,2 +1,2 @@
-configured_endpoints: 68
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai-02200a58ed631064b6419711da99fefd6e97bdbbeb577a80a1a6e0c8dbcb18f5.yml
+configured_endpoints: 69
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai-b5b0e2c794b012919701c3fd43286af10fa25d33ceb8a881bec2636028f446e0.yml
diff --git a/api.md b/api.md
index e971585de..10f88e5e0 100644
--- a/api.md
+++ b/api.md
@@ -212,6 +212,66 @@ Methods:
# Beta
+## Realtime
+
+Types:
+
+- ConversationCreatedEvent
+- ConversationItem
+- ConversationItemContent
+- ConversationItemCreateEvent
+- ConversationItemCreatedEvent
+- ConversationItemDeleteEvent
+- ConversationItemDeletedEvent
+- ConversationItemInputAudioTranscriptionCompletedEvent
+- ConversationItemInputAudioTranscriptionFailedEvent
+- ConversationItemTruncateEvent
+- ConversationItemTruncatedEvent
+- ErrorEvent
+- InputAudioBufferAppendEvent
+- InputAudioBufferClearEvent
+- InputAudioBufferClearedEvent
+- InputAudioBufferCommitEvent
+- InputAudioBufferCommittedEvent
+- InputAudioBufferSpeechStartedEvent
+- InputAudioBufferSpeechStoppedEvent
+- RateLimitsUpdatedEvent
+- RealtimeClientEvent
+- RealtimeResponse
+- RealtimeResponseStatus
+- RealtimeResponseUsage
+- RealtimeServerEvent
+- ResponseAudioDeltaEvent
+- ResponseAudioDoneEvent
+- ResponseAudioTranscriptDeltaEvent
+- ResponseAudioTranscriptDoneEvent
+- ResponseCancelEvent
+- ResponseContentPartAddedEvent
+- ResponseContentPartDoneEvent
+- ResponseCreateEvent
+- ResponseCreatedEvent
+- ResponseDoneEvent
+- ResponseFunctionCallArgumentsDeltaEvent
+- ResponseFunctionCallArgumentsDoneEvent
+- ResponseOutputItemAddedEvent
+- ResponseOutputItemDoneEvent
+- ResponseTextDeltaEvent
+- ResponseTextDoneEvent
+- SessionCreatedEvent
+- SessionUpdateEvent
+- SessionUpdatedEvent
+
+### Sessions
+
+Types:
+
+- Session
+- SessionCreateResponse
+
+Methods:
+
+- client.beta.realtime.sessions.create({ ...params }) -> SessionCreateResponse
+
## VectorStores
Types:
diff --git a/src/resources/beta/beta.ts b/src/resources/beta/beta.ts
index 75035d600..48cc92369 100644
--- a/src/resources/beta/beta.ts
+++ b/src/resources/beta/beta.ts
@@ -20,6 +20,8 @@ import {
RunStreamEvent,
ThreadStreamEvent,
} from './assistants';
+import * as RealtimeAPI from './realtime/realtime';
+import { Realtime } from './realtime/realtime';
import * as ThreadsAPI from './threads/threads';
import {
AssistantResponseFormatOption,
@@ -54,11 +56,13 @@ import {
} from './vector-stores/vector-stores';
export class Beta extends APIResource {
+ realtime: RealtimeAPI.Realtime = new RealtimeAPI.Realtime(this._client);
vectorStores: VectorStoresAPI.VectorStores = new VectorStoresAPI.VectorStores(this._client);
assistants: AssistantsAPI.Assistants = new AssistantsAPI.Assistants(this._client);
threads: ThreadsAPI.Threads = new ThreadsAPI.Threads(this._client);
}
+Beta.Realtime = Realtime;
Beta.VectorStores = VectorStores;
Beta.VectorStoresPage = VectorStoresPage;
Beta.Assistants = Assistants;
@@ -66,6 +70,8 @@ Beta.AssistantsPage = AssistantsPage;
Beta.Threads = Threads;
export declare namespace Beta {
+ export { Realtime as Realtime };
+
export {
VectorStores as VectorStores,
type AutoFileChunkingStrategyParam as AutoFileChunkingStrategyParam,
diff --git a/src/resources/beta/index.ts b/src/resources/beta/index.ts
index 10212d390..b8db6f846 100644
--- a/src/resources/beta/index.ts
+++ b/src/resources/beta/index.ts
@@ -19,6 +19,7 @@ export {
type AssistantListParams,
} from './assistants';
export { Beta } from './beta';
+export { Realtime } from './realtime/index';
export {
Threads,
type AssistantResponseFormatOption,
diff --git a/src/resources/beta/realtime/index.ts b/src/resources/beta/realtime/index.ts
new file mode 100644
index 000000000..66c3ecaae
--- /dev/null
+++ b/src/resources/beta/realtime/index.ts
@@ -0,0 +1,4 @@
+// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+export { Realtime } from './realtime';
+export { Sessions, type Session, type SessionCreateResponse, type SessionCreateParams } from './sessions';
diff --git a/src/resources/beta/realtime/realtime.ts b/src/resources/beta/realtime/realtime.ts
new file mode 100644
index 000000000..5de06917a
--- /dev/null
+++ b/src/resources/beta/realtime/realtime.ts
@@ -0,0 +1,1904 @@
+// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+import { APIResource } from '../../../resource';
+import * as RealtimeAPI from './realtime';
+import * as SessionsAPI from './sessions';
+import {
+ Session as SessionsAPISession,
+ SessionCreateParams,
+ SessionCreateResponse,
+ Sessions,
+} from './sessions';
+
+export class Realtime extends APIResource {
+ sessions: SessionsAPI.Sessions = new SessionsAPI.Sessions(this._client);
+}
+
+/**
+ * Returned when a conversation is created. Emitted right after session creation.
+ */
+export interface ConversationCreatedEvent {
+ /**
+ * The conversation resource.
+ */
+ conversation: ConversationCreatedEvent.Conversation;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The event type, must be `conversation.created`.
+ */
+ type: 'conversation.created';
+}
+
+export namespace ConversationCreatedEvent {
+ /**
+ * The conversation resource.
+ */
+ export interface Conversation {
+ /**
+ * The unique ID of the conversation.
+ */
+ id?: string;
+
+ /**
+ * The object type, must be `realtime.conversation`.
+ */
+ object?: 'realtime.conversation';
+ }
+}
+
+/**
+ * The item to add to the conversation.
+ */
+export interface ConversationItem {
+ /**
+ * The unique ID of the item, this can be generated by the client to help manage
+ * server-side context, but is not required because the server will generate one if
+ * not provided.
+ */
+ id?: string;
+
+ /**
+ * The arguments of the function call (for `function_call` items).
+ */
+ arguments?: string;
+
+ /**
+ * The ID of the function call (for `function_call` and `function_call_output`
+ * items). If passed on a `function_call_output` item, the server will check that a
+ * `function_call` item with the same ID exists in the conversation history.
+ */
+ call_id?: string;
+
+ /**
+ * The content of the message, applicable for `message` items.
+ *
+ * - Message items of role `system` support only `input_text` content
+ * - Message items of role `user` support `input_text` and `input_audio` content
+ * - Message items of role `assistant` support `text` content.
+ */
+ content?: Array;
+
+ /**
+ * The name of the function being called (for `function_call` items).
+ */
+ name?: string;
+
+ /**
+ * Identifier for the API object being returned - always `realtime.item`.
+ */
+ object?: 'realtime.item';
+
+ /**
+ * The output of the function call (for `function_call_output` items).
+ */
+ output?: string;
+
+ /**
+ * The role of the message sender (`user`, `assistant`, `system`), only applicable
+ * for `message` items.
+ */
+ role?: 'user' | 'assistant' | 'system';
+
+ /**
+ * The status of the item (`completed`, `incomplete`). These have no effect on the
+ * conversation, but are accepted for consistency with the
+ * `conversation.item.created` event.
+ */
+ status?: 'completed' | 'incomplete';
+
+ /**
+ * The type of the item (`message`, `function_call`, `function_call_output`).
+ */
+ type?: 'message' | 'function_call' | 'function_call_output';
+}
+
+export interface ConversationItemContent {
+ /**
+ * ID of a previous conversation item to reference (for `item_reference` content
+ * types in `response.create` events). These can reference both client and server
+ * created items.
+ */
+ id?: string;
+
+ /**
+ * Base64-encoded audio bytes, used for `input_audio` content type.
+ */
+ audio?: string;
+
+ /**
+ * The text content, used for `input_text` and `text` content types.
+ */
+ text?: string;
+
+ /**
+ * The transcript of the audio, used for `input_audio` content type.
+ */
+ transcript?: string;
+
+ /**
+ * The content type (`input_text`, `input_audio`, `item_reference`, `text`).
+ */
+ type?: 'input_text' | 'input_audio' | 'item_reference' | 'text';
+}
+
+/**
+ * Add a new Item to the Conversation's context, including messages, function
+ * calls, and function call responses. This event can be used both to populate a
+ * "history" of the conversation and to add new items mid-stream, but has the
+ * current limitation that it cannot populate assistant audio messages.
+ *
+ * If successful, the server will respond with a `conversation.item.created` event,
+ * otherwise an `error` event will be sent.
+ */
+export interface ConversationItemCreateEvent {
+ /**
+ * The item to add to the conversation.
+ */
+ item: ConversationItem;
+
+ /**
+ * The event type, must be `conversation.item.create`.
+ */
+ type: 'conversation.item.create';
+
+ /**
+ * Optional client-generated ID used to identify this event.
+ */
+ event_id?: string;
+
+ /**
+ * The ID of the preceding item after which the new item will be inserted. If not
+ * set, the new item will be appended to the end of the conversation. If set, it
+ * allows an item to be inserted mid-conversation. If the ID cannot be found, an
+ * error will be returned and the item will not be added.
+ */
+ previous_item_id?: string;
+}
+
+/**
+ * Returned when a conversation item is created. There are several scenarios that
+ * produce this event:
+ *
+ * - The server is generating a Response, which if successful will produce either
+ * one or two Items, which will be of type `message` (role `assistant`) or type
+ * `function_call`.
+ * - The input audio buffer has been committed, either by the client or the server
+ * (in `server_vad` mode). The server will take the content of the input audio
+ * buffer and add it to a new user message Item.
+ * - The client has sent a `conversation.item.create` event to add a new Item to
+ * the Conversation.
+ */
+export interface ConversationItemCreatedEvent {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The item to add to the conversation.
+ */
+ item: ConversationItem;
+
+ /**
+ * The ID of the preceding item in the Conversation context, allows the client to
+ * understand the order of the conversation.
+ */
+ previous_item_id: string;
+
+ /**
+ * The event type, must be `conversation.item.created`.
+ */
+ type: 'conversation.item.created';
+}
+
+/**
+ * Send this event when you want to remove any item from the conversation history.
+ * The server will respond with a `conversation.item.deleted` event, unless the
+ * item does not exist in the conversation history, in which case the server will
+ * respond with an error.
+ */
+export interface ConversationItemDeleteEvent {
+ /**
+ * The ID of the item to delete.
+ */
+ item_id: string;
+
+ /**
+ * The event type, must be `conversation.item.delete`.
+ */
+ type: 'conversation.item.delete';
+
+ /**
+ * Optional client-generated ID used to identify this event.
+ */
+ event_id?: string;
+}
+
+/**
+ * Returned when an item in the conversation is deleted by the client with a
+ * `conversation.item.delete` event. This event is used to synchronize the server's
+ * understanding of the conversation history with the client's view.
+ */
+export interface ConversationItemDeletedEvent {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the item that was deleted.
+ */
+ item_id: string;
+
+ /**
+ * The event type, must be `conversation.item.deleted`.
+ */
+ type: 'conversation.item.deleted';
+}
+
+/**
+ * This event is the output of audio transcription for user audio written to the
+ * user audio buffer. Transcription begins when the input audio buffer is committed
+ * by the client or server (in `server_vad` mode). Transcription runs
+ * asynchronously with Response creation, so this event may come before or after
+ * the Response events.
+ *
+ * Realtime API models accept audio natively, and thus input transcription is a
+ * separate process run on a separate ASR (Automatic Speech Recognition) model,
+ * currently always `whisper-1`. Thus the transcript may diverge somewhat from the
+ * model's interpretation, and should be treated as a rough guide.
+ */
+export interface ConversationItemInputAudioTranscriptionCompletedEvent {
+ /**
+ * The index of the content part containing the audio.
+ */
+ content_index: number;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the user message item containing the audio.
+ */
+ item_id: string;
+
+ /**
+ * The transcribed text.
+ */
+ transcript: string;
+
+ /**
+ * The event type, must be `conversation.item.input_audio_transcription.completed`.
+ */
+ type: 'conversation.item.input_audio_transcription.completed';
+}
+
+/**
+ * Returned when input audio transcription is configured, and a transcription
+ * request for a user message failed. These events are separate from other `error`
+ * events so that the client can identify the related Item.
+ */
+export interface ConversationItemInputAudioTranscriptionFailedEvent {
+ /**
+ * The index of the content part containing the audio.
+ */
+ content_index: number;
+
+ /**
+ * Details of the transcription error.
+ */
+ error: ConversationItemInputAudioTranscriptionFailedEvent.Error;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the user message item.
+ */
+ item_id: string;
+
+ /**
+ * The event type, must be `conversation.item.input_audio_transcription.failed`.
+ */
+ type: 'conversation.item.input_audio_transcription.failed';
+}
+
+export namespace ConversationItemInputAudioTranscriptionFailedEvent {
+ /**
+ * Details of the transcription error.
+ */
+ export interface Error {
+ /**
+ * Error code, if any.
+ */
+ code?: string;
+
+ /**
+ * A human-readable error message.
+ */
+ message?: string;
+
+ /**
+ * Parameter related to the error, if any.
+ */
+ param?: string;
+
+ /**
+ * The type of error.
+ */
+ type?: string;
+ }
+}
+
+/**
+ * Send this event to truncate a previous assistant message’s audio. The server
+ * will produce audio faster than realtime, so this event is useful when the user
+ * interrupts to truncate audio that has already been sent to the client but not
+ * yet played. This will synchronize the server's understanding of the audio with
+ * the client's playback.
+ *
+ * Truncating audio will delete the server-side text transcript to ensure there is
+ * not text in the context that hasn't been heard by the user.
+ *
+ * If successful, the server will respond with a `conversation.item.truncated`
+ * event.
+ */
+export interface ConversationItemTruncateEvent {
+ /**
+ * Inclusive duration up to which audio is truncated, in milliseconds. If the
+ * audio_end_ms is greater than the actual audio duration, the server will respond
+ * with an error.
+ */
+ audio_end_ms: number;
+
+ /**
+ * The index of the content part to truncate. Set this to 0.
+ */
+ content_index: number;
+
+ /**
+ * The ID of the assistant message item to truncate. Only assistant message items
+ * can be truncated.
+ */
+ item_id: string;
+
+ /**
+ * The event type, must be `conversation.item.truncate`.
+ */
+ type: 'conversation.item.truncate';
+
+ /**
+ * Optional client-generated ID used to identify this event.
+ */
+ event_id?: string;
+}
+
+/**
+ * Returned when an earlier assistant audio message item is truncated by the client
+ * with a `conversation.item.truncate` event. This event is used to synchronize the
+ * server's understanding of the audio with the client's playback.
+ *
+ * This action will truncate the audio and remove the server-side text transcript
+ * to ensure there is no text in the context that hasn't been heard by the user.
+ */
+export interface ConversationItemTruncatedEvent {
+ /**
+ * The duration up to which the audio was truncated, in milliseconds.
+ */
+ audio_end_ms: number;
+
+ /**
+ * The index of the content part that was truncated.
+ */
+ content_index: number;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the assistant message item that was truncated.
+ */
+ item_id: string;
+
+ /**
+ * The event type, must be `conversation.item.truncated`.
+ */
+ type: 'conversation.item.truncated';
+}
+
+/**
+ * Returned when an error occurs, which could be a client problem or a server
+ * problem. Most errors are recoverable and the session will stay open, we
+ * recommend to implementors to monitor and log error messages by default.
+ */
+export interface ErrorEvent {
+ /**
+ * Details of the error.
+ */
+ error: ErrorEvent.Error;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The event type, must be `error`.
+ */
+ type: 'error';
+}
+
+export namespace ErrorEvent {
+ /**
+ * Details of the error.
+ */
+ export interface Error {
+ /**
+ * A human-readable error message.
+ */
+ message: string;
+
+ /**
+ * The type of error (e.g., "invalid_request_error", "server_error").
+ */
+ type: string;
+
+ /**
+ * Error code, if any.
+ */
+ code?: string | null;
+
+ /**
+ * The event_id of the client event that caused the error, if applicable.
+ */
+ event_id?: string | null;
+
+ /**
+ * Parameter related to the error, if any.
+ */
+ param?: string | null;
+ }
+}
+
+/**
+ * Send this event to append audio bytes to the input audio buffer. The audio
+ * buffer is temporary storage you can write to and later commit. In Server VAD
+ * mode, the audio buffer is used to detect speech and the server will decide when
+ * to commit. When Server VAD is disabled, you must commit the audio buffer
+ * manually.
+ *
+ * The client may choose how much audio to place in each event up to a maximum of
+ * 15 MiB, for example streaming smaller chunks from the client may allow the VAD
+ * to be more responsive. Unlike made other client events, the server will not send
+ * a confirmation response to this event.
+ */
+export interface InputAudioBufferAppendEvent {
+ /**
+ * Base64-encoded audio bytes. This must be in the format specified by the
+ * `input_audio_format` field in the session configuration.
+ */
+ audio: string;
+
+ /**
+ * The event type, must be `input_audio_buffer.append`.
+ */
+ type: 'input_audio_buffer.append';
+
+ /**
+ * Optional client-generated ID used to identify this event.
+ */
+ event_id?: string;
+}
+
+/**
+ * Send this event to clear the audio bytes in the buffer. The server will respond
+ * with an `input_audio_buffer.cleared` event.
+ */
+export interface InputAudioBufferClearEvent {
+ /**
+ * The event type, must be `input_audio_buffer.clear`.
+ */
+ type: 'input_audio_buffer.clear';
+
+ /**
+ * Optional client-generated ID used to identify this event.
+ */
+ event_id?: string;
+}
+
+/**
+ * Returned when the input audio buffer is cleared by the client with a
+ * `input_audio_buffer.clear` event.
+ */
+export interface InputAudioBufferClearedEvent {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The event type, must be `input_audio_buffer.cleared`.
+ */
+ type: 'input_audio_buffer.cleared';
+}
+
+/**
+ * Send this event to commit the user input audio buffer, which will create a new
+ * user message item in the conversation. This event will produce an error if the
+ * input audio buffer is empty. When in Server VAD mode, the client does not need
+ * to send this event, the server will commit the audio buffer automatically.
+ *
+ * Committing the input audio buffer will trigger input audio transcription (if
+ * enabled in session configuration), but it will not create a response from the
+ * model. The server will respond with an `input_audio_buffer.committed` event.
+ */
+export interface InputAudioBufferCommitEvent {
+ /**
+ * The event type, must be `input_audio_buffer.commit`.
+ */
+ type: 'input_audio_buffer.commit';
+
+ /**
+ * Optional client-generated ID used to identify this event.
+ */
+ event_id?: string;
+}
+
+/**
+ * Returned when an input audio buffer is committed, either by the client or
+ * automatically in server VAD mode. The `item_id` property is the ID of the user
+ * message item that will be created, thus a `conversation.item.created` event will
+ * also be sent to the client.
+ */
+export interface InputAudioBufferCommittedEvent {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the user message item that will be created.
+ */
+ item_id: string;
+
+ /**
+ * The ID of the preceding item after which the new item will be inserted.
+ */
+ previous_item_id: string;
+
+ /**
+ * The event type, must be `input_audio_buffer.committed`.
+ */
+ type: 'input_audio_buffer.committed';
+}
+
+/**
+ * Sent by the server when in `server_vad` mode to indicate that speech has been
+ * detected in the audio buffer. This can happen any time audio is added to the
+ * buffer (unless speech is already detected). The client may want to use this
+ * event to interrupt audio playback or provide visual feedback to the user.
+ *
+ * The client should expect to receive a `input_audio_buffer.speech_stopped` event
+ * when speech stops. The `item_id` property is the ID of the user message item
+ * that will be created when speech stops and will also be included in the
+ * `input_audio_buffer.speech_stopped` event (unless the client manually commits
+ * the audio buffer during VAD activation).
+ */
+export interface InputAudioBufferSpeechStartedEvent {
+ /**
+ * Milliseconds from the start of all audio written to the buffer during the
+ * session when speech was first detected. This will correspond to the beginning of
+ * audio sent to the model, and thus includes the `prefix_padding_ms` configured in
+ * the Session.
+ */
+ audio_start_ms: number;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the user message item that will be created when speech stops.
+ */
+ item_id: string;
+
+ /**
+ * The event type, must be `input_audio_buffer.speech_started`.
+ */
+ type: 'input_audio_buffer.speech_started';
+}
+
+/**
+ * Returned in `server_vad` mode when the server detects the end of speech in the
+ * audio buffer. The server will also send an `conversation.item.created` event
+ * with the user message item that is created from the audio buffer.
+ */
+export interface InputAudioBufferSpeechStoppedEvent {
+ /**
+ * Milliseconds since the session started when speech stopped. This will correspond
+ * to the end of audio sent to the model, and thus includes the
+ * `min_silence_duration_ms` configured in the Session.
+ */
+ audio_end_ms: number;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the user message item that will be created.
+ */
+ item_id: string;
+
+ /**
+ * The event type, must be `input_audio_buffer.speech_stopped`.
+ */
+ type: 'input_audio_buffer.speech_stopped';
+}
+
+/**
+ * Emitted at the beginning of a Response to indicate the updated rate limits. When
+ * a Response is created some tokens will be "reserved" for the output tokens, the
+ * rate limits shown here reflect that reservation, which is then adjusted
+ * accordingly once the Response is completed.
+ */
+export interface RateLimitsUpdatedEvent {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * List of rate limit information.
+ */
+ rate_limits: Array;
+
+ /**
+ * The event type, must be `rate_limits.updated`.
+ */
+ type: 'rate_limits.updated';
+}
+
+export namespace RateLimitsUpdatedEvent {
+ export interface RateLimit {
+ /**
+ * The maximum allowed value for the rate limit.
+ */
+ limit?: number;
+
+ /**
+ * The name of the rate limit (`requests`, `tokens`).
+ */
+ name?: 'requests' | 'tokens';
+
+ /**
+ * The remaining value before the limit is reached.
+ */
+ remaining?: number;
+
+ /**
+ * Seconds until the rate limit resets.
+ */
+ reset_seconds?: number;
+ }
+}
+
+/**
+ * All events that the client can send to the Realtime API
+ */
+export type RealtimeClientEvent =
+ | SessionUpdateEvent
+ | InputAudioBufferAppendEvent
+ | InputAudioBufferCommitEvent
+ | InputAudioBufferClearEvent
+ | ConversationItemCreateEvent
+ | ConversationItemTruncateEvent
+ | ConversationItemDeleteEvent
+ | ResponseCreateEvent
+ | ResponseCancelEvent;
+
+/**
+ * The response resource.
+ */
+export interface RealtimeResponse {
+ /**
+ * The unique ID of the response.
+ */
+ id?: string;
+
+ /**
+ * Developer-provided string key-value pairs associated with this response.
+ */
+ metadata?: unknown | null;
+
+ /**
+ * The object type, must be `realtime.response`.
+ */
+ object?: 'realtime.response';
+
+ /**
+ * The list of output items generated by the response.
+ */
+ output?: Array;
+
+ /**
+ * The final status of the response (`completed`, `cancelled`, `failed`, or
+ * `incomplete`).
+ */
+ status?: 'completed' | 'cancelled' | 'failed' | 'incomplete';
+
+ /**
+ * Additional details about the status.
+ */
+ status_details?: RealtimeResponseStatus;
+
+ /**
+ * Usage statistics for the Response, this will correspond to billing. A Realtime
+ * API session will maintain a conversation context and append new Items to the
+ * Conversation, thus output from previous turns (text and audio tokens) will
+ * become the input for later turns.
+ */
+ usage?: RealtimeResponseUsage;
+}
+
+/**
+ * Additional details about the status.
+ */
+export interface RealtimeResponseStatus {
+ /**
+ * A description of the error that caused the response to fail, populated when the
+ * `status` is `failed`.
+ */
+ error?: RealtimeResponseStatus.Error;
+
+ /**
+ * The reason the Response did not complete. For a `cancelled` Response, one of
+ * `turn_detected` (the server VAD detected a new start of speech) or
+ * `client_cancelled` (the client sent a cancel event). For an `incomplete`
+ * Response, one of `max_output_tokens` or `content_filter` (the server-side safety
+ * filter activated and cut off the response).
+ */
+ reason?: 'turn_detected' | 'client_cancelled' | 'max_output_tokens' | 'content_filter';
+
+ /**
+ * The type of error that caused the response to fail, corresponding with the
+ * `status` field (`completed`, `cancelled`, `incomplete`, `failed`).
+ */
+ type?: 'completed' | 'cancelled' | 'incomplete' | 'failed';
+}
+
+export namespace RealtimeResponseStatus {
+ /**
+ * A description of the error that caused the response to fail, populated when the
+ * `status` is `failed`.
+ */
+ export interface Error {
+ /**
+ * Error code, if any.
+ */
+ code?: string;
+
+ /**
+ * The type of error.
+ */
+ type?: string;
+ }
+}
+
+/**
+ * Usage statistics for the Response, this will correspond to billing. A Realtime
+ * API session will maintain a conversation context and append new Items to the
+ * Conversation, thus output from previous turns (text and audio tokens) will
+ * become the input for later turns.
+ */
+export interface RealtimeResponseUsage {
+ /**
+ * Details about the input tokens used in the Response.
+ */
+ input_token_details?: RealtimeResponseUsage.InputTokenDetails;
+
+ /**
+ * The number of input tokens used in the Response, including text and audio
+ * tokens.
+ */
+ input_tokens?: number;
+
+ /**
+ * Details about the output tokens used in the Response.
+ */
+ output_token_details?: RealtimeResponseUsage.OutputTokenDetails;
+
+ /**
+ * The number of output tokens sent in the Response, including text and audio
+ * tokens.
+ */
+ output_tokens?: number;
+
+ /**
+ * The total number of tokens in the Response including input and output text and
+ * audio tokens.
+ */
+ total_tokens?: number;
+}
+
+export namespace RealtimeResponseUsage {
+ /**
+ * Details about the input tokens used in the Response.
+ */
+ export interface InputTokenDetails {
+ /**
+ * The number of audio tokens used in the Response.
+ */
+ audio_tokens?: number;
+
+ /**
+ * The number of cached tokens used in the Response.
+ */
+ cached_tokens?: number;
+
+ /**
+ * The number of text tokens used in the Response.
+ */
+ text_tokens?: number;
+ }
+
+ /**
+ * Details about the output tokens used in the Response.
+ */
+ export interface OutputTokenDetails {
+ /**
+ * The number of audio tokens used in the Response.
+ */
+ audio_tokens?: number;
+
+ /**
+ * The number of text tokens used in the Response.
+ */
+ text_tokens?: number;
+ }
+}
+
+/**
+ * All events that the Realtime API can send back
+ */
+export type RealtimeServerEvent =
+ | ErrorEvent
+ | SessionCreatedEvent
+ | SessionUpdatedEvent
+ | ConversationCreatedEvent
+ | InputAudioBufferCommittedEvent
+ | InputAudioBufferClearedEvent
+ | InputAudioBufferSpeechStartedEvent
+ | InputAudioBufferSpeechStoppedEvent
+ | ConversationItemCreatedEvent
+ | ConversationItemInputAudioTranscriptionCompletedEvent
+ | ConversationItemInputAudioTranscriptionFailedEvent
+ | ConversationItemTruncatedEvent
+ | ConversationItemDeletedEvent
+ | ResponseCreatedEvent
+ | ResponseDoneEvent
+ | ResponseOutputItemAddedEvent
+ | ResponseOutputItemDoneEvent
+ | ResponseContentPartAddedEvent
+ | ResponseContentPartDoneEvent
+ | ResponseTextDeltaEvent
+ | ResponseTextDoneEvent
+ | ResponseAudioTranscriptDeltaEvent
+ | ResponseAudioTranscriptDoneEvent
+ | ResponseAudioDeltaEvent
+ | ResponseAudioDoneEvent
+ | ResponseFunctionCallArgumentsDeltaEvent
+ | ResponseFunctionCallArgumentsDoneEvent
+ | RateLimitsUpdatedEvent;
+
+/**
+ * Returned when the model-generated audio is updated.
+ */
+export interface ResponseAudioDeltaEvent {
+ /**
+ * The index of the content part in the item's content array.
+ */
+ content_index: number;
+
+ /**
+ * Base64-encoded audio data delta.
+ */
+ delta: string;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the item.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item in the response.
+ */
+ output_index: number;
+
+ /**
+ * The ID of the response.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `response.audio.delta`.
+ */
+ type: 'response.audio.delta';
+}
+
+/**
+ * Returned when the model-generated audio is done. Also emitted when a Response is
+ * interrupted, incomplete, or cancelled.
+ */
+export interface ResponseAudioDoneEvent {
+ /**
+ * The index of the content part in the item's content array.
+ */
+ content_index: number;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the item.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item in the response.
+ */
+ output_index: number;
+
+ /**
+ * The ID of the response.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `response.audio.done`.
+ */
+ type: 'response.audio.done';
+}
+
+/**
+ * Returned when the model-generated transcription of audio output is updated.
+ */
+export interface ResponseAudioTranscriptDeltaEvent {
+ /**
+ * The index of the content part in the item's content array.
+ */
+ content_index: number;
+
+ /**
+ * The transcript delta.
+ */
+ delta: string;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the item.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item in the response.
+ */
+ output_index: number;
+
+ /**
+ * The ID of the response.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `response.audio_transcript.delta`.
+ */
+ type: 'response.audio_transcript.delta';
+}
+
+/**
+ * Returned when the model-generated transcription of audio output is done
+ * streaming. Also emitted when a Response is interrupted, incomplete, or
+ * cancelled.
+ */
+export interface ResponseAudioTranscriptDoneEvent {
+ /**
+ * The index of the content part in the item's content array.
+ */
+ content_index: number;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the item.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item in the response.
+ */
+ output_index: number;
+
+ /**
+ * The ID of the response.
+ */
+ response_id: string;
+
+ /**
+ * The final transcript of the audio.
+ */
+ transcript: string;
+
+ /**
+ * The event type, must be `response.audio_transcript.done`.
+ */
+ type: 'response.audio_transcript.done';
+}
+
+/**
+ * Send this event to cancel an in-progress response. The server will respond with
+ * a `response.cancelled` event or an error if there is no response to cancel.
+ */
+export interface ResponseCancelEvent {
+ /**
+ * The event type, must be `response.cancel`.
+ */
+ type: 'response.cancel';
+
+ /**
+ * Optional client-generated ID used to identify this event.
+ */
+ event_id?: string;
+
+ /**
+ * A specific response ID to cancel - if not provided, will cancel an in-progress
+ * response in the default conversation.
+ */
+ response_id?: string;
+}
+
+/**
+ * Returned when a new content part is added to an assistant message item during
+ * response generation.
+ */
+export interface ResponseContentPartAddedEvent {
+ /**
+ * The index of the content part in the item's content array.
+ */
+ content_index: number;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the item to which the content part was added.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item in the response.
+ */
+ output_index: number;
+
+ /**
+ * The content part that was added.
+ */
+ part: ResponseContentPartAddedEvent.Part;
+
+ /**
+ * The ID of the response.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `response.content_part.added`.
+ */
+ type: 'response.content_part.added';
+}
+
+export namespace ResponseContentPartAddedEvent {
+ /**
+ * The content part that was added.
+ */
+ export interface Part {
+ /**
+ * Base64-encoded audio data (if type is "audio").
+ */
+ audio?: string;
+
+ /**
+ * The text content (if type is "text").
+ */
+ text?: string;
+
+ /**
+ * The transcript of the audio (if type is "audio").
+ */
+ transcript?: string;
+
+ /**
+ * The content type ("text", "audio").
+ */
+ type?: 'text' | 'audio';
+ }
+}
+
+/**
+ * Returned when a content part is done streaming in an assistant message item.
+ * Also emitted when a Response is interrupted, incomplete, or cancelled.
+ */
+export interface ResponseContentPartDoneEvent {
+ /**
+ * The index of the content part in the item's content array.
+ */
+ content_index: number;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the item.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item in the response.
+ */
+ output_index: number;
+
+ /**
+ * The content part that is done.
+ */
+ part: ResponseContentPartDoneEvent.Part;
+
+ /**
+ * The ID of the response.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `response.content_part.done`.
+ */
+ type: 'response.content_part.done';
+}
+
+export namespace ResponseContentPartDoneEvent {
+ /**
+ * The content part that is done.
+ */
+ export interface Part {
+ /**
+ * Base64-encoded audio data (if type is "audio").
+ */
+ audio?: string;
+
+ /**
+ * The text content (if type is "text").
+ */
+ text?: string;
+
+ /**
+ * The transcript of the audio (if type is "audio").
+ */
+ transcript?: string;
+
+ /**
+ * The content type ("text", "audio").
+ */
+ type?: 'text' | 'audio';
+ }
+}
+
+/**
+ * This event instructs the server to create a Response, which means triggering
+ * model inference. When in Server VAD mode, the server will create Responses
+ * automatically.
+ *
+ * A Response will include at least one Item, and may have two, in which case the
+ * second will be a function call. These Items will be appended to the conversation
+ * history.
+ *
+ * The server will respond with a `response.created` event, events for Items and
+ * content created, and finally a `response.done` event to indicate the Response is
+ * complete.
+ *
+ * The `response.create` event includes inference configuration like
+ * `instructions`, and `temperature`. These fields will override the Session's
+ * configuration for this Response only.
+ */
+export interface ResponseCreateEvent {
+ /**
+ * The event type, must be `response.create`.
+ */
+ type: 'response.create';
+
+ /**
+ * Optional client-generated ID used to identify this event.
+ */
+ event_id?: string;
+
+ /**
+ * Create a new Realtime response with these parameters
+ */
+ response?: ResponseCreateEvent.Response;
+}
+
+export namespace ResponseCreateEvent {
+ /**
+ * Create a new Realtime response with these parameters
+ */
+ export interface Response {
+ /**
+ * Controls which conversation the response is added to. Currently supports `auto`
+ * and `none`, with `auto` as the default value. The `auto` value means that the
+ * contents of the response will be added to the default conversation. Set this to
+ * `none` to create an out-of-band response which will not add items to default
+ * conversation.
+ */
+ conversation?: (string & {}) | 'auto' | 'none';
+
+ /**
+ * Input items to include in the prompt for the model. Creates a new context for
+ * this response, without including the default conversation. Can include
+ * references to items from the default conversation.
+ */
+ input?: Array;
+
+ /**
+ * The default system instructions (i.e. system message) prepended to model calls.
+ * This field allows the client to guide the model on desired responses. The model
+ * can be instructed on response content and format, (e.g. "be extremely succinct",
+ * "act friendly", "here are examples of good responses") and on audio behavior
+ * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
+ * instructions are not guaranteed to be followed by the model, but they provide
+ * guidance to the model on the desired behavior.
+ *
+ * Note that the server sets default instructions which will be used if this field
+ * is not set and are visible in the `session.created` event at the start of the
+ * session.
+ */
+ instructions?: string;
+
+ /**
+ * Maximum number of output tokens for a single assistant response, inclusive of
+ * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
+ * `inf` for the maximum available tokens for a given model. Defaults to `inf`.
+ */
+ max_response_output_tokens?: number | 'inf';
+
+ /**
+ * Set of 16 key-value pairs that can be attached to an object. This can be useful
+ * for storing additional information about the object in a structured format. Keys
+ * can be a maximum of 64 characters long and values can be a maximum of 512
+ * characters long.
+ */
+ metadata?: unknown | null;
+
+ /**
+ * The set of modalities the model can respond with. To disable audio, set this to
+ * ["text"].
+ */
+ modalities?: Array<'text' | 'audio'>;
+
+ /**
+ * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+ */
+ output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
+
+ /**
+ * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
+ */
+ temperature?: number;
+
+ /**
+ * How the model chooses tools. Options are `auto`, `none`, `required`, or specify
+ * a function, like `{"type": "function", "function": {"name": "my_function"}}`.
+ */
+ tool_choice?: string;
+
+ /**
+ * Tools (functions) available to the model.
+ */
+ tools?: Array;
+
+ /**
+ * The voice the model uses to respond. Voice cannot be changed during the session
+ * once the model has responded with audio at least once. Current voice options are
+ * `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`.
+ */
+ voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
+ }
+
+ export namespace Response {
+ export interface Tool {
+ /**
+ * The description of the function, including guidance on when and how to call it,
+ * and guidance about what to tell the user when calling (if anything).
+ */
+ description?: string;
+
+ /**
+ * The name of the function.
+ */
+ name?: string;
+
+ /**
+ * Parameters of the function in JSON Schema.
+ */
+ parameters?: unknown;
+
+ /**
+ * The type of the tool, i.e. `function`.
+ */
+ type?: 'function';
+ }
+ }
+}
+
+/**
+ * Returned when a new Response is created. The first event of response creation,
+ * where the response is in an initial state of `in_progress`.
+ */
+export interface ResponseCreatedEvent {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The response resource.
+ */
+ response: RealtimeResponse;
+
+ /**
+ * The event type, must be `response.created`.
+ */
+ type: 'response.created';
+}
+
+/**
+ * Returned when a Response is done streaming. Always emitted, no matter the final
+ * state. The Response object included in the `response.done` event will include
+ * all output Items in the Response but will omit the raw audio data.
+ */
+export interface ResponseDoneEvent {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The response resource.
+ */
+ response: RealtimeResponse;
+
+ /**
+ * The event type, must be `response.done`.
+ */
+ type: 'response.done';
+}
+
+/**
+ * Returned when the model-generated function call arguments are updated.
+ */
+export interface ResponseFunctionCallArgumentsDeltaEvent {
+ /**
+ * The ID of the function call.
+ */
+ call_id: string;
+
+ /**
+ * The arguments delta as a JSON string.
+ */
+ delta: string;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the function call item.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item in the response.
+ */
+ output_index: number;
+
+ /**
+ * The ID of the response.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `response.function_call_arguments.delta`.
+ */
+ type: 'response.function_call_arguments.delta';
+}
+
+/**
+ * Returned when the model-generated function call arguments are done streaming.
+ * Also emitted when a Response is interrupted, incomplete, or cancelled.
+ */
+export interface ResponseFunctionCallArgumentsDoneEvent {
+ /**
+ * The final arguments as a JSON string.
+ */
+ arguments: string;
+
+ /**
+ * The ID of the function call.
+ */
+ call_id: string;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the function call item.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item in the response.
+ */
+ output_index: number;
+
+ /**
+ * The ID of the response.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `response.function_call_arguments.done`.
+ */
+ type: 'response.function_call_arguments.done';
+}
+
+/**
+ * Returned when a new Item is created during Response generation.
+ */
+export interface ResponseOutputItemAddedEvent {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The item to add to the conversation.
+ */
+ item: ConversationItem;
+
+ /**
+ * The index of the output item in the Response.
+ */
+ output_index: number;
+
+ /**
+ * The ID of the Response to which the item belongs.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `response.output_item.added`.
+ */
+ type: 'response.output_item.added';
+}
+
+/**
+ * Returned when an Item is done streaming. Also emitted when a Response is
+ * interrupted, incomplete, or cancelled.
+ */
+export interface ResponseOutputItemDoneEvent {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The item to add to the conversation.
+ */
+ item: ConversationItem;
+
+ /**
+ * The index of the output item in the Response.
+ */
+ output_index: number;
+
+ /**
+ * The ID of the Response to which the item belongs.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `response.output_item.done`.
+ */
+ type: 'response.output_item.done';
+}
+
+/**
+ * Returned when the text value of a "text" content part is updated.
+ */
+export interface ResponseTextDeltaEvent {
+ /**
+ * The index of the content part in the item's content array.
+ */
+ content_index: number;
+
+ /**
+ * The text delta.
+ */
+ delta: string;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the item.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item in the response.
+ */
+ output_index: number;
+
+ /**
+ * The ID of the response.
+ */
+ response_id: string;
+
+ /**
+ * The event type, must be `response.text.delta`.
+ */
+ type: 'response.text.delta';
+}
+
+/**
+ * Returned when the text value of a "text" content part is done streaming. Also
+ * emitted when a Response is interrupted, incomplete, or cancelled.
+ */
+export interface ResponseTextDoneEvent {
+ /**
+ * The index of the content part in the item's content array.
+ */
+ content_index: number;
+
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * The ID of the item.
+ */
+ item_id: string;
+
+ /**
+ * The index of the output item in the response.
+ */
+ output_index: number;
+
+ /**
+ * The ID of the response.
+ */
+ response_id: string;
+
+ /**
+ * The final text content.
+ */
+ text: string;
+
+ /**
+ * The event type, must be `response.text.done`.
+ */
+ type: 'response.text.done';
+}
+
+/**
+ * Returned when a Session is created. Emitted automatically when a new connection
+ * is established as the first server event. This event will contain the default
+ * Session configuration.
+ */
+export interface SessionCreatedEvent {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * Realtime session object configuration.
+ */
+ session: SessionsAPI.Session;
+
+ /**
+ * The event type, must be `session.created`.
+ */
+ type: 'session.created';
+}
+
+/**
+ * Send this event to update the session’s default configuration. The client may
+ * send this event at any time to update the session configuration, and any field
+ * may be updated at any time, except for "voice". The server will respond with a
+ * `session.updated` event that shows the full effective configuration. Only fields
+ * that are present are updated, thus the correct way to clear a field like
+ * "instructions" is to pass an empty string.
+ */
+export interface SessionUpdateEvent {
+ /**
+ * Realtime session object configuration.
+ */
+ session: SessionUpdateEvent.Session;
+
+ /**
+ * The event type, must be `session.update`.
+ */
+ type: 'session.update';
+
+ /**
+ * Optional client-generated ID used to identify this event.
+ */
+ event_id?: string;
+}
+
+export namespace SessionUpdateEvent {
+ /**
+ * Realtime session object configuration.
+ */
+ export interface Session {
+ /**
+ * The Realtime model used for this session.
+ */
+ model:
+ | 'gpt-4o-realtime-preview'
+ | 'gpt-4o-realtime-preview-2024-10-01'
+ | 'gpt-4o-realtime-preview-2024-12-17'
+ | 'gpt-4o-mini-realtime-preview'
+ | 'gpt-4o-mini-realtime-preview-2024-12-17';
+
+ /**
+ * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+ */
+ input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
+
+ /**
+ * Configuration for input audio transcription, defaults to off and can be set to
+ * `null` to turn off once on. Input audio transcription is not native to the
+ * model, since the model consumes audio directly. Transcription runs
+ * asynchronously through Whisper and should be treated as rough guidance rather
+ * than the representation understood by the model.
+ */
+ input_audio_transcription?: Session.InputAudioTranscription;
+
+ /**
+ * The default system instructions (i.e. system message) prepended to model calls.
+ * This field allows the client to guide the model on desired responses. The model
+ * can be instructed on response content and format, (e.g. "be extremely succinct",
+ * "act friendly", "here are examples of good responses") and on audio behavior
+ * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
+ * instructions are not guaranteed to be followed by the model, but they provide
+ * guidance to the model on the desired behavior.
+ *
+ * Note that the server sets default instructions which will be used if this field
+ * is not set and are visible in the `session.created` event at the start of the
+ * session.
+ */
+ instructions?: string;
+
+ /**
+ * Maximum number of output tokens for a single assistant response, inclusive of
+ * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
+ * `inf` for the maximum available tokens for a given model. Defaults to `inf`.
+ */
+ max_response_output_tokens?: number | 'inf';
+
+ /**
+ * The set of modalities the model can respond with. To disable audio, set this to
+ * ["text"].
+ */
+ modalities?: Array<'text' | 'audio'>;
+
+ /**
+ * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+ */
+ output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
+
+ /**
+ * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
+ */
+ temperature?: number;
+
+ /**
+ * How the model chooses tools. Options are `auto`, `none`, `required`, or specify
+ * a function.
+ */
+ tool_choice?: string;
+
+ /**
+ * Tools (functions) available to the model.
+ */
+ tools?: Array;
+
+ /**
+ * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
+ * means that the model will detect the start and end of speech based on audio
+ * volume and respond at the end of user speech.
+ */
+ turn_detection?: Session.TurnDetection;
+
+ /**
+ * The voice the model uses to respond. Voice cannot be changed during the session
+ * once the model has responded with audio at least once. Current voice options are
+ * `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`.
+ */
+ voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
+ }
+
+ export namespace Session {
+ /**
+ * Configuration for input audio transcription, defaults to off and can be set to
+ * `null` to turn off once on. Input audio transcription is not native to the
+ * model, since the model consumes audio directly. Transcription runs
+ * asynchronously through Whisper and should be treated as rough guidance rather
+ * than the representation understood by the model.
+ */
+ export interface InputAudioTranscription {
+ /**
+ * The model to use for transcription, `whisper-1` is the only currently supported
+ * model.
+ */
+ model?: string;
+ }
+
+ export interface Tool {
+ /**
+ * The description of the function, including guidance on when and how to call it,
+ * and guidance about what to tell the user when calling (if anything).
+ */
+ description?: string;
+
+ /**
+ * The name of the function.
+ */
+ name?: string;
+
+ /**
+ * Parameters of the function in JSON Schema.
+ */
+ parameters?: unknown;
+
+ /**
+ * The type of the tool, i.e. `function`.
+ */
+ type?: 'function';
+ }
+
+ /**
+ * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
+ * means that the model will detect the start and end of speech based on audio
+ * volume and respond at the end of user speech.
+ */
+ export interface TurnDetection {
+ /**
+ * Whether or not to automatically generate a response when VAD is enabled. `true`
+ * by default.
+ */
+ create_response?: boolean;
+
+ /**
+ * Amount of audio to include before the VAD detected speech (in milliseconds).
+ * Defaults to 300ms.
+ */
+ prefix_padding_ms?: number;
+
+ /**
+ * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+ * With shorter values the model will respond more quickly, but may jump in on
+ * short pauses from the user.
+ */
+ silence_duration_ms?: number;
+
+ /**
+ * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+ * threshold will require louder audio to activate the model, and thus might
+ * perform better in noisy environments.
+ */
+ threshold?: number;
+
+ /**
+ * Type of turn detection, only `server_vad` is currently supported.
+ */
+ type?: string;
+ }
+ }
+}
+
+/**
+ * Returned when a session is updated with a `session.update` event, unless there
+ * is an error.
+ */
+export interface SessionUpdatedEvent {
+ /**
+ * The unique ID of the server event.
+ */
+ event_id: string;
+
+ /**
+ * Realtime session object configuration.
+ */
+ session: SessionsAPI.Session;
+
+ /**
+ * The event type, must be `session.updated`.
+ */
+ type: 'session.updated';
+}
+
+Realtime.Sessions = Sessions;
+
+export declare namespace Realtime {
+ export {
+ Sessions as Sessions,
+ type SessionsAPISession as Session,
+ type SessionCreateResponse as SessionCreateResponse,
+ type SessionCreateParams as SessionCreateParams,
+ };
+}
diff --git a/src/resources/beta/realtime/sessions.ts b/src/resources/beta/realtime/sessions.ts
new file mode 100644
index 000000000..c1082d236
--- /dev/null
+++ b/src/resources/beta/realtime/sessions.ts
@@ -0,0 +1,546 @@
+// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+import { APIResource } from '../../../resource';
+import * as Core from '../../../core';
+
+export class Sessions extends APIResource {
+ /**
+ * Create an ephemeral API token for use in client-side applications with the
+ * Realtime API. Can be configured with the same session parameters as the
+ * `session.update` client event.
+ *
+ * It responds with a session object, plus a `client_secret` key which contains a
+ * usable ephemeral API token that can be used to authenticate browser clients for
+ * the Realtime API.
+ */
+ create(body: SessionCreateParams, options?: Core.RequestOptions): Core.APIPromise {
+ return this._client.post('/realtime/sessions', {
+ body,
+ ...options,
+ headers: { 'OpenAI-Beta': 'assistants=v2', ...options?.headers },
+ });
+ }
+}
+
+/**
+ * Realtime session object configuration.
+ */
+export interface Session {
+ /**
+ * Unique identifier for the session object.
+ */
+ id?: string;
+
+ /**
+ * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+ */
+ input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
+
+ /**
+ * Configuration for input audio transcription, defaults to off and can be set to
+ * `null` to turn off once on. Input audio transcription is not native to the
+ * model, since the model consumes audio directly. Transcription runs
+ * asynchronously through Whisper and should be treated as rough guidance rather
+ * than the representation understood by the model.
+ */
+ input_audio_transcription?: Session.InputAudioTranscription;
+
+ /**
+ * The default system instructions (i.e. system message) prepended to model calls.
+ * This field allows the client to guide the model on desired responses. The model
+ * can be instructed on response content and format, (e.g. "be extremely succinct",
+ * "act friendly", "here are examples of good responses") and on audio behavior
+ * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
+ * instructions are not guaranteed to be followed by the model, but they provide
+ * guidance to the model on the desired behavior.
+ *
+ * Note that the server sets default instructions which will be used if this field
+ * is not set and are visible in the `session.created` event at the start of the
+ * session.
+ */
+ instructions?: string;
+
+ /**
+ * Maximum number of output tokens for a single assistant response, inclusive of
+ * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
+ * `inf` for the maximum available tokens for a given model. Defaults to `inf`.
+ */
+ max_response_output_tokens?: number | 'inf';
+
+ /**
+ * The set of modalities the model can respond with. To disable audio, set this to
+ * ["text"].
+ */
+ modalities?: Array<'text' | 'audio'>;
+
+ /**
+ * The Realtime model used for this session.
+ */
+ model?:
+ | (string & {})
+ | 'gpt-4o-realtime-preview'
+ | 'gpt-4o-realtime-preview-2024-10-01'
+ | 'gpt-4o-realtime-preview-2024-12-17'
+ | 'gpt-4o-mini-realtime-preview'
+ | 'gpt-4o-mini-realtime-preview-2024-12-17';
+
+ /**
+ * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+ */
+ output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
+
+ /**
+ * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
+ */
+ temperature?: number;
+
+ /**
+ * How the model chooses tools. Options are `auto`, `none`, `required`, or specify
+ * a function.
+ */
+ tool_choice?: string;
+
+ /**
+ * Tools (functions) available to the model.
+ */
+ tools?: Array;
+
+ /**
+ * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
+ * means that the model will detect the start and end of speech based on audio
+ * volume and respond at the end of user speech.
+ */
+ turn_detection?: Session.TurnDetection | null;
+
+ /**
+ * The voice the model uses to respond. Voice cannot be changed during the session
+ * once the model has responded with audio at least once. Current voice options are
+ * `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`.
+ */
+ voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
+}
+
+export namespace Session {
+ /**
+ * Configuration for input audio transcription, defaults to off and can be set to
+ * `null` to turn off once on. Input audio transcription is not native to the
+ * model, since the model consumes audio directly. Transcription runs
+ * asynchronously through Whisper and should be treated as rough guidance rather
+ * than the representation understood by the model.
+ */
+ export interface InputAudioTranscription {
+ /**
+ * The model to use for transcription, `whisper-1` is the only currently supported
+ * model.
+ */
+ model?: string;
+ }
+
+ export interface Tool {
+ /**
+ * The description of the function, including guidance on when and how to call it,
+ * and guidance about what to tell the user when calling (if anything).
+ */
+ description?: string;
+
+ /**
+ * The name of the function.
+ */
+ name?: string;
+
+ /**
+ * Parameters of the function in JSON Schema.
+ */
+ parameters?: unknown;
+
+ /**
+ * The type of the tool, i.e. `function`.
+ */
+ type?: 'function';
+ }
+
+ /**
+ * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
+ * means that the model will detect the start and end of speech based on audio
+ * volume and respond at the end of user speech.
+ */
+ export interface TurnDetection {
+ /**
+ * Amount of audio to include before the VAD detected speech (in milliseconds).
+ * Defaults to 300ms.
+ */
+ prefix_padding_ms?: number;
+
+ /**
+ * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+ * With shorter values the model will respond more quickly, but may jump in on
+ * short pauses from the user.
+ */
+ silence_duration_ms?: number;
+
+ /**
+ * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+ * threshold will require louder audio to activate the model, and thus might
+ * perform better in noisy environments.
+ */
+ threshold?: number;
+
+ /**
+ * Type of turn detection, only `server_vad` is currently supported.
+ */
+ type?: 'server_vad';
+ }
+}
+
+/**
+ * A new Realtime session configuration, with an ephermeral key. Default TTL for
+ * keys is one minute.
+ */
+export interface SessionCreateResponse {
+ /**
+ * Ephemeral key returned by the API.
+ */
+ client_secret?: SessionCreateResponse.ClientSecret;
+
+ /**
+ * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+ */
+ input_audio_format?: string;
+
+ /**
+ * Configuration for input audio transcription, defaults to off and can be set to
+ * `null` to turn off once on. Input audio transcription is not native to the
+ * model, since the model consumes audio directly. Transcription runs
+ * asynchronously through Whisper and should be treated as rough guidance rather
+ * than the representation understood by the model.
+ */
+ input_audio_transcription?: SessionCreateResponse.InputAudioTranscription;
+
+ /**
+ * The default system instructions (i.e. system message) prepended to model calls.
+ * This field allows the client to guide the model on desired responses. The model
+ * can be instructed on response content and format, (e.g. "be extremely succinct",
+ * "act friendly", "here are examples of good responses") and on audio behavior
+ * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
+ * instructions are not guaranteed to be followed by the model, but they provide
+ * guidance to the model on the desired behavior.
+ *
+ * Note that the server sets default instructions which will be used if this field
+ * is not set and are visible in the `session.created` event at the start of the
+ * session.
+ */
+ instructions?: string;
+
+ /**
+ * Maximum number of output tokens for a single assistant response, inclusive of
+ * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
+ * `inf` for the maximum available tokens for a given model. Defaults to `inf`.
+ */
+ max_response_output_tokens?: number | 'inf';
+
+ /**
+ * The set of modalities the model can respond with. To disable audio, set this to
+ * ["text"].
+ */
+ modalities?: Array<'text' | 'audio'>;
+
+ /**
+ * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+ */
+ output_audio_format?: string;
+
+ /**
+ * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
+ */
+ temperature?: number;
+
+ /**
+ * How the model chooses tools. Options are `auto`, `none`, `required`, or specify
+ * a function.
+ */
+ tool_choice?: string;
+
+ /**
+ * Tools (functions) available to the model.
+ */
+ tools?: Array;
+
+ /**
+ * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
+ * means that the model will detect the start and end of speech based on audio
+ * volume and respond at the end of user speech.
+ */
+ turn_detection?: SessionCreateResponse.TurnDetection;
+
+ /**
+ * The voice the model uses to respond. Voice cannot be changed during the session
+ * once the model has responded with audio at least once. Current voice options are
+ * `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`.
+ */
+ voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
+}
+
+export namespace SessionCreateResponse {
+ /**
+ * Ephemeral key returned by the API.
+ */
+ export interface ClientSecret {
+ /**
+ * Timestamp for when the token expires. Currently, all tokens expire after one
+ * minute.
+ */
+ expires_at?: number;
+
+ /**
+ * Ephemeral key usable in client environments to authenticate connections to the
+ * Realtime API. Use this in client-side environments rather than a standard API
+ * token, which should only be used server-side.
+ */
+ value?: string;
+ }
+
+ /**
+ * Configuration for input audio transcription, defaults to off and can be set to
+ * `null` to turn off once on. Input audio transcription is not native to the
+ * model, since the model consumes audio directly. Transcription runs
+ * asynchronously through Whisper and should be treated as rough guidance rather
+ * than the representation understood by the model.
+ */
+ export interface InputAudioTranscription {
+ /**
+ * The model to use for transcription, `whisper-1` is the only currently supported
+ * model.
+ */
+ model?: string;
+ }
+
+ export interface Tool {
+ /**
+ * The description of the function, including guidance on when and how to call it,
+ * and guidance about what to tell the user when calling (if anything).
+ */
+ description?: string;
+
+ /**
+ * The name of the function.
+ */
+ name?: string;
+
+ /**
+ * Parameters of the function in JSON Schema.
+ */
+ parameters?: unknown;
+
+ /**
+ * The type of the tool, i.e. `function`.
+ */
+ type?: 'function';
+ }
+
+ /**
+ * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
+ * means that the model will detect the start and end of speech based on audio
+ * volume and respond at the end of user speech.
+ */
+ export interface TurnDetection {
+ /**
+ * Amount of audio to include before the VAD detected speech (in milliseconds).
+ * Defaults to 300ms.
+ */
+ prefix_padding_ms?: number;
+
+ /**
+ * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+ * With shorter values the model will respond more quickly, but may jump in on
+ * short pauses from the user.
+ */
+ silence_duration_ms?: number;
+
+ /**
+ * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+ * threshold will require louder audio to activate the model, and thus might
+ * perform better in noisy environments.
+ */
+ threshold?: number;
+
+ /**
+ * Type of turn detection, only `server_vad` is currently supported.
+ */
+ type?: string;
+ }
+}
+
+export interface SessionCreateParams {
+ /**
+ * The Realtime model used for this session.
+ */
+ model:
+ | 'gpt-4o-realtime-preview'
+ | 'gpt-4o-realtime-preview-2024-10-01'
+ | 'gpt-4o-realtime-preview-2024-12-17'
+ | 'gpt-4o-mini-realtime-preview'
+ | 'gpt-4o-mini-realtime-preview-2024-12-17';
+
+ /**
+ * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+ */
+ input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
+
+ /**
+ * Configuration for input audio transcription, defaults to off and can be set to
+ * `null` to turn off once on. Input audio transcription is not native to the
+ * model, since the model consumes audio directly. Transcription runs
+ * asynchronously through Whisper and should be treated as rough guidance rather
+ * than the representation understood by the model.
+ */
+ input_audio_transcription?: SessionCreateParams.InputAudioTranscription;
+
+ /**
+ * The default system instructions (i.e. system message) prepended to model calls.
+ * This field allows the client to guide the model on desired responses. The model
+ * can be instructed on response content and format, (e.g. "be extremely succinct",
+ * "act friendly", "here are examples of good responses") and on audio behavior
+ * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
+ * instructions are not guaranteed to be followed by the model, but they provide
+ * guidance to the model on the desired behavior.
+ *
+ * Note that the server sets default instructions which will be used if this field
+ * is not set and are visible in the `session.created` event at the start of the
+ * session.
+ */
+ instructions?: string;
+
+ /**
+ * Maximum number of output tokens for a single assistant response, inclusive of
+ * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
+ * `inf` for the maximum available tokens for a given model. Defaults to `inf`.
+ */
+ max_response_output_tokens?: number | 'inf';
+
+ /**
+ * The set of modalities the model can respond with. To disable audio, set this to
+ * ["text"].
+ */
+ modalities?: Array<'text' | 'audio'>;
+
+ /**
+ * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+ */
+ output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
+
+ /**
+ * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
+ */
+ temperature?: number;
+
+ /**
+ * How the model chooses tools. Options are `auto`, `none`, `required`, or specify
+ * a function.
+ */
+ tool_choice?: string;
+
+ /**
+ * Tools (functions) available to the model.
+ */
+ tools?: Array;
+
+ /**
+ * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
+ * means that the model will detect the start and end of speech based on audio
+ * volume and respond at the end of user speech.
+ */
+ turn_detection?: SessionCreateParams.TurnDetection;
+
+ /**
+ * The voice the model uses to respond. Voice cannot be changed during the session
+ * once the model has responded with audio at least once. Current voice options are
+ * `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`.
+ */
+ voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
+}
+
+export namespace SessionCreateParams {
+ /**
+ * Configuration for input audio transcription, defaults to off and can be set to
+ * `null` to turn off once on. Input audio transcription is not native to the
+ * model, since the model consumes audio directly. Transcription runs
+ * asynchronously through Whisper and should be treated as rough guidance rather
+ * than the representation understood by the model.
+ */
+ export interface InputAudioTranscription {
+ /**
+ * The model to use for transcription, `whisper-1` is the only currently supported
+ * model.
+ */
+ model?: string;
+ }
+
+ export interface Tool {
+ /**
+ * The description of the function, including guidance on when and how to call it,
+ * and guidance about what to tell the user when calling (if anything).
+ */
+ description?: string;
+
+ /**
+ * The name of the function.
+ */
+ name?: string;
+
+ /**
+ * Parameters of the function in JSON Schema.
+ */
+ parameters?: unknown;
+
+ /**
+ * The type of the tool, i.e. `function`.
+ */
+ type?: 'function';
+ }
+
+ /**
+ * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
+ * means that the model will detect the start and end of speech based on audio
+ * volume and respond at the end of user speech.
+ */
+ export interface TurnDetection {
+ /**
+ * Whether or not to automatically generate a response when VAD is enabled. `true`
+ * by default.
+ */
+ create_response?: boolean;
+
+ /**
+ * Amount of audio to include before the VAD detected speech (in milliseconds).
+ * Defaults to 300ms.
+ */
+ prefix_padding_ms?: number;
+
+ /**
+ * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+ * With shorter values the model will respond more quickly, but may jump in on
+ * short pauses from the user.
+ */
+ silence_duration_ms?: number;
+
+ /**
+ * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+ * threshold will require louder audio to activate the model, and thus might
+ * perform better in noisy environments.
+ */
+ threshold?: number;
+
+ /**
+ * Type of turn detection, only `server_vad` is currently supported.
+ */
+ type?: string;
+ }
+}
+
+export declare namespace Sessions {
+ export {
+ type Session as Session,
+ type SessionCreateResponse as SessionCreateResponse,
+ type SessionCreateParams as SessionCreateParams,
+ };
+}
diff --git a/tests/api-resources/beta/realtime/sessions.test.ts b/tests/api-resources/beta/realtime/sessions.test.ts
new file mode 100644
index 000000000..0ed998c27
--- /dev/null
+++ b/tests/api-resources/beta/realtime/sessions.test.ts
@@ -0,0 +1,45 @@
+// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+import OpenAI from 'openai';
+import { Response } from 'node-fetch';
+
+const client = new OpenAI({
+ apiKey: 'My API Key',
+ baseURL: process.env['TEST_API_BASE_URL'] ?? 'http://127.0.0.1:4010',
+});
+
+describe('resource sessions', () => {
+ test('create: only required params', async () => {
+ const responsePromise = client.beta.realtime.sessions.create({ model: 'gpt-4o-realtime-preview' });
+ const rawResponse = await responsePromise.asResponse();
+ expect(rawResponse).toBeInstanceOf(Response);
+ const response = await responsePromise;
+ expect(response).not.toBeInstanceOf(Response);
+ const dataAndResponse = await responsePromise.withResponse();
+ expect(dataAndResponse.data).toBe(response);
+ expect(dataAndResponse.response).toBe(rawResponse);
+ });
+
+ test('create: required and optional params', async () => {
+ const response = await client.beta.realtime.sessions.create({
+ model: 'gpt-4o-realtime-preview',
+ input_audio_format: 'pcm16',
+ input_audio_transcription: { model: 'model' },
+ instructions: 'instructions',
+ max_response_output_tokens: 0,
+ modalities: ['text'],
+ output_audio_format: 'pcm16',
+ temperature: 0,
+ tool_choice: 'tool_choice',
+ tools: [{ description: 'description', name: 'name', parameters: {}, type: 'function' }],
+ turn_detection: {
+ create_response: true,
+ prefix_padding_ms: 0,
+ silence_duration_ms: 0,
+ threshold: 0,
+ type: 'type',
+ },
+ voice: 'alloy',
+ });
+ });
+});