diff --git a/.stats.yml b/.stats.yml
index d223c8f1f..9600edae3 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,2 +1,2 @@
-configured_endpoints: 68
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai-02200a58ed631064b6419711da99fefd6e97bdbbeb577a80a1a6e0c8dbcb18f5.yml
+configured_endpoints: 69
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai-b5b0e2c794b012919701c3fd43286af10fa25d33ceb8a881bec2636028f446e0.yml
diff --git a/api.md b/api.md
index e971585de..10f88e5e0 100644
--- a/api.md
+++ b/api.md
@@ -212,6 +212,66 @@ Methods:
 
 # Beta
 
+## Realtime
+
+Types:
+
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationCreatedEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItem</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemContent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemCreateEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemCreatedEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemDeleteEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemDeletedEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemInputAudioTranscriptionCompletedEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemInputAudioTranscriptionFailedEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemTruncateEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ConversationItemTruncatedEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ErrorEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">InputAudioBufferAppendEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">InputAudioBufferClearEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">InputAudioBufferClearedEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">InputAudioBufferCommitEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">InputAudioBufferCommittedEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">InputAudioBufferSpeechStartedEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">InputAudioBufferSpeechStoppedEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">RateLimitsUpdatedEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">RealtimeClientEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">RealtimeResponse</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">RealtimeResponseStatus</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">RealtimeResponseUsage</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">RealtimeServerEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ResponseAudioDeltaEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ResponseAudioDoneEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ResponseAudioTranscriptDeltaEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ResponseAudioTranscriptDoneEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ResponseCancelEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ResponseContentPartAddedEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ResponseContentPartDoneEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ResponseCreateEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ResponseCreatedEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ResponseDoneEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ResponseFunctionCallArgumentsDeltaEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ResponseFunctionCallArgumentsDoneEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ResponseOutputItemAddedEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ResponseOutputItemDoneEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ResponseTextDeltaEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">ResponseTextDoneEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">SessionCreatedEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">SessionUpdateEvent</a></code>
+- <code><a href="./src/resources/beta/realtime/realtime.ts">SessionUpdatedEvent</a></code>
+
+### Sessions
+
+Types:
+
+- <code><a href="./src/resources/beta/realtime/sessions.ts">Session</a></code>
+- <code><a href="./src/resources/beta/realtime/sessions.ts">SessionCreateResponse</a></code>
+
+Methods:
+
+- <code title="post /realtime/sessions">client.beta.realtime.sessions.<a href="./src/resources/beta/realtime/sessions.ts">create</a>({ ...params }) -> SessionCreateResponse</code>
+
 ## VectorStores
 
 Types:
diff --git a/src/resources/beta/beta.ts b/src/resources/beta/beta.ts
index 75035d600..48cc92369 100644
--- a/src/resources/beta/beta.ts
+++ b/src/resources/beta/beta.ts
@@ -20,6 +20,8 @@ import {
   RunStreamEvent,
   ThreadStreamEvent,
 } from './assistants';
+import * as RealtimeAPI from './realtime/realtime';
+import { Realtime } from './realtime/realtime';
 import * as ThreadsAPI from './threads/threads';
 import {
   AssistantResponseFormatOption,
@@ -54,11 +56,13 @@ import {
 } from './vector-stores/vector-stores';
 
 export class Beta extends APIResource {
+  realtime: RealtimeAPI.Realtime = new RealtimeAPI.Realtime(this._client);
   vectorStores: VectorStoresAPI.VectorStores = new VectorStoresAPI.VectorStores(this._client);
   assistants: AssistantsAPI.Assistants = new AssistantsAPI.Assistants(this._client);
   threads: ThreadsAPI.Threads = new ThreadsAPI.Threads(this._client);
 }
 
+Beta.Realtime = Realtime;
 Beta.VectorStores = VectorStores;
 Beta.VectorStoresPage = VectorStoresPage;
 Beta.Assistants = Assistants;
@@ -66,6 +70,8 @@ Beta.AssistantsPage = AssistantsPage;
 Beta.Threads = Threads;
 
 export declare namespace Beta {
+  export { Realtime as Realtime };
+
   export {
     VectorStores as VectorStores,
     type AutoFileChunkingStrategyParam as AutoFileChunkingStrategyParam,
diff --git a/src/resources/beta/index.ts b/src/resources/beta/index.ts
index 10212d390..b8db6f846 100644
--- a/src/resources/beta/index.ts
+++ b/src/resources/beta/index.ts
@@ -19,6 +19,7 @@ export {
   type AssistantListParams,
 } from './assistants';
 export { Beta } from './beta';
+export { Realtime } from './realtime/index';
 export {
   Threads,
   type AssistantResponseFormatOption,
diff --git a/src/resources/beta/realtime/index.ts b/src/resources/beta/realtime/index.ts
new file mode 100644
index 000000000..66c3ecaae
--- /dev/null
+++ b/src/resources/beta/realtime/index.ts
@@ -0,0 +1,4 @@
+// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+export { Realtime } from './realtime';
+export { Sessions, type Session, type SessionCreateResponse, type SessionCreateParams } from './sessions';
diff --git a/src/resources/beta/realtime/realtime.ts b/src/resources/beta/realtime/realtime.ts
new file mode 100644
index 000000000..5de06917a
--- /dev/null
+++ b/src/resources/beta/realtime/realtime.ts
@@ -0,0 +1,1904 @@
+// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+import { APIResource } from '../../../resource';
+import * as RealtimeAPI from './realtime';
+import * as SessionsAPI from './sessions';
+import {
+  Session as SessionsAPISession,
+  SessionCreateParams,
+  SessionCreateResponse,
+  Sessions,
+} from './sessions';
+
+export class Realtime extends APIResource {
+  sessions: SessionsAPI.Sessions = new SessionsAPI.Sessions(this._client);
+}
+
+/**
+ * Returned when a conversation is created. Emitted right after session creation.
+ */
+export interface ConversationCreatedEvent {
+  /**
+   * The conversation resource.
+   */
+  conversation: ConversationCreatedEvent.Conversation;
+
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * The event type, must be `conversation.created`.
+   */
+  type: 'conversation.created';
+}
+
+export namespace ConversationCreatedEvent {
+  /**
+   * The conversation resource.
+   */
+  export interface Conversation {
+    /**
+     * The unique ID of the conversation.
+     */
+    id?: string;
+
+    /**
+     * The object type, must be `realtime.conversation`.
+     */
+    object?: 'realtime.conversation';
+  }
+}
+
+/**
+ * The item to add to the conversation.
+ */
+export interface ConversationItem {
+  /**
+   * The unique ID of the item, this can be generated by the client to help manage
+   * server-side context, but is not required because the server will generate one if
+   * not provided.
+   */
+  id?: string;
+
+  /**
+   * The arguments of the function call (for `function_call` items).
+   */
+  arguments?: string;
+
+  /**
+   * The ID of the function call (for `function_call` and `function_call_output`
+   * items). If passed on a `function_call_output` item, the server will check that a
+   * `function_call` item with the same ID exists in the conversation history.
+   */
+  call_id?: string;
+
+  /**
+   * The content of the message, applicable for `message` items.
+   *
+   * - Message items of role `system` support only `input_text` content
+   * - Message items of role `user` support `input_text` and `input_audio` content
+   * - Message items of role `assistant` support `text` content.
+   */
+  content?: Array<ConversationItemContent>;
+
+  /**
+   * The name of the function being called (for `function_call` items).
+   */
+  name?: string;
+
+  /**
+   * Identifier for the API object being returned - always `realtime.item`.
+   */
+  object?: 'realtime.item';
+
+  /**
+   * The output of the function call (for `function_call_output` items).
+   */
+  output?: string;
+
+  /**
+   * The role of the message sender (`user`, `assistant`, `system`), only applicable
+   * for `message` items.
+   */
+  role?: 'user' | 'assistant' | 'system';
+
+  /**
+   * The status of the item (`completed`, `incomplete`). These have no effect on the
+   * conversation, but are accepted for consistency with the
+   * `conversation.item.created` event.
+   */
+  status?: 'completed' | 'incomplete';
+
+  /**
+   * The type of the item (`message`, `function_call`, `function_call_output`).
+   */
+  type?: 'message' | 'function_call' | 'function_call_output';
+}
+
+export interface ConversationItemContent {
+  /**
+   * ID of a previous conversation item to reference (for `item_reference` content
+   * types in `response.create` events). These can reference both client and server
+   * created items.
+   */
+  id?: string;
+
+  /**
+   * Base64-encoded audio bytes, used for `input_audio` content type.
+   */
+  audio?: string;
+
+  /**
+   * The text content, used for `input_text` and `text` content types.
+   */
+  text?: string;
+
+  /**
+   * The transcript of the audio, used for `input_audio` content type.
+   */
+  transcript?: string;
+
+  /**
+   * The content type (`input_text`, `input_audio`, `item_reference`, `text`).
+   */
+  type?: 'input_text' | 'input_audio' | 'item_reference' | 'text';
+}
+
+/**
+ * Add a new Item to the Conversation's context, including messages, function
+ * calls, and function call responses. This event can be used both to populate a
+ * "history" of the conversation and to add new items mid-stream, but has the
+ * current limitation that it cannot populate assistant audio messages.
+ *
+ * If successful, the server will respond with a `conversation.item.created` event,
+ * otherwise an `error` event will be sent.
+ */
+export interface ConversationItemCreateEvent {
+  /**
+   * The item to add to the conversation.
+   */
+  item: ConversationItem;
+
+  /**
+   * The event type, must be `conversation.item.create`.
+   */
+  type: 'conversation.item.create';
+
+  /**
+   * Optional client-generated ID used to identify this event.
+   */
+  event_id?: string;
+
+  /**
+   * The ID of the preceding item after which the new item will be inserted. If not
+   * set, the new item will be appended to the end of the conversation. If set, it
+   * allows an item to be inserted mid-conversation. If the ID cannot be found, an
+   * error will be returned and the item will not be added.
+   */
+  previous_item_id?: string;
+}
+
+/**
+ * Returned when a conversation item is created. There are several scenarios that
+ * produce this event:
+ *
+ * - The server is generating a Response, which if successful will produce either
+ *   one or two Items, which will be of type `message` (role `assistant`) or type
+ *   `function_call`.
+ * - The input audio buffer has been committed, either by the client or the server
+ *   (in `server_vad` mode). The server will take the content of the input audio
+ *   buffer and add it to a new user message Item.
+ * - The client has sent a `conversation.item.create` event to add a new Item to
+ *   the Conversation.
+ */
+export interface ConversationItemCreatedEvent {
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * The item to add to the conversation.
+   */
+  item: ConversationItem;
+
+  /**
+   * The ID of the preceding item in the Conversation context, allows the client to
+   * understand the order of the conversation.
+   */
+  previous_item_id: string;
+
+  /**
+   * The event type, must be `conversation.item.created`.
+   */
+  type: 'conversation.item.created';
+}
+
+/**
+ * Send this event when you want to remove any item from the conversation history.
+ * The server will respond with a `conversation.item.deleted` event, unless the
+ * item does not exist in the conversation history, in which case the server will
+ * respond with an error.
+ */
+export interface ConversationItemDeleteEvent {
+  /**
+   * The ID of the item to delete.
+   */
+  item_id: string;
+
+  /**
+   * The event type, must be `conversation.item.delete`.
+   */
+  type: 'conversation.item.delete';
+
+  /**
+   * Optional client-generated ID used to identify this event.
+   */
+  event_id?: string;
+}
+
+/**
+ * Returned when an item in the conversation is deleted by the client with a
+ * `conversation.item.delete` event. This event is used to synchronize the server's
+ * understanding of the conversation history with the client's view.
+ */
+export interface ConversationItemDeletedEvent {
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * The ID of the item that was deleted.
+   */
+  item_id: string;
+
+  /**
+   * The event type, must be `conversation.item.deleted`.
+   */
+  type: 'conversation.item.deleted';
+}
+
+/**
+ * This event is the output of audio transcription for user audio written to the
+ * user audio buffer. Transcription begins when the input audio buffer is committed
+ * by the client or server (in `server_vad` mode). Transcription runs
+ * asynchronously with Response creation, so this event may come before or after
+ * the Response events.
+ *
+ * Realtime API models accept audio natively, and thus input transcription is a
+ * separate process run on a separate ASR (Automatic Speech Recognition) model,
+ * currently always `whisper-1`. Thus the transcript may diverge somewhat from the
+ * model's interpretation, and should be treated as a rough guide.
+ */
+export interface ConversationItemInputAudioTranscriptionCompletedEvent {
+  /**
+   * The index of the content part containing the audio.
+   */
+  content_index: number;
+
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * The ID of the user message item containing the audio.
+   */
+  item_id: string;
+
+  /**
+   * The transcribed text.
+   */
+  transcript: string;
+
+  /**
+   * The event type, must be `conversation.item.input_audio_transcription.completed`.
+   */
+  type: 'conversation.item.input_audio_transcription.completed';
+}
+
+/**
+ * Returned when input audio transcription is configured, and a transcription
+ * request for a user message failed. These events are separate from other `error`
+ * events so that the client can identify the related Item.
+ */
+export interface ConversationItemInputAudioTranscriptionFailedEvent {
+  /**
+   * The index of the content part containing the audio.
+   */
+  content_index: number;
+
+  /**
+   * Details of the transcription error.
+   */
+  error: ConversationItemInputAudioTranscriptionFailedEvent.Error;
+
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * The ID of the user message item.
+   */
+  item_id: string;
+
+  /**
+   * The event type, must be `conversation.item.input_audio_transcription.failed`.
+   */
+  type: 'conversation.item.input_audio_transcription.failed';
+}
+
+export namespace ConversationItemInputAudioTranscriptionFailedEvent {
+  /**
+   * Details of the transcription error.
+   */
+  export interface Error {
+    /**
+     * Error code, if any.
+     */
+    code?: string;
+
+    /**
+     * A human-readable error message.
+     */
+    message?: string;
+
+    /**
+     * Parameter related to the error, if any.
+     */
+    param?: string;
+
+    /**
+     * The type of error.
+     */
+    type?: string;
+  }
+}
+
+/**
+ * Send this event to truncate a previous assistant message’s audio. The server
+ * will produce audio faster than realtime, so this event is useful when the user
+ * interrupts to truncate audio that has already been sent to the client but not
+ * yet played. This will synchronize the server's understanding of the audio with
+ * the client's playback.
+ *
+ * Truncating audio will delete the server-side text transcript to ensure there is
+ * not text in the context that hasn't been heard by the user.
+ *
+ * If successful, the server will respond with a `conversation.item.truncated`
+ * event.
+ */
+export interface ConversationItemTruncateEvent {
+  /**
+   * Inclusive duration up to which audio is truncated, in milliseconds. If the
+   * audio_end_ms is greater than the actual audio duration, the server will respond
+   * with an error.
+   */
+  audio_end_ms: number;
+
+  /**
+   * The index of the content part to truncate. Set this to 0.
+   */
+  content_index: number;
+
+  /**
+   * The ID of the assistant message item to truncate. Only assistant message items
+   * can be truncated.
+   */
+  item_id: string;
+
+  /**
+   * The event type, must be `conversation.item.truncate`.
+   */
+  type: 'conversation.item.truncate';
+
+  /**
+   * Optional client-generated ID used to identify this event.
+   */
+  event_id?: string;
+}
+
+/**
+ * Returned when an earlier assistant audio message item is truncated by the client
+ * with a `conversation.item.truncate` event. This event is used to synchronize the
+ * server's understanding of the audio with the client's playback.
+ *
+ * This action will truncate the audio and remove the server-side text transcript
+ * to ensure there is no text in the context that hasn't been heard by the user.
+ */
+export interface ConversationItemTruncatedEvent {
+  /**
+   * The duration up to which the audio was truncated, in milliseconds.
+   */
+  audio_end_ms: number;
+
+  /**
+   * The index of the content part that was truncated.
+   */
+  content_index: number;
+
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * The ID of the assistant message item that was truncated.
+   */
+  item_id: string;
+
+  /**
+   * The event type, must be `conversation.item.truncated`.
+   */
+  type: 'conversation.item.truncated';
+}
+
+/**
+ * Returned when an error occurs, which could be a client problem or a server
+ * problem. Most errors are recoverable and the session will stay open, we
+ * recommend to implementors to monitor and log error messages by default.
+ */
+export interface ErrorEvent {
+  /**
+   * Details of the error.
+   */
+  error: ErrorEvent.Error;
+
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * The event type, must be `error`.
+   */
+  type: 'error';
+}
+
+export namespace ErrorEvent {
+  /**
+   * Details of the error.
+   */
+  export interface Error {
+    /**
+     * A human-readable error message.
+     */
+    message: string;
+
+    /**
+     * The type of error (e.g., "invalid_request_error", "server_error").
+     */
+    type: string;
+
+    /**
+     * Error code, if any.
+     */
+    code?: string | null;
+
+    /**
+     * The event_id of the client event that caused the error, if applicable.
+     */
+    event_id?: string | null;
+
+    /**
+     * Parameter related to the error, if any.
+     */
+    param?: string | null;
+  }
+}
+
+/**
+ * Send this event to append audio bytes to the input audio buffer. The audio
+ * buffer is temporary storage you can write to and later commit. In Server VAD
+ * mode, the audio buffer is used to detect speech and the server will decide when
+ * to commit. When Server VAD is disabled, you must commit the audio buffer
+ * manually.
+ *
+ * The client may choose how much audio to place in each event up to a maximum of
+ * 15 MiB, for example streaming smaller chunks from the client may allow the VAD
+ * to be more responsive. Unlike made other client events, the server will not send
+ * a confirmation response to this event.
+ */
+export interface InputAudioBufferAppendEvent {
+  /**
+   * Base64-encoded audio bytes. This must be in the format specified by the
+   * `input_audio_format` field in the session configuration.
+   */
+  audio: string;
+
+  /**
+   * The event type, must be `input_audio_buffer.append`.
+   */
+  type: 'input_audio_buffer.append';
+
+  /**
+   * Optional client-generated ID used to identify this event.
+   */
+  event_id?: string;
+}
+
+/**
+ * Send this event to clear the audio bytes in the buffer. The server will respond
+ * with an `input_audio_buffer.cleared` event.
+ */
+export interface InputAudioBufferClearEvent {
+  /**
+   * The event type, must be `input_audio_buffer.clear`.
+   */
+  type: 'input_audio_buffer.clear';
+
+  /**
+   * Optional client-generated ID used to identify this event.
+   */
+  event_id?: string;
+}
+
+/**
+ * Returned when the input audio buffer is cleared by the client with a
+ * `input_audio_buffer.clear` event.
+ */
+export interface InputAudioBufferClearedEvent {
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * The event type, must be `input_audio_buffer.cleared`.
+   */
+  type: 'input_audio_buffer.cleared';
+}
+
+/**
+ * Send this event to commit the user input audio buffer, which will create a new
+ * user message item in the conversation. This event will produce an error if the
+ * input audio buffer is empty. When in Server VAD mode, the client does not need
+ * to send this event, the server will commit the audio buffer automatically.
+ *
+ * Committing the input audio buffer will trigger input audio transcription (if
+ * enabled in session configuration), but it will not create a response from the
+ * model. The server will respond with an `input_audio_buffer.committed` event.
+ */
+export interface InputAudioBufferCommitEvent {
+  /**
+   * The event type, must be `input_audio_buffer.commit`.
+   */
+  type: 'input_audio_buffer.commit';
+
+  /**
+   * Optional client-generated ID used to identify this event.
+   */
+  event_id?: string;
+}
+
+/**
+ * Returned when an input audio buffer is committed, either by the client or
+ * automatically in server VAD mode. The `item_id` property is the ID of the user
+ * message item that will be created, thus a `conversation.item.created` event will
+ * also be sent to the client.
+ */
+export interface InputAudioBufferCommittedEvent {
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * The ID of the user message item that will be created.
+   */
+  item_id: string;
+
+  /**
+   * The ID of the preceding item after which the new item will be inserted.
+   */
+  previous_item_id: string;
+
+  /**
+   * The event type, must be `input_audio_buffer.committed`.
+   */
+  type: 'input_audio_buffer.committed';
+}
+
+/**
+ * Sent by the server when in `server_vad` mode to indicate that speech has been
+ * detected in the audio buffer. This can happen any time audio is added to the
+ * buffer (unless speech is already detected). The client may want to use this
+ * event to interrupt audio playback or provide visual feedback to the user.
+ *
+ * The client should expect to receive a `input_audio_buffer.speech_stopped` event
+ * when speech stops. The `item_id` property is the ID of the user message item
+ * that will be created when speech stops and will also be included in the
+ * `input_audio_buffer.speech_stopped` event (unless the client manually commits
+ * the audio buffer during VAD activation).
+ */
+export interface InputAudioBufferSpeechStartedEvent {
+  /**
+   * Milliseconds from the start of all audio written to the buffer during the
+   * session when speech was first detected. This will correspond to the beginning of
+   * audio sent to the model, and thus includes the `prefix_padding_ms` configured in
+   * the Session.
+   */
+  audio_start_ms: number;
+
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * The ID of the user message item that will be created when speech stops.
+   */
+  item_id: string;
+
+  /**
+   * The event type, must be `input_audio_buffer.speech_started`.
+   */
+  type: 'input_audio_buffer.speech_started';
+}
+
+/**
+ * Returned in `server_vad` mode when the server detects the end of speech in the
+ * audio buffer. The server will also send an `conversation.item.created` event
+ * with the user message item that is created from the audio buffer.
+ */
+export interface InputAudioBufferSpeechStoppedEvent {
+  /**
+   * Milliseconds since the session started when speech stopped. This will correspond
+   * to the end of audio sent to the model, and thus includes the
+   * `min_silence_duration_ms` configured in the Session.
+   */
+  audio_end_ms: number;
+
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * The ID of the user message item that will be created.
+   */
+  item_id: string;
+
+  /**
+   * The event type, must be `input_audio_buffer.speech_stopped`.
+   */
+  type: 'input_audio_buffer.speech_stopped';
+}
+
+/**
+ * Emitted at the beginning of a Response to indicate the updated rate limits. When
+ * a Response is created some tokens will be "reserved" for the output tokens, the
+ * rate limits shown here reflect that reservation, which is then adjusted
+ * accordingly once the Response is completed.
+ */
+export interface RateLimitsUpdatedEvent {
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * List of rate limit information.
+   */
+  rate_limits: Array<RateLimitsUpdatedEvent.RateLimit>;
+
+  /**
+   * The event type, must be `rate_limits.updated`.
+   */
+  type: 'rate_limits.updated';
+}
+
+export namespace RateLimitsUpdatedEvent {
+  export interface RateLimit {
+    /**
+     * The maximum allowed value for the rate limit.
+     */
+    limit?: number;
+
+    /**
+     * The name of the rate limit (`requests`, `tokens`).
+     */
+    name?: 'requests' | 'tokens';
+
+    /**
+     * The remaining value before the limit is reached.
+     */
+    remaining?: number;
+
+    /**
+     * Seconds until the rate limit resets.
+     */
+    reset_seconds?: number;
+  }
+}
+
+/**
+ * All events that the client can send to the Realtime API
+ */
+export type RealtimeClientEvent =
+  | SessionUpdateEvent
+  | InputAudioBufferAppendEvent
+  | InputAudioBufferCommitEvent
+  | InputAudioBufferClearEvent
+  | ConversationItemCreateEvent
+  | ConversationItemTruncateEvent
+  | ConversationItemDeleteEvent
+  | ResponseCreateEvent
+  | ResponseCancelEvent;
+
+/**
+ * The response resource.
+ */
+export interface RealtimeResponse {
+  /**
+   * The unique ID of the response.
+   */
+  id?: string;
+
+  /**
+   * Developer-provided string key-value pairs associated with this response.
+   */
+  metadata?: unknown | null;
+
+  /**
+   * The object type, must be `realtime.response`.
+   */
+  object?: 'realtime.response';
+
+  /**
+   * The list of output items generated by the response.
+   */
+  output?: Array<ConversationItem>;
+
+  /**
+   * The final status of the response (`completed`, `cancelled`, `failed`, or
+   * `incomplete`).
+   */
+  status?: 'completed' | 'cancelled' | 'failed' | 'incomplete';
+
+  /**
+   * Additional details about the status.
+   */
+  status_details?: RealtimeResponseStatus;
+
+  /**
+   * Usage statistics for the Response, this will correspond to billing. A Realtime
+   * API session will maintain a conversation context and append new Items to the
+   * Conversation, thus output from previous turns (text and audio tokens) will
+   * become the input for later turns.
+   */
+  usage?: RealtimeResponseUsage;
+}
+
+/**
+ * Additional details about the status.
+ */
+export interface RealtimeResponseStatus {
+  /**
+   * A description of the error that caused the response to fail, populated when the
+   * `status` is `failed`.
+   */
+  error?: RealtimeResponseStatus.Error;
+
+  /**
+   * The reason the Response did not complete. For a `cancelled` Response, one of
+   * `turn_detected` (the server VAD detected a new start of speech) or
+   * `client_cancelled` (the client sent a cancel event). For an `incomplete`
+   * Response, one of `max_output_tokens` or `content_filter` (the server-side safety
+   * filter activated and cut off the response).
+   */
+  reason?: 'turn_detected' | 'client_cancelled' | 'max_output_tokens' | 'content_filter';
+
+  /**
+   * The type of error that caused the response to fail, corresponding with the
+   * `status` field (`completed`, `cancelled`, `incomplete`, `failed`).
+   */
+  type?: 'completed' | 'cancelled' | 'incomplete' | 'failed';
+}
+
+export namespace RealtimeResponseStatus {
+  /**
+   * A description of the error that caused the response to fail, populated when the
+   * `status` is `failed`.
+   */
+  export interface Error {
+    /**
+     * Error code, if any.
+     */
+    code?: string;
+
+    /**
+     * The type of error.
+     */
+    type?: string;
+  }
+}
+
+/**
+ * Usage statistics for the Response, this will correspond to billing. A Realtime
+ * API session will maintain a conversation context and append new Items to the
+ * Conversation, thus output from previous turns (text and audio tokens) will
+ * become the input for later turns.
+ */
+export interface RealtimeResponseUsage {
+  /**
+   * Details about the input tokens used in the Response.
+   */
+  input_token_details?: RealtimeResponseUsage.InputTokenDetails;
+
+  /**
+   * The number of input tokens used in the Response, including text and audio
+   * tokens.
+   */
+  input_tokens?: number;
+
+  /**
+   * Details about the output tokens used in the Response.
+   */
+  output_token_details?: RealtimeResponseUsage.OutputTokenDetails;
+
+  /**
+   * The number of output tokens sent in the Response, including text and audio
+   * tokens.
+   */
+  output_tokens?: number;
+
+  /**
+   * The total number of tokens in the Response including input and output text and
+   * audio tokens.
+   */
+  total_tokens?: number;
+}
+
+export namespace RealtimeResponseUsage {
+  /**
+   * Details about the input tokens used in the Response.
+   */
+  export interface InputTokenDetails {
+    /**
+     * The number of audio tokens used in the Response.
+     */
+    audio_tokens?: number;
+
+    /**
+     * The number of cached tokens used in the Response.
+     */
+    cached_tokens?: number;
+
+    /**
+     * The number of text tokens used in the Response.
+     */
+    text_tokens?: number;
+  }
+
+  /**
+   * Details about the output tokens used in the Response.
+   */
+  export interface OutputTokenDetails {
+    /**
+     * The number of audio tokens used in the Response.
+     */
+    audio_tokens?: number;
+
+    /**
+     * The number of text tokens used in the Response.
+     */
+    text_tokens?: number;
+  }
+}
+
+/**
+ * All events that the Realtime API can send back
+ */
+export type RealtimeServerEvent =
+  | ErrorEvent
+  | SessionCreatedEvent
+  | SessionUpdatedEvent
+  | ConversationCreatedEvent
+  | InputAudioBufferCommittedEvent
+  | InputAudioBufferClearedEvent
+  | InputAudioBufferSpeechStartedEvent
+  | InputAudioBufferSpeechStoppedEvent
+  | ConversationItemCreatedEvent
+  | ConversationItemInputAudioTranscriptionCompletedEvent
+  | ConversationItemInputAudioTranscriptionFailedEvent
+  | ConversationItemTruncatedEvent
+  | ConversationItemDeletedEvent
+  | ResponseCreatedEvent
+  | ResponseDoneEvent
+  | ResponseOutputItemAddedEvent
+  | ResponseOutputItemDoneEvent
+  | ResponseContentPartAddedEvent
+  | ResponseContentPartDoneEvent
+  | ResponseTextDeltaEvent
+  | ResponseTextDoneEvent
+  | ResponseAudioTranscriptDeltaEvent
+  | ResponseAudioTranscriptDoneEvent
+  | ResponseAudioDeltaEvent
+  | ResponseAudioDoneEvent
+  | ResponseFunctionCallArgumentsDeltaEvent
+  | ResponseFunctionCallArgumentsDoneEvent
+  | RateLimitsUpdatedEvent;
+
+/**
+ * Returned when the model-generated audio is updated.
+ */
+export interface ResponseAudioDeltaEvent {
+  /**
+   * The index of the content part in the item's content array.
+   */
+  content_index: number;
+
+  /**
+   * Base64-encoded audio data delta.
+   */
+  delta: string;
+
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * The ID of the item.
+   */
+  item_id: string;
+
+  /**
+   * The index of the output item in the response.
+   */
+  output_index: number;
+
+  /**
+   * The ID of the response.
+   */
+  response_id: string;
+
+  /**
+   * The event type, must be `response.audio.delta`.
+   */
+  type: 'response.audio.delta';
+}
+
+/**
+ * Returned when the model-generated audio is done. Also emitted when a Response is
+ * interrupted, incomplete, or cancelled.
+ */
+export interface ResponseAudioDoneEvent {
+  /**
+   * The index of the content part in the item's content array.
+   */
+  content_index: number;
+
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * The ID of the item.
+   */
+  item_id: string;
+
+  /**
+   * The index of the output item in the response.
+   */
+  output_index: number;
+
+  /**
+   * The ID of the response.
+   */
+  response_id: string;
+
+  /**
+   * The event type, must be `response.audio.done`.
+   */
+  type: 'response.audio.done';
+}
+
+/**
+ * Returned when the model-generated transcription of audio output is updated.
+ */
+export interface ResponseAudioTranscriptDeltaEvent {
+  /**
+   * The index of the content part in the item's content array.
+   */
+  content_index: number;
+
+  /**
+   * The transcript delta.
+   */
+  delta: string;
+
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * The ID of the item.
+   */
+  item_id: string;
+
+  /**
+   * The index of the output item in the response.
+   */
+  output_index: number;
+
+  /**
+   * The ID of the response.
+   */
+  response_id: string;
+
+  /**
+   * The event type, must be `response.audio_transcript.delta`.
+   */
+  type: 'response.audio_transcript.delta';
+}
+
+/**
+ * Returned when the model-generated transcription of audio output is done
+ * streaming. Also emitted when a Response is interrupted, incomplete, or
+ * cancelled.
+ */
+export interface ResponseAudioTranscriptDoneEvent {
+  /**
+   * The index of the content part in the item's content array.
+   */
+  content_index: number;
+
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * The ID of the item.
+   */
+  item_id: string;
+
+  /**
+   * The index of the output item in the response.
+   */
+  output_index: number;
+
+  /**
+   * The ID of the response.
+   */
+  response_id: string;
+
+  /**
+   * The final transcript of the audio.
+   */
+  transcript: string;
+
+  /**
+   * The event type, must be `response.audio_transcript.done`.
+   */
+  type: 'response.audio_transcript.done';
+}
+
+/**
+ * Send this event to cancel an in-progress response. The server will respond with
+ * a `response.cancelled` event or an error if there is no response to cancel.
+ */
+export interface ResponseCancelEvent {
+  /**
+   * The event type, must be `response.cancel`.
+   */
+  type: 'response.cancel';
+
+  /**
+   * Optional client-generated ID used to identify this event.
+   */
+  event_id?: string;
+
+  /**
+   * A specific response ID to cancel - if not provided, will cancel an in-progress
+   * response in the default conversation.
+   */
+  response_id?: string;
+}
+
+/**
+ * Returned when a new content part is added to an assistant message item during
+ * response generation.
+ */
+export interface ResponseContentPartAddedEvent {
+  /**
+   * The index of the content part in the item's content array.
+   */
+  content_index: number;
+
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * The ID of the item to which the content part was added.
+   */
+  item_id: string;
+
+  /**
+   * The index of the output item in the response.
+   */
+  output_index: number;
+
+  /**
+   * The content part that was added.
+   */
+  part: ResponseContentPartAddedEvent.Part;
+
+  /**
+   * The ID of the response.
+   */
+  response_id: string;
+
+  /**
+   * The event type, must be `response.content_part.added`.
+   */
+  type: 'response.content_part.added';
+}
+
+export namespace ResponseContentPartAddedEvent {
+  /**
+   * The content part that was added.
+   */
+  export interface Part {
+    /**
+     * Base64-encoded audio data (if type is "audio").
+     */
+    audio?: string;
+
+    /**
+     * The text content (if type is "text").
+     */
+    text?: string;
+
+    /**
+     * The transcript of the audio (if type is "audio").
+     */
+    transcript?: string;
+
+    /**
+     * The content type ("text", "audio").
+     */
+    type?: 'text' | 'audio';
+  }
+}
+
+/**
+ * Returned when a content part is done streaming in an assistant message item.
+ * Also emitted when a Response is interrupted, incomplete, or cancelled.
+ */
+export interface ResponseContentPartDoneEvent {
+  /**
+   * The index of the content part in the item's content array.
+   */
+  content_index: number;
+
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * The ID of the item.
+   */
+  item_id: string;
+
+  /**
+   * The index of the output item in the response.
+   */
+  output_index: number;
+
+  /**
+   * The content part that is done.
+   */
+  part: ResponseContentPartDoneEvent.Part;
+
+  /**
+   * The ID of the response.
+   */
+  response_id: string;
+
+  /**
+   * The event type, must be `response.content_part.done`.
+   */
+  type: 'response.content_part.done';
+}
+
+export namespace ResponseContentPartDoneEvent {
+  /**
+   * The content part that is done.
+   */
+  export interface Part {
+    /**
+     * Base64-encoded audio data (if type is "audio").
+     */
+    audio?: string;
+
+    /**
+     * The text content (if type is "text").
+     */
+    text?: string;
+
+    /**
+     * The transcript of the audio (if type is "audio").
+     */
+    transcript?: string;
+
+    /**
+     * The content type ("text", "audio").
+     */
+    type?: 'text' | 'audio';
+  }
+}
+
+/**
+ * This event instructs the server to create a Response, which means triggering
+ * model inference. When in Server VAD mode, the server will create Responses
+ * automatically.
+ *
+ * A Response will include at least one Item, and may have two, in which case the
+ * second will be a function call. These Items will be appended to the conversation
+ * history.
+ *
+ * The server will respond with a `response.created` event, events for Items and
+ * content created, and finally a `response.done` event to indicate the Response is
+ * complete.
+ *
+ * The `response.create` event includes inference configuration like
+ * `instructions`, and `temperature`. These fields will override the Session's
+ * configuration for this Response only.
+ */
+export interface ResponseCreateEvent {
+  /**
+   * The event type, must be `response.create`.
+   */
+  type: 'response.create';
+
+  /**
+   * Optional client-generated ID used to identify this event.
+   */
+  event_id?: string;
+
+  /**
+   * Create a new Realtime response with these parameters
+   */
+  response?: ResponseCreateEvent.Response;
+}
+
+export namespace ResponseCreateEvent {
+  /**
+   * Create a new Realtime response with these parameters
+   */
+  export interface Response {
+    /**
+     * Controls which conversation the response is added to. Currently supports `auto`
+     * and `none`, with `auto` as the default value. The `auto` value means that the
+     * contents of the response will be added to the default conversation. Set this to
+     * `none` to create an out-of-band response which will not add items to default
+     * conversation.
+     */
+    conversation?: (string & {}) | 'auto' | 'none';
+
+    /**
+     * Input items to include in the prompt for the model. Creates a new context for
+     * this response, without including the default conversation. Can include
+     * references to items from the default conversation.
+     */
+    input?: Array<RealtimeAPI.ConversationItem>;
+
+    /**
+     * The default system instructions (i.e. system message) prepended to model calls.
+     * This field allows the client to guide the model on desired responses. The model
+     * can be instructed on response content and format, (e.g. "be extremely succinct",
+     * "act friendly", "here are examples of good responses") and on audio behavior
+     * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
+     * instructions are not guaranteed to be followed by the model, but they provide
+     * guidance to the model on the desired behavior.
+     *
+     * Note that the server sets default instructions which will be used if this field
+     * is not set and are visible in the `session.created` event at the start of the
+     * session.
+     */
+    instructions?: string;
+
+    /**
+     * Maximum number of output tokens for a single assistant response, inclusive of
+     * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
+     * `inf` for the maximum available tokens for a given model. Defaults to `inf`.
+     */
+    max_response_output_tokens?: number | 'inf';
+
+    /**
+     * Set of 16 key-value pairs that can be attached to an object. This can be useful
+     * for storing additional information about the object in a structured format. Keys
+     * can be a maximum of 64 characters long and values can be a maximum of 512
+     * characters long.
+     */
+    metadata?: unknown | null;
+
+    /**
+     * The set of modalities the model can respond with. To disable audio, set this to
+     * ["text"].
+     */
+    modalities?: Array<'text' | 'audio'>;
+
+    /**
+     * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+     */
+    output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
+
+    /**
+     * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
+     */
+    temperature?: number;
+
+    /**
+     * How the model chooses tools. Options are `auto`, `none`, `required`, or specify
+     * a function, like `{"type": "function", "function": {"name": "my_function"}}`.
+     */
+    tool_choice?: string;
+
+    /**
+     * Tools (functions) available to the model.
+     */
+    tools?: Array<Response.Tool>;
+
+    /**
+     * The voice the model uses to respond. Voice cannot be changed during the session
+     * once the model has responded with audio at least once. Current voice options are
+     * `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`.
+     */
+    voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
+  }
+
+  export namespace Response {
+    export interface Tool {
+      /**
+       * The description of the function, including guidance on when and how to call it,
+       * and guidance about what to tell the user when calling (if anything).
+       */
+      description?: string;
+
+      /**
+       * The name of the function.
+       */
+      name?: string;
+
+      /**
+       * Parameters of the function in JSON Schema.
+       */
+      parameters?: unknown;
+
+      /**
+       * The type of the tool, i.e. `function`.
+       */
+      type?: 'function';
+    }
+  }
+}
+
+/**
+ * Returned when a new Response is created. The first event of response creation,
+ * where the response is in an initial state of `in_progress`.
+ */
+export interface ResponseCreatedEvent {
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * The response resource.
+   */
+  response: RealtimeResponse;
+
+  /**
+   * The event type, must be `response.created`.
+   */
+  type: 'response.created';
+}
+
+/**
+ * Returned when a Response is done streaming. Always emitted, no matter the final
+ * state. The Response object included in the `response.done` event will include
+ * all output Items in the Response but will omit the raw audio data.
+ */
+export interface ResponseDoneEvent {
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * The response resource.
+   */
+  response: RealtimeResponse;
+
+  /**
+   * The event type, must be `response.done`.
+   */
+  type: 'response.done';
+}
+
+/**
+ * Returned when the model-generated function call arguments are updated.
+ */
+export interface ResponseFunctionCallArgumentsDeltaEvent {
+  /**
+   * The ID of the function call.
+   */
+  call_id: string;
+
+  /**
+   * The arguments delta as a JSON string.
+   */
+  delta: string;
+
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * The ID of the function call item.
+   */
+  item_id: string;
+
+  /**
+   * The index of the output item in the response.
+   */
+  output_index: number;
+
+  /**
+   * The ID of the response.
+   */
+  response_id: string;
+
+  /**
+   * The event type, must be `response.function_call_arguments.delta`.
+   */
+  type: 'response.function_call_arguments.delta';
+}
+
+/**
+ * Returned when the model-generated function call arguments are done streaming.
+ * Also emitted when a Response is interrupted, incomplete, or cancelled.
+ */
+export interface ResponseFunctionCallArgumentsDoneEvent {
+  /**
+   * The final arguments as a JSON string.
+   */
+  arguments: string;
+
+  /**
+   * The ID of the function call.
+   */
+  call_id: string;
+
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * The ID of the function call item.
+   */
+  item_id: string;
+
+  /**
+   * The index of the output item in the response.
+   */
+  output_index: number;
+
+  /**
+   * The ID of the response.
+   */
+  response_id: string;
+
+  /**
+   * The event type, must be `response.function_call_arguments.done`.
+   */
+  type: 'response.function_call_arguments.done';
+}
+
+/**
+ * Returned when a new Item is created during Response generation.
+ */
+export interface ResponseOutputItemAddedEvent {
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * The item to add to the conversation.
+   */
+  item: ConversationItem;
+
+  /**
+   * The index of the output item in the Response.
+   */
+  output_index: number;
+
+  /**
+   * The ID of the Response to which the item belongs.
+   */
+  response_id: string;
+
+  /**
+   * The event type, must be `response.output_item.added`.
+   */
+  type: 'response.output_item.added';
+}
+
+/**
+ * Returned when an Item is done streaming. Also emitted when a Response is
+ * interrupted, incomplete, or cancelled.
+ */
+export interface ResponseOutputItemDoneEvent {
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * The item to add to the conversation.
+   */
+  item: ConversationItem;
+
+  /**
+   * The index of the output item in the Response.
+   */
+  output_index: number;
+
+  /**
+   * The ID of the Response to which the item belongs.
+   */
+  response_id: string;
+
+  /**
+   * The event type, must be `response.output_item.done`.
+   */
+  type: 'response.output_item.done';
+}
+
+/**
+ * Returned when the text value of a "text" content part is updated.
+ */
+export interface ResponseTextDeltaEvent {
+  /**
+   * The index of the content part in the item's content array.
+   */
+  content_index: number;
+
+  /**
+   * The text delta.
+   */
+  delta: string;
+
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * The ID of the item.
+   */
+  item_id: string;
+
+  /**
+   * The index of the output item in the response.
+   */
+  output_index: number;
+
+  /**
+   * The ID of the response.
+   */
+  response_id: string;
+
+  /**
+   * The event type, must be `response.text.delta`.
+   */
+  type: 'response.text.delta';
+}
+
+/**
+ * Returned when the text value of a "text" content part is done streaming. Also
+ * emitted when a Response is interrupted, incomplete, or cancelled.
+ */
+export interface ResponseTextDoneEvent {
+  /**
+   * The index of the content part in the item's content array.
+   */
+  content_index: number;
+
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * The ID of the item.
+   */
+  item_id: string;
+
+  /**
+   * The index of the output item in the response.
+   */
+  output_index: number;
+
+  /**
+   * The ID of the response.
+   */
+  response_id: string;
+
+  /**
+   * The final text content.
+   */
+  text: string;
+
+  /**
+   * The event type, must be `response.text.done`.
+   */
+  type: 'response.text.done';
+}
+
+/**
+ * Returned when a Session is created. Emitted automatically when a new connection
+ * is established as the first server event. This event will contain the default
+ * Session configuration.
+ */
+export interface SessionCreatedEvent {
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * Realtime session object configuration.
+   */
+  session: SessionsAPI.Session;
+
+  /**
+   * The event type, must be `session.created`.
+   */
+  type: 'session.created';
+}
+
+/**
+ * Send this event to update the session’s default configuration. The client may
+ * send this event at any time to update the session configuration, and any field
+ * may be updated at any time, except for "voice". The server will respond with a
+ * `session.updated` event that shows the full effective configuration. Only fields
+ * that are present are updated, thus the correct way to clear a field like
+ * "instructions" is to pass an empty string.
+ */
+export interface SessionUpdateEvent {
+  /**
+   * Realtime session object configuration.
+   */
+  session: SessionUpdateEvent.Session;
+
+  /**
+   * The event type, must be `session.update`.
+   */
+  type: 'session.update';
+
+  /**
+   * Optional client-generated ID used to identify this event.
+   */
+  event_id?: string;
+}
+
+export namespace SessionUpdateEvent {
+  /**
+   * Realtime session object configuration.
+   */
+  export interface Session {
+    /**
+     * The Realtime model used for this session.
+     */
+    model:
+      | 'gpt-4o-realtime-preview'
+      | 'gpt-4o-realtime-preview-2024-10-01'
+      | 'gpt-4o-realtime-preview-2024-12-17'
+      | 'gpt-4o-mini-realtime-preview'
+      | 'gpt-4o-mini-realtime-preview-2024-12-17';
+
+    /**
+     * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+     */
+    input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
+
+    /**
+     * Configuration for input audio transcription, defaults to off and can be set to
+     * `null` to turn off once on. Input audio transcription is not native to the
+     * model, since the model consumes audio directly. Transcription runs
+     * asynchronously through Whisper and should be treated as rough guidance rather
+     * than the representation understood by the model.
+     */
+    input_audio_transcription?: Session.InputAudioTranscription;
+
+    /**
+     * The default system instructions (i.e. system message) prepended to model calls.
+     * This field allows the client to guide the model on desired responses. The model
+     * can be instructed on response content and format, (e.g. "be extremely succinct",
+     * "act friendly", "here are examples of good responses") and on audio behavior
+     * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
+     * instructions are not guaranteed to be followed by the model, but they provide
+     * guidance to the model on the desired behavior.
+     *
+     * Note that the server sets default instructions which will be used if this field
+     * is not set and are visible in the `session.created` event at the start of the
+     * session.
+     */
+    instructions?: string;
+
+    /**
+     * Maximum number of output tokens for a single assistant response, inclusive of
+     * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
+     * `inf` for the maximum available tokens for a given model. Defaults to `inf`.
+     */
+    max_response_output_tokens?: number | 'inf';
+
+    /**
+     * The set of modalities the model can respond with. To disable audio, set this to
+     * ["text"].
+     */
+    modalities?: Array<'text' | 'audio'>;
+
+    /**
+     * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+     */
+    output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
+
+    /**
+     * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
+     */
+    temperature?: number;
+
+    /**
+     * How the model chooses tools. Options are `auto`, `none`, `required`, or specify
+     * a function.
+     */
+    tool_choice?: string;
+
+    /**
+     * Tools (functions) available to the model.
+     */
+    tools?: Array<Session.Tool>;
+
+    /**
+     * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
+     * means that the model will detect the start and end of speech based on audio
+     * volume and respond at the end of user speech.
+     */
+    turn_detection?: Session.TurnDetection;
+
+    /**
+     * The voice the model uses to respond. Voice cannot be changed during the session
+     * once the model has responded with audio at least once. Current voice options are
+     * `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`.
+     */
+    voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
+  }
+
+  export namespace Session {
+    /**
+     * Configuration for input audio transcription, defaults to off and can be set to
+     * `null` to turn off once on. Input audio transcription is not native to the
+     * model, since the model consumes audio directly. Transcription runs
+     * asynchronously through Whisper and should be treated as rough guidance rather
+     * than the representation understood by the model.
+     */
+    export interface InputAudioTranscription {
+      /**
+       * The model to use for transcription, `whisper-1` is the only currently supported
+       * model.
+       */
+      model?: string;
+    }
+
+    export interface Tool {
+      /**
+       * The description of the function, including guidance on when and how to call it,
+       * and guidance about what to tell the user when calling (if anything).
+       */
+      description?: string;
+
+      /**
+       * The name of the function.
+       */
+      name?: string;
+
+      /**
+       * Parameters of the function in JSON Schema.
+       */
+      parameters?: unknown;
+
+      /**
+       * The type of the tool, i.e. `function`.
+       */
+      type?: 'function';
+    }
+
+    /**
+     * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
+     * means that the model will detect the start and end of speech based on audio
+     * volume and respond at the end of user speech.
+     */
+    export interface TurnDetection {
+      /**
+       * Whether or not to automatically generate a response when VAD is enabled. `true`
+       * by default.
+       */
+      create_response?: boolean;
+
+      /**
+       * Amount of audio to include before the VAD detected speech (in milliseconds).
+       * Defaults to 300ms.
+       */
+      prefix_padding_ms?: number;
+
+      /**
+       * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+       * With shorter values the model will respond more quickly, but may jump in on
+       * short pauses from the user.
+       */
+      silence_duration_ms?: number;
+
+      /**
+       * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+       * threshold will require louder audio to activate the model, and thus might
+       * perform better in noisy environments.
+       */
+      threshold?: number;
+
+      /**
+       * Type of turn detection, only `server_vad` is currently supported.
+       */
+      type?: string;
+    }
+  }
+}
+
+/**
+ * Returned when a session is updated with a `session.update` event, unless there
+ * is an error.
+ */
+export interface SessionUpdatedEvent {
+  /**
+   * The unique ID of the server event.
+   */
+  event_id: string;
+
+  /**
+   * Realtime session object configuration.
+   */
+  session: SessionsAPI.Session;
+
+  /**
+   * The event type, must be `session.updated`.
+   */
+  type: 'session.updated';
+}
+
+Realtime.Sessions = Sessions;
+
+export declare namespace Realtime {
+  export {
+    Sessions as Sessions,
+    type SessionsAPISession as Session,
+    type SessionCreateResponse as SessionCreateResponse,
+    type SessionCreateParams as SessionCreateParams,
+  };
+}
diff --git a/src/resources/beta/realtime/sessions.ts b/src/resources/beta/realtime/sessions.ts
new file mode 100644
index 000000000..c1082d236
--- /dev/null
+++ b/src/resources/beta/realtime/sessions.ts
@@ -0,0 +1,546 @@
+// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+import { APIResource } from '../../../resource';
+import * as Core from '../../../core';
+
+export class Sessions extends APIResource {
+  /**
+   * Create an ephemeral API token for use in client-side applications with the
+   * Realtime API. Can be configured with the same session parameters as the
+   * `session.update` client event.
+   *
+   * It responds with a session object, plus a `client_secret` key which contains a
+   * usable ephemeral API token that can be used to authenticate browser clients for
+   * the Realtime API.
+   */
+  create(body: SessionCreateParams, options?: Core.RequestOptions): Core.APIPromise<SessionCreateResponse> {
+    return this._client.post('/realtime/sessions', {
+      body,
+      ...options,
+      headers: { 'OpenAI-Beta': 'assistants=v2', ...options?.headers },
+    });
+  }
+}
+
+/**
+ * Realtime session object configuration.
+ */
+export interface Session {
+  /**
+   * Unique identifier for the session object.
+   */
+  id?: string;
+
+  /**
+   * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+   */
+  input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
+
+  /**
+   * Configuration for input audio transcription, defaults to off and can be set to
+   * `null` to turn off once on. Input audio transcription is not native to the
+   * model, since the model consumes audio directly. Transcription runs
+   * asynchronously through Whisper and should be treated as rough guidance rather
+   * than the representation understood by the model.
+   */
+  input_audio_transcription?: Session.InputAudioTranscription;
+
+  /**
+   * The default system instructions (i.e. system message) prepended to model calls.
+   * This field allows the client to guide the model on desired responses. The model
+   * can be instructed on response content and format, (e.g. "be extremely succinct",
+   * "act friendly", "here are examples of good responses") and on audio behavior
+   * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
+   * instructions are not guaranteed to be followed by the model, but they provide
+   * guidance to the model on the desired behavior.
+   *
+   * Note that the server sets default instructions which will be used if this field
+   * is not set and are visible in the `session.created` event at the start of the
+   * session.
+   */
+  instructions?: string;
+
+  /**
+   * Maximum number of output tokens for a single assistant response, inclusive of
+   * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
+   * `inf` for the maximum available tokens for a given model. Defaults to `inf`.
+   */
+  max_response_output_tokens?: number | 'inf';
+
+  /**
+   * The set of modalities the model can respond with. To disable audio, set this to
+   * ["text"].
+   */
+  modalities?: Array<'text' | 'audio'>;
+
+  /**
+   * The Realtime model used for this session.
+   */
+  model?:
+    | (string & {})
+    | 'gpt-4o-realtime-preview'
+    | 'gpt-4o-realtime-preview-2024-10-01'
+    | 'gpt-4o-realtime-preview-2024-12-17'
+    | 'gpt-4o-mini-realtime-preview'
+    | 'gpt-4o-mini-realtime-preview-2024-12-17';
+
+  /**
+   * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+   */
+  output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
+
+  /**
+   * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
+   */
+  temperature?: number;
+
+  /**
+   * How the model chooses tools. Options are `auto`, `none`, `required`, or specify
+   * a function.
+   */
+  tool_choice?: string;
+
+  /**
+   * Tools (functions) available to the model.
+   */
+  tools?: Array<Session.Tool>;
+
+  /**
+   * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
+   * means that the model will detect the start and end of speech based on audio
+   * volume and respond at the end of user speech.
+   */
+  turn_detection?: Session.TurnDetection | null;
+
+  /**
+   * The voice the model uses to respond. Voice cannot be changed during the session
+   * once the model has responded with audio at least once. Current voice options are
+   * `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`.
+   */
+  voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
+}
+
+export namespace Session {
+  /**
+   * Configuration for input audio transcription, defaults to off and can be set to
+   * `null` to turn off once on. Input audio transcription is not native to the
+   * model, since the model consumes audio directly. Transcription runs
+   * asynchronously through Whisper and should be treated as rough guidance rather
+   * than the representation understood by the model.
+   */
+  export interface InputAudioTranscription {
+    /**
+     * The model to use for transcription, `whisper-1` is the only currently supported
+     * model.
+     */
+    model?: string;
+  }
+
+  export interface Tool {
+    /**
+     * The description of the function, including guidance on when and how to call it,
+     * and guidance about what to tell the user when calling (if anything).
+     */
+    description?: string;
+
+    /**
+     * The name of the function.
+     */
+    name?: string;
+
+    /**
+     * Parameters of the function in JSON Schema.
+     */
+    parameters?: unknown;
+
+    /**
+     * The type of the tool, i.e. `function`.
+     */
+    type?: 'function';
+  }
+
+  /**
+   * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
+   * means that the model will detect the start and end of speech based on audio
+   * volume and respond at the end of user speech.
+   */
+  export interface TurnDetection {
+    /**
+     * Amount of audio to include before the VAD detected speech (in milliseconds).
+     * Defaults to 300ms.
+     */
+    prefix_padding_ms?: number;
+
+    /**
+     * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+     * With shorter values the model will respond more quickly, but may jump in on
+     * short pauses from the user.
+     */
+    silence_duration_ms?: number;
+
+    /**
+     * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+     * threshold will require louder audio to activate the model, and thus might
+     * perform better in noisy environments.
+     */
+    threshold?: number;
+
+    /**
+     * Type of turn detection, only `server_vad` is currently supported.
+     */
+    type?: 'server_vad';
+  }
+}
+
+/**
+ * A new Realtime session configuration, with an ephermeral key. Default TTL for
+ * keys is one minute.
+ */
+export interface SessionCreateResponse {
+  /**
+   * Ephemeral key returned by the API.
+   */
+  client_secret?: SessionCreateResponse.ClientSecret;
+
+  /**
+   * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+   */
+  input_audio_format?: string;
+
+  /**
+   * Configuration for input audio transcription, defaults to off and can be set to
+   * `null` to turn off once on. Input audio transcription is not native to the
+   * model, since the model consumes audio directly. Transcription runs
+   * asynchronously through Whisper and should be treated as rough guidance rather
+   * than the representation understood by the model.
+   */
+  input_audio_transcription?: SessionCreateResponse.InputAudioTranscription;
+
+  /**
+   * The default system instructions (i.e. system message) prepended to model calls.
+   * This field allows the client to guide the model on desired responses. The model
+   * can be instructed on response content and format, (e.g. "be extremely succinct",
+   * "act friendly", "here are examples of good responses") and on audio behavior
+   * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
+   * instructions are not guaranteed to be followed by the model, but they provide
+   * guidance to the model on the desired behavior.
+   *
+   * Note that the server sets default instructions which will be used if this field
+   * is not set and are visible in the `session.created` event at the start of the
+   * session.
+   */
+  instructions?: string;
+
+  /**
+   * Maximum number of output tokens for a single assistant response, inclusive of
+   * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
+   * `inf` for the maximum available tokens for a given model. Defaults to `inf`.
+   */
+  max_response_output_tokens?: number | 'inf';
+
+  /**
+   * The set of modalities the model can respond with. To disable audio, set this to
+   * ["text"].
+   */
+  modalities?: Array<'text' | 'audio'>;
+
+  /**
+   * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+   */
+  output_audio_format?: string;
+
+  /**
+   * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
+   */
+  temperature?: number;
+
+  /**
+   * How the model chooses tools. Options are `auto`, `none`, `required`, or specify
+   * a function.
+   */
+  tool_choice?: string;
+
+  /**
+   * Tools (functions) available to the model.
+   */
+  tools?: Array<SessionCreateResponse.Tool>;
+
+  /**
+   * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
+   * means that the model will detect the start and end of speech based on audio
+   * volume and respond at the end of user speech.
+   */
+  turn_detection?: SessionCreateResponse.TurnDetection;
+
+  /**
+   * The voice the model uses to respond. Voice cannot be changed during the session
+   * once the model has responded with audio at least once. Current voice options are
+   * `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`.
+   */
+  voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
+}
+
+export namespace SessionCreateResponse {
+  /**
+   * Ephemeral key returned by the API.
+   */
+  export interface ClientSecret {
+    /**
+     * Timestamp for when the token expires. Currently, all tokens expire after one
+     * minute.
+     */
+    expires_at?: number;
+
+    /**
+     * Ephemeral key usable in client environments to authenticate connections to the
+     * Realtime API. Use this in client-side environments rather than a standard API
+     * token, which should only be used server-side.
+     */
+    value?: string;
+  }
+
+  /**
+   * Configuration for input audio transcription, defaults to off and can be set to
+   * `null` to turn off once on. Input audio transcription is not native to the
+   * model, since the model consumes audio directly. Transcription runs
+   * asynchronously through Whisper and should be treated as rough guidance rather
+   * than the representation understood by the model.
+   */
+  export interface InputAudioTranscription {
+    /**
+     * The model to use for transcription, `whisper-1` is the only currently supported
+     * model.
+     */
+    model?: string;
+  }
+
+  export interface Tool {
+    /**
+     * The description of the function, including guidance on when and how to call it,
+     * and guidance about what to tell the user when calling (if anything).
+     */
+    description?: string;
+
+    /**
+     * The name of the function.
+     */
+    name?: string;
+
+    /**
+     * Parameters of the function in JSON Schema.
+     */
+    parameters?: unknown;
+
+    /**
+     * The type of the tool, i.e. `function`.
+     */
+    type?: 'function';
+  }
+
+  /**
+   * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
+   * means that the model will detect the start and end of speech based on audio
+   * volume and respond at the end of user speech.
+   */
+  export interface TurnDetection {
+    /**
+     * Amount of audio to include before the VAD detected speech (in milliseconds).
+     * Defaults to 300ms.
+     */
+    prefix_padding_ms?: number;
+
+    /**
+     * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+     * With shorter values the model will respond more quickly, but may jump in on
+     * short pauses from the user.
+     */
+    silence_duration_ms?: number;
+
+    /**
+     * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+     * threshold will require louder audio to activate the model, and thus might
+     * perform better in noisy environments.
+     */
+    threshold?: number;
+
+    /**
+     * Type of turn detection, only `server_vad` is currently supported.
+     */
+    type?: string;
+  }
+}
+
+export interface SessionCreateParams {
+  /**
+   * The Realtime model used for this session.
+   */
+  model:
+    | 'gpt-4o-realtime-preview'
+    | 'gpt-4o-realtime-preview-2024-10-01'
+    | 'gpt-4o-realtime-preview-2024-12-17'
+    | 'gpt-4o-mini-realtime-preview'
+    | 'gpt-4o-mini-realtime-preview-2024-12-17';
+
+  /**
+   * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+   */
+  input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
+
+  /**
+   * Configuration for input audio transcription, defaults to off and can be set to
+   * `null` to turn off once on. Input audio transcription is not native to the
+   * model, since the model consumes audio directly. Transcription runs
+   * asynchronously through Whisper and should be treated as rough guidance rather
+   * than the representation understood by the model.
+   */
+  input_audio_transcription?: SessionCreateParams.InputAudioTranscription;
+
+  /**
+   * The default system instructions (i.e. system message) prepended to model calls.
+   * This field allows the client to guide the model on desired responses. The model
+   * can be instructed on response content and format, (e.g. "be extremely succinct",
+   * "act friendly", "here are examples of good responses") and on audio behavior
+   * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The
+   * instructions are not guaranteed to be followed by the model, but they provide
+   * guidance to the model on the desired behavior.
+   *
+   * Note that the server sets default instructions which will be used if this field
+   * is not set and are visible in the `session.created` event at the start of the
+   * session.
+   */
+  instructions?: string;
+
+  /**
+   * Maximum number of output tokens for a single assistant response, inclusive of
+   * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or
+   * `inf` for the maximum available tokens for a given model. Defaults to `inf`.
+   */
+  max_response_output_tokens?: number | 'inf';
+
+  /**
+   * The set of modalities the model can respond with. To disable audio, set this to
+   * ["text"].
+   */
+  modalities?: Array<'text' | 'audio'>;
+
+  /**
+   * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`.
+   */
+  output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw';
+
+  /**
+   * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8.
+   */
+  temperature?: number;
+
+  /**
+   * How the model chooses tools. Options are `auto`, `none`, `required`, or specify
+   * a function.
+   */
+  tool_choice?: string;
+
+  /**
+   * Tools (functions) available to the model.
+   */
+  tools?: Array<SessionCreateParams.Tool>;
+
+  /**
+   * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
+   * means that the model will detect the start and end of speech based on audio
+   * volume and respond at the end of user speech.
+   */
+  turn_detection?: SessionCreateParams.TurnDetection;
+
+  /**
+   * The voice the model uses to respond. Voice cannot be changed during the session
+   * once the model has responded with audio at least once. Current voice options are
+   * `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`.
+   */
+  voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse';
+}
+
+export namespace SessionCreateParams {
+  /**
+   * Configuration for input audio transcription, defaults to off and can be set to
+   * `null` to turn off once on. Input audio transcription is not native to the
+   * model, since the model consumes audio directly. Transcription runs
+   * asynchronously through Whisper and should be treated as rough guidance rather
+   * than the representation understood by the model.
+   */
+  export interface InputAudioTranscription {
+    /**
+     * The model to use for transcription, `whisper-1` is the only currently supported
+     * model.
+     */
+    model?: string;
+  }
+
+  export interface Tool {
+    /**
+     * The description of the function, including guidance on when and how to call it,
+     * and guidance about what to tell the user when calling (if anything).
+     */
+    description?: string;
+
+    /**
+     * The name of the function.
+     */
+    name?: string;
+
+    /**
+     * Parameters of the function in JSON Schema.
+     */
+    parameters?: unknown;
+
+    /**
+     * The type of the tool, i.e. `function`.
+     */
+    type?: 'function';
+  }
+
+  /**
+   * Configuration for turn detection. Can be set to `null` to turn off. Server VAD
+   * means that the model will detect the start and end of speech based on audio
+   * volume and respond at the end of user speech.
+   */
+  export interface TurnDetection {
+    /**
+     * Whether or not to automatically generate a response when VAD is enabled. `true`
+     * by default.
+     */
+    create_response?: boolean;
+
+    /**
+     * Amount of audio to include before the VAD detected speech (in milliseconds).
+     * Defaults to 300ms.
+     */
+    prefix_padding_ms?: number;
+
+    /**
+     * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms.
+     * With shorter values the model will respond more quickly, but may jump in on
+     * short pauses from the user.
+     */
+    silence_duration_ms?: number;
+
+    /**
+     * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher
+     * threshold will require louder audio to activate the model, and thus might
+     * perform better in noisy environments.
+     */
+    threshold?: number;
+
+    /**
+     * Type of turn detection, only `server_vad` is currently supported.
+     */
+    type?: string;
+  }
+}
+
+export declare namespace Sessions {
+  export {
+    type Session as Session,
+    type SessionCreateResponse as SessionCreateResponse,
+    type SessionCreateParams as SessionCreateParams,
+  };
+}
diff --git a/tests/api-resources/beta/realtime/sessions.test.ts b/tests/api-resources/beta/realtime/sessions.test.ts
new file mode 100644
index 000000000..0ed998c27
--- /dev/null
+++ b/tests/api-resources/beta/realtime/sessions.test.ts
@@ -0,0 +1,45 @@
+// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+import OpenAI from 'openai';
+import { Response } from 'node-fetch';
+
+const client = new OpenAI({
+  apiKey: 'My API Key',
+  baseURL: process.env['TEST_API_BASE_URL'] ?? 'http://127.0.0.1:4010',
+});
+
+describe('resource sessions', () => {
+  test('create: only required params', async () => {
+    const responsePromise = client.beta.realtime.sessions.create({ model: 'gpt-4o-realtime-preview' });
+    const rawResponse = await responsePromise.asResponse();
+    expect(rawResponse).toBeInstanceOf(Response);
+    const response = await responsePromise;
+    expect(response).not.toBeInstanceOf(Response);
+    const dataAndResponse = await responsePromise.withResponse();
+    expect(dataAndResponse.data).toBe(response);
+    expect(dataAndResponse.response).toBe(rawResponse);
+  });
+
+  test('create: required and optional params', async () => {
+    const response = await client.beta.realtime.sessions.create({
+      model: 'gpt-4o-realtime-preview',
+      input_audio_format: 'pcm16',
+      input_audio_transcription: { model: 'model' },
+      instructions: 'instructions',
+      max_response_output_tokens: 0,
+      modalities: ['text'],
+      output_audio_format: 'pcm16',
+      temperature: 0,
+      tool_choice: 'tool_choice',
+      tools: [{ description: 'description', name: 'name', parameters: {}, type: 'function' }],
+      turn_detection: {
+        create_response: true,
+        prefix_padding_ms: 0,
+        silence_duration_ms: 0,
+        threshold: 0,
+        type: 'type',
+      },
+      voice: 'alloy',
+    });
+  });
+});