diff --git a/.stats.yml b/.stats.yml index d223c8f1f..9600edae3 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,2 +1,2 @@ -configured_endpoints: 68 -openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai-02200a58ed631064b6419711da99fefd6e97bdbbeb577a80a1a6e0c8dbcb18f5.yml +configured_endpoints: 69 +openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai-b5b0e2c794b012919701c3fd43286af10fa25d33ceb8a881bec2636028f446e0.yml diff --git a/api.md b/api.md index e971585de..10f88e5e0 100644 --- a/api.md +++ b/api.md @@ -212,6 +212,66 @@ Methods: # Beta +## Realtime + +Types: + +- ConversationCreatedEvent +- ConversationItem +- ConversationItemContent +- ConversationItemCreateEvent +- ConversationItemCreatedEvent +- ConversationItemDeleteEvent +- ConversationItemDeletedEvent +- ConversationItemInputAudioTranscriptionCompletedEvent +- ConversationItemInputAudioTranscriptionFailedEvent +- ConversationItemTruncateEvent +- ConversationItemTruncatedEvent +- ErrorEvent +- InputAudioBufferAppendEvent +- InputAudioBufferClearEvent +- InputAudioBufferClearedEvent +- InputAudioBufferCommitEvent +- InputAudioBufferCommittedEvent +- InputAudioBufferSpeechStartedEvent +- InputAudioBufferSpeechStoppedEvent +- RateLimitsUpdatedEvent +- RealtimeClientEvent +- RealtimeResponse +- RealtimeResponseStatus +- RealtimeResponseUsage +- RealtimeServerEvent +- ResponseAudioDeltaEvent +- ResponseAudioDoneEvent +- ResponseAudioTranscriptDeltaEvent +- ResponseAudioTranscriptDoneEvent +- ResponseCancelEvent +- ResponseContentPartAddedEvent +- ResponseContentPartDoneEvent +- ResponseCreateEvent +- ResponseCreatedEvent +- ResponseDoneEvent +- ResponseFunctionCallArgumentsDeltaEvent +- ResponseFunctionCallArgumentsDoneEvent +- ResponseOutputItemAddedEvent +- ResponseOutputItemDoneEvent +- ResponseTextDeltaEvent +- ResponseTextDoneEvent +- SessionCreatedEvent +- SessionUpdateEvent +- SessionUpdatedEvent + +### Sessions + +Types: + +- Session +- SessionCreateResponse + +Methods: + +- client.beta.realtime.sessions.create({ ...params }) -> SessionCreateResponse + ## VectorStores Types: diff --git a/src/resources/beta/beta.ts b/src/resources/beta/beta.ts index 75035d600..48cc92369 100644 --- a/src/resources/beta/beta.ts +++ b/src/resources/beta/beta.ts @@ -20,6 +20,8 @@ import { RunStreamEvent, ThreadStreamEvent, } from './assistants'; +import * as RealtimeAPI from './realtime/realtime'; +import { Realtime } from './realtime/realtime'; import * as ThreadsAPI from './threads/threads'; import { AssistantResponseFormatOption, @@ -54,11 +56,13 @@ import { } from './vector-stores/vector-stores'; export class Beta extends APIResource { + realtime: RealtimeAPI.Realtime = new RealtimeAPI.Realtime(this._client); vectorStores: VectorStoresAPI.VectorStores = new VectorStoresAPI.VectorStores(this._client); assistants: AssistantsAPI.Assistants = new AssistantsAPI.Assistants(this._client); threads: ThreadsAPI.Threads = new ThreadsAPI.Threads(this._client); } +Beta.Realtime = Realtime; Beta.VectorStores = VectorStores; Beta.VectorStoresPage = VectorStoresPage; Beta.Assistants = Assistants; @@ -66,6 +70,8 @@ Beta.AssistantsPage = AssistantsPage; Beta.Threads = Threads; export declare namespace Beta { + export { Realtime as Realtime }; + export { VectorStores as VectorStores, type AutoFileChunkingStrategyParam as AutoFileChunkingStrategyParam, diff --git a/src/resources/beta/index.ts b/src/resources/beta/index.ts index 10212d390..b8db6f846 100644 --- a/src/resources/beta/index.ts +++ b/src/resources/beta/index.ts @@ -19,6 +19,7 @@ export { type AssistantListParams, } from './assistants'; export { Beta } from './beta'; +export { Realtime } from './realtime/index'; export { Threads, type AssistantResponseFormatOption, diff --git a/src/resources/beta/realtime/index.ts b/src/resources/beta/realtime/index.ts new file mode 100644 index 000000000..66c3ecaae --- /dev/null +++ b/src/resources/beta/realtime/index.ts @@ -0,0 +1,4 @@ +// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +export { Realtime } from './realtime'; +export { Sessions, type Session, type SessionCreateResponse, type SessionCreateParams } from './sessions'; diff --git a/src/resources/beta/realtime/realtime.ts b/src/resources/beta/realtime/realtime.ts new file mode 100644 index 000000000..5de06917a --- /dev/null +++ b/src/resources/beta/realtime/realtime.ts @@ -0,0 +1,1904 @@ +// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +import { APIResource } from '../../../resource'; +import * as RealtimeAPI from './realtime'; +import * as SessionsAPI from './sessions'; +import { + Session as SessionsAPISession, + SessionCreateParams, + SessionCreateResponse, + Sessions, +} from './sessions'; + +export class Realtime extends APIResource { + sessions: SessionsAPI.Sessions = new SessionsAPI.Sessions(this._client); +} + +/** + * Returned when a conversation is created. Emitted right after session creation. + */ +export interface ConversationCreatedEvent { + /** + * The conversation resource. + */ + conversation: ConversationCreatedEvent.Conversation; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The event type, must be `conversation.created`. + */ + type: 'conversation.created'; +} + +export namespace ConversationCreatedEvent { + /** + * The conversation resource. + */ + export interface Conversation { + /** + * The unique ID of the conversation. + */ + id?: string; + + /** + * The object type, must be `realtime.conversation`. + */ + object?: 'realtime.conversation'; + } +} + +/** + * The item to add to the conversation. + */ +export interface ConversationItem { + /** + * The unique ID of the item, this can be generated by the client to help manage + * server-side context, but is not required because the server will generate one if + * not provided. + */ + id?: string; + + /** + * The arguments of the function call (for `function_call` items). + */ + arguments?: string; + + /** + * The ID of the function call (for `function_call` and `function_call_output` + * items). If passed on a `function_call_output` item, the server will check that a + * `function_call` item with the same ID exists in the conversation history. + */ + call_id?: string; + + /** + * The content of the message, applicable for `message` items. + * + * - Message items of role `system` support only `input_text` content + * - Message items of role `user` support `input_text` and `input_audio` content + * - Message items of role `assistant` support `text` content. + */ + content?: Array; + + /** + * The name of the function being called (for `function_call` items). + */ + name?: string; + + /** + * Identifier for the API object being returned - always `realtime.item`. + */ + object?: 'realtime.item'; + + /** + * The output of the function call (for `function_call_output` items). + */ + output?: string; + + /** + * The role of the message sender (`user`, `assistant`, `system`), only applicable + * for `message` items. + */ + role?: 'user' | 'assistant' | 'system'; + + /** + * The status of the item (`completed`, `incomplete`). These have no effect on the + * conversation, but are accepted for consistency with the + * `conversation.item.created` event. + */ + status?: 'completed' | 'incomplete'; + + /** + * The type of the item (`message`, `function_call`, `function_call_output`). + */ + type?: 'message' | 'function_call' | 'function_call_output'; +} + +export interface ConversationItemContent { + /** + * ID of a previous conversation item to reference (for `item_reference` content + * types in `response.create` events). These can reference both client and server + * created items. + */ + id?: string; + + /** + * Base64-encoded audio bytes, used for `input_audio` content type. + */ + audio?: string; + + /** + * The text content, used for `input_text` and `text` content types. + */ + text?: string; + + /** + * The transcript of the audio, used for `input_audio` content type. + */ + transcript?: string; + + /** + * The content type (`input_text`, `input_audio`, `item_reference`, `text`). + */ + type?: 'input_text' | 'input_audio' | 'item_reference' | 'text'; +} + +/** + * Add a new Item to the Conversation's context, including messages, function + * calls, and function call responses. This event can be used both to populate a + * "history" of the conversation and to add new items mid-stream, but has the + * current limitation that it cannot populate assistant audio messages. + * + * If successful, the server will respond with a `conversation.item.created` event, + * otherwise an `error` event will be sent. + */ +export interface ConversationItemCreateEvent { + /** + * The item to add to the conversation. + */ + item: ConversationItem; + + /** + * The event type, must be `conversation.item.create`. + */ + type: 'conversation.item.create'; + + /** + * Optional client-generated ID used to identify this event. + */ + event_id?: string; + + /** + * The ID of the preceding item after which the new item will be inserted. If not + * set, the new item will be appended to the end of the conversation. If set, it + * allows an item to be inserted mid-conversation. If the ID cannot be found, an + * error will be returned and the item will not be added. + */ + previous_item_id?: string; +} + +/** + * Returned when a conversation item is created. There are several scenarios that + * produce this event: + * + * - The server is generating a Response, which if successful will produce either + * one or two Items, which will be of type `message` (role `assistant`) or type + * `function_call`. + * - The input audio buffer has been committed, either by the client or the server + * (in `server_vad` mode). The server will take the content of the input audio + * buffer and add it to a new user message Item. + * - The client has sent a `conversation.item.create` event to add a new Item to + * the Conversation. + */ +export interface ConversationItemCreatedEvent { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The item to add to the conversation. + */ + item: ConversationItem; + + /** + * The ID of the preceding item in the Conversation context, allows the client to + * understand the order of the conversation. + */ + previous_item_id: string; + + /** + * The event type, must be `conversation.item.created`. + */ + type: 'conversation.item.created'; +} + +/** + * Send this event when you want to remove any item from the conversation history. + * The server will respond with a `conversation.item.deleted` event, unless the + * item does not exist in the conversation history, in which case the server will + * respond with an error. + */ +export interface ConversationItemDeleteEvent { + /** + * The ID of the item to delete. + */ + item_id: string; + + /** + * The event type, must be `conversation.item.delete`. + */ + type: 'conversation.item.delete'; + + /** + * Optional client-generated ID used to identify this event. + */ + event_id?: string; +} + +/** + * Returned when an item in the conversation is deleted by the client with a + * `conversation.item.delete` event. This event is used to synchronize the server's + * understanding of the conversation history with the client's view. + */ +export interface ConversationItemDeletedEvent { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the item that was deleted. + */ + item_id: string; + + /** + * The event type, must be `conversation.item.deleted`. + */ + type: 'conversation.item.deleted'; +} + +/** + * This event is the output of audio transcription for user audio written to the + * user audio buffer. Transcription begins when the input audio buffer is committed + * by the client or server (in `server_vad` mode). Transcription runs + * asynchronously with Response creation, so this event may come before or after + * the Response events. + * + * Realtime API models accept audio natively, and thus input transcription is a + * separate process run on a separate ASR (Automatic Speech Recognition) model, + * currently always `whisper-1`. Thus the transcript may diverge somewhat from the + * model's interpretation, and should be treated as a rough guide. + */ +export interface ConversationItemInputAudioTranscriptionCompletedEvent { + /** + * The index of the content part containing the audio. + */ + content_index: number; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the user message item containing the audio. + */ + item_id: string; + + /** + * The transcribed text. + */ + transcript: string; + + /** + * The event type, must be `conversation.item.input_audio_transcription.completed`. + */ + type: 'conversation.item.input_audio_transcription.completed'; +} + +/** + * Returned when input audio transcription is configured, and a transcription + * request for a user message failed. These events are separate from other `error` + * events so that the client can identify the related Item. + */ +export interface ConversationItemInputAudioTranscriptionFailedEvent { + /** + * The index of the content part containing the audio. + */ + content_index: number; + + /** + * Details of the transcription error. + */ + error: ConversationItemInputAudioTranscriptionFailedEvent.Error; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the user message item. + */ + item_id: string; + + /** + * The event type, must be `conversation.item.input_audio_transcription.failed`. + */ + type: 'conversation.item.input_audio_transcription.failed'; +} + +export namespace ConversationItemInputAudioTranscriptionFailedEvent { + /** + * Details of the transcription error. + */ + export interface Error { + /** + * Error code, if any. + */ + code?: string; + + /** + * A human-readable error message. + */ + message?: string; + + /** + * Parameter related to the error, if any. + */ + param?: string; + + /** + * The type of error. + */ + type?: string; + } +} + +/** + * Send this event to truncate a previous assistant message’s audio. The server + * will produce audio faster than realtime, so this event is useful when the user + * interrupts to truncate audio that has already been sent to the client but not + * yet played. This will synchronize the server's understanding of the audio with + * the client's playback. + * + * Truncating audio will delete the server-side text transcript to ensure there is + * not text in the context that hasn't been heard by the user. + * + * If successful, the server will respond with a `conversation.item.truncated` + * event. + */ +export interface ConversationItemTruncateEvent { + /** + * Inclusive duration up to which audio is truncated, in milliseconds. If the + * audio_end_ms is greater than the actual audio duration, the server will respond + * with an error. + */ + audio_end_ms: number; + + /** + * The index of the content part to truncate. Set this to 0. + */ + content_index: number; + + /** + * The ID of the assistant message item to truncate. Only assistant message items + * can be truncated. + */ + item_id: string; + + /** + * The event type, must be `conversation.item.truncate`. + */ + type: 'conversation.item.truncate'; + + /** + * Optional client-generated ID used to identify this event. + */ + event_id?: string; +} + +/** + * Returned when an earlier assistant audio message item is truncated by the client + * with a `conversation.item.truncate` event. This event is used to synchronize the + * server's understanding of the audio with the client's playback. + * + * This action will truncate the audio and remove the server-side text transcript + * to ensure there is no text in the context that hasn't been heard by the user. + */ +export interface ConversationItemTruncatedEvent { + /** + * The duration up to which the audio was truncated, in milliseconds. + */ + audio_end_ms: number; + + /** + * The index of the content part that was truncated. + */ + content_index: number; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the assistant message item that was truncated. + */ + item_id: string; + + /** + * The event type, must be `conversation.item.truncated`. + */ + type: 'conversation.item.truncated'; +} + +/** + * Returned when an error occurs, which could be a client problem or a server + * problem. Most errors are recoverable and the session will stay open, we + * recommend to implementors to monitor and log error messages by default. + */ +export interface ErrorEvent { + /** + * Details of the error. + */ + error: ErrorEvent.Error; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The event type, must be `error`. + */ + type: 'error'; +} + +export namespace ErrorEvent { + /** + * Details of the error. + */ + export interface Error { + /** + * A human-readable error message. + */ + message: string; + + /** + * The type of error (e.g., "invalid_request_error", "server_error"). + */ + type: string; + + /** + * Error code, if any. + */ + code?: string | null; + + /** + * The event_id of the client event that caused the error, if applicable. + */ + event_id?: string | null; + + /** + * Parameter related to the error, if any. + */ + param?: string | null; + } +} + +/** + * Send this event to append audio bytes to the input audio buffer. The audio + * buffer is temporary storage you can write to and later commit. In Server VAD + * mode, the audio buffer is used to detect speech and the server will decide when + * to commit. When Server VAD is disabled, you must commit the audio buffer + * manually. + * + * The client may choose how much audio to place in each event up to a maximum of + * 15 MiB, for example streaming smaller chunks from the client may allow the VAD + * to be more responsive. Unlike made other client events, the server will not send + * a confirmation response to this event. + */ +export interface InputAudioBufferAppendEvent { + /** + * Base64-encoded audio bytes. This must be in the format specified by the + * `input_audio_format` field in the session configuration. + */ + audio: string; + + /** + * The event type, must be `input_audio_buffer.append`. + */ + type: 'input_audio_buffer.append'; + + /** + * Optional client-generated ID used to identify this event. + */ + event_id?: string; +} + +/** + * Send this event to clear the audio bytes in the buffer. The server will respond + * with an `input_audio_buffer.cleared` event. + */ +export interface InputAudioBufferClearEvent { + /** + * The event type, must be `input_audio_buffer.clear`. + */ + type: 'input_audio_buffer.clear'; + + /** + * Optional client-generated ID used to identify this event. + */ + event_id?: string; +} + +/** + * Returned when the input audio buffer is cleared by the client with a + * `input_audio_buffer.clear` event. + */ +export interface InputAudioBufferClearedEvent { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The event type, must be `input_audio_buffer.cleared`. + */ + type: 'input_audio_buffer.cleared'; +} + +/** + * Send this event to commit the user input audio buffer, which will create a new + * user message item in the conversation. This event will produce an error if the + * input audio buffer is empty. When in Server VAD mode, the client does not need + * to send this event, the server will commit the audio buffer automatically. + * + * Committing the input audio buffer will trigger input audio transcription (if + * enabled in session configuration), but it will not create a response from the + * model. The server will respond with an `input_audio_buffer.committed` event. + */ +export interface InputAudioBufferCommitEvent { + /** + * The event type, must be `input_audio_buffer.commit`. + */ + type: 'input_audio_buffer.commit'; + + /** + * Optional client-generated ID used to identify this event. + */ + event_id?: string; +} + +/** + * Returned when an input audio buffer is committed, either by the client or + * automatically in server VAD mode. The `item_id` property is the ID of the user + * message item that will be created, thus a `conversation.item.created` event will + * also be sent to the client. + */ +export interface InputAudioBufferCommittedEvent { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the user message item that will be created. + */ + item_id: string; + + /** + * The ID of the preceding item after which the new item will be inserted. + */ + previous_item_id: string; + + /** + * The event type, must be `input_audio_buffer.committed`. + */ + type: 'input_audio_buffer.committed'; +} + +/** + * Sent by the server when in `server_vad` mode to indicate that speech has been + * detected in the audio buffer. This can happen any time audio is added to the + * buffer (unless speech is already detected). The client may want to use this + * event to interrupt audio playback or provide visual feedback to the user. + * + * The client should expect to receive a `input_audio_buffer.speech_stopped` event + * when speech stops. The `item_id` property is the ID of the user message item + * that will be created when speech stops and will also be included in the + * `input_audio_buffer.speech_stopped` event (unless the client manually commits + * the audio buffer during VAD activation). + */ +export interface InputAudioBufferSpeechStartedEvent { + /** + * Milliseconds from the start of all audio written to the buffer during the + * session when speech was first detected. This will correspond to the beginning of + * audio sent to the model, and thus includes the `prefix_padding_ms` configured in + * the Session. + */ + audio_start_ms: number; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the user message item that will be created when speech stops. + */ + item_id: string; + + /** + * The event type, must be `input_audio_buffer.speech_started`. + */ + type: 'input_audio_buffer.speech_started'; +} + +/** + * Returned in `server_vad` mode when the server detects the end of speech in the + * audio buffer. The server will also send an `conversation.item.created` event + * with the user message item that is created from the audio buffer. + */ +export interface InputAudioBufferSpeechStoppedEvent { + /** + * Milliseconds since the session started when speech stopped. This will correspond + * to the end of audio sent to the model, and thus includes the + * `min_silence_duration_ms` configured in the Session. + */ + audio_end_ms: number; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the user message item that will be created. + */ + item_id: string; + + /** + * The event type, must be `input_audio_buffer.speech_stopped`. + */ + type: 'input_audio_buffer.speech_stopped'; +} + +/** + * Emitted at the beginning of a Response to indicate the updated rate limits. When + * a Response is created some tokens will be "reserved" for the output tokens, the + * rate limits shown here reflect that reservation, which is then adjusted + * accordingly once the Response is completed. + */ +export interface RateLimitsUpdatedEvent { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * List of rate limit information. + */ + rate_limits: Array; + + /** + * The event type, must be `rate_limits.updated`. + */ + type: 'rate_limits.updated'; +} + +export namespace RateLimitsUpdatedEvent { + export interface RateLimit { + /** + * The maximum allowed value for the rate limit. + */ + limit?: number; + + /** + * The name of the rate limit (`requests`, `tokens`). + */ + name?: 'requests' | 'tokens'; + + /** + * The remaining value before the limit is reached. + */ + remaining?: number; + + /** + * Seconds until the rate limit resets. + */ + reset_seconds?: number; + } +} + +/** + * All events that the client can send to the Realtime API + */ +export type RealtimeClientEvent = + | SessionUpdateEvent + | InputAudioBufferAppendEvent + | InputAudioBufferCommitEvent + | InputAudioBufferClearEvent + | ConversationItemCreateEvent + | ConversationItemTruncateEvent + | ConversationItemDeleteEvent + | ResponseCreateEvent + | ResponseCancelEvent; + +/** + * The response resource. + */ +export interface RealtimeResponse { + /** + * The unique ID of the response. + */ + id?: string; + + /** + * Developer-provided string key-value pairs associated with this response. + */ + metadata?: unknown | null; + + /** + * The object type, must be `realtime.response`. + */ + object?: 'realtime.response'; + + /** + * The list of output items generated by the response. + */ + output?: Array; + + /** + * The final status of the response (`completed`, `cancelled`, `failed`, or + * `incomplete`). + */ + status?: 'completed' | 'cancelled' | 'failed' | 'incomplete'; + + /** + * Additional details about the status. + */ + status_details?: RealtimeResponseStatus; + + /** + * Usage statistics for the Response, this will correspond to billing. A Realtime + * API session will maintain a conversation context and append new Items to the + * Conversation, thus output from previous turns (text and audio tokens) will + * become the input for later turns. + */ + usage?: RealtimeResponseUsage; +} + +/** + * Additional details about the status. + */ +export interface RealtimeResponseStatus { + /** + * A description of the error that caused the response to fail, populated when the + * `status` is `failed`. + */ + error?: RealtimeResponseStatus.Error; + + /** + * The reason the Response did not complete. For a `cancelled` Response, one of + * `turn_detected` (the server VAD detected a new start of speech) or + * `client_cancelled` (the client sent a cancel event). For an `incomplete` + * Response, one of `max_output_tokens` or `content_filter` (the server-side safety + * filter activated and cut off the response). + */ + reason?: 'turn_detected' | 'client_cancelled' | 'max_output_tokens' | 'content_filter'; + + /** + * The type of error that caused the response to fail, corresponding with the + * `status` field (`completed`, `cancelled`, `incomplete`, `failed`). + */ + type?: 'completed' | 'cancelled' | 'incomplete' | 'failed'; +} + +export namespace RealtimeResponseStatus { + /** + * A description of the error that caused the response to fail, populated when the + * `status` is `failed`. + */ + export interface Error { + /** + * Error code, if any. + */ + code?: string; + + /** + * The type of error. + */ + type?: string; + } +} + +/** + * Usage statistics for the Response, this will correspond to billing. A Realtime + * API session will maintain a conversation context and append new Items to the + * Conversation, thus output from previous turns (text and audio tokens) will + * become the input for later turns. + */ +export interface RealtimeResponseUsage { + /** + * Details about the input tokens used in the Response. + */ + input_token_details?: RealtimeResponseUsage.InputTokenDetails; + + /** + * The number of input tokens used in the Response, including text and audio + * tokens. + */ + input_tokens?: number; + + /** + * Details about the output tokens used in the Response. + */ + output_token_details?: RealtimeResponseUsage.OutputTokenDetails; + + /** + * The number of output tokens sent in the Response, including text and audio + * tokens. + */ + output_tokens?: number; + + /** + * The total number of tokens in the Response including input and output text and + * audio tokens. + */ + total_tokens?: number; +} + +export namespace RealtimeResponseUsage { + /** + * Details about the input tokens used in the Response. + */ + export interface InputTokenDetails { + /** + * The number of audio tokens used in the Response. + */ + audio_tokens?: number; + + /** + * The number of cached tokens used in the Response. + */ + cached_tokens?: number; + + /** + * The number of text tokens used in the Response. + */ + text_tokens?: number; + } + + /** + * Details about the output tokens used in the Response. + */ + export interface OutputTokenDetails { + /** + * The number of audio tokens used in the Response. + */ + audio_tokens?: number; + + /** + * The number of text tokens used in the Response. + */ + text_tokens?: number; + } +} + +/** + * All events that the Realtime API can send back + */ +export type RealtimeServerEvent = + | ErrorEvent + | SessionCreatedEvent + | SessionUpdatedEvent + | ConversationCreatedEvent + | InputAudioBufferCommittedEvent + | InputAudioBufferClearedEvent + | InputAudioBufferSpeechStartedEvent + | InputAudioBufferSpeechStoppedEvent + | ConversationItemCreatedEvent + | ConversationItemInputAudioTranscriptionCompletedEvent + | ConversationItemInputAudioTranscriptionFailedEvent + | ConversationItemTruncatedEvent + | ConversationItemDeletedEvent + | ResponseCreatedEvent + | ResponseDoneEvent + | ResponseOutputItemAddedEvent + | ResponseOutputItemDoneEvent + | ResponseContentPartAddedEvent + | ResponseContentPartDoneEvent + | ResponseTextDeltaEvent + | ResponseTextDoneEvent + | ResponseAudioTranscriptDeltaEvent + | ResponseAudioTranscriptDoneEvent + | ResponseAudioDeltaEvent + | ResponseAudioDoneEvent + | ResponseFunctionCallArgumentsDeltaEvent + | ResponseFunctionCallArgumentsDoneEvent + | RateLimitsUpdatedEvent; + +/** + * Returned when the model-generated audio is updated. + */ +export interface ResponseAudioDeltaEvent { + /** + * The index of the content part in the item's content array. + */ + content_index: number; + + /** + * Base64-encoded audio data delta. + */ + delta: string; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the item. + */ + item_id: string; + + /** + * The index of the output item in the response. + */ + output_index: number; + + /** + * The ID of the response. + */ + response_id: string; + + /** + * The event type, must be `response.audio.delta`. + */ + type: 'response.audio.delta'; +} + +/** + * Returned when the model-generated audio is done. Also emitted when a Response is + * interrupted, incomplete, or cancelled. + */ +export interface ResponseAudioDoneEvent { + /** + * The index of the content part in the item's content array. + */ + content_index: number; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the item. + */ + item_id: string; + + /** + * The index of the output item in the response. + */ + output_index: number; + + /** + * The ID of the response. + */ + response_id: string; + + /** + * The event type, must be `response.audio.done`. + */ + type: 'response.audio.done'; +} + +/** + * Returned when the model-generated transcription of audio output is updated. + */ +export interface ResponseAudioTranscriptDeltaEvent { + /** + * The index of the content part in the item's content array. + */ + content_index: number; + + /** + * The transcript delta. + */ + delta: string; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the item. + */ + item_id: string; + + /** + * The index of the output item in the response. + */ + output_index: number; + + /** + * The ID of the response. + */ + response_id: string; + + /** + * The event type, must be `response.audio_transcript.delta`. + */ + type: 'response.audio_transcript.delta'; +} + +/** + * Returned when the model-generated transcription of audio output is done + * streaming. Also emitted when a Response is interrupted, incomplete, or + * cancelled. + */ +export interface ResponseAudioTranscriptDoneEvent { + /** + * The index of the content part in the item's content array. + */ + content_index: number; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the item. + */ + item_id: string; + + /** + * The index of the output item in the response. + */ + output_index: number; + + /** + * The ID of the response. + */ + response_id: string; + + /** + * The final transcript of the audio. + */ + transcript: string; + + /** + * The event type, must be `response.audio_transcript.done`. + */ + type: 'response.audio_transcript.done'; +} + +/** + * Send this event to cancel an in-progress response. The server will respond with + * a `response.cancelled` event or an error if there is no response to cancel. + */ +export interface ResponseCancelEvent { + /** + * The event type, must be `response.cancel`. + */ + type: 'response.cancel'; + + /** + * Optional client-generated ID used to identify this event. + */ + event_id?: string; + + /** + * A specific response ID to cancel - if not provided, will cancel an in-progress + * response in the default conversation. + */ + response_id?: string; +} + +/** + * Returned when a new content part is added to an assistant message item during + * response generation. + */ +export interface ResponseContentPartAddedEvent { + /** + * The index of the content part in the item's content array. + */ + content_index: number; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the item to which the content part was added. + */ + item_id: string; + + /** + * The index of the output item in the response. + */ + output_index: number; + + /** + * The content part that was added. + */ + part: ResponseContentPartAddedEvent.Part; + + /** + * The ID of the response. + */ + response_id: string; + + /** + * The event type, must be `response.content_part.added`. + */ + type: 'response.content_part.added'; +} + +export namespace ResponseContentPartAddedEvent { + /** + * The content part that was added. + */ + export interface Part { + /** + * Base64-encoded audio data (if type is "audio"). + */ + audio?: string; + + /** + * The text content (if type is "text"). + */ + text?: string; + + /** + * The transcript of the audio (if type is "audio"). + */ + transcript?: string; + + /** + * The content type ("text", "audio"). + */ + type?: 'text' | 'audio'; + } +} + +/** + * Returned when a content part is done streaming in an assistant message item. + * Also emitted when a Response is interrupted, incomplete, or cancelled. + */ +export interface ResponseContentPartDoneEvent { + /** + * The index of the content part in the item's content array. + */ + content_index: number; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the item. + */ + item_id: string; + + /** + * The index of the output item in the response. + */ + output_index: number; + + /** + * The content part that is done. + */ + part: ResponseContentPartDoneEvent.Part; + + /** + * The ID of the response. + */ + response_id: string; + + /** + * The event type, must be `response.content_part.done`. + */ + type: 'response.content_part.done'; +} + +export namespace ResponseContentPartDoneEvent { + /** + * The content part that is done. + */ + export interface Part { + /** + * Base64-encoded audio data (if type is "audio"). + */ + audio?: string; + + /** + * The text content (if type is "text"). + */ + text?: string; + + /** + * The transcript of the audio (if type is "audio"). + */ + transcript?: string; + + /** + * The content type ("text", "audio"). + */ + type?: 'text' | 'audio'; + } +} + +/** + * This event instructs the server to create a Response, which means triggering + * model inference. When in Server VAD mode, the server will create Responses + * automatically. + * + * A Response will include at least one Item, and may have two, in which case the + * second will be a function call. These Items will be appended to the conversation + * history. + * + * The server will respond with a `response.created` event, events for Items and + * content created, and finally a `response.done` event to indicate the Response is + * complete. + * + * The `response.create` event includes inference configuration like + * `instructions`, and `temperature`. These fields will override the Session's + * configuration for this Response only. + */ +export interface ResponseCreateEvent { + /** + * The event type, must be `response.create`. + */ + type: 'response.create'; + + /** + * Optional client-generated ID used to identify this event. + */ + event_id?: string; + + /** + * Create a new Realtime response with these parameters + */ + response?: ResponseCreateEvent.Response; +} + +export namespace ResponseCreateEvent { + /** + * Create a new Realtime response with these parameters + */ + export interface Response { + /** + * Controls which conversation the response is added to. Currently supports `auto` + * and `none`, with `auto` as the default value. The `auto` value means that the + * contents of the response will be added to the default conversation. Set this to + * `none` to create an out-of-band response which will not add items to default + * conversation. + */ + conversation?: (string & {}) | 'auto' | 'none'; + + /** + * Input items to include in the prompt for the model. Creates a new context for + * this response, without including the default conversation. Can include + * references to items from the default conversation. + */ + input?: Array; + + /** + * The default system instructions (i.e. system message) prepended to model calls. + * This field allows the client to guide the model on desired responses. The model + * can be instructed on response content and format, (e.g. "be extremely succinct", + * "act friendly", "here are examples of good responses") and on audio behavior + * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The + * instructions are not guaranteed to be followed by the model, but they provide + * guidance to the model on the desired behavior. + * + * Note that the server sets default instructions which will be used if this field + * is not set and are visible in the `session.created` event at the start of the + * session. + */ + instructions?: string; + + /** + * Maximum number of output tokens for a single assistant response, inclusive of + * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or + * `inf` for the maximum available tokens for a given model. Defaults to `inf`. + */ + max_response_output_tokens?: number | 'inf'; + + /** + * Set of 16 key-value pairs that can be attached to an object. This can be useful + * for storing additional information about the object in a structured format. Keys + * can be a maximum of 64 characters long and values can be a maximum of 512 + * characters long. + */ + metadata?: unknown | null; + + /** + * The set of modalities the model can respond with. To disable audio, set this to + * ["text"]. + */ + modalities?: Array<'text' | 'audio'>; + + /** + * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + */ + output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; + + /** + * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. + */ + temperature?: number; + + /** + * How the model chooses tools. Options are `auto`, `none`, `required`, or specify + * a function, like `{"type": "function", "function": {"name": "my_function"}}`. + */ + tool_choice?: string; + + /** + * Tools (functions) available to the model. + */ + tools?: Array; + + /** + * The voice the model uses to respond. Voice cannot be changed during the session + * once the model has responded with audio at least once. Current voice options are + * `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`. + */ + voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse'; + } + + export namespace Response { + export interface Tool { + /** + * The description of the function, including guidance on when and how to call it, + * and guidance about what to tell the user when calling (if anything). + */ + description?: string; + + /** + * The name of the function. + */ + name?: string; + + /** + * Parameters of the function in JSON Schema. + */ + parameters?: unknown; + + /** + * The type of the tool, i.e. `function`. + */ + type?: 'function'; + } + } +} + +/** + * Returned when a new Response is created. The first event of response creation, + * where the response is in an initial state of `in_progress`. + */ +export interface ResponseCreatedEvent { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The response resource. + */ + response: RealtimeResponse; + + /** + * The event type, must be `response.created`. + */ + type: 'response.created'; +} + +/** + * Returned when a Response is done streaming. Always emitted, no matter the final + * state. The Response object included in the `response.done` event will include + * all output Items in the Response but will omit the raw audio data. + */ +export interface ResponseDoneEvent { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The response resource. + */ + response: RealtimeResponse; + + /** + * The event type, must be `response.done`. + */ + type: 'response.done'; +} + +/** + * Returned when the model-generated function call arguments are updated. + */ +export interface ResponseFunctionCallArgumentsDeltaEvent { + /** + * The ID of the function call. + */ + call_id: string; + + /** + * The arguments delta as a JSON string. + */ + delta: string; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the function call item. + */ + item_id: string; + + /** + * The index of the output item in the response. + */ + output_index: number; + + /** + * The ID of the response. + */ + response_id: string; + + /** + * The event type, must be `response.function_call_arguments.delta`. + */ + type: 'response.function_call_arguments.delta'; +} + +/** + * Returned when the model-generated function call arguments are done streaming. + * Also emitted when a Response is interrupted, incomplete, or cancelled. + */ +export interface ResponseFunctionCallArgumentsDoneEvent { + /** + * The final arguments as a JSON string. + */ + arguments: string; + + /** + * The ID of the function call. + */ + call_id: string; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the function call item. + */ + item_id: string; + + /** + * The index of the output item in the response. + */ + output_index: number; + + /** + * The ID of the response. + */ + response_id: string; + + /** + * The event type, must be `response.function_call_arguments.done`. + */ + type: 'response.function_call_arguments.done'; +} + +/** + * Returned when a new Item is created during Response generation. + */ +export interface ResponseOutputItemAddedEvent { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The item to add to the conversation. + */ + item: ConversationItem; + + /** + * The index of the output item in the Response. + */ + output_index: number; + + /** + * The ID of the Response to which the item belongs. + */ + response_id: string; + + /** + * The event type, must be `response.output_item.added`. + */ + type: 'response.output_item.added'; +} + +/** + * Returned when an Item is done streaming. Also emitted when a Response is + * interrupted, incomplete, or cancelled. + */ +export interface ResponseOutputItemDoneEvent { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The item to add to the conversation. + */ + item: ConversationItem; + + /** + * The index of the output item in the Response. + */ + output_index: number; + + /** + * The ID of the Response to which the item belongs. + */ + response_id: string; + + /** + * The event type, must be `response.output_item.done`. + */ + type: 'response.output_item.done'; +} + +/** + * Returned when the text value of a "text" content part is updated. + */ +export interface ResponseTextDeltaEvent { + /** + * The index of the content part in the item's content array. + */ + content_index: number; + + /** + * The text delta. + */ + delta: string; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the item. + */ + item_id: string; + + /** + * The index of the output item in the response. + */ + output_index: number; + + /** + * The ID of the response. + */ + response_id: string; + + /** + * The event type, must be `response.text.delta`. + */ + type: 'response.text.delta'; +} + +/** + * Returned when the text value of a "text" content part is done streaming. Also + * emitted when a Response is interrupted, incomplete, or cancelled. + */ +export interface ResponseTextDoneEvent { + /** + * The index of the content part in the item's content array. + */ + content_index: number; + + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * The ID of the item. + */ + item_id: string; + + /** + * The index of the output item in the response. + */ + output_index: number; + + /** + * The ID of the response. + */ + response_id: string; + + /** + * The final text content. + */ + text: string; + + /** + * The event type, must be `response.text.done`. + */ + type: 'response.text.done'; +} + +/** + * Returned when a Session is created. Emitted automatically when a new connection + * is established as the first server event. This event will contain the default + * Session configuration. + */ +export interface SessionCreatedEvent { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * Realtime session object configuration. + */ + session: SessionsAPI.Session; + + /** + * The event type, must be `session.created`. + */ + type: 'session.created'; +} + +/** + * Send this event to update the session’s default configuration. The client may + * send this event at any time to update the session configuration, and any field + * may be updated at any time, except for "voice". The server will respond with a + * `session.updated` event that shows the full effective configuration. Only fields + * that are present are updated, thus the correct way to clear a field like + * "instructions" is to pass an empty string. + */ +export interface SessionUpdateEvent { + /** + * Realtime session object configuration. + */ + session: SessionUpdateEvent.Session; + + /** + * The event type, must be `session.update`. + */ + type: 'session.update'; + + /** + * Optional client-generated ID used to identify this event. + */ + event_id?: string; +} + +export namespace SessionUpdateEvent { + /** + * Realtime session object configuration. + */ + export interface Session { + /** + * The Realtime model used for this session. + */ + model: + | 'gpt-4o-realtime-preview' + | 'gpt-4o-realtime-preview-2024-10-01' + | 'gpt-4o-realtime-preview-2024-12-17' + | 'gpt-4o-mini-realtime-preview' + | 'gpt-4o-mini-realtime-preview-2024-12-17'; + + /** + * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + */ + input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; + + /** + * Configuration for input audio transcription, defaults to off and can be set to + * `null` to turn off once on. Input audio transcription is not native to the + * model, since the model consumes audio directly. Transcription runs + * asynchronously through Whisper and should be treated as rough guidance rather + * than the representation understood by the model. + */ + input_audio_transcription?: Session.InputAudioTranscription; + + /** + * The default system instructions (i.e. system message) prepended to model calls. + * This field allows the client to guide the model on desired responses. The model + * can be instructed on response content and format, (e.g. "be extremely succinct", + * "act friendly", "here are examples of good responses") and on audio behavior + * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The + * instructions are not guaranteed to be followed by the model, but they provide + * guidance to the model on the desired behavior. + * + * Note that the server sets default instructions which will be used if this field + * is not set and are visible in the `session.created` event at the start of the + * session. + */ + instructions?: string; + + /** + * Maximum number of output tokens for a single assistant response, inclusive of + * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or + * `inf` for the maximum available tokens for a given model. Defaults to `inf`. + */ + max_response_output_tokens?: number | 'inf'; + + /** + * The set of modalities the model can respond with. To disable audio, set this to + * ["text"]. + */ + modalities?: Array<'text' | 'audio'>; + + /** + * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + */ + output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; + + /** + * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. + */ + temperature?: number; + + /** + * How the model chooses tools. Options are `auto`, `none`, `required`, or specify + * a function. + */ + tool_choice?: string; + + /** + * Tools (functions) available to the model. + */ + tools?: Array; + + /** + * Configuration for turn detection. Can be set to `null` to turn off. Server VAD + * means that the model will detect the start and end of speech based on audio + * volume and respond at the end of user speech. + */ + turn_detection?: Session.TurnDetection; + + /** + * The voice the model uses to respond. Voice cannot be changed during the session + * once the model has responded with audio at least once. Current voice options are + * `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`. + */ + voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse'; + } + + export namespace Session { + /** + * Configuration for input audio transcription, defaults to off and can be set to + * `null` to turn off once on. Input audio transcription is not native to the + * model, since the model consumes audio directly. Transcription runs + * asynchronously through Whisper and should be treated as rough guidance rather + * than the representation understood by the model. + */ + export interface InputAudioTranscription { + /** + * The model to use for transcription, `whisper-1` is the only currently supported + * model. + */ + model?: string; + } + + export interface Tool { + /** + * The description of the function, including guidance on when and how to call it, + * and guidance about what to tell the user when calling (if anything). + */ + description?: string; + + /** + * The name of the function. + */ + name?: string; + + /** + * Parameters of the function in JSON Schema. + */ + parameters?: unknown; + + /** + * The type of the tool, i.e. `function`. + */ + type?: 'function'; + } + + /** + * Configuration for turn detection. Can be set to `null` to turn off. Server VAD + * means that the model will detect the start and end of speech based on audio + * volume and respond at the end of user speech. + */ + export interface TurnDetection { + /** + * Whether or not to automatically generate a response when VAD is enabled. `true` + * by default. + */ + create_response?: boolean; + + /** + * Amount of audio to include before the VAD detected speech (in milliseconds). + * Defaults to 300ms. + */ + prefix_padding_ms?: number; + + /** + * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. + * With shorter values the model will respond more quickly, but may jump in on + * short pauses from the user. + */ + silence_duration_ms?: number; + + /** + * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher + * threshold will require louder audio to activate the model, and thus might + * perform better in noisy environments. + */ + threshold?: number; + + /** + * Type of turn detection, only `server_vad` is currently supported. + */ + type?: string; + } + } +} + +/** + * Returned when a session is updated with a `session.update` event, unless there + * is an error. + */ +export interface SessionUpdatedEvent { + /** + * The unique ID of the server event. + */ + event_id: string; + + /** + * Realtime session object configuration. + */ + session: SessionsAPI.Session; + + /** + * The event type, must be `session.updated`. + */ + type: 'session.updated'; +} + +Realtime.Sessions = Sessions; + +export declare namespace Realtime { + export { + Sessions as Sessions, + type SessionsAPISession as Session, + type SessionCreateResponse as SessionCreateResponse, + type SessionCreateParams as SessionCreateParams, + }; +} diff --git a/src/resources/beta/realtime/sessions.ts b/src/resources/beta/realtime/sessions.ts new file mode 100644 index 000000000..c1082d236 --- /dev/null +++ b/src/resources/beta/realtime/sessions.ts @@ -0,0 +1,546 @@ +// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +import { APIResource } from '../../../resource'; +import * as Core from '../../../core'; + +export class Sessions extends APIResource { + /** + * Create an ephemeral API token for use in client-side applications with the + * Realtime API. Can be configured with the same session parameters as the + * `session.update` client event. + * + * It responds with a session object, plus a `client_secret` key which contains a + * usable ephemeral API token that can be used to authenticate browser clients for + * the Realtime API. + */ + create(body: SessionCreateParams, options?: Core.RequestOptions): Core.APIPromise { + return this._client.post('/realtime/sessions', { + body, + ...options, + headers: { 'OpenAI-Beta': 'assistants=v2', ...options?.headers }, + }); + } +} + +/** + * Realtime session object configuration. + */ +export interface Session { + /** + * Unique identifier for the session object. + */ + id?: string; + + /** + * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + */ + input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; + + /** + * Configuration for input audio transcription, defaults to off and can be set to + * `null` to turn off once on. Input audio transcription is not native to the + * model, since the model consumes audio directly. Transcription runs + * asynchronously through Whisper and should be treated as rough guidance rather + * than the representation understood by the model. + */ + input_audio_transcription?: Session.InputAudioTranscription; + + /** + * The default system instructions (i.e. system message) prepended to model calls. + * This field allows the client to guide the model on desired responses. The model + * can be instructed on response content and format, (e.g. "be extremely succinct", + * "act friendly", "here are examples of good responses") and on audio behavior + * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The + * instructions are not guaranteed to be followed by the model, but they provide + * guidance to the model on the desired behavior. + * + * Note that the server sets default instructions which will be used if this field + * is not set and are visible in the `session.created` event at the start of the + * session. + */ + instructions?: string; + + /** + * Maximum number of output tokens for a single assistant response, inclusive of + * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or + * `inf` for the maximum available tokens for a given model. Defaults to `inf`. + */ + max_response_output_tokens?: number | 'inf'; + + /** + * The set of modalities the model can respond with. To disable audio, set this to + * ["text"]. + */ + modalities?: Array<'text' | 'audio'>; + + /** + * The Realtime model used for this session. + */ + model?: + | (string & {}) + | 'gpt-4o-realtime-preview' + | 'gpt-4o-realtime-preview-2024-10-01' + | 'gpt-4o-realtime-preview-2024-12-17' + | 'gpt-4o-mini-realtime-preview' + | 'gpt-4o-mini-realtime-preview-2024-12-17'; + + /** + * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + */ + output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; + + /** + * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. + */ + temperature?: number; + + /** + * How the model chooses tools. Options are `auto`, `none`, `required`, or specify + * a function. + */ + tool_choice?: string; + + /** + * Tools (functions) available to the model. + */ + tools?: Array; + + /** + * Configuration for turn detection. Can be set to `null` to turn off. Server VAD + * means that the model will detect the start and end of speech based on audio + * volume and respond at the end of user speech. + */ + turn_detection?: Session.TurnDetection | null; + + /** + * The voice the model uses to respond. Voice cannot be changed during the session + * once the model has responded with audio at least once. Current voice options are + * `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`. + */ + voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse'; +} + +export namespace Session { + /** + * Configuration for input audio transcription, defaults to off and can be set to + * `null` to turn off once on. Input audio transcription is not native to the + * model, since the model consumes audio directly. Transcription runs + * asynchronously through Whisper and should be treated as rough guidance rather + * than the representation understood by the model. + */ + export interface InputAudioTranscription { + /** + * The model to use for transcription, `whisper-1` is the only currently supported + * model. + */ + model?: string; + } + + export interface Tool { + /** + * The description of the function, including guidance on when and how to call it, + * and guidance about what to tell the user when calling (if anything). + */ + description?: string; + + /** + * The name of the function. + */ + name?: string; + + /** + * Parameters of the function in JSON Schema. + */ + parameters?: unknown; + + /** + * The type of the tool, i.e. `function`. + */ + type?: 'function'; + } + + /** + * Configuration for turn detection. Can be set to `null` to turn off. Server VAD + * means that the model will detect the start and end of speech based on audio + * volume and respond at the end of user speech. + */ + export interface TurnDetection { + /** + * Amount of audio to include before the VAD detected speech (in milliseconds). + * Defaults to 300ms. + */ + prefix_padding_ms?: number; + + /** + * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. + * With shorter values the model will respond more quickly, but may jump in on + * short pauses from the user. + */ + silence_duration_ms?: number; + + /** + * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher + * threshold will require louder audio to activate the model, and thus might + * perform better in noisy environments. + */ + threshold?: number; + + /** + * Type of turn detection, only `server_vad` is currently supported. + */ + type?: 'server_vad'; + } +} + +/** + * A new Realtime session configuration, with an ephermeral key. Default TTL for + * keys is one minute. + */ +export interface SessionCreateResponse { + /** + * Ephemeral key returned by the API. + */ + client_secret?: SessionCreateResponse.ClientSecret; + + /** + * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + */ + input_audio_format?: string; + + /** + * Configuration for input audio transcription, defaults to off and can be set to + * `null` to turn off once on. Input audio transcription is not native to the + * model, since the model consumes audio directly. Transcription runs + * asynchronously through Whisper and should be treated as rough guidance rather + * than the representation understood by the model. + */ + input_audio_transcription?: SessionCreateResponse.InputAudioTranscription; + + /** + * The default system instructions (i.e. system message) prepended to model calls. + * This field allows the client to guide the model on desired responses. The model + * can be instructed on response content and format, (e.g. "be extremely succinct", + * "act friendly", "here are examples of good responses") and on audio behavior + * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The + * instructions are not guaranteed to be followed by the model, but they provide + * guidance to the model on the desired behavior. + * + * Note that the server sets default instructions which will be used if this field + * is not set and are visible in the `session.created` event at the start of the + * session. + */ + instructions?: string; + + /** + * Maximum number of output tokens for a single assistant response, inclusive of + * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or + * `inf` for the maximum available tokens for a given model. Defaults to `inf`. + */ + max_response_output_tokens?: number | 'inf'; + + /** + * The set of modalities the model can respond with. To disable audio, set this to + * ["text"]. + */ + modalities?: Array<'text' | 'audio'>; + + /** + * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + */ + output_audio_format?: string; + + /** + * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. + */ + temperature?: number; + + /** + * How the model chooses tools. Options are `auto`, `none`, `required`, or specify + * a function. + */ + tool_choice?: string; + + /** + * Tools (functions) available to the model. + */ + tools?: Array; + + /** + * Configuration for turn detection. Can be set to `null` to turn off. Server VAD + * means that the model will detect the start and end of speech based on audio + * volume and respond at the end of user speech. + */ + turn_detection?: SessionCreateResponse.TurnDetection; + + /** + * The voice the model uses to respond. Voice cannot be changed during the session + * once the model has responded with audio at least once. Current voice options are + * `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`. + */ + voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse'; +} + +export namespace SessionCreateResponse { + /** + * Ephemeral key returned by the API. + */ + export interface ClientSecret { + /** + * Timestamp for when the token expires. Currently, all tokens expire after one + * minute. + */ + expires_at?: number; + + /** + * Ephemeral key usable in client environments to authenticate connections to the + * Realtime API. Use this in client-side environments rather than a standard API + * token, which should only be used server-side. + */ + value?: string; + } + + /** + * Configuration for input audio transcription, defaults to off and can be set to + * `null` to turn off once on. Input audio transcription is not native to the + * model, since the model consumes audio directly. Transcription runs + * asynchronously through Whisper and should be treated as rough guidance rather + * than the representation understood by the model. + */ + export interface InputAudioTranscription { + /** + * The model to use for transcription, `whisper-1` is the only currently supported + * model. + */ + model?: string; + } + + export interface Tool { + /** + * The description of the function, including guidance on when and how to call it, + * and guidance about what to tell the user when calling (if anything). + */ + description?: string; + + /** + * The name of the function. + */ + name?: string; + + /** + * Parameters of the function in JSON Schema. + */ + parameters?: unknown; + + /** + * The type of the tool, i.e. `function`. + */ + type?: 'function'; + } + + /** + * Configuration for turn detection. Can be set to `null` to turn off. Server VAD + * means that the model will detect the start and end of speech based on audio + * volume and respond at the end of user speech. + */ + export interface TurnDetection { + /** + * Amount of audio to include before the VAD detected speech (in milliseconds). + * Defaults to 300ms. + */ + prefix_padding_ms?: number; + + /** + * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. + * With shorter values the model will respond more quickly, but may jump in on + * short pauses from the user. + */ + silence_duration_ms?: number; + + /** + * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher + * threshold will require louder audio to activate the model, and thus might + * perform better in noisy environments. + */ + threshold?: number; + + /** + * Type of turn detection, only `server_vad` is currently supported. + */ + type?: string; + } +} + +export interface SessionCreateParams { + /** + * The Realtime model used for this session. + */ + model: + | 'gpt-4o-realtime-preview' + | 'gpt-4o-realtime-preview-2024-10-01' + | 'gpt-4o-realtime-preview-2024-12-17' + | 'gpt-4o-mini-realtime-preview' + | 'gpt-4o-mini-realtime-preview-2024-12-17'; + + /** + * The format of input audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + */ + input_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; + + /** + * Configuration for input audio transcription, defaults to off and can be set to + * `null` to turn off once on. Input audio transcription is not native to the + * model, since the model consumes audio directly. Transcription runs + * asynchronously through Whisper and should be treated as rough guidance rather + * than the representation understood by the model. + */ + input_audio_transcription?: SessionCreateParams.InputAudioTranscription; + + /** + * The default system instructions (i.e. system message) prepended to model calls. + * This field allows the client to guide the model on desired responses. The model + * can be instructed on response content and format, (e.g. "be extremely succinct", + * "act friendly", "here are examples of good responses") and on audio behavior + * (e.g. "talk quickly", "inject emotion into your voice", "laugh frequently"). The + * instructions are not guaranteed to be followed by the model, but they provide + * guidance to the model on the desired behavior. + * + * Note that the server sets default instructions which will be used if this field + * is not set and are visible in the `session.created` event at the start of the + * session. + */ + instructions?: string; + + /** + * Maximum number of output tokens for a single assistant response, inclusive of + * tool calls. Provide an integer between 1 and 4096 to limit output tokens, or + * `inf` for the maximum available tokens for a given model. Defaults to `inf`. + */ + max_response_output_tokens?: number | 'inf'; + + /** + * The set of modalities the model can respond with. To disable audio, set this to + * ["text"]. + */ + modalities?: Array<'text' | 'audio'>; + + /** + * The format of output audio. Options are `pcm16`, `g711_ulaw`, or `g711_alaw`. + */ + output_audio_format?: 'pcm16' | 'g711_ulaw' | 'g711_alaw'; + + /** + * Sampling temperature for the model, limited to [0.6, 1.2]. Defaults to 0.8. + */ + temperature?: number; + + /** + * How the model chooses tools. Options are `auto`, `none`, `required`, or specify + * a function. + */ + tool_choice?: string; + + /** + * Tools (functions) available to the model. + */ + tools?: Array; + + /** + * Configuration for turn detection. Can be set to `null` to turn off. Server VAD + * means that the model will detect the start and end of speech based on audio + * volume and respond at the end of user speech. + */ + turn_detection?: SessionCreateParams.TurnDetection; + + /** + * The voice the model uses to respond. Voice cannot be changed during the session + * once the model has responded with audio at least once. Current voice options are + * `alloy`, `ash`, `ballad`, `coral`, `echo` `sage`, `shimmer` and `verse`. + */ + voice?: 'alloy' | 'ash' | 'ballad' | 'coral' | 'echo' | 'sage' | 'shimmer' | 'verse'; +} + +export namespace SessionCreateParams { + /** + * Configuration for input audio transcription, defaults to off and can be set to + * `null` to turn off once on. Input audio transcription is not native to the + * model, since the model consumes audio directly. Transcription runs + * asynchronously through Whisper and should be treated as rough guidance rather + * than the representation understood by the model. + */ + export interface InputAudioTranscription { + /** + * The model to use for transcription, `whisper-1` is the only currently supported + * model. + */ + model?: string; + } + + export interface Tool { + /** + * The description of the function, including guidance on when and how to call it, + * and guidance about what to tell the user when calling (if anything). + */ + description?: string; + + /** + * The name of the function. + */ + name?: string; + + /** + * Parameters of the function in JSON Schema. + */ + parameters?: unknown; + + /** + * The type of the tool, i.e. `function`. + */ + type?: 'function'; + } + + /** + * Configuration for turn detection. Can be set to `null` to turn off. Server VAD + * means that the model will detect the start and end of speech based on audio + * volume and respond at the end of user speech. + */ + export interface TurnDetection { + /** + * Whether or not to automatically generate a response when VAD is enabled. `true` + * by default. + */ + create_response?: boolean; + + /** + * Amount of audio to include before the VAD detected speech (in milliseconds). + * Defaults to 300ms. + */ + prefix_padding_ms?: number; + + /** + * Duration of silence to detect speech stop (in milliseconds). Defaults to 500ms. + * With shorter values the model will respond more quickly, but may jump in on + * short pauses from the user. + */ + silence_duration_ms?: number; + + /** + * Activation threshold for VAD (0.0 to 1.0), this defaults to 0.5. A higher + * threshold will require louder audio to activate the model, and thus might + * perform better in noisy environments. + */ + threshold?: number; + + /** + * Type of turn detection, only `server_vad` is currently supported. + */ + type?: string; + } +} + +export declare namespace Sessions { + export { + type Session as Session, + type SessionCreateResponse as SessionCreateResponse, + type SessionCreateParams as SessionCreateParams, + }; +} diff --git a/tests/api-resources/beta/realtime/sessions.test.ts b/tests/api-resources/beta/realtime/sessions.test.ts new file mode 100644 index 000000000..0ed998c27 --- /dev/null +++ b/tests/api-resources/beta/realtime/sessions.test.ts @@ -0,0 +1,45 @@ +// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +import OpenAI from 'openai'; +import { Response } from 'node-fetch'; + +const client = new OpenAI({ + apiKey: 'My API Key', + baseURL: process.env['TEST_API_BASE_URL'] ?? 'http://127.0.0.1:4010', +}); + +describe('resource sessions', () => { + test('create: only required params', async () => { + const responsePromise = client.beta.realtime.sessions.create({ model: 'gpt-4o-realtime-preview' }); + const rawResponse = await responsePromise.asResponse(); + expect(rawResponse).toBeInstanceOf(Response); + const response = await responsePromise; + expect(response).not.toBeInstanceOf(Response); + const dataAndResponse = await responsePromise.withResponse(); + expect(dataAndResponse.data).toBe(response); + expect(dataAndResponse.response).toBe(rawResponse); + }); + + test('create: required and optional params', async () => { + const response = await client.beta.realtime.sessions.create({ + model: 'gpt-4o-realtime-preview', + input_audio_format: 'pcm16', + input_audio_transcription: { model: 'model' }, + instructions: 'instructions', + max_response_output_tokens: 0, + modalities: ['text'], + output_audio_format: 'pcm16', + temperature: 0, + tool_choice: 'tool_choice', + tools: [{ description: 'description', name: 'name', parameters: {}, type: 'function' }], + turn_detection: { + create_response: true, + prefix_padding_ms: 0, + silence_duration_ms: 0, + threshold: 0, + type: 'type', + }, + voice: 'alloy', + }); + }); +});