From b20a9e7b813a12b22202987f512d913433b07fcf Mon Sep 17 00:00:00 2001 From: "stainless-app[bot]" <142633134+stainless-app[bot]@users.noreply.github.com> Date: Fri, 10 Oct 2025 08:49:48 +0000 Subject: [PATCH 1/6] chore(internal): detect missing future annotations with ruff --- pyproject.toml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 7197505229..0f773e5fa4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -256,6 +256,8 @@ select = [ "B", # remove unused imports "F401", + # check for missing future annotations + "FA102", # bare except statements "E722", # unused arguments @@ -278,6 +280,8 @@ unfixable = [ "T203", ] +extend-safe-fixes = ["FA102"] + [tool.ruff.lint.flake8-tidy-imports.banned-api] "functools.lru_cache".msg = "This function does not retain type information for the wrapped function's arguments; The `lru_cache` function from `_utils` should be used instead" From d5c64434b7b1a500e074913cd87d8a6c09f1c13e Mon Sep 17 00:00:00 2001 From: "stainless-app[bot]" <142633134+stainless-app[bot]@users.noreply.github.com> Date: Wed, 15 Oct 2025 20:10:15 +0000 Subject: [PATCH 2/6] codegen metadata --- .stats.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.stats.yml b/.stats.yml index b5d9915ab8..f6f2dd4705 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,4 +1,4 @@ configured_endpoints: 136 -openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-e66e85fb7f72477256dca1acb6b23396989d381c5c1b318de564195436bcb93f.yml +openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-242fe01994cc3c6c2b1a76d8e1eaa832303fa870e4e40de4a2303ac5ce17369a.yml openapi_spec_hash: 0a4bbb5aa0ae532a072bd6b3854e70b1 -config_hash: 89bf7bb3a1f9439ffc6ea0e7dc57ba9b +config_hash: c6362759d174c1ff65e656b1cfb5efdb From 8cdfd0650ef548178939607eb39adf5df4af5b7d Mon Sep 17 00:00:00 2001 From: "stainless-app[bot]" <142633134+stainless-app[bot]@users.noreply.github.com> Date: Thu, 16 Oct 2025 14:46:54 +0000 Subject: [PATCH 3/6] codegen metadata --- .stats.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.stats.yml b/.stats.yml index f6f2dd4705..8bd7c486ba 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,4 +1,4 @@ configured_endpoints: 136 -openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-242fe01994cc3c6c2b1a76d8e1eaa832303fa870e4e40de4a2303ac5ce17369a.yml +openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-11d308a9ef78ad01aa11c880a084a3982276800d7994db3f454aa515474977d7.yml openapi_spec_hash: 0a4bbb5aa0ae532a072bd6b3854e70b1 -config_hash: c6362759d174c1ff65e656b1cfb5efdb +config_hash: f0940d0906846178759ef7128e4cb98e From 25cbb74f835206c497df2772205c7b2225951989 Mon Sep 17 00:00:00 2001 From: "stainless-app[bot]" <142633134+stainless-app[bot]@users.noreply.github.com> Date: Thu, 16 Oct 2025 14:55:31 +0000 Subject: [PATCH 4/6] feat(api): Add support for gpt-4o-transcribe-diarize on audio/transcriptions endpoint --- .stats.yml | 6 +- api.md | 3 + src/openai/resources/audio/transcriptions.py | 278 +++++++++++++++--- src/openai/resources/audio/translations.py | 8 +- .../resources/vector_stores/vector_stores.py | 10 + src/openai/types/audio/__init__.py | 3 + .../audio/transcription_create_params.py | 41 ++- .../audio/transcription_create_response.py | 3 +- .../types/audio/transcription_diarized.py | 63 ++++ .../audio/transcription_diarized_segment.py | 32 ++ .../types/audio/transcription_stream_event.py | 4 +- .../audio/transcription_text_delta_event.py | 6 + .../audio/transcription_text_segment_event.py | 27 ++ src/openai/types/audio_model.py | 2 +- src/openai/types/audio_response_format.py | 2 +- .../types/realtime/audio_transcription.py | 15 +- .../realtime/audio_transcription_param.py | 11 +- .../types/vector_store_create_params.py | 6 + .../audio/test_transcriptions.py | 8 + tests/api_resources/test_vector_stores.py | 2 + tests/lib/test_audio.py | 26 +- 21 files changed, 475 insertions(+), 81 deletions(-) create mode 100644 src/openai/types/audio/transcription_diarized.py create mode 100644 src/openai/types/audio/transcription_diarized_segment.py create mode 100644 src/openai/types/audio/transcription_text_segment_event.py diff --git a/.stats.yml b/.stats.yml index 8bd7c486ba..d0ff2b0dc2 100644 --- a/.stats.yml +++ b/.stats.yml @@ -1,4 +1,4 @@ configured_endpoints: 136 -openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-11d308a9ef78ad01aa11c880a084a3982276800d7994db3f454aa515474977d7.yml -openapi_spec_hash: 0a4bbb5aa0ae532a072bd6b3854e70b1 -config_hash: f0940d0906846178759ef7128e4cb98e +openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-104cced8f4c7436a76eea02e26307828166405ccfb296faffb008b72772c11a7.yml +openapi_spec_hash: fdc03ed84a65a31b80da909255e53924 +config_hash: 03b48e9b8c7231a902403210dbd7dfa0 diff --git a/api.md b/api.md index 1c170ccdd8..7eb296318f 100644 --- a/api.md +++ b/api.md @@ -171,11 +171,14 @@ Types: ```python from openai.types.audio import ( Transcription, + TranscriptionDiarized, + TranscriptionDiarizedSegment, TranscriptionInclude, TranscriptionSegment, TranscriptionStreamEvent, TranscriptionTextDeltaEvent, TranscriptionTextDoneEvent, + TranscriptionTextSegmentEvent, TranscriptionVerbose, TranscriptionWord, TranscriptionCreateResponse, diff --git a/src/openai/resources/audio/transcriptions.py b/src/openai/resources/audio/transcriptions.py index 1fe8866562..52e44bffb7 100644 --- a/src/openai/resources/audio/transcriptions.py +++ b/src/openai/resources/audio/transcriptions.py @@ -9,8 +9,17 @@ import httpx from ... import _legacy_response -from ...types import AudioResponseFormat -from ..._types import Body, Omit, Query, Headers, NotGiven, FileTypes, omit, not_given +from ..._types import ( + Body, + Omit, + Query, + Headers, + NotGiven, + FileTypes, + SequenceNotStr, + omit, + not_given, +) from ..._utils import extract_files, required_args, maybe_transform, deepcopy_minimal, async_maybe_transform from ..._compat import cached_property from ..._resource import SyncAPIResource, AsyncAPIResource @@ -23,6 +32,7 @@ from ...types.audio_response_format import AudioResponseFormat from ...types.audio.transcription_include import TranscriptionInclude from ...types.audio.transcription_verbose import TranscriptionVerbose +from ...types.audio.transcription_diarized import TranscriptionDiarized from ...types.audio.transcription_stream_event import TranscriptionStreamEvent from ...types.audio.transcription_create_response import TranscriptionCreateResponse @@ -93,6 +103,66 @@ def create( timeout: float | httpx.Timeout | None | NotGiven = not_given, ) -> TranscriptionVerbose: ... + model's confidence in the transcription. `logprobs` only works with + response_format set to `json` and only with the models `gpt-4o-transcribe` and + `gpt-4o-mini-transcribe`. This field is not supported when using + `gpt-4o-transcribe-diarize`. + + known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in + `known_speaker_references[]`. Each entry should be a short identifier (for + example `customer` or `agent`). Up to 4 speakers are supported. + + known_speaker_references: Optional list of audio samples (as + [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs)) + that contain known speaker references matching `known_speaker_names[]`. Each + sample must be between 2 and 10 seconds, and can use any of the same input audio + formats supported by `file`. + + language: The language of the input audio. Supplying the input language in + [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) + format will improve accuracy and latency. + + prompt: An optional text to guide the model's style or continue a previous audio + segment. The + [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) + should match the audio language. This field is not supported when using + `gpt-4o-transcribe-diarize`. + + response_format: The format of the output, in one of these options: `json`, `text`, `srt`, + `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and + `gpt-4o-mini-transcribe`, the only supported format is `json`. For + `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and + `diarized_json`, with `diarized_json` required to receive speaker annotations. + + stream: If set to true, the model response data will be streamed to the client as it is + generated using + [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format). + See the + [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions) + for more information. + + Note: Streaming is not supported for the `whisper-1` model and will be ignored. + + temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the + output more random, while lower values like 0.2 will make it more focused and + deterministic. If set to 0, the model will use + [log probability](https://en.wikipedia.org/wiki/Log_probability) to + automatically increase the temperature until certain thresholds are hit. + + timestamp_granularities: The timestamp granularities to populate for this transcription. + `response_format` must be set `verbose_json` to use timestamp granularities. + Either or both of these options are supported: `word`, or `segment`. Note: There + is no additional latency for segment timestamps, but generating word timestamps + incurs additional latency. This option is not available for + `gpt-4o-transcribe-diarize`. + + extra_headers: Send extra headers + + extra_query: Add additional query parameters to the request + + extra_body: Add additional JSON properties to the request + ) -> Transcription: ... + @overload def create( self, @@ -114,6 +184,27 @@ def create( timeout: float | httpx.Timeout | None | NotGiven = not_given, ) -> str: ... + @overload + def create( + self, + *, + file: FileTypes, + model: Union[str, AudioModel], + chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit, + response_format: Literal["diarized_json"], + known_speaker_names: SequenceNotStr[str] | Omit = omit, + known_speaker_references: SequenceNotStr[str] | Omit = omit, + language: str | Omit = omit, + temperature: float | Omit = omit, + timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit, + # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs. + # The extra values given here take precedence over values defined on the client or passed to this method. + extra_headers: Headers | None = None, + extra_query: Query | None = None, + extra_body: Body | None = None, + timeout: float | httpx.Timeout | None | NotGiven = not_given, + ) -> TranscriptionDiarized: ... + @overload def create( self, @@ -123,6 +214,8 @@ def create( stream: Literal[True], chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit, include: List[TranscriptionInclude] | Omit = omit, + known_speaker_names: SequenceNotStr[str] | Omit = omit, + known_speaker_references: SequenceNotStr[str] | Omit = omit, language: str | Omit = omit, prompt: str | Omit = omit, response_format: Union[AudioResponseFormat, Omit] = omit, @@ -144,8 +237,8 @@ def create( flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. model: ID of the model to use. The options are `gpt-4o-transcribe`, - `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source - Whisper V2 model). + `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source + Whisper V2 model), and `gpt-4o-transcribe-diarize`. stream: If set to true, the model response data will be streamed to the client as it is generated using @@ -160,12 +253,25 @@ def create( first normalizes loudness and then uses voice activity detection (VAD) to choose boundaries. `server_vad` object can be provided to tweak VAD detection parameters manually. If unset, the audio is transcribed as a single block. + Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30 + seconds. include: Additional information to include in the transcription response. `logprobs` will return the log probabilities of the tokens in the response to understand the model's confidence in the transcription. `logprobs` only works with response_format set to `json` and only with the models `gpt-4o-transcribe` and - `gpt-4o-mini-transcribe`. + `gpt-4o-mini-transcribe`. This field is not supported when using + `gpt-4o-transcribe-diarize`. + + known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in + `known_speaker_references[]`. Each entry should be a short identifier (for + example `customer` or `agent`). Up to 4 speakers are supported. + + known_speaker_references: Optional list of audio samples (as + [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs)) + that contain known speaker references matching `known_speaker_names[]`. Each + sample must be between 2 and 10 seconds, and can use any of the same input audio + formats supported by `file`. language: The language of the input audio. Supplying the input language in [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) @@ -174,11 +280,14 @@ def create( prompt: An optional text to guide the model's style or continue a previous audio segment. The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) - should match the audio language. + should match the audio language. This field is not supported when using + `gpt-4o-transcribe-diarize`. response_format: The format of the output, in one of these options: `json`, `text`, `srt`, - `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`, - the only supported format is `json`. + `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and + `gpt-4o-mini-transcribe`, the only supported format is `json`. For + `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and + `diarized_json`, with `diarized_json` required to receive speaker annotations. temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and @@ -190,7 +299,8 @@ def create( `response_format` must be set `verbose_json` to use timestamp granularities. Either or both of these options are supported: `word`, or `segment`. Note: There is no additional latency for segment timestamps, but generating word timestamps - incurs additional latency. + incurs additional latency. This option is not available for + `gpt-4o-transcribe-diarize`. extra_headers: Send extra headers @@ -211,6 +321,8 @@ def create( stream: bool, chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit, include: List[TranscriptionInclude] | Omit = omit, + known_speaker_names: SequenceNotStr[str] | Omit = omit, + known_speaker_references: SequenceNotStr[str] | Omit = omit, language: str | Omit = omit, prompt: str | Omit = omit, response_format: Union[AudioResponseFormat, Omit] = omit, @@ -232,8 +344,8 @@ def create( flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. model: ID of the model to use. The options are `gpt-4o-transcribe`, - `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source - Whisper V2 model). + `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source + Whisper V2 model), and `gpt-4o-transcribe-diarize`. stream: If set to true, the model response data will be streamed to the client as it is generated using @@ -248,12 +360,25 @@ def create( first normalizes loudness and then uses voice activity detection (VAD) to choose boundaries. `server_vad` object can be provided to tweak VAD detection parameters manually. If unset, the audio is transcribed as a single block. + Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30 + seconds. include: Additional information to include in the transcription response. `logprobs` will return the log probabilities of the tokens in the response to understand the model's confidence in the transcription. `logprobs` only works with response_format set to `json` and only with the models `gpt-4o-transcribe` and - `gpt-4o-mini-transcribe`. + `gpt-4o-mini-transcribe`. This field is not supported when using + `gpt-4o-transcribe-diarize`. + + known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in + `known_speaker_references[]`. Each entry should be a short identifier (for + example `customer` or `agent`). Up to 4 speakers are supported. + + known_speaker_references: Optional list of audio samples (as + [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs)) + that contain known speaker references matching `known_speaker_names[]`. Each + sample must be between 2 and 10 seconds, and can use any of the same input audio + formats supported by `file`. language: The language of the input audio. Supplying the input language in [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) @@ -262,11 +387,14 @@ def create( prompt: An optional text to guide the model's style or continue a previous audio segment. The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) - should match the audio language. + should match the audio language. This field is not supported when using + `gpt-4o-transcribe-diarize`. response_format: The format of the output, in one of these options: `json`, `text`, `srt`, - `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`, - the only supported format is `json`. + `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and + `gpt-4o-mini-transcribe`, the only supported format is `json`. For + `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and + `diarized_json`, with `diarized_json` required to receive speaker annotations. temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and @@ -278,7 +406,8 @@ def create( `response_format` must be set `verbose_json` to use timestamp granularities. Either or both of these options are supported: `word`, or `segment`. Note: There is no additional latency for segment timestamps, but generating word timestamps - incurs additional latency. + incurs additional latency. This option is not available for + `gpt-4o-transcribe-diarize`. extra_headers: Send extra headers @@ -298,6 +427,8 @@ def create( model: Union[str, AudioModel], chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit, include: List[TranscriptionInclude] | Omit = omit, + known_speaker_names: SequenceNotStr[str] | Omit = omit, + known_speaker_references: SequenceNotStr[str] | Omit = omit, language: str | Omit = omit, prompt: str | Omit = omit, response_format: Union[AudioResponseFormat, Omit] = omit, @@ -310,13 +441,15 @@ def create( extra_query: Query | None = None, extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> str | Transcription | TranscriptionVerbose | Stream[TranscriptionStreamEvent]: + ) -> str | Transcription | TranscriptionDiarized | TranscriptionVerbose | Stream[TranscriptionStreamEvent]: body = deepcopy_minimal( { "file": file, "model": model, "chunking_strategy": chunking_strategy, "include": include, + "known_speaker_names": known_speaker_names, + "known_speaker_references": known_speaker_references, "language": language, "prompt": prompt, "response_format": response_format, @@ -376,6 +509,8 @@ async def create( model: Union[str, AudioModel], chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit, include: List[TranscriptionInclude] | Omit = omit, + known_speaker_names: SequenceNotStr[str] | Omit = omit, + known_speaker_references: SequenceNotStr[str] | Omit = omit, language: str | Omit = omit, prompt: str | Omit = omit, response_format: Union[Literal["json"], Omit] = omit, @@ -398,19 +533,32 @@ async def create( flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. model: ID of the model to use. The options are `gpt-4o-transcribe`, - `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source - Whisper V2 model). + `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source + Whisper V2 model), and `gpt-4o-transcribe-diarize`. chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server first normalizes loudness and then uses voice activity detection (VAD) to choose boundaries. `server_vad` object can be provided to tweak VAD detection parameters manually. If unset, the audio is transcribed as a single block. + Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30 + seconds. include: Additional information to include in the transcription response. `logprobs` will return the log probabilities of the tokens in the response to understand the model's confidence in the transcription. `logprobs` only works with response_format set to `json` and only with the models `gpt-4o-transcribe` and - `gpt-4o-mini-transcribe`. + `gpt-4o-mini-transcribe`. This field is not supported when using + `gpt-4o-transcribe-diarize`. + + known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in + `known_speaker_references[]`. Each entry should be a short identifier (for + example `customer` or `agent`). Up to 4 speakers are supported. + + known_speaker_references: Optional list of audio samples (as + [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs)) + that contain known speaker references matching `known_speaker_names[]`. Each + sample must be between 2 and 10 seconds, and can use any of the same input audio + formats supported by `file`. language: The language of the input audio. Supplying the input language in [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) @@ -419,11 +567,14 @@ async def create( prompt: An optional text to guide the model's style or continue a previous audio segment. The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) - should match the audio language. + should match the audio language. This field is not supported when using + `gpt-4o-transcribe-diarize`. response_format: The format of the output, in one of these options: `json`, `text`, `srt`, - `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`, - the only supported format is `json`. + `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and + `gpt-4o-mini-transcribe`, the only supported format is `json`. For + `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and + `diarized_json`, with `diarized_json` required to receive speaker annotations. stream: If set to true, the model response data will be streamed to the client as it is generated using @@ -444,7 +595,8 @@ async def create( `response_format` must be set `verbose_json` to use timestamp granularities. Either or both of these options are supported: `word`, or `segment`. Note: There is no additional latency for segment timestamps, but generating word timestamps - incurs additional latency. + incurs additional latency. This option is not available for + `gpt-4o-transcribe-diarize`. extra_headers: Send extra headers @@ -502,6 +654,8 @@ async def create( stream: Literal[True], chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit, include: List[TranscriptionInclude] | Omit = omit, + known_speaker_names: SequenceNotStr[str] | Omit = omit, + known_speaker_references: SequenceNotStr[str] | Omit = omit, language: str | Omit = omit, prompt: str | Omit = omit, response_format: Union[AudioResponseFormat, Omit] = omit, @@ -523,8 +677,8 @@ async def create( flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. model: ID of the model to use. The options are `gpt-4o-transcribe`, - `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source - Whisper V2 model). + `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source + Whisper V2 model), and `gpt-4o-transcribe-diarize`. stream: If set to true, the model response data will be streamed to the client as it is generated using @@ -539,12 +693,25 @@ async def create( first normalizes loudness and then uses voice activity detection (VAD) to choose boundaries. `server_vad` object can be provided to tweak VAD detection parameters manually. If unset, the audio is transcribed as a single block. + Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30 + seconds. include: Additional information to include in the transcription response. `logprobs` will return the log probabilities of the tokens in the response to understand the model's confidence in the transcription. `logprobs` only works with response_format set to `json` and only with the models `gpt-4o-transcribe` and - `gpt-4o-mini-transcribe`. + `gpt-4o-mini-transcribe`. This field is not supported when using + `gpt-4o-transcribe-diarize`. + + known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in + `known_speaker_references[]`. Each entry should be a short identifier (for + example `customer` or `agent`). Up to 4 speakers are supported. + + known_speaker_references: Optional list of audio samples (as + [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs)) + that contain known speaker references matching `known_speaker_names[]`. Each + sample must be between 2 and 10 seconds, and can use any of the same input audio + formats supported by `file`. language: The language of the input audio. Supplying the input language in [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) @@ -553,11 +720,14 @@ async def create( prompt: An optional text to guide the model's style or continue a previous audio segment. The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) - should match the audio language. + should match the audio language. This field is not supported when using + `gpt-4o-transcribe-diarize`. response_format: The format of the output, in one of these options: `json`, `text`, `srt`, - `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`, - the only supported format is `json`. + `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and + `gpt-4o-mini-transcribe`, the only supported format is `json`. For + `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and + `diarized_json`, with `diarized_json` required to receive speaker annotations. temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and @@ -569,7 +739,8 @@ async def create( `response_format` must be set `verbose_json` to use timestamp granularities. Either or both of these options are supported: `word`, or `segment`. Note: There is no additional latency for segment timestamps, but generating word timestamps - incurs additional latency. + incurs additional latency. This option is not available for + `gpt-4o-transcribe-diarize`. extra_headers: Send extra headers @@ -590,6 +761,8 @@ async def create( stream: bool, chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit, include: List[TranscriptionInclude] | Omit = omit, + known_speaker_names: SequenceNotStr[str] | Omit = omit, + known_speaker_references: SequenceNotStr[str] | Omit = omit, language: str | Omit = omit, prompt: str | Omit = omit, response_format: Union[AudioResponseFormat, Omit] = omit, @@ -611,8 +784,8 @@ async def create( flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm. model: ID of the model to use. The options are `gpt-4o-transcribe`, - `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source - Whisper V2 model). + `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source + Whisper V2 model), and `gpt-4o-transcribe-diarize`. stream: If set to true, the model response data will be streamed to the client as it is generated using @@ -627,12 +800,25 @@ async def create( first normalizes loudness and then uses voice activity detection (VAD) to choose boundaries. `server_vad` object can be provided to tweak VAD detection parameters manually. If unset, the audio is transcribed as a single block. + Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30 + seconds. include: Additional information to include in the transcription response. `logprobs` will return the log probabilities of the tokens in the response to understand the model's confidence in the transcription. `logprobs` only works with response_format set to `json` and only with the models `gpt-4o-transcribe` and - `gpt-4o-mini-transcribe`. + `gpt-4o-mini-transcribe`. This field is not supported when using + `gpt-4o-transcribe-diarize`. + + known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in + `known_speaker_references[]`. Each entry should be a short identifier (for + example `customer` or `agent`). Up to 4 speakers are supported. + + known_speaker_references: Optional list of audio samples (as + [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs)) + that contain known speaker references matching `known_speaker_names[]`. Each + sample must be between 2 and 10 seconds, and can use any of the same input audio + formats supported by `file`. language: The language of the input audio. Supplying the input language in [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) @@ -641,11 +827,14 @@ async def create( prompt: An optional text to guide the model's style or continue a previous audio segment. The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) - should match the audio language. + should match the audio language. This field is not supported when using + `gpt-4o-transcribe-diarize`. response_format: The format of the output, in one of these options: `json`, `text`, `srt`, - `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`, - the only supported format is `json`. + `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and + `gpt-4o-mini-transcribe`, the only supported format is `json`. For + `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and + `diarized_json`, with `diarized_json` required to receive speaker annotations. temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the output more random, while lower values like 0.2 will make it more focused and @@ -657,7 +846,8 @@ async def create( `response_format` must be set `verbose_json` to use timestamp granularities. Either or both of these options are supported: `word`, or `segment`. Note: There is no additional latency for segment timestamps, but generating word timestamps - incurs additional latency. + incurs additional latency. This option is not available for + `gpt-4o-transcribe-diarize`. extra_headers: Send extra headers @@ -677,6 +867,8 @@ async def create( model: Union[str, AudioModel], chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit, include: List[TranscriptionInclude] | Omit = omit, + known_speaker_names: SequenceNotStr[str] | Omit = omit, + known_speaker_references: SequenceNotStr[str] | Omit = omit, language: str | Omit = omit, prompt: str | Omit = omit, response_format: Union[AudioResponseFormat, Omit] = omit, @@ -689,13 +881,15 @@ async def create( extra_query: Query | None = None, extra_body: Body | None = None, timeout: float | httpx.Timeout | None | NotGiven = not_given, - ) -> Transcription | TranscriptionVerbose | str | AsyncStream[TranscriptionStreamEvent]: + ) -> Transcription | TranscriptionVerbose | TranscriptionDiarized | str | AsyncStream[TranscriptionStreamEvent]: body = deepcopy_minimal( { "file": file, "model": model, "chunking_strategy": chunking_strategy, "include": include, + "known_speaker_names": known_speaker_names, + "known_speaker_references": known_speaker_references, "language": language, "prompt": prompt, "response_format": response_format, @@ -764,8 +958,8 @@ def __init__(self, transcriptions: AsyncTranscriptions) -> None: def _get_response_format_type( - response_format: Literal["json", "text", "srt", "verbose_json", "vtt"] | Omit, -) -> type[Transcription | TranscriptionVerbose | str]: + response_format: AudioResponseFormat | Omit, +) -> type[Transcription | TranscriptionVerbose | TranscriptionDiarized | str]: if isinstance(response_format, Omit) or response_format is None: # pyright: ignore[reportUnnecessaryComparison] return Transcription @@ -773,6 +967,8 @@ def _get_response_format_type( return Transcription elif response_format == "verbose_json": return TranscriptionVerbose + elif response_format == "diarized_json": + return TranscriptionDiarized elif response_format == "srt" or response_format == "text" or response_format == "vtt": return str elif TYPE_CHECKING: # type: ignore[unreachable] diff --git a/src/openai/resources/audio/translations.py b/src/openai/resources/audio/translations.py index a4f844db13..310f901fb3 100644 --- a/src/openai/resources/audio/translations.py +++ b/src/openai/resources/audio/translations.py @@ -349,7 +349,7 @@ def __init__(self, translations: AsyncTranslations) -> None: def _get_response_format_type( - response_format: Literal["json", "text", "srt", "verbose_json", "vtt"] | Omit, + response_format: AudioResponseFormat | Omit, ) -> type[Translation | TranslationVerbose | str]: if isinstance(response_format, Omit) or response_format is None: # pyright: ignore[reportUnnecessaryComparison] return Translation @@ -360,8 +360,8 @@ def _get_response_format_type( return TranslationVerbose elif response_format == "srt" or response_format == "text" or response_format == "vtt": return str - elif TYPE_CHECKING: # type: ignore[unreachable] + elif TYPE_CHECKING and response_format != "diarized_json": # type: ignore[unreachable] assert_never(response_format) else: - log.warn("Unexpected audio response format: %s", response_format) - return Transcription + log.warning("Unexpected audio response format: %s", response_format) + return Translation diff --git a/src/openai/resources/vector_stores/vector_stores.py b/src/openai/resources/vector_stores/vector_stores.py index 39548936c8..490e3e7fdb 100644 --- a/src/openai/resources/vector_stores/vector_stores.py +++ b/src/openai/resources/vector_stores/vector_stores.py @@ -79,6 +79,7 @@ def create( self, *, chunking_strategy: FileChunkingStrategyParam | Omit = omit, + description: str | Omit = omit, expires_after: vector_store_create_params.ExpiresAfter | Omit = omit, file_ids: SequenceNotStr[str] | Omit = omit, metadata: Optional[Metadata] | Omit = omit, @@ -97,6 +98,9 @@ def create( chunking_strategy: The chunking strategy used to chunk the file(s). If not set, will use the `auto` strategy. Only applicable if `file_ids` is non-empty. + description: A description for the vector store. Can be used to describe the vector store's + purpose. + expires_after: The expiration policy for a vector store. file_ids: A list of [File](https://platform.openai.com/docs/api-reference/files) IDs that @@ -126,6 +130,7 @@ def create( body=maybe_transform( { "chunking_strategy": chunking_strategy, + "description": description, "expires_after": expires_after, "file_ids": file_ids, "metadata": metadata, @@ -424,6 +429,7 @@ async def create( self, *, chunking_strategy: FileChunkingStrategyParam | Omit = omit, + description: str | Omit = omit, expires_after: vector_store_create_params.ExpiresAfter | Omit = omit, file_ids: SequenceNotStr[str] | Omit = omit, metadata: Optional[Metadata] | Omit = omit, @@ -442,6 +448,9 @@ async def create( chunking_strategy: The chunking strategy used to chunk the file(s). If not set, will use the `auto` strategy. Only applicable if `file_ids` is non-empty. + description: A description for the vector store. Can be used to describe the vector store's + purpose. + expires_after: The expiration policy for a vector store. file_ids: A list of [File](https://platform.openai.com/docs/api-reference/files) IDs that @@ -471,6 +480,7 @@ async def create( body=await async_maybe_transform( { "chunking_strategy": chunking_strategy, + "description": description, "expires_after": expires_after, "file_ids": file_ids, "metadata": metadata, diff --git a/src/openai/types/audio/__init__.py b/src/openai/types/audio/__init__.py index 396944ee47..2ff2b8185d 100644 --- a/src/openai/types/audio/__init__.py +++ b/src/openai/types/audio/__init__.py @@ -11,10 +11,13 @@ from .transcription_include import TranscriptionInclude as TranscriptionInclude from .transcription_segment import TranscriptionSegment as TranscriptionSegment from .transcription_verbose import TranscriptionVerbose as TranscriptionVerbose +from .transcription_diarized import TranscriptionDiarized as TranscriptionDiarized from .translation_create_params import TranslationCreateParams as TranslationCreateParams from .transcription_stream_event import TranscriptionStreamEvent as TranscriptionStreamEvent from .transcription_create_params import TranscriptionCreateParams as TranscriptionCreateParams from .translation_create_response import TranslationCreateResponse as TranslationCreateResponse from .transcription_create_response import TranscriptionCreateResponse as TranscriptionCreateResponse from .transcription_text_done_event import TranscriptionTextDoneEvent as TranscriptionTextDoneEvent +from .transcription_diarized_segment import TranscriptionDiarizedSegment as TranscriptionDiarizedSegment from .transcription_text_delta_event import TranscriptionTextDeltaEvent as TranscriptionTextDeltaEvent +from .transcription_text_segment_event import TranscriptionTextSegmentEvent as TranscriptionTextSegmentEvent diff --git a/src/openai/types/audio/transcription_create_params.py b/src/openai/types/audio/transcription_create_params.py index f7abcced87..adaef9f5fe 100644 --- a/src/openai/types/audio/transcription_create_params.py +++ b/src/openai/types/audio/transcription_create_params.py @@ -5,7 +5,7 @@ from typing import List, Union, Optional from typing_extensions import Literal, Required, TypeAlias, TypedDict -from ..._types import FileTypes +from ..._types import FileTypes, SequenceNotStr from ..audio_model import AudioModel from .transcription_include import TranscriptionInclude from ..audio_response_format import AudioResponseFormat @@ -29,8 +29,9 @@ class TranscriptionCreateParamsBase(TypedDict, total=False): model: Required[Union[str, AudioModel]] """ID of the model to use. - The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1` - (which is powered by our open source Whisper V2 model). + The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, `whisper-1` + (which is powered by our open source Whisper V2 model), and + `gpt-4o-transcribe-diarize`. """ chunking_strategy: Optional[ChunkingStrategy] @@ -39,7 +40,8 @@ class TranscriptionCreateParamsBase(TypedDict, total=False): When set to `"auto"`, the server first normalizes loudness and then uses voice activity detection (VAD) to choose boundaries. `server_vad` object can be provided to tweak VAD detection parameters manually. If unset, the audio is - transcribed as a single block. + transcribed as a single block. Required when using `gpt-4o-transcribe-diarize` + for inputs longer than 30 seconds. """ include: List[TranscriptionInclude] @@ -48,7 +50,24 @@ class TranscriptionCreateParamsBase(TypedDict, total=False): return the log probabilities of the tokens in the response to understand the model's confidence in the transcription. `logprobs` only works with response_format set to `json` and only with the models `gpt-4o-transcribe` and - `gpt-4o-mini-transcribe`. + `gpt-4o-mini-transcribe`. This field is not supported when using + `gpt-4o-transcribe-diarize`. + """ + + known_speaker_names: SequenceNotStr[str] + """ + Optional list of speaker names that correspond to the audio samples provided in + `known_speaker_references[]`. Each entry should be a short identifier (for + example `customer` or `agent`). Up to 4 speakers are supported. + """ + + known_speaker_references: SequenceNotStr[str] + """ + Optional list of audio samples (as + [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs)) + that contain known speaker references matching `known_speaker_names[]`. Each + sample must be between 2 and 10 seconds, and can use any of the same input audio + formats supported by `file`. """ language: str @@ -64,14 +83,17 @@ class TranscriptionCreateParamsBase(TypedDict, total=False): segment. The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) - should match the audio language. + should match the audio language. This field is not supported when using + `gpt-4o-transcribe-diarize`. """ response_format: AudioResponseFormat """ The format of the output, in one of these options: `json`, `text`, `srt`, - `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`, - the only supported format is `json`. + `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and + `gpt-4o-mini-transcribe`, the only supported format is `json`. For + `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and + `diarized_json`, with `diarized_json` required to receive speaker annotations. """ temperature: float @@ -89,7 +111,8 @@ class TranscriptionCreateParamsBase(TypedDict, total=False): `response_format` must be set `verbose_json` to use timestamp granularities. Either or both of these options are supported: `word`, or `segment`. Note: There is no additional latency for segment timestamps, but generating word timestamps - incurs additional latency. + incurs additional latency. This option is not available for + `gpt-4o-transcribe-diarize`. """ diff --git a/src/openai/types/audio/transcription_create_response.py b/src/openai/types/audio/transcription_create_response.py index 2f7bed8114..5717a3e701 100644 --- a/src/openai/types/audio/transcription_create_response.py +++ b/src/openai/types/audio/transcription_create_response.py @@ -5,7 +5,8 @@ from .transcription import Transcription from .transcription_verbose import TranscriptionVerbose +from .transcription_diarized import TranscriptionDiarized __all__ = ["TranscriptionCreateResponse"] -TranscriptionCreateResponse: TypeAlias = Union[Transcription, TranscriptionVerbose] +TranscriptionCreateResponse: TypeAlias = Union[Transcription, TranscriptionDiarized, TranscriptionVerbose] diff --git a/src/openai/types/audio/transcription_diarized.py b/src/openai/types/audio/transcription_diarized.py new file mode 100644 index 0000000000..b7dd2b8ebb --- /dev/null +++ b/src/openai/types/audio/transcription_diarized.py @@ -0,0 +1,63 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing import List, Union, Optional +from typing_extensions import Literal, Annotated, TypeAlias + +from ..._utils import PropertyInfo +from ..._models import BaseModel +from .transcription_diarized_segment import TranscriptionDiarizedSegment + +__all__ = ["TranscriptionDiarized", "Usage", "UsageTokens", "UsageTokensInputTokenDetails", "UsageDuration"] + + +class UsageTokensInputTokenDetails(BaseModel): + audio_tokens: Optional[int] = None + """Number of audio tokens billed for this request.""" + + text_tokens: Optional[int] = None + """Number of text tokens billed for this request.""" + + +class UsageTokens(BaseModel): + input_tokens: int + """Number of input tokens billed for this request.""" + + output_tokens: int + """Number of output tokens generated.""" + + total_tokens: int + """Total number of tokens used (input + output).""" + + type: Literal["tokens"] + """The type of the usage object. Always `tokens` for this variant.""" + + input_token_details: Optional[UsageTokensInputTokenDetails] = None + """Details about the input tokens billed for this request.""" + + +class UsageDuration(BaseModel): + seconds: float + """Duration of the input audio in seconds.""" + + type: Literal["duration"] + """The type of the usage object. Always `duration` for this variant.""" + + +Usage: TypeAlias = Annotated[Union[UsageTokens, UsageDuration], PropertyInfo(discriminator="type")] + + +class TranscriptionDiarized(BaseModel): + duration: float + """Duration of the input audio in seconds.""" + + segments: List[TranscriptionDiarizedSegment] + """Segments of the transcript annotated with timestamps and speaker labels.""" + + task: Literal["transcribe"] + """The type of task that was run. Always `transcribe`.""" + + text: str + """The concatenated transcript text for the entire audio input.""" + + usage: Optional[Usage] = None + """Token or duration usage statistics for the request.""" diff --git a/src/openai/types/audio/transcription_diarized_segment.py b/src/openai/types/audio/transcription_diarized_segment.py new file mode 100644 index 0000000000..fe87bb4fb8 --- /dev/null +++ b/src/openai/types/audio/transcription_diarized_segment.py @@ -0,0 +1,32 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing_extensions import Literal + +from ..._models import BaseModel + +__all__ = ["TranscriptionDiarizedSegment"] + + +class TranscriptionDiarizedSegment(BaseModel): + id: str + """Unique identifier for the segment.""" + + end: float + """End timestamp of the segment in seconds.""" + + speaker: str + """Speaker label for this segment. + + When known speakers are provided, the label matches `known_speaker_names[]`. + Otherwise speakers are labeled sequentially using capital letters (`A`, `B`, + ...). + """ + + start: float + """Start timestamp of the segment in seconds.""" + + text: str + """Transcript text for this segment.""" + + type: Literal["transcript.text.segment"] + """The type of the segment. Always `transcript.text.segment`.""" diff --git a/src/openai/types/audio/transcription_stream_event.py b/src/openai/types/audio/transcription_stream_event.py index 757077a280..77d3a3aeec 100644 --- a/src/openai/types/audio/transcription_stream_event.py +++ b/src/openai/types/audio/transcription_stream_event.py @@ -6,9 +6,11 @@ from ..._utils import PropertyInfo from .transcription_text_done_event import TranscriptionTextDoneEvent from .transcription_text_delta_event import TranscriptionTextDeltaEvent +from .transcription_text_segment_event import TranscriptionTextSegmentEvent __all__ = ["TranscriptionStreamEvent"] TranscriptionStreamEvent: TypeAlias = Annotated[ - Union[TranscriptionTextDeltaEvent, TranscriptionTextDoneEvent], PropertyInfo(discriminator="type") + Union[TranscriptionTextSegmentEvent, TranscriptionTextDeltaEvent, TranscriptionTextDoneEvent], + PropertyInfo(discriminator="type"), ] diff --git a/src/openai/types/audio/transcription_text_delta_event.py b/src/openai/types/audio/transcription_text_delta_event.py index 36c52f0623..363b6a6335 100644 --- a/src/openai/types/audio/transcription_text_delta_event.py +++ b/src/openai/types/audio/transcription_text_delta_event.py @@ -33,3 +33,9 @@ class TranscriptionTextDeltaEvent(BaseModel): [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription) with the `include[]` parameter set to `logprobs`. """ + + segment_id: Optional[str] = None + """Identifier of the diarized segment that this delta belongs to. + + Only present when using `gpt-4o-transcribe-diarize`. + """ diff --git a/src/openai/types/audio/transcription_text_segment_event.py b/src/openai/types/audio/transcription_text_segment_event.py new file mode 100644 index 0000000000..d4f7664578 --- /dev/null +++ b/src/openai/types/audio/transcription_text_segment_event.py @@ -0,0 +1,27 @@ +# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. + +from typing_extensions import Literal + +from ..._models import BaseModel + +__all__ = ["TranscriptionTextSegmentEvent"] + + +class TranscriptionTextSegmentEvent(BaseModel): + id: str + """Unique identifier for the segment.""" + + end: float + """End timestamp of the segment in seconds.""" + + speaker: str + """Speaker label for this segment.""" + + start: float + """Start timestamp of the segment in seconds.""" + + text: str + """Transcript text for this segment.""" + + type: Literal["transcript.text.segment"] + """The type of the event. Always `transcript.text.segment`.""" diff --git a/src/openai/types/audio_model.py b/src/openai/types/audio_model.py index 4d14d60181..68031a2198 100644 --- a/src/openai/types/audio_model.py +++ b/src/openai/types/audio_model.py @@ -4,4 +4,4 @@ __all__ = ["AudioModel"] -AudioModel: TypeAlias = Literal["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"] +AudioModel: TypeAlias = Literal["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe", "gpt-4o-transcribe-diarize"] diff --git a/src/openai/types/audio_response_format.py b/src/openai/types/audio_response_format.py index f8c8d45945..1897aaf6ed 100644 --- a/src/openai/types/audio_response_format.py +++ b/src/openai/types/audio_response_format.py @@ -4,4 +4,4 @@ __all__ = ["AudioResponseFormat"] -AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json", "vtt"] +AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json", "vtt", "diarized_json"] diff --git a/src/openai/types/realtime/audio_transcription.py b/src/openai/types/realtime/audio_transcription.py index cf662b3aa2..3e5c8e0cb4 100644 --- a/src/openai/types/realtime/audio_transcription.py +++ b/src/openai/types/realtime/audio_transcription.py @@ -17,13 +17,14 @@ class AudioTranscription(BaseModel): format will improve accuracy and latency. """ - model: Optional[Literal["whisper-1", "gpt-4o-transcribe-latest", "gpt-4o-mini-transcribe", "gpt-4o-transcribe"]] = ( - None - ) + model: Optional[ + Literal["whisper-1", "gpt-4o-mini-transcribe", "gpt-4o-transcribe", "gpt-4o-transcribe-diarize"] + ] = None """The model to use for transcription. - Current options are `whisper-1`, `gpt-4o-transcribe-latest`, - `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`. + Current options are `whisper-1`, `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, + and `gpt-4o-transcribe-diarize`. Use `gpt-4o-transcribe-diarize` when you need + diarization with speaker labels. """ prompt: Optional[str] = None @@ -31,6 +32,6 @@ class AudioTranscription(BaseModel): An optional text to guide the model's style or continue a previous audio segment. For `whisper-1`, the [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). - For `gpt-4o-transcribe` models, the prompt is a free text string, for example - "expect words related to technology". + For `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the + prompt is a free text string, for example "expect words related to technology". """ diff --git a/src/openai/types/realtime/audio_transcription_param.py b/src/openai/types/realtime/audio_transcription_param.py index fb09f105b8..3b65e42c8f 100644 --- a/src/openai/types/realtime/audio_transcription_param.py +++ b/src/openai/types/realtime/audio_transcription_param.py @@ -16,11 +16,12 @@ class AudioTranscriptionParam(TypedDict, total=False): format will improve accuracy and latency. """ - model: Literal["whisper-1", "gpt-4o-transcribe-latest", "gpt-4o-mini-transcribe", "gpt-4o-transcribe"] + model: Literal["whisper-1", "gpt-4o-mini-transcribe", "gpt-4o-transcribe", "gpt-4o-transcribe-diarize"] """The model to use for transcription. - Current options are `whisper-1`, `gpt-4o-transcribe-latest`, - `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`. + Current options are `whisper-1`, `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`, + and `gpt-4o-transcribe-diarize`. Use `gpt-4o-transcribe-diarize` when you need + diarization with speaker labels. """ prompt: str @@ -28,6 +29,6 @@ class AudioTranscriptionParam(TypedDict, total=False): An optional text to guide the model's style or continue a previous audio segment. For `whisper-1`, the [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting). - For `gpt-4o-transcribe` models, the prompt is a free text string, for example - "expect words related to technology". + For `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the + prompt is a free text string, for example "expect words related to technology". """ diff --git a/src/openai/types/vector_store_create_params.py b/src/openai/types/vector_store_create_params.py index 945a9886a3..f373a6ed28 100644 --- a/src/openai/types/vector_store_create_params.py +++ b/src/openai/types/vector_store_create_params.py @@ -20,6 +20,12 @@ class VectorStoreCreateParams(TypedDict, total=False): non-empty. """ + description: str + """A description for the vector store. + + Can be used to describe the vector store's purpose. + """ + expires_after: ExpiresAfter """The expiration policy for a vector store.""" diff --git a/tests/api_resources/audio/test_transcriptions.py b/tests/api_resources/audio/test_transcriptions.py index 11cbe2349c..b5eaa4be1f 100644 --- a/tests/api_resources/audio/test_transcriptions.py +++ b/tests/api_resources/audio/test_transcriptions.py @@ -32,6 +32,8 @@ def test_method_create_with_all_params_overload_1(self, client: OpenAI) -> None: model="gpt-4o-transcribe", chunking_strategy="auto", include=["logprobs"], + known_speaker_names=["string"], + known_speaker_references=["string"], language="language", prompt="prompt", response_format="json", @@ -84,6 +86,8 @@ def test_method_create_with_all_params_overload_2(self, client: OpenAI) -> None: stream=True, chunking_strategy="auto", include=["logprobs"], + known_speaker_names=["string"], + known_speaker_references=["string"], language="language", prompt="prompt", response_format="json", @@ -140,6 +144,8 @@ async def test_method_create_with_all_params_overload_1(self, async_client: Asyn model="gpt-4o-transcribe", chunking_strategy="auto", include=["logprobs"], + known_speaker_names=["string"], + known_speaker_references=["string"], language="language", prompt="prompt", response_format="json", @@ -192,6 +198,8 @@ async def test_method_create_with_all_params_overload_2(self, async_client: Asyn stream=True, chunking_strategy="auto", include=["logprobs"], + known_speaker_names=["string"], + known_speaker_references=["string"], language="language", prompt="prompt", response_format="json", diff --git a/tests/api_resources/test_vector_stores.py b/tests/api_resources/test_vector_stores.py index dffd2b1d07..cce9c52cea 100644 --- a/tests/api_resources/test_vector_stores.py +++ b/tests/api_resources/test_vector_stores.py @@ -31,6 +31,7 @@ def test_method_create(self, client: OpenAI) -> None: def test_method_create_with_all_params(self, client: OpenAI) -> None: vector_store = client.vector_stores.create( chunking_strategy={"type": "auto"}, + description="description", expires_after={ "anchor": "last_active_at", "days": 1, @@ -299,6 +300,7 @@ async def test_method_create(self, async_client: AsyncOpenAI) -> None: async def test_method_create_with_all_params(self, async_client: AsyncOpenAI) -> None: vector_store = await async_client.vector_stores.create( chunking_strategy={"type": "auto"}, + description="description", expires_after={ "anchor": "last_active_at", "days": 1, diff --git a/tests/lib/test_audio.py b/tests/lib/test_audio.py index ff8dba4714..93ed3a33b2 100644 --- a/tests/lib/test_audio.py +++ b/tests/lib/test_audio.py @@ -44,7 +44,8 @@ def test_translation_create_overloads_in_sync(sync: bool, client: OpenAI, async_ elif is_literal_type(typ): overload_response_formats.update(get_args(typ)) - src_response_formats: set[str] = set(get_args(AudioResponseFormat)) + # 'diarized_json' applies only to transcriptions, not translations. + src_response_formats: set[str] = set(get_args(AudioResponseFormat)) - {"diarized_json"} diff = src_response_formats.difference(overload_response_formats) assert len(diff) == 0, f"some response format options don't have overloads" @@ -57,18 +58,27 @@ def test_transcription_create_overloads_in_sync(sync: bool, client: OpenAI, asyn overload_response_formats: set[str] = set() for i, overload in enumerate(typing_extensions.get_overloads(fn)): - assert_signatures_in_sync( - fn, - overload, - exclude_params={"response_format", "stream"}, - description=f" for overload {i}", - ) - sig = inspect.signature(overload) typ = evaluate_forwardref( sig.parameters["response_format"].annotation, globalns=sys.modules[fn.__module__].__dict__, ) + + exclude_params = {"response_format", "stream"} + # known_speaker_names and known_speaker_references are only supported by diarized_json + if not (is_literal_type(typ) and set(get_args(typ)) == {"diarized_json"}): + exclude_params.update({"known_speaker_names", "known_speaker_references"}) + + # diarized_json does not support these parameters + if is_literal_type(typ) and set(get_args(typ)) == {"diarized_json"}: + exclude_params.update({"include", "prompt", "timestamp_granularities"}) + + assert_signatures_in_sync( + fn, + overload, + exclude_params=exclude_params, + description=f" for overload {i}", + ) if is_union_type(typ): for arg in get_args(typ): if not is_literal_type(arg): From e043d7b164c9ee9b34f7029606f08ed60d2d47db Mon Sep 17 00:00:00 2001 From: Alex Chang Date: Thu, 16 Oct 2025 11:08:09 -0400 Subject: [PATCH 5/6] chore: fix dangling comment --- src/openai/resources/audio/transcriptions.py | 60 -------------------- 1 file changed, 60 deletions(-) diff --git a/src/openai/resources/audio/transcriptions.py b/src/openai/resources/audio/transcriptions.py index 52e44bffb7..30ef39deec 100644 --- a/src/openai/resources/audio/transcriptions.py +++ b/src/openai/resources/audio/transcriptions.py @@ -103,66 +103,6 @@ def create( timeout: float | httpx.Timeout | None | NotGiven = not_given, ) -> TranscriptionVerbose: ... - model's confidence in the transcription. `logprobs` only works with - response_format set to `json` and only with the models `gpt-4o-transcribe` and - `gpt-4o-mini-transcribe`. This field is not supported when using - `gpt-4o-transcribe-diarize`. - - known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in - `known_speaker_references[]`. Each entry should be a short identifier (for - example `customer` or `agent`). Up to 4 speakers are supported. - - known_speaker_references: Optional list of audio samples (as - [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs)) - that contain known speaker references matching `known_speaker_names[]`. Each - sample must be between 2 and 10 seconds, and can use any of the same input audio - formats supported by `file`. - - language: The language of the input audio. Supplying the input language in - [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`) - format will improve accuracy and latency. - - prompt: An optional text to guide the model's style or continue a previous audio - segment. The - [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting) - should match the audio language. This field is not supported when using - `gpt-4o-transcribe-diarize`. - - response_format: The format of the output, in one of these options: `json`, `text`, `srt`, - `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and - `gpt-4o-mini-transcribe`, the only supported format is `json`. For - `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and - `diarized_json`, with `diarized_json` required to receive speaker annotations. - - stream: If set to true, the model response data will be streamed to the client as it is - generated using - [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format). - See the - [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions) - for more information. - - Note: Streaming is not supported for the `whisper-1` model and will be ignored. - - temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the - output more random, while lower values like 0.2 will make it more focused and - deterministic. If set to 0, the model will use - [log probability](https://en.wikipedia.org/wiki/Log_probability) to - automatically increase the temperature until certain thresholds are hit. - - timestamp_granularities: The timestamp granularities to populate for this transcription. - `response_format` must be set `verbose_json` to use timestamp granularities. - Either or both of these options are supported: `word`, or `segment`. Note: There - is no additional latency for segment timestamps, but generating word timestamps - incurs additional latency. This option is not available for - `gpt-4o-transcribe-diarize`. - - extra_headers: Send extra headers - - extra_query: Add additional query parameters to the request - - extra_body: Add additional JSON properties to the request - ) -> Transcription: ... - @overload def create( self, From ebf32212f7bf5bec6b24cc2276ac0d9a28dd63bb Mon Sep 17 00:00:00 2001 From: "stainless-app[bot]" <142633134+stainless-app[bot]@users.noreply.github.com> Date: Thu, 16 Oct 2025 15:08:49 +0000 Subject: [PATCH 6/6] release: 2.4.0 --- .release-please-manifest.json | 2 +- CHANGELOG.md | 14 ++++++++++++++ pyproject.toml | 2 +- src/openai/_version.py | 2 +- 4 files changed, 17 insertions(+), 3 deletions(-) diff --git a/.release-please-manifest.json b/.release-please-manifest.json index 75ec52fc91..b44b287037 100644 --- a/.release-please-manifest.json +++ b/.release-please-manifest.json @@ -1,3 +1,3 @@ { - ".": "2.3.0" + ".": "2.4.0" } \ No newline at end of file diff --git a/CHANGELOG.md b/CHANGELOG.md index 66eec00fea..30f898c23b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,19 @@ # Changelog +## 2.4.0 (2025-10-16) + +Full Changelog: [v2.3.0...v2.4.0](https://github.com/openai/openai-python/compare/v2.3.0...v2.4.0) + +### Features + +* **api:** Add support for gpt-4o-transcribe-diarize on audio/transcriptions endpoint ([bdbe9b8](https://github.com/openai/openai-python/commit/bdbe9b8f440209afa2979db4a9eda9579b3d2550)) + + +### Chores + +* fix dangling comment ([da14e99](https://github.com/openai/openai-python/commit/da14e9960608f7ade6f5cdf91967830c8a6c1657)) +* **internal:** detect missing future annotations with ruff ([2672b8f](https://github.com/openai/openai-python/commit/2672b8f0726300f7c62c356f25545ef0b3c0bb2e)) + ## 2.3.0 (2025-10-10) Full Changelog: [v2.2.0...v2.3.0](https://github.com/openai/openai-python/compare/v2.2.0...v2.3.0) diff --git a/pyproject.toml b/pyproject.toml index 0f773e5fa4..43de9882f2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [project] name = "openai" -version = "2.3.0" +version = "2.4.0" description = "The official Python library for the openai API" dynamic = ["readme"] license = "Apache-2.0" diff --git a/src/openai/_version.py b/src/openai/_version.py index f202a6d61c..e09654e09d 100644 --- a/src/openai/_version.py +++ b/src/openai/_version.py @@ -1,4 +1,4 @@ # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details. __title__ = "openai" -__version__ = "2.3.0" # x-release-please-version +__version__ = "2.4.0" # x-release-please-version