From b20a9e7b813a12b22202987f512d913433b07fcf Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Fri, 10 Oct 2025 08:49:48 +0000
Subject: [PATCH 1/6] chore(internal): detect missing future annotations with
 ruff

---
 pyproject.toml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 7197505229..0f773e5fa4 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -256,6 +256,8 @@ select = [
   "B",
   # remove unused imports
   "F401",
+  # check for missing future annotations
+  "FA102",
   # bare except statements
   "E722",
   # unused arguments
@@ -278,6 +280,8 @@ unfixable = [
   "T203",
 ]
 
+extend-safe-fixes = ["FA102"]
+
 [tool.ruff.lint.flake8-tidy-imports.banned-api]
 "functools.lru_cache".msg = "This function does not retain type information for the wrapped function's arguments; The `lru_cache` function from `_utils` should be used instead"
 

From d5c64434b7b1a500e074913cd87d8a6c09f1c13e Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Wed, 15 Oct 2025 20:10:15 +0000
Subject: [PATCH 2/6] codegen metadata

---
 .stats.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.stats.yml b/.stats.yml
index b5d9915ab8..f6f2dd4705 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,4 +1,4 @@
 configured_endpoints: 136
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-e66e85fb7f72477256dca1acb6b23396989d381c5c1b318de564195436bcb93f.yml
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-242fe01994cc3c6c2b1a76d8e1eaa832303fa870e4e40de4a2303ac5ce17369a.yml
 openapi_spec_hash: 0a4bbb5aa0ae532a072bd6b3854e70b1
-config_hash: 89bf7bb3a1f9439ffc6ea0e7dc57ba9b
+config_hash: c6362759d174c1ff65e656b1cfb5efdb

From 8cdfd0650ef548178939607eb39adf5df4af5b7d Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Thu, 16 Oct 2025 14:46:54 +0000
Subject: [PATCH 3/6] codegen metadata

---
 .stats.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.stats.yml b/.stats.yml
index f6f2dd4705..8bd7c486ba 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,4 +1,4 @@
 configured_endpoints: 136
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-242fe01994cc3c6c2b1a76d8e1eaa832303fa870e4e40de4a2303ac5ce17369a.yml
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-11d308a9ef78ad01aa11c880a084a3982276800d7994db3f454aa515474977d7.yml
 openapi_spec_hash: 0a4bbb5aa0ae532a072bd6b3854e70b1
-config_hash: c6362759d174c1ff65e656b1cfb5efdb
+config_hash: f0940d0906846178759ef7128e4cb98e

From 25cbb74f835206c497df2772205c7b2225951989 Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Thu, 16 Oct 2025 14:55:31 +0000
Subject: [PATCH 4/6] feat(api): Add support for gpt-4o-transcribe-diarize on
 audio/transcriptions endpoint

---
 .stats.yml                                    |   6 +-
 api.md                                        |   3 +
 src/openai/resources/audio/transcriptions.py  | 278 +++++++++++++++---
 src/openai/resources/audio/translations.py    |   8 +-
 .../resources/vector_stores/vector_stores.py  |  10 +
 src/openai/types/audio/__init__.py            |   3 +
 .../audio/transcription_create_params.py      |  41 ++-
 .../audio/transcription_create_response.py    |   3 +-
 .../types/audio/transcription_diarized.py     |  63 ++++
 .../audio/transcription_diarized_segment.py   |  32 ++
 .../types/audio/transcription_stream_event.py |   4 +-
 .../audio/transcription_text_delta_event.py   |   6 +
 .../audio/transcription_text_segment_event.py |  27 ++
 src/openai/types/audio_model.py               |   2 +-
 src/openai/types/audio_response_format.py     |   2 +-
 .../types/realtime/audio_transcription.py     |  15 +-
 .../realtime/audio_transcription_param.py     |  11 +-
 .../types/vector_store_create_params.py       |   6 +
 .../audio/test_transcriptions.py              |   8 +
 tests/api_resources/test_vector_stores.py     |   2 +
 tests/lib/test_audio.py                       |  26 +-
 21 files changed, 475 insertions(+), 81 deletions(-)
 create mode 100644 src/openai/types/audio/transcription_diarized.py
 create mode 100644 src/openai/types/audio/transcription_diarized_segment.py
 create mode 100644 src/openai/types/audio/transcription_text_segment_event.py

diff --git a/.stats.yml b/.stats.yml
index 8bd7c486ba..d0ff2b0dc2 100644
--- a/.stats.yml
+++ b/.stats.yml
@@ -1,4 +1,4 @@
 configured_endpoints: 136
-openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-11d308a9ef78ad01aa11c880a084a3982276800d7994db3f454aa515474977d7.yml
-openapi_spec_hash: 0a4bbb5aa0ae532a072bd6b3854e70b1
-config_hash: f0940d0906846178759ef7128e4cb98e
+openapi_spec_url: https://storage.googleapis.com/stainless-sdk-openapi-specs/openai%2Fopenai-104cced8f4c7436a76eea02e26307828166405ccfb296faffb008b72772c11a7.yml
+openapi_spec_hash: fdc03ed84a65a31b80da909255e53924
+config_hash: 03b48e9b8c7231a902403210dbd7dfa0
diff --git a/api.md b/api.md
index 1c170ccdd8..7eb296318f 100644
--- a/api.md
+++ b/api.md
@@ -171,11 +171,14 @@ Types:
 ```python
 from openai.types.audio import (
     Transcription,
+    TranscriptionDiarized,
+    TranscriptionDiarizedSegment,
     TranscriptionInclude,
     TranscriptionSegment,
     TranscriptionStreamEvent,
     TranscriptionTextDeltaEvent,
     TranscriptionTextDoneEvent,
+    TranscriptionTextSegmentEvent,
     TranscriptionVerbose,
     TranscriptionWord,
     TranscriptionCreateResponse,
diff --git a/src/openai/resources/audio/transcriptions.py b/src/openai/resources/audio/transcriptions.py
index 1fe8866562..52e44bffb7 100644
--- a/src/openai/resources/audio/transcriptions.py
+++ b/src/openai/resources/audio/transcriptions.py
@@ -9,8 +9,17 @@
 import httpx
 
 from ... import _legacy_response
-from ...types import AudioResponseFormat
-from ..._types import Body, Omit, Query, Headers, NotGiven, FileTypes, omit, not_given
+from ..._types import (
+    Body,
+    Omit,
+    Query,
+    Headers,
+    NotGiven,
+    FileTypes,
+    SequenceNotStr,
+    omit,
+    not_given,
+)
 from ..._utils import extract_files, required_args, maybe_transform, deepcopy_minimal, async_maybe_transform
 from ..._compat import cached_property
 from ..._resource import SyncAPIResource, AsyncAPIResource
@@ -23,6 +32,7 @@
 from ...types.audio_response_format import AudioResponseFormat
 from ...types.audio.transcription_include import TranscriptionInclude
 from ...types.audio.transcription_verbose import TranscriptionVerbose
+from ...types.audio.transcription_diarized import TranscriptionDiarized
 from ...types.audio.transcription_stream_event import TranscriptionStreamEvent
 from ...types.audio.transcription_create_response import TranscriptionCreateResponse
 
@@ -93,6 +103,66 @@ def create(
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
     ) -> TranscriptionVerbose: ...
 
+              model's confidence in the transcription. `logprobs` only works with
+              response_format set to `json` and only with the models `gpt-4o-transcribe` and
+              `gpt-4o-mini-transcribe`. This field is not supported when using
+              `gpt-4o-transcribe-diarize`.
+
+          known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
+              `known_speaker_references[]`. Each entry should be a short identifier (for
+              example `customer` or `agent`). Up to 4 speakers are supported.
+
+          known_speaker_references: Optional list of audio samples (as
+              [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
+              that contain known speaker references matching `known_speaker_names[]`. Each
+              sample must be between 2 and 10 seconds, and can use any of the same input audio
+              formats supported by `file`.
+
+          language: The language of the input audio. Supplying the input language in
+              [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
+              format will improve accuracy and latency.
+
+          prompt: An optional text to guide the model's style or continue a previous audio
+              segment. The
+              [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
+              should match the audio language. This field is not supported when using
+              `gpt-4o-transcribe-diarize`.
+
+          response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
+              `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
+              `gpt-4o-mini-transcribe`, the only supported format is `json`. For
+              `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
+              `diarized_json`, with `diarized_json` required to receive speaker annotations.
+
+          stream: If set to true, the model response data will be streamed to the client as it is
+              generated using
+              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
+              See the
+              [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
+              for more information.
+
+              Note: Streaming is not supported for the `whisper-1` model and will be ignored.
+
+          temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
+              output more random, while lower values like 0.2 will make it more focused and
+              deterministic. If set to 0, the model will use
+              [log probability](https://en.wikipedia.org/wiki/Log_probability) to
+              automatically increase the temperature until certain thresholds are hit.
+
+          timestamp_granularities: The timestamp granularities to populate for this transcription.
+              `response_format` must be set `verbose_json` to use timestamp granularities.
+              Either or both of these options are supported: `word`, or `segment`. Note: There
+              is no additional latency for segment timestamps, but generating word timestamps
+              incurs additional latency. This option is not available for
+              `gpt-4o-transcribe-diarize`.
+
+          extra_headers: Send extra headers
+
+          extra_query: Add additional query parameters to the request
+
+          extra_body: Add additional JSON properties to the request
+    ) -> Transcription: ...
+
     @overload
     def create(
         self,
@@ -114,6 +184,27 @@ def create(
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
     ) -> str: ...
 
+    @overload
+    def create(
+        self,
+        *,
+        file: FileTypes,
+        model: Union[str, AudioModel],
+        chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
+        response_format: Literal["diarized_json"],
+        known_speaker_names: SequenceNotStr[str] | Omit = omit,
+        known_speaker_references: SequenceNotStr[str] | Omit = omit,
+        language: str | Omit = omit,
+        temperature: float | Omit = omit,
+        timestamp_granularities: List[Literal["word", "segment"]] | Omit = omit,
+        # Use the following arguments if you need to pass additional parameters to the API that aren't available via kwargs.
+        # The extra values given here take precedence over values defined on the client or passed to this method.
+        extra_headers: Headers | None = None,
+        extra_query: Query | None = None,
+        extra_body: Body | None = None,
+        timeout: float | httpx.Timeout | None | NotGiven = not_given,
+    ) -> TranscriptionDiarized: ...
+
     @overload
     def create(
         self,
@@ -123,6 +214,8 @@ def create(
         stream: Literal[True],
         chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
         include: List[TranscriptionInclude] | Omit = omit,
+        known_speaker_names: SequenceNotStr[str] | Omit = omit,
+        known_speaker_references: SequenceNotStr[str] | Omit = omit,
         language: str | Omit = omit,
         prompt: str | Omit = omit,
         response_format: Union[AudioResponseFormat, Omit] = omit,
@@ -144,8 +237,8 @@ def create(
               flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
 
           model: ID of the model to use. The options are `gpt-4o-transcribe`,
-              `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
-              Whisper V2 model).
+              `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
+              Whisper V2 model), and `gpt-4o-transcribe-diarize`.
 
           stream: If set to true, the model response data will be streamed to the client as it is
               generated using
@@ -160,12 +253,25 @@ def create(
               first normalizes loudness and then uses voice activity detection (VAD) to choose
               boundaries. `server_vad` object can be provided to tweak VAD detection
               parameters manually. If unset, the audio is transcribed as a single block.
+              Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
+              seconds.
 
           include: Additional information to include in the transcription response. `logprobs` will
               return the log probabilities of the tokens in the response to understand the
               model's confidence in the transcription. `logprobs` only works with
               response_format set to `json` and only with the models `gpt-4o-transcribe` and
-              `gpt-4o-mini-transcribe`.
+              `gpt-4o-mini-transcribe`. This field is not supported when using
+              `gpt-4o-transcribe-diarize`.
+
+          known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
+              `known_speaker_references[]`. Each entry should be a short identifier (for
+              example `customer` or `agent`). Up to 4 speakers are supported.
+
+          known_speaker_references: Optional list of audio samples (as
+              [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
+              that contain known speaker references matching `known_speaker_names[]`. Each
+              sample must be between 2 and 10 seconds, and can use any of the same input audio
+              formats supported by `file`.
 
           language: The language of the input audio. Supplying the input language in
               [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
@@ -174,11 +280,14 @@ def create(
           prompt: An optional text to guide the model's style or continue a previous audio
               segment. The
               [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
-              should match the audio language.
+              should match the audio language. This field is not supported when using
+              `gpt-4o-transcribe-diarize`.
 
           response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
-              `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
-              the only supported format is `json`.
+              `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
+              `gpt-4o-mini-transcribe`, the only supported format is `json`. For
+              `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
+              `diarized_json`, with `diarized_json` required to receive speaker annotations.
 
           temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
               output more random, while lower values like 0.2 will make it more focused and
@@ -190,7 +299,8 @@ def create(
               `response_format` must be set `verbose_json` to use timestamp granularities.
               Either or both of these options are supported: `word`, or `segment`. Note: There
               is no additional latency for segment timestamps, but generating word timestamps
-              incurs additional latency.
+              incurs additional latency. This option is not available for
+              `gpt-4o-transcribe-diarize`.
 
           extra_headers: Send extra headers
 
@@ -211,6 +321,8 @@ def create(
         stream: bool,
         chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
         include: List[TranscriptionInclude] | Omit = omit,
+        known_speaker_names: SequenceNotStr[str] | Omit = omit,
+        known_speaker_references: SequenceNotStr[str] | Omit = omit,
         language: str | Omit = omit,
         prompt: str | Omit = omit,
         response_format: Union[AudioResponseFormat, Omit] = omit,
@@ -232,8 +344,8 @@ def create(
               flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
 
           model: ID of the model to use. The options are `gpt-4o-transcribe`,
-              `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
-              Whisper V2 model).
+              `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
+              Whisper V2 model), and `gpt-4o-transcribe-diarize`.
 
           stream: If set to true, the model response data will be streamed to the client as it is
               generated using
@@ -248,12 +360,25 @@ def create(
               first normalizes loudness and then uses voice activity detection (VAD) to choose
               boundaries. `server_vad` object can be provided to tweak VAD detection
               parameters manually. If unset, the audio is transcribed as a single block.
+              Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
+              seconds.
 
           include: Additional information to include in the transcription response. `logprobs` will
               return the log probabilities of the tokens in the response to understand the
               model's confidence in the transcription. `logprobs` only works with
               response_format set to `json` and only with the models `gpt-4o-transcribe` and
-              `gpt-4o-mini-transcribe`.
+              `gpt-4o-mini-transcribe`. This field is not supported when using
+              `gpt-4o-transcribe-diarize`.
+
+          known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
+              `known_speaker_references[]`. Each entry should be a short identifier (for
+              example `customer` or `agent`). Up to 4 speakers are supported.
+
+          known_speaker_references: Optional list of audio samples (as
+              [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
+              that contain known speaker references matching `known_speaker_names[]`. Each
+              sample must be between 2 and 10 seconds, and can use any of the same input audio
+              formats supported by `file`.
 
           language: The language of the input audio. Supplying the input language in
               [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
@@ -262,11 +387,14 @@ def create(
           prompt: An optional text to guide the model's style or continue a previous audio
               segment. The
               [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
-              should match the audio language.
+              should match the audio language. This field is not supported when using
+              `gpt-4o-transcribe-diarize`.
 
           response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
-              `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
-              the only supported format is `json`.
+              `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
+              `gpt-4o-mini-transcribe`, the only supported format is `json`. For
+              `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
+              `diarized_json`, with `diarized_json` required to receive speaker annotations.
 
           temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
               output more random, while lower values like 0.2 will make it more focused and
@@ -278,7 +406,8 @@ def create(
               `response_format` must be set `verbose_json` to use timestamp granularities.
               Either or both of these options are supported: `word`, or `segment`. Note: There
               is no additional latency for segment timestamps, but generating word timestamps
-              incurs additional latency.
+              incurs additional latency. This option is not available for
+              `gpt-4o-transcribe-diarize`.
 
           extra_headers: Send extra headers
 
@@ -298,6 +427,8 @@ def create(
         model: Union[str, AudioModel],
         chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
         include: List[TranscriptionInclude] | Omit = omit,
+        known_speaker_names: SequenceNotStr[str] | Omit = omit,
+        known_speaker_references: SequenceNotStr[str] | Omit = omit,
         language: str | Omit = omit,
         prompt: str | Omit = omit,
         response_format: Union[AudioResponseFormat, Omit] = omit,
@@ -310,13 +441,15 @@ def create(
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> str | Transcription | TranscriptionVerbose | Stream[TranscriptionStreamEvent]:
+    ) -> str | Transcription | TranscriptionDiarized | TranscriptionVerbose | Stream[TranscriptionStreamEvent]:
         body = deepcopy_minimal(
             {
                 "file": file,
                 "model": model,
                 "chunking_strategy": chunking_strategy,
                 "include": include,
+                "known_speaker_names": known_speaker_names,
+                "known_speaker_references": known_speaker_references,
                 "language": language,
                 "prompt": prompt,
                 "response_format": response_format,
@@ -376,6 +509,8 @@ async def create(
         model: Union[str, AudioModel],
         chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
         include: List[TranscriptionInclude] | Omit = omit,
+        known_speaker_names: SequenceNotStr[str] | Omit = omit,
+        known_speaker_references: SequenceNotStr[str] | Omit = omit,
         language: str | Omit = omit,
         prompt: str | Omit = omit,
         response_format: Union[Literal["json"], Omit] = omit,
@@ -398,19 +533,32 @@ async def create(
               flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
 
           model: ID of the model to use. The options are `gpt-4o-transcribe`,
-              `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
-              Whisper V2 model).
+              `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
+              Whisper V2 model), and `gpt-4o-transcribe-diarize`.
 
           chunking_strategy: Controls how the audio is cut into chunks. When set to `"auto"`, the server
               first normalizes loudness and then uses voice activity detection (VAD) to choose
               boundaries. `server_vad` object can be provided to tweak VAD detection
               parameters manually. If unset, the audio is transcribed as a single block.
+              Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
+              seconds.
 
           include: Additional information to include in the transcription response. `logprobs` will
               return the log probabilities of the tokens in the response to understand the
               model's confidence in the transcription. `logprobs` only works with
               response_format set to `json` and only with the models `gpt-4o-transcribe` and
-              `gpt-4o-mini-transcribe`.
+              `gpt-4o-mini-transcribe`. This field is not supported when using
+              `gpt-4o-transcribe-diarize`.
+
+          known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
+              `known_speaker_references[]`. Each entry should be a short identifier (for
+              example `customer` or `agent`). Up to 4 speakers are supported.
+
+          known_speaker_references: Optional list of audio samples (as
+              [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
+              that contain known speaker references matching `known_speaker_names[]`. Each
+              sample must be between 2 and 10 seconds, and can use any of the same input audio
+              formats supported by `file`.
 
           language: The language of the input audio. Supplying the input language in
               [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
@@ -419,11 +567,14 @@ async def create(
           prompt: An optional text to guide the model's style or continue a previous audio
               segment. The
               [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
-              should match the audio language.
+              should match the audio language. This field is not supported when using
+              `gpt-4o-transcribe-diarize`.
 
           response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
-              `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
-              the only supported format is `json`.
+              `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
+              `gpt-4o-mini-transcribe`, the only supported format is `json`. For
+              `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
+              `diarized_json`, with `diarized_json` required to receive speaker annotations.
 
           stream: If set to true, the model response data will be streamed to the client as it is
               generated using
@@ -444,7 +595,8 @@ async def create(
               `response_format` must be set `verbose_json` to use timestamp granularities.
               Either or both of these options are supported: `word`, or `segment`. Note: There
               is no additional latency for segment timestamps, but generating word timestamps
-              incurs additional latency.
+              incurs additional latency. This option is not available for
+              `gpt-4o-transcribe-diarize`.
 
           extra_headers: Send extra headers
 
@@ -502,6 +654,8 @@ async def create(
         stream: Literal[True],
         chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
         include: List[TranscriptionInclude] | Omit = omit,
+        known_speaker_names: SequenceNotStr[str] | Omit = omit,
+        known_speaker_references: SequenceNotStr[str] | Omit = omit,
         language: str | Omit = omit,
         prompt: str | Omit = omit,
         response_format: Union[AudioResponseFormat, Omit] = omit,
@@ -523,8 +677,8 @@ async def create(
               flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
 
           model: ID of the model to use. The options are `gpt-4o-transcribe`,
-              `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
-              Whisper V2 model).
+              `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
+              Whisper V2 model), and `gpt-4o-transcribe-diarize`.
 
           stream: If set to true, the model response data will be streamed to the client as it is
               generated using
@@ -539,12 +693,25 @@ async def create(
               first normalizes loudness and then uses voice activity detection (VAD) to choose
               boundaries. `server_vad` object can be provided to tweak VAD detection
               parameters manually. If unset, the audio is transcribed as a single block.
+              Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
+              seconds.
 
           include: Additional information to include in the transcription response. `logprobs` will
               return the log probabilities of the tokens in the response to understand the
               model's confidence in the transcription. `logprobs` only works with
               response_format set to `json` and only with the models `gpt-4o-transcribe` and
-              `gpt-4o-mini-transcribe`.
+              `gpt-4o-mini-transcribe`. This field is not supported when using
+              `gpt-4o-transcribe-diarize`.
+
+          known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
+              `known_speaker_references[]`. Each entry should be a short identifier (for
+              example `customer` or `agent`). Up to 4 speakers are supported.
+
+          known_speaker_references: Optional list of audio samples (as
+              [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
+              that contain known speaker references matching `known_speaker_names[]`. Each
+              sample must be between 2 and 10 seconds, and can use any of the same input audio
+              formats supported by `file`.
 
           language: The language of the input audio. Supplying the input language in
               [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
@@ -553,11 +720,14 @@ async def create(
           prompt: An optional text to guide the model's style or continue a previous audio
               segment. The
               [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
-              should match the audio language.
+              should match the audio language. This field is not supported when using
+              `gpt-4o-transcribe-diarize`.
 
           response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
-              `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
-              the only supported format is `json`.
+              `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
+              `gpt-4o-mini-transcribe`, the only supported format is `json`. For
+              `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
+              `diarized_json`, with `diarized_json` required to receive speaker annotations.
 
           temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
               output more random, while lower values like 0.2 will make it more focused and
@@ -569,7 +739,8 @@ async def create(
               `response_format` must be set `verbose_json` to use timestamp granularities.
               Either or both of these options are supported: `word`, or `segment`. Note: There
               is no additional latency for segment timestamps, but generating word timestamps
-              incurs additional latency.
+              incurs additional latency. This option is not available for
+              `gpt-4o-transcribe-diarize`.
 
           extra_headers: Send extra headers
 
@@ -590,6 +761,8 @@ async def create(
         stream: bool,
         chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
         include: List[TranscriptionInclude] | Omit = omit,
+        known_speaker_names: SequenceNotStr[str] | Omit = omit,
+        known_speaker_references: SequenceNotStr[str] | Omit = omit,
         language: str | Omit = omit,
         prompt: str | Omit = omit,
         response_format: Union[AudioResponseFormat, Omit] = omit,
@@ -611,8 +784,8 @@ async def create(
               flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
 
           model: ID of the model to use. The options are `gpt-4o-transcribe`,
-              `gpt-4o-mini-transcribe`, and `whisper-1` (which is powered by our open source
-              Whisper V2 model).
+              `gpt-4o-mini-transcribe`, `whisper-1` (which is powered by our open source
+              Whisper V2 model), and `gpt-4o-transcribe-diarize`.
 
           stream: If set to true, the model response data will be streamed to the client as it is
               generated using
@@ -627,12 +800,25 @@ async def create(
               first normalizes loudness and then uses voice activity detection (VAD) to choose
               boundaries. `server_vad` object can be provided to tweak VAD detection
               parameters manually. If unset, the audio is transcribed as a single block.
+              Required when using `gpt-4o-transcribe-diarize` for inputs longer than 30
+              seconds.
 
           include: Additional information to include in the transcription response. `logprobs` will
               return the log probabilities of the tokens in the response to understand the
               model's confidence in the transcription. `logprobs` only works with
               response_format set to `json` and only with the models `gpt-4o-transcribe` and
-              `gpt-4o-mini-transcribe`.
+              `gpt-4o-mini-transcribe`. This field is not supported when using
+              `gpt-4o-transcribe-diarize`.
+
+          known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
+              `known_speaker_references[]`. Each entry should be a short identifier (for
+              example `customer` or `agent`). Up to 4 speakers are supported.
+
+          known_speaker_references: Optional list of audio samples (as
+              [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
+              that contain known speaker references matching `known_speaker_names[]`. Each
+              sample must be between 2 and 10 seconds, and can use any of the same input audio
+              formats supported by `file`.
 
           language: The language of the input audio. Supplying the input language in
               [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
@@ -641,11 +827,14 @@ async def create(
           prompt: An optional text to guide the model's style or continue a previous audio
               segment. The
               [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
-              should match the audio language.
+              should match the audio language. This field is not supported when using
+              `gpt-4o-transcribe-diarize`.
 
           response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
-              `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
-              the only supported format is `json`.
+              `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
+              `gpt-4o-mini-transcribe`, the only supported format is `json`. For
+              `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
+              `diarized_json`, with `diarized_json` required to receive speaker annotations.
 
           temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
               output more random, while lower values like 0.2 will make it more focused and
@@ -657,7 +846,8 @@ async def create(
               `response_format` must be set `verbose_json` to use timestamp granularities.
               Either or both of these options are supported: `word`, or `segment`. Note: There
               is no additional latency for segment timestamps, but generating word timestamps
-              incurs additional latency.
+              incurs additional latency. This option is not available for
+              `gpt-4o-transcribe-diarize`.
 
           extra_headers: Send extra headers
 
@@ -677,6 +867,8 @@ async def create(
         model: Union[str, AudioModel],
         chunking_strategy: Optional[transcription_create_params.ChunkingStrategy] | Omit = omit,
         include: List[TranscriptionInclude] | Omit = omit,
+        known_speaker_names: SequenceNotStr[str] | Omit = omit,
+        known_speaker_references: SequenceNotStr[str] | Omit = omit,
         language: str | Omit = omit,
         prompt: str | Omit = omit,
         response_format: Union[AudioResponseFormat, Omit] = omit,
@@ -689,13 +881,15 @@ async def create(
         extra_query: Query | None = None,
         extra_body: Body | None = None,
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
-    ) -> Transcription | TranscriptionVerbose | str | AsyncStream[TranscriptionStreamEvent]:
+    ) -> Transcription | TranscriptionVerbose | TranscriptionDiarized | str | AsyncStream[TranscriptionStreamEvent]:
         body = deepcopy_minimal(
             {
                 "file": file,
                 "model": model,
                 "chunking_strategy": chunking_strategy,
                 "include": include,
+                "known_speaker_names": known_speaker_names,
+                "known_speaker_references": known_speaker_references,
                 "language": language,
                 "prompt": prompt,
                 "response_format": response_format,
@@ -764,8 +958,8 @@ def __init__(self, transcriptions: AsyncTranscriptions) -> None:
 
 
 def _get_response_format_type(
-    response_format: Literal["json", "text", "srt", "verbose_json", "vtt"] | Omit,
-) -> type[Transcription | TranscriptionVerbose | str]:
+    response_format: AudioResponseFormat | Omit,
+) -> type[Transcription | TranscriptionVerbose | TranscriptionDiarized | str]:
     if isinstance(response_format, Omit) or response_format is None:  # pyright: ignore[reportUnnecessaryComparison]
         return Transcription
 
@@ -773,6 +967,8 @@ def _get_response_format_type(
         return Transcription
     elif response_format == "verbose_json":
         return TranscriptionVerbose
+    elif response_format == "diarized_json":
+        return TranscriptionDiarized
     elif response_format == "srt" or response_format == "text" or response_format == "vtt":
         return str
     elif TYPE_CHECKING:  # type: ignore[unreachable]
diff --git a/src/openai/resources/audio/translations.py b/src/openai/resources/audio/translations.py
index a4f844db13..310f901fb3 100644
--- a/src/openai/resources/audio/translations.py
+++ b/src/openai/resources/audio/translations.py
@@ -349,7 +349,7 @@ def __init__(self, translations: AsyncTranslations) -> None:
 
 
 def _get_response_format_type(
-    response_format: Literal["json", "text", "srt", "verbose_json", "vtt"] | Omit,
+    response_format: AudioResponseFormat | Omit,
 ) -> type[Translation | TranslationVerbose | str]:
     if isinstance(response_format, Omit) or response_format is None:  # pyright: ignore[reportUnnecessaryComparison]
         return Translation
@@ -360,8 +360,8 @@ def _get_response_format_type(
         return TranslationVerbose
     elif response_format == "srt" or response_format == "text" or response_format == "vtt":
         return str
-    elif TYPE_CHECKING:  # type: ignore[unreachable]
+    elif TYPE_CHECKING and response_format != "diarized_json":  # type: ignore[unreachable]
         assert_never(response_format)
     else:
-        log.warn("Unexpected audio response format: %s", response_format)
-        return Transcription
+        log.warning("Unexpected audio response format: %s", response_format)
+        return Translation
diff --git a/src/openai/resources/vector_stores/vector_stores.py b/src/openai/resources/vector_stores/vector_stores.py
index 39548936c8..490e3e7fdb 100644
--- a/src/openai/resources/vector_stores/vector_stores.py
+++ b/src/openai/resources/vector_stores/vector_stores.py
@@ -79,6 +79,7 @@ def create(
         self,
         *,
         chunking_strategy: FileChunkingStrategyParam | Omit = omit,
+        description: str | Omit = omit,
         expires_after: vector_store_create_params.ExpiresAfter | Omit = omit,
         file_ids: SequenceNotStr[str] | Omit = omit,
         metadata: Optional[Metadata] | Omit = omit,
@@ -97,6 +98,9 @@ def create(
           chunking_strategy: The chunking strategy used to chunk the file(s). If not set, will use the `auto`
               strategy. Only applicable if `file_ids` is non-empty.
 
+          description: A description for the vector store. Can be used to describe the vector store's
+              purpose.
+
           expires_after: The expiration policy for a vector store.
 
           file_ids: A list of [File](https://platform.openai.com/docs/api-reference/files) IDs that
@@ -126,6 +130,7 @@ def create(
             body=maybe_transform(
                 {
                     "chunking_strategy": chunking_strategy,
+                    "description": description,
                     "expires_after": expires_after,
                     "file_ids": file_ids,
                     "metadata": metadata,
@@ -424,6 +429,7 @@ async def create(
         self,
         *,
         chunking_strategy: FileChunkingStrategyParam | Omit = omit,
+        description: str | Omit = omit,
         expires_after: vector_store_create_params.ExpiresAfter | Omit = omit,
         file_ids: SequenceNotStr[str] | Omit = omit,
         metadata: Optional[Metadata] | Omit = omit,
@@ -442,6 +448,9 @@ async def create(
           chunking_strategy: The chunking strategy used to chunk the file(s). If not set, will use the `auto`
               strategy. Only applicable if `file_ids` is non-empty.
 
+          description: A description for the vector store. Can be used to describe the vector store's
+              purpose.
+
           expires_after: The expiration policy for a vector store.
 
           file_ids: A list of [File](https://platform.openai.com/docs/api-reference/files) IDs that
@@ -471,6 +480,7 @@ async def create(
             body=await async_maybe_transform(
                 {
                     "chunking_strategy": chunking_strategy,
+                    "description": description,
                     "expires_after": expires_after,
                     "file_ids": file_ids,
                     "metadata": metadata,
diff --git a/src/openai/types/audio/__init__.py b/src/openai/types/audio/__init__.py
index 396944ee47..2ff2b8185d 100644
--- a/src/openai/types/audio/__init__.py
+++ b/src/openai/types/audio/__init__.py
@@ -11,10 +11,13 @@
 from .transcription_include import TranscriptionInclude as TranscriptionInclude
 from .transcription_segment import TranscriptionSegment as TranscriptionSegment
 from .transcription_verbose import TranscriptionVerbose as TranscriptionVerbose
+from .transcription_diarized import TranscriptionDiarized as TranscriptionDiarized
 from .translation_create_params import TranslationCreateParams as TranslationCreateParams
 from .transcription_stream_event import TranscriptionStreamEvent as TranscriptionStreamEvent
 from .transcription_create_params import TranscriptionCreateParams as TranscriptionCreateParams
 from .translation_create_response import TranslationCreateResponse as TranslationCreateResponse
 from .transcription_create_response import TranscriptionCreateResponse as TranscriptionCreateResponse
 from .transcription_text_done_event import TranscriptionTextDoneEvent as TranscriptionTextDoneEvent
+from .transcription_diarized_segment import TranscriptionDiarizedSegment as TranscriptionDiarizedSegment
 from .transcription_text_delta_event import TranscriptionTextDeltaEvent as TranscriptionTextDeltaEvent
+from .transcription_text_segment_event import TranscriptionTextSegmentEvent as TranscriptionTextSegmentEvent
diff --git a/src/openai/types/audio/transcription_create_params.py b/src/openai/types/audio/transcription_create_params.py
index f7abcced87..adaef9f5fe 100644
--- a/src/openai/types/audio/transcription_create_params.py
+++ b/src/openai/types/audio/transcription_create_params.py
@@ -5,7 +5,7 @@
 from typing import List, Union, Optional
 from typing_extensions import Literal, Required, TypeAlias, TypedDict
 
-from ..._types import FileTypes
+from ..._types import FileTypes, SequenceNotStr
 from ..audio_model import AudioModel
 from .transcription_include import TranscriptionInclude
 from ..audio_response_format import AudioResponseFormat
@@ -29,8 +29,9 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
     model: Required[Union[str, AudioModel]]
     """ID of the model to use.
 
-    The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, and `whisper-1`
-    (which is powered by our open source Whisper V2 model).
+    The options are `gpt-4o-transcribe`, `gpt-4o-mini-transcribe`, `whisper-1`
+    (which is powered by our open source Whisper V2 model), and
+    `gpt-4o-transcribe-diarize`.
     """
 
     chunking_strategy: Optional[ChunkingStrategy]
@@ -39,7 +40,8 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
     When set to `"auto"`, the server first normalizes loudness and then uses voice
     activity detection (VAD) to choose boundaries. `server_vad` object can be
     provided to tweak VAD detection parameters manually. If unset, the audio is
-    transcribed as a single block.
+    transcribed as a single block. Required when using `gpt-4o-transcribe-diarize`
+    for inputs longer than 30 seconds.
     """
 
     include: List[TranscriptionInclude]
@@ -48,7 +50,24 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
     return the log probabilities of the tokens in the response to understand the
     model's confidence in the transcription. `logprobs` only works with
     response_format set to `json` and only with the models `gpt-4o-transcribe` and
-    `gpt-4o-mini-transcribe`.
+    `gpt-4o-mini-transcribe`. This field is not supported when using
+    `gpt-4o-transcribe-diarize`.
+    """
+
+    known_speaker_names: SequenceNotStr[str]
+    """
+    Optional list of speaker names that correspond to the audio samples provided in
+    `known_speaker_references[]`. Each entry should be a short identifier (for
+    example `customer` or `agent`). Up to 4 speakers are supported.
+    """
+
+    known_speaker_references: SequenceNotStr[str]
+    """
+    Optional list of audio samples (as
+    [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
+    that contain known speaker references matching `known_speaker_names[]`. Each
+    sample must be between 2 and 10 seconds, and can use any of the same input audio
+    formats supported by `file`.
     """
 
     language: str
@@ -64,14 +83,17 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
     segment.
 
     The [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
-    should match the audio language.
+    should match the audio language. This field is not supported when using
+    `gpt-4o-transcribe-diarize`.
     """
 
     response_format: AudioResponseFormat
     """
     The format of the output, in one of these options: `json`, `text`, `srt`,
-    `verbose_json`, or `vtt`. For `gpt-4o-transcribe` and `gpt-4o-mini-transcribe`,
-    the only supported format is `json`.
+    `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
+    `gpt-4o-mini-transcribe`, the only supported format is `json`. For
+    `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
+    `diarized_json`, with `diarized_json` required to receive speaker annotations.
     """
 
     temperature: float
@@ -89,7 +111,8 @@ class TranscriptionCreateParamsBase(TypedDict, total=False):
     `response_format` must be set `verbose_json` to use timestamp granularities.
     Either or both of these options are supported: `word`, or `segment`. Note: There
     is no additional latency for segment timestamps, but generating word timestamps
-    incurs additional latency.
+    incurs additional latency. This option is not available for
+    `gpt-4o-transcribe-diarize`.
     """
 
 
diff --git a/src/openai/types/audio/transcription_create_response.py b/src/openai/types/audio/transcription_create_response.py
index 2f7bed8114..5717a3e701 100644
--- a/src/openai/types/audio/transcription_create_response.py
+++ b/src/openai/types/audio/transcription_create_response.py
@@ -5,7 +5,8 @@
 
 from .transcription import Transcription
 from .transcription_verbose import TranscriptionVerbose
+from .transcription_diarized import TranscriptionDiarized
 
 __all__ = ["TranscriptionCreateResponse"]
 
-TranscriptionCreateResponse: TypeAlias = Union[Transcription, TranscriptionVerbose]
+TranscriptionCreateResponse: TypeAlias = Union[Transcription, TranscriptionDiarized, TranscriptionVerbose]
diff --git a/src/openai/types/audio/transcription_diarized.py b/src/openai/types/audio/transcription_diarized.py
new file mode 100644
index 0000000000..b7dd2b8ebb
--- /dev/null
+++ b/src/openai/types/audio/transcription_diarized.py
@@ -0,0 +1,63 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing import List, Union, Optional
+from typing_extensions import Literal, Annotated, TypeAlias
+
+from ..._utils import PropertyInfo
+from ..._models import BaseModel
+from .transcription_diarized_segment import TranscriptionDiarizedSegment
+
+__all__ = ["TranscriptionDiarized", "Usage", "UsageTokens", "UsageTokensInputTokenDetails", "UsageDuration"]
+
+
+class UsageTokensInputTokenDetails(BaseModel):
+    audio_tokens: Optional[int] = None
+    """Number of audio tokens billed for this request."""
+
+    text_tokens: Optional[int] = None
+    """Number of text tokens billed for this request."""
+
+
+class UsageTokens(BaseModel):
+    input_tokens: int
+    """Number of input tokens billed for this request."""
+
+    output_tokens: int
+    """Number of output tokens generated."""
+
+    total_tokens: int
+    """Total number of tokens used (input + output)."""
+
+    type: Literal["tokens"]
+    """The type of the usage object. Always `tokens` for this variant."""
+
+    input_token_details: Optional[UsageTokensInputTokenDetails] = None
+    """Details about the input tokens billed for this request."""
+
+
+class UsageDuration(BaseModel):
+    seconds: float
+    """Duration of the input audio in seconds."""
+
+    type: Literal["duration"]
+    """The type of the usage object. Always `duration` for this variant."""
+
+
+Usage: TypeAlias = Annotated[Union[UsageTokens, UsageDuration], PropertyInfo(discriminator="type")]
+
+
+class TranscriptionDiarized(BaseModel):
+    duration: float
+    """Duration of the input audio in seconds."""
+
+    segments: List[TranscriptionDiarizedSegment]
+    """Segments of the transcript annotated with timestamps and speaker labels."""
+
+    task: Literal["transcribe"]
+    """The type of task that was run. Always `transcribe`."""
+
+    text: str
+    """The concatenated transcript text for the entire audio input."""
+
+    usage: Optional[Usage] = None
+    """Token or duration usage statistics for the request."""
diff --git a/src/openai/types/audio/transcription_diarized_segment.py b/src/openai/types/audio/transcription_diarized_segment.py
new file mode 100644
index 0000000000..fe87bb4fb8
--- /dev/null
+++ b/src/openai/types/audio/transcription_diarized_segment.py
@@ -0,0 +1,32 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing_extensions import Literal
+
+from ..._models import BaseModel
+
+__all__ = ["TranscriptionDiarizedSegment"]
+
+
+class TranscriptionDiarizedSegment(BaseModel):
+    id: str
+    """Unique identifier for the segment."""
+
+    end: float
+    """End timestamp of the segment in seconds."""
+
+    speaker: str
+    """Speaker label for this segment.
+
+    When known speakers are provided, the label matches `known_speaker_names[]`.
+    Otherwise speakers are labeled sequentially using capital letters (`A`, `B`,
+    ...).
+    """
+
+    start: float
+    """Start timestamp of the segment in seconds."""
+
+    text: str
+    """Transcript text for this segment."""
+
+    type: Literal["transcript.text.segment"]
+    """The type of the segment. Always `transcript.text.segment`."""
diff --git a/src/openai/types/audio/transcription_stream_event.py b/src/openai/types/audio/transcription_stream_event.py
index 757077a280..77d3a3aeec 100644
--- a/src/openai/types/audio/transcription_stream_event.py
+++ b/src/openai/types/audio/transcription_stream_event.py
@@ -6,9 +6,11 @@
 from ..._utils import PropertyInfo
 from .transcription_text_done_event import TranscriptionTextDoneEvent
 from .transcription_text_delta_event import TranscriptionTextDeltaEvent
+from .transcription_text_segment_event import TranscriptionTextSegmentEvent
 
 __all__ = ["TranscriptionStreamEvent"]
 
 TranscriptionStreamEvent: TypeAlias = Annotated[
-    Union[TranscriptionTextDeltaEvent, TranscriptionTextDoneEvent], PropertyInfo(discriminator="type")
+    Union[TranscriptionTextSegmentEvent, TranscriptionTextDeltaEvent, TranscriptionTextDoneEvent],
+    PropertyInfo(discriminator="type"),
 ]
diff --git a/src/openai/types/audio/transcription_text_delta_event.py b/src/openai/types/audio/transcription_text_delta_event.py
index 36c52f0623..363b6a6335 100644
--- a/src/openai/types/audio/transcription_text_delta_event.py
+++ b/src/openai/types/audio/transcription_text_delta_event.py
@@ -33,3 +33,9 @@ class TranscriptionTextDeltaEvent(BaseModel):
     [create a transcription](https://platform.openai.com/docs/api-reference/audio/create-transcription)
     with the `include[]` parameter set to `logprobs`.
     """
+
+    segment_id: Optional[str] = None
+    """Identifier of the diarized segment that this delta belongs to.
+
+    Only present when using `gpt-4o-transcribe-diarize`.
+    """
diff --git a/src/openai/types/audio/transcription_text_segment_event.py b/src/openai/types/audio/transcription_text_segment_event.py
new file mode 100644
index 0000000000..d4f7664578
--- /dev/null
+++ b/src/openai/types/audio/transcription_text_segment_event.py
@@ -0,0 +1,27 @@
+# File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
+
+from typing_extensions import Literal
+
+from ..._models import BaseModel
+
+__all__ = ["TranscriptionTextSegmentEvent"]
+
+
+class TranscriptionTextSegmentEvent(BaseModel):
+    id: str
+    """Unique identifier for the segment."""
+
+    end: float
+    """End timestamp of the segment in seconds."""
+
+    speaker: str
+    """Speaker label for this segment."""
+
+    start: float
+    """Start timestamp of the segment in seconds."""
+
+    text: str
+    """Transcript text for this segment."""
+
+    type: Literal["transcript.text.segment"]
+    """The type of the event. Always `transcript.text.segment`."""
diff --git a/src/openai/types/audio_model.py b/src/openai/types/audio_model.py
index 4d14d60181..68031a2198 100644
--- a/src/openai/types/audio_model.py
+++ b/src/openai/types/audio_model.py
@@ -4,4 +4,4 @@
 
 __all__ = ["AudioModel"]
 
-AudioModel: TypeAlias = Literal["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe"]
+AudioModel: TypeAlias = Literal["whisper-1", "gpt-4o-transcribe", "gpt-4o-mini-transcribe", "gpt-4o-transcribe-diarize"]
diff --git a/src/openai/types/audio_response_format.py b/src/openai/types/audio_response_format.py
index f8c8d45945..1897aaf6ed 100644
--- a/src/openai/types/audio_response_format.py
+++ b/src/openai/types/audio_response_format.py
@@ -4,4 +4,4 @@
 
 __all__ = ["AudioResponseFormat"]
 
-AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json", "vtt"]
+AudioResponseFormat: TypeAlias = Literal["json", "text", "srt", "verbose_json", "vtt", "diarized_json"]
diff --git a/src/openai/types/realtime/audio_transcription.py b/src/openai/types/realtime/audio_transcription.py
index cf662b3aa2..3e5c8e0cb4 100644
--- a/src/openai/types/realtime/audio_transcription.py
+++ b/src/openai/types/realtime/audio_transcription.py
@@ -17,13 +17,14 @@ class AudioTranscription(BaseModel):
     format will improve accuracy and latency.
     """
 
-    model: Optional[Literal["whisper-1", "gpt-4o-transcribe-latest", "gpt-4o-mini-transcribe", "gpt-4o-transcribe"]] = (
-        None
-    )
+    model: Optional[
+        Literal["whisper-1", "gpt-4o-mini-transcribe", "gpt-4o-transcribe", "gpt-4o-transcribe-diarize"]
+    ] = None
     """The model to use for transcription.
 
-    Current options are `whisper-1`, `gpt-4o-transcribe-latest`,
-    `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`.
+    Current options are `whisper-1`, `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`,
+    and `gpt-4o-transcribe-diarize`. Use `gpt-4o-transcribe-diarize` when you need
+    diarization with speaker labels.
     """
 
     prompt: Optional[str] = None
@@ -31,6 +32,6 @@ class AudioTranscription(BaseModel):
     An optional text to guide the model's style or continue a previous audio
     segment. For `whisper-1`, the
     [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
-    For `gpt-4o-transcribe` models, the prompt is a free text string, for example
-    "expect words related to technology".
+    For `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the
+    prompt is a free text string, for example "expect words related to technology".
     """
diff --git a/src/openai/types/realtime/audio_transcription_param.py b/src/openai/types/realtime/audio_transcription_param.py
index fb09f105b8..3b65e42c8f 100644
--- a/src/openai/types/realtime/audio_transcription_param.py
+++ b/src/openai/types/realtime/audio_transcription_param.py
@@ -16,11 +16,12 @@ class AudioTranscriptionParam(TypedDict, total=False):
     format will improve accuracy and latency.
     """
 
-    model: Literal["whisper-1", "gpt-4o-transcribe-latest", "gpt-4o-mini-transcribe", "gpt-4o-transcribe"]
+    model: Literal["whisper-1", "gpt-4o-mini-transcribe", "gpt-4o-transcribe", "gpt-4o-transcribe-diarize"]
     """The model to use for transcription.
 
-    Current options are `whisper-1`, `gpt-4o-transcribe-latest`,
-    `gpt-4o-mini-transcribe`, and `gpt-4o-transcribe`.
+    Current options are `whisper-1`, `gpt-4o-mini-transcribe`, `gpt-4o-transcribe`,
+    and `gpt-4o-transcribe-diarize`. Use `gpt-4o-transcribe-diarize` when you need
+    diarization with speaker labels.
     """
 
     prompt: str
@@ -28,6 +29,6 @@ class AudioTranscriptionParam(TypedDict, total=False):
     An optional text to guide the model's style or continue a previous audio
     segment. For `whisper-1`, the
     [prompt is a list of keywords](https://platform.openai.com/docs/guides/speech-to-text#prompting).
-    For `gpt-4o-transcribe` models, the prompt is a free text string, for example
-    "expect words related to technology".
+    For `gpt-4o-transcribe` models (excluding `gpt-4o-transcribe-diarize`), the
+    prompt is a free text string, for example "expect words related to technology".
     """
diff --git a/src/openai/types/vector_store_create_params.py b/src/openai/types/vector_store_create_params.py
index 945a9886a3..f373a6ed28 100644
--- a/src/openai/types/vector_store_create_params.py
+++ b/src/openai/types/vector_store_create_params.py
@@ -20,6 +20,12 @@ class VectorStoreCreateParams(TypedDict, total=False):
     non-empty.
     """
 
+    description: str
+    """A description for the vector store.
+
+    Can be used to describe the vector store's purpose.
+    """
+
     expires_after: ExpiresAfter
     """The expiration policy for a vector store."""
 
diff --git a/tests/api_resources/audio/test_transcriptions.py b/tests/api_resources/audio/test_transcriptions.py
index 11cbe2349c..b5eaa4be1f 100644
--- a/tests/api_resources/audio/test_transcriptions.py
+++ b/tests/api_resources/audio/test_transcriptions.py
@@ -32,6 +32,8 @@ def test_method_create_with_all_params_overload_1(self, client: OpenAI) -> None:
             model="gpt-4o-transcribe",
             chunking_strategy="auto",
             include=["logprobs"],
+            known_speaker_names=["string"],
+            known_speaker_references=["string"],
             language="language",
             prompt="prompt",
             response_format="json",
@@ -84,6 +86,8 @@ def test_method_create_with_all_params_overload_2(self, client: OpenAI) -> None:
             stream=True,
             chunking_strategy="auto",
             include=["logprobs"],
+            known_speaker_names=["string"],
+            known_speaker_references=["string"],
             language="language",
             prompt="prompt",
             response_format="json",
@@ -140,6 +144,8 @@ async def test_method_create_with_all_params_overload_1(self, async_client: Asyn
             model="gpt-4o-transcribe",
             chunking_strategy="auto",
             include=["logprobs"],
+            known_speaker_names=["string"],
+            known_speaker_references=["string"],
             language="language",
             prompt="prompt",
             response_format="json",
@@ -192,6 +198,8 @@ async def test_method_create_with_all_params_overload_2(self, async_client: Asyn
             stream=True,
             chunking_strategy="auto",
             include=["logprobs"],
+            known_speaker_names=["string"],
+            known_speaker_references=["string"],
             language="language",
             prompt="prompt",
             response_format="json",
diff --git a/tests/api_resources/test_vector_stores.py b/tests/api_resources/test_vector_stores.py
index dffd2b1d07..cce9c52cea 100644
--- a/tests/api_resources/test_vector_stores.py
+++ b/tests/api_resources/test_vector_stores.py
@@ -31,6 +31,7 @@ def test_method_create(self, client: OpenAI) -> None:
     def test_method_create_with_all_params(self, client: OpenAI) -> None:
         vector_store = client.vector_stores.create(
             chunking_strategy={"type": "auto"},
+            description="description",
             expires_after={
                 "anchor": "last_active_at",
                 "days": 1,
@@ -299,6 +300,7 @@ async def test_method_create(self, async_client: AsyncOpenAI) -> None:
     async def test_method_create_with_all_params(self, async_client: AsyncOpenAI) -> None:
         vector_store = await async_client.vector_stores.create(
             chunking_strategy={"type": "auto"},
+            description="description",
             expires_after={
                 "anchor": "last_active_at",
                 "days": 1,
diff --git a/tests/lib/test_audio.py b/tests/lib/test_audio.py
index ff8dba4714..93ed3a33b2 100644
--- a/tests/lib/test_audio.py
+++ b/tests/lib/test_audio.py
@@ -44,7 +44,8 @@ def test_translation_create_overloads_in_sync(sync: bool, client: OpenAI, async_
         elif is_literal_type(typ):
             overload_response_formats.update(get_args(typ))
 
-    src_response_formats: set[str] = set(get_args(AudioResponseFormat))
+    # 'diarized_json' applies only to transcriptions, not translations.
+    src_response_formats: set[str] = set(get_args(AudioResponseFormat)) - {"diarized_json"}
     diff = src_response_formats.difference(overload_response_formats)
     assert len(diff) == 0, f"some response format options don't have overloads"
 
@@ -57,18 +58,27 @@ def test_transcription_create_overloads_in_sync(sync: bool, client: OpenAI, asyn
     overload_response_formats: set[str] = set()
 
     for i, overload in enumerate(typing_extensions.get_overloads(fn)):
-        assert_signatures_in_sync(
-            fn,
-            overload,
-            exclude_params={"response_format", "stream"},
-            description=f" for overload {i}",
-        )
-
         sig = inspect.signature(overload)
         typ = evaluate_forwardref(
             sig.parameters["response_format"].annotation,
             globalns=sys.modules[fn.__module__].__dict__,
         )
+
+        exclude_params = {"response_format", "stream"}
+        # known_speaker_names and known_speaker_references are only supported by diarized_json
+        if not (is_literal_type(typ) and set(get_args(typ)) == {"diarized_json"}):
+            exclude_params.update({"known_speaker_names", "known_speaker_references"})
+
+        # diarized_json does not support these parameters
+        if is_literal_type(typ) and set(get_args(typ)) == {"diarized_json"}:
+            exclude_params.update({"include", "prompt", "timestamp_granularities"})
+
+        assert_signatures_in_sync(
+            fn,
+            overload,
+            exclude_params=exclude_params,
+            description=f" for overload {i}",
+        )
         if is_union_type(typ):
             for arg in get_args(typ):
                 if not is_literal_type(arg):

From e043d7b164c9ee9b34f7029606f08ed60d2d47db Mon Sep 17 00:00:00 2001
From: Alex Chang <apcha@openai.com>
Date: Thu, 16 Oct 2025 11:08:09 -0400
Subject: [PATCH 5/6] chore: fix dangling comment

---
 src/openai/resources/audio/transcriptions.py | 60 --------------------
 1 file changed, 60 deletions(-)

diff --git a/src/openai/resources/audio/transcriptions.py b/src/openai/resources/audio/transcriptions.py
index 52e44bffb7..30ef39deec 100644
--- a/src/openai/resources/audio/transcriptions.py
+++ b/src/openai/resources/audio/transcriptions.py
@@ -103,66 +103,6 @@ def create(
         timeout: float | httpx.Timeout | None | NotGiven = not_given,
     ) -> TranscriptionVerbose: ...
 
-              model's confidence in the transcription. `logprobs` only works with
-              response_format set to `json` and only with the models `gpt-4o-transcribe` and
-              `gpt-4o-mini-transcribe`. This field is not supported when using
-              `gpt-4o-transcribe-diarize`.
-
-          known_speaker_names: Optional list of speaker names that correspond to the audio samples provided in
-              `known_speaker_references[]`. Each entry should be a short identifier (for
-              example `customer` or `agent`). Up to 4 speakers are supported.
-
-          known_speaker_references: Optional list of audio samples (as
-              [data URLs](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/Data_URLs))
-              that contain known speaker references matching `known_speaker_names[]`. Each
-              sample must be between 2 and 10 seconds, and can use any of the same input audio
-              formats supported by `file`.
-
-          language: The language of the input audio. Supplying the input language in
-              [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) (e.g. `en`)
-              format will improve accuracy and latency.
-
-          prompt: An optional text to guide the model's style or continue a previous audio
-              segment. The
-              [prompt](https://platform.openai.com/docs/guides/speech-to-text#prompting)
-              should match the audio language. This field is not supported when using
-              `gpt-4o-transcribe-diarize`.
-
-          response_format: The format of the output, in one of these options: `json`, `text`, `srt`,
-              `verbose_json`, `vtt`, or `diarized_json`. For `gpt-4o-transcribe` and
-              `gpt-4o-mini-transcribe`, the only supported format is `json`. For
-              `gpt-4o-transcribe-diarize`, the supported formats are `json`, `text`, and
-              `diarized_json`, with `diarized_json` required to receive speaker annotations.
-
-          stream: If set to true, the model response data will be streamed to the client as it is
-              generated using
-              [server-sent events](https://developer.mozilla.org/en-US/docs/Web/API/Server-sent_events/Using_server-sent_events#Event_stream_format).
-              See the
-              [Streaming section of the Speech-to-Text guide](https://platform.openai.com/docs/guides/speech-to-text?lang=curl#streaming-transcriptions)
-              for more information.
-
-              Note: Streaming is not supported for the `whisper-1` model and will be ignored.
-
-          temperature: The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
-              output more random, while lower values like 0.2 will make it more focused and
-              deterministic. If set to 0, the model will use
-              [log probability](https://en.wikipedia.org/wiki/Log_probability) to
-              automatically increase the temperature until certain thresholds are hit.
-
-          timestamp_granularities: The timestamp granularities to populate for this transcription.
-              `response_format` must be set `verbose_json` to use timestamp granularities.
-              Either or both of these options are supported: `word`, or `segment`. Note: There
-              is no additional latency for segment timestamps, but generating word timestamps
-              incurs additional latency. This option is not available for
-              `gpt-4o-transcribe-diarize`.
-
-          extra_headers: Send extra headers
-
-          extra_query: Add additional query parameters to the request
-
-          extra_body: Add additional JSON properties to the request
-    ) -> Transcription: ...
-
     @overload
     def create(
         self,

From ebf32212f7bf5bec6b24cc2276ac0d9a28dd63bb Mon Sep 17 00:00:00 2001
From: "stainless-app[bot]"
 <142633134+stainless-app[bot]@users.noreply.github.com>
Date: Thu, 16 Oct 2025 15:08:49 +0000
Subject: [PATCH 6/6] release: 2.4.0

---
 .release-please-manifest.json |  2 +-
 CHANGELOG.md                  | 14 ++++++++++++++
 pyproject.toml                |  2 +-
 src/openai/_version.py        |  2 +-
 4 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/.release-please-manifest.json b/.release-please-manifest.json
index 75ec52fc91..b44b287037 100644
--- a/.release-please-manifest.json
+++ b/.release-please-manifest.json
@@ -1,3 +1,3 @@
 {
-  ".": "2.3.0"
+  ".": "2.4.0"
 }
\ No newline at end of file
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 66eec00fea..30f898c23b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,5 +1,19 @@
 # Changelog
 
+## 2.4.0 (2025-10-16)
+
+Full Changelog: [v2.3.0...v2.4.0](https://github.com/openai/openai-python/compare/v2.3.0...v2.4.0)
+
+### Features
+
+* **api:** Add support for gpt-4o-transcribe-diarize on audio/transcriptions endpoint ([bdbe9b8](https://github.com/openai/openai-python/commit/bdbe9b8f440209afa2979db4a9eda9579b3d2550))
+
+
+### Chores
+
+* fix dangling comment ([da14e99](https://github.com/openai/openai-python/commit/da14e9960608f7ade6f5cdf91967830c8a6c1657))
+* **internal:** detect missing future annotations with ruff ([2672b8f](https://github.com/openai/openai-python/commit/2672b8f0726300f7c62c356f25545ef0b3c0bb2e))
+
 ## 2.3.0 (2025-10-10)
 
 Full Changelog: [v2.2.0...v2.3.0](https://github.com/openai/openai-python/compare/v2.2.0...v2.3.0)
diff --git a/pyproject.toml b/pyproject.toml
index 0f773e5fa4..43de9882f2 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [project]
 name = "openai"
-version = "2.3.0"
+version = "2.4.0"
 description = "The official Python library for the openai API"
 dynamic = ["readme"]
 license = "Apache-2.0"
diff --git a/src/openai/_version.py b/src/openai/_version.py
index f202a6d61c..e09654e09d 100644
--- a/src/openai/_version.py
+++ b/src/openai/_version.py
@@ -1,4 +1,4 @@
 # File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
 
 __title__ = "openai"
-__version__ = "2.3.0"  # x-release-please-version
+__version__ = "2.4.0"  # x-release-please-version