anasfik · anasfik · Sep 9, 2025 · Jul 3, 2025
diff --git a/example/lib/create_audio_transcription_with_chunking_strategy.dart b/example/lib/create_audio_transcription_with_chunking_strategy.dart
@@ -0,0 +1,98 @@
+import 'dart:io';
+
+import 'package:dart_openai/dart_openai.dart';
+
+import 'env/env.dart';
+
+// ignore: depend_on_referenced_packages
+import 'package:http/http.dart' as http;
+
+Future<void> main() async {
+  // Set the OpenAI API key from the .env file.
+  OpenAI.apiKey = Env.apiKey;
+
+  print("Demonstrating chunking_strategy in audio transcription...\n");
+
+  // Example 1: Using auto chunking strategy
+  print("1. Using auto chunking strategy:");
+  final transcriptionAuto = await OpenAI.instance.audio.createTranscription(
+    file: await getFileFromUrl(
+      'https://www.cbvoiceovers.com/wp-content/uploads/2017/05/Commercial-showreel.mp3',
+    ),
+    model: "whisper-1",
+    responseFormat: OpenAIAudioResponseFormat.json,
+    chunkingStrategy: OpenAIAudioChunkingConfig.auto(),
+  );
+  print(
+      "Auto chunking result: ${transcriptionAuto.text.substring(0, 100)}...\n");
+
+  // Example 2: Using server VAD chunking strategy with custom parameters
+  print("2. Using server VAD chunking strategy:");
+  final transcriptionServerVad =
+      await OpenAI.instance.audio.createTranscription(
+    file: await getFileFromUrl(
+      'https://www.cbvoiceovers.com/wp-content/uploads/2017/05/Commercial-showreel.mp3',
+    ),
+    model: "whisper-1",
+    responseFormat: OpenAIAudioResponseFormat.json,
+    chunkingStrategy: OpenAIAudioChunkingConfig.serverVad(
+      prefixPaddingMs: 200, // Add 200ms of audio before detected speech
+      silenceDurationMs:
+          500, // Wait 500ms of silence before considering speech ended
+      threshold:
+          0.1, // Voice activity detection threshold (lower = more sensitive)
+    ),
+  );
+  print(
+      "Server VAD chunking result: ${transcriptionServerVad.text.substring(0, 100)}...\n");
+
+  // Example 3: Using chunking strategy with translation
+  print("3. Using auto chunking strategy for translation:");
+  final translation = await OpenAI.instance.audio.createTranslation(
+    file: await getFileFromUrl(
+      'https://www.cbvoiceovers.com/wp-content/uploads/2017/05/Commercial-showreel.mp3',
+    ),
+    model: "whisper-1",
+    chunkingStrategy: OpenAIAudioChunkingConfig.auto(),
+  );
+  print(
+      "Translation with chunking: ${translation.text.substring(0, 100)}...\n");
+
+  // Example 4: Different chunking configurations
+  print("4. Different server VAD configurations:");
+
+  // More sensitive VAD (lower threshold)
+  final sensitiveTrans = await OpenAI.instance.audio.createTranscription(
+    file: await getFileFromUrl(
+      'https://www.cbvoiceovers.com/wp-content/uploads/2017/05/Commercial-showreel.mp3',
+    ),
+    model: "whisper-1",
+    chunkingStrategy: OpenAIAudioChunkingConfig.serverVad(threshold: 0.05),
+  );
+  print("Sensitive VAD result: ${sensitiveTrans.text.substring(0, 100)}...\n");
+
+  // Less sensitive VAD (higher threshold) with longer silence detection
+  final conservativeTrans = await OpenAI.instance.audio.createTranscription(
+    file: await getFileFromUrl(
+      'https://www.cbvoiceovers.com/wp-content/uploads/2017/05/Commercial-showreel.mp3',
+    ),
+    model: "whisper-1",
+    chunkingStrategy: OpenAIAudioChunkingConfig.serverVad(
+      threshold: 0.3,
+      silenceDurationMs: 1000,
+    ),
+  );
+  print(
+      "Conservative VAD result: ${conservativeTrans.text.substring(0, 100)}...\n");
+
+  print("Chunking strategy demonstration completed!");
+}
+
+/// Downloads a file from the given URL and returns it as a File object.
+Future<File> getFileFromUrl(String networkUrl) async {
+  final response = await http.get(Uri.parse(networkUrl));
+  final uniqueImageName = DateTime.now().microsecondsSinceEpoch;
+  final file = File("$uniqueImageName.mp3");
+  await file.writeAsBytes(response.bodyBytes);
+  return file;
+}
diff --git a/lib/src/core/base/audio/interfaces.dart b/lib/src/core/base/audio/interfaces.dart
@@ -21,6 +21,7 @@ abstract class CreateInterface {
     double? temperature,
     String? language,
     List<OpenAIAudioTimestampGranularity>? timestamp_granularities,
+    OpenAIAudioChunkingConfig? chunkingStrategy,
   });
 
   Future<OpenAIAudioModel> createTranslation({
@@ -29,5 +30,6 @@ abstract class CreateInterface {
     String? prompt,
     OpenAIAudioResponseFormat? responseFormat,
     double? temperature,
+    OpenAIAudioChunkingConfig? chunkingStrategy,
   });
 }
diff --git a/lib/src/core/enum.dart b/lib/src/core/enum.dart
@@ -18,4 +18,6 @@ enum OpenAIAudioResponseFormat { json, text, srt, verbose_json, vtt }
 
 enum OpenAIAudioSpeechResponseFormat { mp3, opus, aac, flac }
 
+enum OpenAIAudioChunkingStrategy { auto, server_vad }
+
 enum OpenAIChatMessageRole { system, user, assistant, function, tool }
diff --git a/lib/src/core/models/audio/chunking_strategy.dart b/lib/src/core/models/audio/chunking_strategy.dart
@@ -0,0 +1,137 @@
+import 'package:meta/meta.dart';
+
+import '../../enum.dart';
+
+/// {@template openai_audio_chunking_config}
+/// This class represents the chunking strategy configuration for audio transcription.
+/// It can be either "auto" or a server VAD configuration with custom parameters.
+/// {@endtemplate}
+@immutable
+abstract class OpenAIAudioChunkingConfig {
+  /// The type of chunking strategy to use.
+  final OpenAIAudioChunkingStrategy type;
+
+  /// {@macro openai_audio_chunking_config}
+  const OpenAIAudioChunkingConfig({required this.type});
+
+  /// Factory constructor for creating auto chunking strategy.
+  factory OpenAIAudioChunkingConfig.auto() {
+    return const OpenAIAudioChunkingConfigAuto();
+  }
+
+  /// Factory constructor for creating server VAD chunking strategy.
+  factory OpenAIAudioChunkingConfig.serverVad({
+    int? prefixPaddingMs,
+    int? silenceDurationMs,
+    double? threshold,
+  }) {
+    return OpenAIAudioChunkingConfigServerVad(
+      prefixPaddingMs: prefixPaddingMs,
+      silenceDurationMs: silenceDurationMs,
+      threshold: threshold,
+    );
+  }
+
+  /// This is used to convert a [Map<String, dynamic>] object to a [OpenAIAudioChunkingConfig] object.
+  factory OpenAIAudioChunkingConfig.fromMap(Map<String, dynamic> json) {
+    final type = OpenAIAudioChunkingStrategy.values.firstWhere(
+      (e) => e.name == json['type'],
+    );
+
+    switch (type) {
+      case OpenAIAudioChunkingStrategy.auto:
+        return OpenAIAudioChunkingConfig.auto();
+      case OpenAIAudioChunkingStrategy.server_vad:
+        return OpenAIAudioChunkingConfig.serverVad(
+          prefixPaddingMs: json['prefix_padding_ms'],
+          silenceDurationMs: json['silence_duration_ms'],
+          threshold: json['threshold'],
+        );
+    }
+  }
+
+  /// This method is used to convert the [OpenAIAudioChunkingConfig] to a [Map<String, dynamic>] object.
+  Map<String, dynamic> toMap();
+
+  @override
+  String toString() => 'OpenAIAudioChunkingConfig(type: $type)';
+
+  @override
+  bool operator ==(Object other) {
+    if (identical(this, other)) return true;
+    return other is OpenAIAudioChunkingConfig && other.type == type;
+  }
+
+  @override
+  int get hashCode => type.hashCode;
+}
+
+/// {@template openai_audio_chunking_config_auto}
+/// Auto chunking strategy configuration.
+/// {@endtemplate}
+@immutable
+final class OpenAIAudioChunkingConfigAuto extends OpenAIAudioChunkingConfig {
+  /// {@macro openai_audio_chunking_config_auto}
+  const OpenAIAudioChunkingConfigAuto()
+    : super(type: OpenAIAudioChunkingStrategy.auto);
+
+  @override
+  Map<String, dynamic> toMap() {
+    return {'type': type.name};
+  }
+
+  @override
+  String toString() => 'OpenAIAudioChunkingConfigAuto()';
+}
+
+/// {@template openai_audio_chunking_config_server_vad}
+/// Server VAD chunking strategy configuration with custom parameters.
+/// {@endtemplate}
+@immutable
+final class OpenAIAudioChunkingConfigServerVad
+    extends OpenAIAudioChunkingConfig {
+  /// The amount of audio to include before the VAD detected speech starts, in milliseconds.
+  final int? prefixPaddingMs;
+
+  /// The amount of silence to wait before considering speech to have ended, in milliseconds.
+  final int? silenceDurationMs;
+
+  /// The threshold for voice activity detection. Lower values are more sensitive.
+  final double? threshold;
+
+  /// {@macro openai_audio_chunking_config_server_vad}
+  const OpenAIAudioChunkingConfigServerVad({
+    this.prefixPaddingMs,
+    this.silenceDurationMs,
+    this.threshold,
+  }) : super(type: OpenAIAudioChunkingStrategy.server_vad);
+
+  @override
+  Map<String, dynamic> toMap() {
+    return {
+      'type': type.name,
+      if (prefixPaddingMs != null) 'prefix_padding_ms': prefixPaddingMs,
+      if (silenceDurationMs != null) 'silence_duration_ms': silenceDurationMs,
+      if (threshold != null) 'threshold': threshold,
+    };
+  }
+
+  @override
+  String toString() {
+    return 'OpenAIAudioChunkingConfigServerVad(prefixPaddingMs: $prefixPaddingMs, silenceDurationMs: $silenceDurationMs, threshold: $threshold)';
+  }
+
+  @override
+  bool operator ==(Object other) {
+    if (identical(this, other)) return true;
+    return other is OpenAIAudioChunkingConfigServerVad &&
+        other.type == type &&
+        other.prefixPaddingMs == prefixPaddingMs &&
+        other.silenceDurationMs == silenceDurationMs &&
+        other.threshold == threshold;
+  }
+
+  @override
+  int get hashCode =>
+      Object.hash(type, prefixPaddingMs, silenceDurationMs, threshold);
+}
diff --git a/lib/src/core/models/export.dart b/lib/src/core/models/export.dart
@@ -11,3 +11,4 @@ export 'moderation/moderation.dart';
 export '../enum.dart';
 export 'chat/chat.dart';
 export 'audio/audio.dart';
+export 'audio/chunking_strategy.dart';
diff --git a/lib/src/instance/audio/audio.dart b/lib/src/instance/audio/audio.dart
@@ -1,6 +1,7 @@
 import 'package:dart_openai/src/core/builder/base_api_url.dart';
 import 'package:dart_openai/src/core/networking/client.dart';
 
+import 'dart:convert';
 import 'dart:io';
 
 import '../../../dart_openai.dart';
@@ -36,6 +37,8 @@ interface class OpenAIAudio implements OpenAIAudioBase {
   ///
   /// [timestamp_granularities] The timestamp granularities to populate for this transcription. response_format must be set verbose_json to use timestamp granularities. Either: word or segment, both doesnt work.
   ///
+  /// [chunkingStrategy] The chunking strategy to use for processing the audio file. Can be "auto" or server VAD configuration.
+  ///
   /// Example:
   /// ```dart
   /// final transcription = await openai.audio.createTranscription(
@@ -44,6 +47,7 @@ interface class OpenAIAudio implements OpenAIAudioBase {
   /// prompt: "This is a prompt",
   /// responseFormat: OpenAIAudioResponseFormat.srt,
   /// temperature: 0.5,
+  /// chunkingStrategy: OpenAIAudioChunkingConfig.auto(),
   /// );
   /// ```
   @override
@@ -55,6 +59,7 @@ interface class OpenAIAudio implements OpenAIAudioBase {
     double? temperature,
     String? language,
     List<OpenAIAudioTimestampGranularity>? timestamp_granularities,
+    OpenAIAudioChunkingConfig? chunkingStrategy,
   }) async {
     return await OpenAINetworkingClient.fileUpload(
       file: file,
@@ -68,6 +73,11 @@ interface class OpenAIAudio implements OpenAIAudioBase {
         if (timestamp_granularities != null)
           "timestamp_granularities[]":
               timestamp_granularities.map((e) => e.name).join(","),
+        if (chunkingStrategy != null)
+          "chunking_strategy":
+              chunkingStrategy.type == OpenAIAudioChunkingStrategy.auto
+                  ? "auto"
+                  : jsonEncode(chunkingStrategy.toMap()),
       },
       onSuccess: (Map<String, dynamic> response) {
         return OpenAIAudioModel.fromMap(response);
@@ -90,13 +100,16 @@ interface class OpenAIAudio implements OpenAIAudioBase {
   ///
   /// [temperature] is the sampling temperature for the request.
   ///
+  /// [chunkingStrategy] The chunking strategy to use for processing the audio file. Can be "auto" or server VAD configuration.
+  ///
   /// Example:
   /// ```dart
   /// final translation = await openai.audio.createTranslation(
   /// file: File("audio.mp3"),
   /// model: "whisper-1",
   /// prompt: "This is a prompt",
   /// responseFormat: OpenAIAudioResponseFormat.text,
+  /// chunkingStrategy: OpenAIAudioChunkingConfig.auto(),
   /// );
   /// ```
   @override
@@ -106,6 +119,7 @@ interface class OpenAIAudio implements OpenAIAudioBase {
     String? prompt,
     OpenAIAudioResponseFormat? responseFormat,
     double? temperature,
+    OpenAIAudioChunkingConfig? chunkingStrategy,
   }) async {
     return await OpenAINetworkingClient.fileUpload(
       file: file,
@@ -115,6 +129,11 @@ interface class OpenAIAudio implements OpenAIAudioBase {
         if (prompt != null) "prompt": prompt,
         if (responseFormat != null) "response_format": responseFormat.name,
         if (temperature != null) "temperature": temperature.toString(),
+        if (chunkingStrategy != null)
+          "chunking_strategy":
+              chunkingStrategy.type == OpenAIAudioChunkingStrategy.auto
+                  ? "auto"
+                  : jsonEncode(chunkingStrategy.toMap()),
       },
       onSuccess: (Map<String, dynamic> response) {
         return OpenAIAudioModel.fromMap(response);
Original file line number	Diff line number	Diff line change
Expand Up		@@ -18,4 +18,6 @@ enum OpenAIAudioResponseFormat { json, text, srt, verbose_json, vtt }

		enum OpenAIAudioSpeechResponseFormat { mp3, opus, aac, flac }

		enum OpenAIAudioChunkingStrategy { auto, server_vad }

		enum OpenAIChatMessageRole { system, user, assistant, function, tool }