Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions example/lib/create_audio_transcription_with_chunking_strategy.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
import 'dart:io';

import 'package:dart_openai/dart_openai.dart';

import 'env/env.dart';

// ignore: depend_on_referenced_packages
import 'package:http/http.dart' as http;

Future<void> main() async {
// Set the OpenAI API key from the .env file.
OpenAI.apiKey = Env.apiKey;

print("Demonstrating chunking_strategy in audio transcription...\n");

// Example 1: Using auto chunking strategy
print("1. Using auto chunking strategy:");
final transcriptionAuto = await OpenAI.instance.audio.createTranscription(
file: await getFileFromUrl(
'https://www.cbvoiceovers.com/wp-content/uploads/2017/05/Commercial-showreel.mp3',
),
model: "whisper-1",
responseFormat: OpenAIAudioResponseFormat.json,
chunkingStrategy: OpenAIAudioChunkingConfig.auto(),
);
print(
"Auto chunking result: ${transcriptionAuto.text.substring(0, 100)}...\n");

// Example 2: Using server VAD chunking strategy with custom parameters
print("2. Using server VAD chunking strategy:");
final transcriptionServerVad =
await OpenAI.instance.audio.createTranscription(
file: await getFileFromUrl(
'https://www.cbvoiceovers.com/wp-content/uploads/2017/05/Commercial-showreel.mp3',
),
model: "whisper-1",
responseFormat: OpenAIAudioResponseFormat.json,
chunkingStrategy: OpenAIAudioChunkingConfig.serverVad(
prefixPaddingMs: 200, // Add 200ms of audio before detected speech
silenceDurationMs:
500, // Wait 500ms of silence before considering speech ended
threshold:
0.1, // Voice activity detection threshold (lower = more sensitive)
),
);
print(
"Server VAD chunking result: ${transcriptionServerVad.text.substring(0, 100)}...\n");

// Example 3: Using chunking strategy with translation
print("3. Using auto chunking strategy for translation:");
final translation = await OpenAI.instance.audio.createTranslation(
file: await getFileFromUrl(
'https://www.cbvoiceovers.com/wp-content/uploads/2017/05/Commercial-showreel.mp3',
),
model: "whisper-1",
chunkingStrategy: OpenAIAudioChunkingConfig.auto(),
);
print(
"Translation with chunking: ${translation.text.substring(0, 100)}...\n");

// Example 4: Different chunking configurations
print("4. Different server VAD configurations:");

// More sensitive VAD (lower threshold)
final sensitiveTrans = await OpenAI.instance.audio.createTranscription(
file: await getFileFromUrl(
'https://www.cbvoiceovers.com/wp-content/uploads/2017/05/Commercial-showreel.mp3',
),
model: "whisper-1",
chunkingStrategy: OpenAIAudioChunkingConfig.serverVad(threshold: 0.05),
);
print("Sensitive VAD result: ${sensitiveTrans.text.substring(0, 100)}...\n");

// Less sensitive VAD (higher threshold) with longer silence detection
final conservativeTrans = await OpenAI.instance.audio.createTranscription(
file: await getFileFromUrl(
'https://www.cbvoiceovers.com/wp-content/uploads/2017/05/Commercial-showreel.mp3',
),
model: "whisper-1",
chunkingStrategy: OpenAIAudioChunkingConfig.serverVad(
threshold: 0.3,
silenceDurationMs: 1000,
),
);
print(
"Conservative VAD result: ${conservativeTrans.text.substring(0, 100)}...\n");

print("Chunking strategy demonstration completed!");
}

/// Downloads a file from the given URL and returns it as a File object.
Future<File> getFileFromUrl(String networkUrl) async {
final response = await http.get(Uri.parse(networkUrl));
final uniqueImageName = DateTime.now().microsecondsSinceEpoch;
final file = File("$uniqueImageName.mp3");
await file.writeAsBytes(response.bodyBytes);
return file;
}
2 changes: 2 additions & 0 deletions lib/src/core/base/audio/interfaces.dart
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ abstract class CreateInterface {
double? temperature,
String? language,
List<OpenAIAudioTimestampGranularity>? timestamp_granularities,
OpenAIAudioChunkingConfig? chunkingStrategy,
});

Future<OpenAIAudioModel> createTranslation({
Expand All @@ -29,5 +30,6 @@ abstract class CreateInterface {
String? prompt,
OpenAIAudioResponseFormat? responseFormat,
double? temperature,
OpenAIAudioChunkingConfig? chunkingStrategy,
});
}
2 changes: 2 additions & 0 deletions lib/src/core/enum.dart
Original file line number Diff line number Diff line change
Expand Up @@ -18,4 +18,6 @@ enum OpenAIAudioResponseFormat { json, text, srt, verbose_json, vtt }

enum OpenAIAudioSpeechResponseFormat { mp3, opus, aac, flac }

enum OpenAIAudioChunkingStrategy { auto, server_vad }

enum OpenAIChatMessageRole { system, user, assistant, function, tool }
137 changes: 137 additions & 0 deletions lib/src/core/models/audio/chunking_strategy.dart
Original file line number Diff line number Diff line change
@@ -0,0 +1,137 @@
import 'package:meta/meta.dart';

import '../../enum.dart';

/// {@template openai_audio_chunking_config}
/// This class represents the chunking strategy configuration for audio transcription.
/// It can be either "auto" or a server VAD configuration with custom parameters.
/// {@endtemplate}
@immutable
abstract class OpenAIAudioChunkingConfig {
/// The type of chunking strategy to use.
final OpenAIAudioChunkingStrategy type;

/// {@macro openai_audio_chunking_config}
const OpenAIAudioChunkingConfig({required this.type});

/// Factory constructor for creating auto chunking strategy.
factory OpenAIAudioChunkingConfig.auto() {
return const OpenAIAudioChunkingConfigAuto();
}

/// Factory constructor for creating server VAD chunking strategy.
factory OpenAIAudioChunkingConfig.serverVad({
int? prefixPaddingMs,
int? silenceDurationMs,
double? threshold,
}) {
return OpenAIAudioChunkingConfigServerVad(
prefixPaddingMs: prefixPaddingMs,
silenceDurationMs: silenceDurationMs,
threshold: threshold,
);
}

/// This is used to convert a [Map<String, dynamic>] object to a [OpenAIAudioChunkingConfig] object.
factory OpenAIAudioChunkingConfig.fromMap(Map<String, dynamic> json) {
final type = OpenAIAudioChunkingStrategy.values.firstWhere(
(e) => e.name == json['type'],
);

switch (type) {
case OpenAIAudioChunkingStrategy.auto:
return OpenAIAudioChunkingConfig.auto();
case OpenAIAudioChunkingStrategy.server_vad:
return OpenAIAudioChunkingConfig.serverVad(
prefixPaddingMs: json['prefix_padding_ms'],
silenceDurationMs: json['silence_duration_ms'],
threshold: json['threshold'],
);
}
}

/// This method is used to convert the [OpenAIAudioChunkingConfig] to a [Map<String, dynamic>] object.
Map<String, dynamic> toMap();

@override
String toString() => 'OpenAIAudioChunkingConfig(type: $type)';

@override
bool operator ==(Object other) {
if (identical(this, other)) return true;
return other is OpenAIAudioChunkingConfig && other.type == type;
}

@override
int get hashCode => type.hashCode;
}

/// {@template openai_audio_chunking_config_auto}
/// Auto chunking strategy configuration.
/// {@endtemplate}
@immutable
final class OpenAIAudioChunkingConfigAuto extends OpenAIAudioChunkingConfig {
/// {@macro openai_audio_chunking_config_auto}
const OpenAIAudioChunkingConfigAuto()
: super(type: OpenAIAudioChunkingStrategy.auto);

@override
Map<String, dynamic> toMap() {
return {'type': type.name};
}

@override
String toString() => 'OpenAIAudioChunkingConfigAuto()';
}

/// {@template openai_audio_chunking_config_server_vad}
/// Server VAD chunking strategy configuration with custom parameters.
/// {@endtemplate}
@immutable
final class OpenAIAudioChunkingConfigServerVad
extends OpenAIAudioChunkingConfig {
/// The amount of audio to include before the VAD detected speech starts, in milliseconds.
final int? prefixPaddingMs;

/// The amount of silence to wait before considering speech to have ended, in milliseconds.
final int? silenceDurationMs;

/// The threshold for voice activity detection. Lower values are more sensitive.
final double? threshold;

/// {@macro openai_audio_chunking_config_server_vad}
const OpenAIAudioChunkingConfigServerVad({
this.prefixPaddingMs,
this.silenceDurationMs,
this.threshold,
}) : super(type: OpenAIAudioChunkingStrategy.server_vad);

@override
Map<String, dynamic> toMap() {
return {
'type': type.name,
if (prefixPaddingMs != null) 'prefix_padding_ms': prefixPaddingMs,
if (silenceDurationMs != null) 'silence_duration_ms': silenceDurationMs,
if (threshold != null) 'threshold': threshold,
};
}

@override
String toString() {
return 'OpenAIAudioChunkingConfigServerVad(prefixPaddingMs: $prefixPaddingMs, silenceDurationMs: $silenceDurationMs, threshold: $threshold)';
}

@override
bool operator ==(Object other) {
if (identical(this, other)) return true;
return other is OpenAIAudioChunkingConfigServerVad &&
other.type == type &&
other.prefixPaddingMs == prefixPaddingMs &&
other.silenceDurationMs == silenceDurationMs &&
other.threshold == threshold;
}

@override
int get hashCode =>
Object.hash(type, prefixPaddingMs, silenceDurationMs, threshold);
}
1 change: 1 addition & 0 deletions lib/src/core/models/export.dart
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ export 'moderation/moderation.dart';
export '../enum.dart';
export 'chat/chat.dart';
export 'audio/audio.dart';
export 'audio/chunking_strategy.dart';
19 changes: 19 additions & 0 deletions lib/src/instance/audio/audio.dart
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import 'package:dart_openai/src/core/builder/base_api_url.dart';
import 'package:dart_openai/src/core/networking/client.dart';

import 'dart:convert';
import 'dart:io';

import '../../../dart_openai.dart';
Expand Down Expand Up @@ -36,6 +37,8 @@ interface class OpenAIAudio implements OpenAIAudioBase {
///
/// [timestamp_granularities] The timestamp granularities to populate for this transcription. response_format must be set verbose_json to use timestamp granularities. Either: word or segment, both doesnt work.
///
/// [chunkingStrategy] The chunking strategy to use for processing the audio file. Can be "auto" or server VAD configuration.
///
/// Example:
/// ```dart
/// final transcription = await openai.audio.createTranscription(
Expand All @@ -44,6 +47,7 @@ interface class OpenAIAudio implements OpenAIAudioBase {
/// prompt: "This is a prompt",
/// responseFormat: OpenAIAudioResponseFormat.srt,
/// temperature: 0.5,
/// chunkingStrategy: OpenAIAudioChunkingConfig.auto(),
/// );
/// ```
@override
Expand All @@ -55,6 +59,7 @@ interface class OpenAIAudio implements OpenAIAudioBase {
double? temperature,
String? language,
List<OpenAIAudioTimestampGranularity>? timestamp_granularities,
OpenAIAudioChunkingConfig? chunkingStrategy,
}) async {
return await OpenAINetworkingClient.fileUpload(
file: file,
Expand All @@ -68,6 +73,11 @@ interface class OpenAIAudio implements OpenAIAudioBase {
if (timestamp_granularities != null)
"timestamp_granularities[]":
timestamp_granularities.map((e) => e.name).join(","),
if (chunkingStrategy != null)
"chunking_strategy":
chunkingStrategy.type == OpenAIAudioChunkingStrategy.auto
? "auto"
: jsonEncode(chunkingStrategy.toMap()),
},
onSuccess: (Map<String, dynamic> response) {
return OpenAIAudioModel.fromMap(response);
Expand All @@ -90,13 +100,16 @@ interface class OpenAIAudio implements OpenAIAudioBase {
///
/// [temperature] is the sampling temperature for the request.
///
/// [chunkingStrategy] The chunking strategy to use for processing the audio file. Can be "auto" or server VAD configuration.
///
/// Example:
/// ```dart
/// final translation = await openai.audio.createTranslation(
/// file: File("audio.mp3"),
/// model: "whisper-1",
/// prompt: "This is a prompt",
/// responseFormat: OpenAIAudioResponseFormat.text,
/// chunkingStrategy: OpenAIAudioChunkingConfig.auto(),
/// );
/// ```
@override
Expand All @@ -106,6 +119,7 @@ interface class OpenAIAudio implements OpenAIAudioBase {
String? prompt,
OpenAIAudioResponseFormat? responseFormat,
double? temperature,
OpenAIAudioChunkingConfig? chunkingStrategy,
}) async {
return await OpenAINetworkingClient.fileUpload(
file: file,
Expand All @@ -115,6 +129,11 @@ interface class OpenAIAudio implements OpenAIAudioBase {
if (prompt != null) "prompt": prompt,
if (responseFormat != null) "response_format": responseFormat.name,
if (temperature != null) "temperature": temperature.toString(),
if (chunkingStrategy != null)
"chunking_strategy":
chunkingStrategy.type == OpenAIAudioChunkingStrategy.auto
? "auto"
: jsonEncode(chunkingStrategy.toMap()),
},
onSuccess: (Map<String, dynamic> response) {
return OpenAIAudioModel.fromMap(response);
Expand Down
Loading