Merge pull request #157 from mihirshahxenlabs/main

[Audio] createTranscription Added `timestamp_granularities[]` request parameter and updated response object
anasfik · Feb 21, 2024 · 878b602 · 878b602
2 parents e4b3eb5 + 8416206
commit 878b602
Show file tree

Hide file tree

Showing 6 changed files with 173 additions and 3 deletions.
diff --git a/lib/src/core/base/audio/interfaces.dart b/lib/src/core/base/audio/interfaces.dart
@@ -20,6 +20,7 @@ abstract class CreateInterface {
     OpenAIAudioResponseFormat? responseFormat,
     double? temperature,
     String? language,
+    List<OpenAIAudioTimestampGranularity>? timestamp_granularities,
   });
 
   Future<OpenAIAudioModel> createTranslation({

diff --git a/lib/src/core/enum.dart b/lib/src/core/enum.dart
@@ -12,6 +12,8 @@ enum OpenAIImageQuality { hd }
 
 enum OpenAIImageResponseFormat { url, b64Json }
 
+enum OpenAIAudioTimestampGranularity { word, segment }
+
 enum OpenAIAudioResponseFormat { json, text, srt, verbose_json, vtt }
 
 enum OpenAIAudioSpeechResponseFormat { mp3, opus, aac, flac }

diff --git a/lib/src/core/models/audio/audio.dart b/lib/src/core/models/audio/audio.dart
@@ -8,19 +8,39 @@ final class OpenAIAudioModel {
   /// The text response from the audio requests.
   /// This is the only field that is returned from the API.
   final String text;
+  final String? task;
+  final String? language;
+  final double? duration;
+
+  final List<Word>? words;
+  final List<Segment>? segments;
 
   @override
   int get hashCode => text.hashCode;
 
   /// {@macro openai_audio_model}
   const OpenAIAudioModel({
     required this.text,
+    this.task,
+    this.language,
+    this.duration,
+    this.words,
+    this.segments,
   });
 
   /// This is used  to convert a [Map<String, dynamic>] object to a [OpenAIAudioModel] object.
   factory OpenAIAudioModel.fromMap(Map<String, dynamic> json) {
     return OpenAIAudioModel(
       text: json['text'],
+      task: json['task'],
+      language: json['language'],
+      duration: json['duration'],
+      words: json['words'] != null
+          ? List<Word>.from(json['words'].map((x) => Word.fromMap(x)))
+          : null,
+      segments: json['segments'] != null
+          ? List<Segment>.from(json['segments'].map((x) => Segment.fromMap(x)))
+          : null,
     );
   }
 
@@ -30,18 +50,147 @@ final class OpenAIAudioModel {
   Map<String, dynamic> toMap() {
     return {
       'text': text,
+      if (task != null) 'task': task,
+      if (language != null) 'language': language,
+      if (duration != null) 'duration': duration,
+      if (words != null) 'words': words,
+      if (segments != null) 'segments': segments,
     };
   }
 
   @override
   String toString() {
-    return 'OpenAIAudioModel(text: $text)';
+    return 'OpenAIAudioModel(text: $text, task: $task, language: $language, duration: $duration, words: $words, segments: $segments)';
+  }
+
+  @override
+  bool operator ==(Object other) {
+    if (identical(this, other)) return true;
+
+    return other is OpenAIAudioModel &&
+        other.text == text &&
+        other.task == task &&
+        other.language == language &&
+        other.duration == duration &&
+        other.words == words &&
+        other.segments == segments;
   }
+}
+
+final class Word {
+  final String word;
+  final double start;
+  final double end;
+
+  const Word({
+    required this.word,
+    required this.start,
+    required this.end,
+  });
+
+  factory Word.fromMap(Map<String, dynamic> json) {
+    return Word(
+      word: json['word'],
+      start: json['start'],
+      end: json['end'],
+    );
+  }
+
+  Map<String, dynamic> toMap() {
+    return {
+      'word': word,
+      'start': start,
+      'end': end,
+    };
+  }
+
+  @override
+  String toString() => 'Word(word: $word, start: $start, end: $end)';
+
+  @override
+  bool operator ==(Object other) {
+    if (identical(this, other)) return true;
+
+    return other is Word &&
+        other.word == word &&
+        other.start == start &&
+        other.end == end;
+  }
+}
+
+final class Segment {
+  final int id;
+  final int seek;
+  final double start;
+  final double end;
+  final String text;
+  final List<int> tokens;
+  final double temperature;
+  final double avg_logprob;
+  final double compression_ratio;
+  final double no_speech_prob;
+
+  const Segment({
+    required this.id,
+    required this.seek,
+    required this.start,
+    required this.end,
+    required this.text,
+    required this.tokens,
+    required this.temperature,
+    required this.avg_logprob,
+    required this.compression_ratio,
+    required this.no_speech_prob,
+  });
+
+  factory Segment.fromMap(Map<String, dynamic> json) {
+    return Segment(
+      id: json['id'],
+      seek: json['seek'],
+      start: json['start'],
+      end: json['end'],
+      text: json['text'],
+      tokens: List<int>.from(json['tokens']),
+      temperature: json['temperature'],
+      avg_logprob: json['avg_logprob'],
+      compression_ratio: json['compression_ratio'],
+      no_speech_prob: json['no_speech_prob'],
+    );
+  }
+
+  Map<String, dynamic> toMap() {
+    return {
+      'id': id,
+      'seek': seek,
+      'start': start,
+      'end': end,
+      'text': text,
+      'tokens': tokens,
+      'temperature': temperature,
+      'avg_logprob': avg_logprob,
+      'compression_ratio': compression_ratio,
+      'no_speech_prob': no_speech_prob,
+    };
+  }
+
+  @override
+  String toString() =>
+      'Segment(id: $id, seek: $seek, start: $start, end: $end, text: $text, tokens: $tokens, temperature: $temperature, avg_logprob: $avg_logprob, compression_ratio: $compression_ratio, no_speech_prob: $no_speech_prob)';
 
   @override
   bool operator ==(Object other) {
     if (identical(this, other)) return true;
 
-    return other is OpenAIAudioModel && other.text == text;
+    return other is Segment &&
+        other.id == id &&
+        other.seek == seek &&
+        other.start == start &&
+        other.end == end &&
+        other.text == text &&
+        other.tokens == tokens &&
+        other.temperature == temperature &&
+        other.avg_logprob == avg_logprob &&
+        other.compression_ratio == compression_ratio &&
+        other.no_speech_prob == no_speech_prob;
   }
 }
diff --git a/lib/src/instance/audio/audio.dart b/lib/src/instance/audio/audio.dart
@@ -34,6 +34,8 @@ interface class OpenAIAudio implements OpenAIAudioBase {
   ///
   /// [language] is the language of the input audio. Supplying the input language in **ISO-639-1** format will improve accuracy and latency.
   ///
+  /// [timestamp_granularities] The timestamp granularities to populate for this transcription. response_format must be set verbose_json to use timestamp granularities. Either: word or segment, both doesnt work.
+  ///
   /// Example:
   /// ```dart
   /// final transcription = await openai.audio.createTranscription(
@@ -52,6 +54,7 @@ interface class OpenAIAudio implements OpenAIAudioBase {
     OpenAIAudioResponseFormat? responseFormat,
     double? temperature,
     String? language,
+    List<OpenAIAudioTimestampGranularity>? timestamp_granularities,
   }) async {
     return await OpenAINetworkingClient.fileUpload(
       file: file,
@@ -62,6 +65,9 @@ interface class OpenAIAudio implements OpenAIAudioBase {
         if (responseFormat != null) "response_format": responseFormat.name,
         if (temperature != null) "temperature": temperature.toString(),
         if (language != null) "language": language,
+        if (timestamp_granularities != null)
+          "timestamp_granularities[]":
+              timestamp_granularities.map((e) => e.name).join(","),
       },
       onSuccess: (Map<String, dynamic> response) {
         return OpenAIAudioModel.fromMap(response);

diff --git a/pubspec.yaml b/pubspec.yaml
@@ -1,6 +1,6 @@
 name: dart_openai
 description: Dart SDK for openAI Apis (GPT-3 & DALL-E), integrate easily the power of OpenAI's state-of-the-art AI models into their Dart applications.
-version: 5.0.0
+version: 5.0.1
 homepage: https://github.com/anasfik/openai
 repository: https://github.com/anasfik/openai
 documentation: https://github.com/anasfik/openai/blob/main/README.md

diff --git a/test/openai_test.dart b/test/openai_test.dart
@@ -388,9 +388,21 @@ void main() async {
         model: "whisper-1",
         responseFormat: OpenAIAudioResponseFormat.json,
       );
+      expect(transcription, isA<OpenAIAudioModel>());
+      expect(transcription.text, isA<String>());
+    });
+
+    test("create transcription with timestamp granularity", () async {
+      final transcription = await OpenAI.instance.audio.createTranscription(
+        file: audioExampleFile,
+        model: "whisper-1",
+        responseFormat: OpenAIAudioResponseFormat.verbose_json,
+        timestamp_granularities: [OpenAIAudioTimestampGranularity.word],
+      );
 
       expect(transcription, isA<OpenAIAudioModel>());
       expect(transcription.text, isA<String>());
+      expect(transcription.words, isA<List>());
     });
     test("create translation", () async {
       final audioExampleFile = await getFileFromUrl(