forked from openai/openai-go
-
Notifications
You must be signed in to change notification settings - Fork 0
/
audiotranscription.go
137 lines (122 loc) · 5.52 KB
/
audiotranscription.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
// File generated from our OpenAPI spec by Stainless. See CONTRIBUTING.md for details.
package openai
import (
"bytes"
"context"
"io"
"mime/multipart"
"net/http"
"github.com/joschahenningsen/openai-go/internal/apiform"
"github.com/joschahenningsen/openai-go/internal/apijson"
"github.com/joschahenningsen/openai-go/internal/param"
"github.com/joschahenningsen/openai-go/internal/requestconfig"
"github.com/joschahenningsen/openai-go/option"
)
// AudioTranscriptionService contains methods and other services that help with
// interacting with the openai API.
//
// Note, unlike clients, this service does not read variables from the environment
// automatically. You should not instantiate this service directly, and instead use
// the [NewAudioTranscriptionService] method instead.
type AudioTranscriptionService struct {
Options []option.RequestOption
}
// NewAudioTranscriptionService generates a new service that applies the given
// options to each request. These options are applied after the parent client's
// options (if there is one), and before any request-specific options.
func NewAudioTranscriptionService(opts ...option.RequestOption) (r *AudioTranscriptionService) {
r = &AudioTranscriptionService{}
r.Options = opts
return
}
// Transcribes audio into the input language.
func (r *AudioTranscriptionService) New(ctx context.Context, body AudioTranscriptionNewParams, opts ...option.RequestOption) (res *Transcription, err error) {
opts = append(r.Options[:], opts...)
path := "audio/transcriptions"
err = requestconfig.ExecuteNewRequest(ctx, http.MethodPost, path, body, &res, opts...)
return
}
// Represents a transcription response returned by model, based on the provided
// input.
type Transcription struct {
// The transcribed text.
Text string `json:"text,required"`
JSON transcriptionJSON `json:"-"`
}
// transcriptionJSON contains the JSON metadata for the struct [Transcription]
type transcriptionJSON struct {
Text apijson.Field
raw string
ExtraFields map[string]apijson.Field
}
func (r *Transcription) UnmarshalJSON(data []byte) (err error) {
return apijson.UnmarshalRoot(data, r)
}
func (r transcriptionJSON) RawJSON() string {
return r.raw
}
type AudioTranscriptionNewParams struct {
// The audio file object (not file name) to transcribe, in one of these formats:
// flac, mp3, mp4, mpeg, mpga, m4a, ogg, wav, or webm.
File param.Field[io.Reader] `json:"file,required" format:"binary"`
// ID of the model to use. Only `whisper-1` (which is powered by our open source
// Whisper V2 model) is currently available.
Model param.Field[AudioModel] `json:"model,required"`
// The language of the input audio. Supplying the input language in
// [ISO-639-1](https://en.wikipedia.org/wiki/List_of_ISO_639-1_codes) format will
// improve accuracy and latency.
Language param.Field[string] `json:"language"`
// An optional text to guide the model's style or continue a previous audio
// segment. The
// [prompt](https://platform.openai.com/docs/guides/speech-to-text/prompting)
// should match the audio language.
Prompt param.Field[string] `json:"prompt"`
// If True, the previous output of the model is provided
// as a prompt for the next window; disabling may make the text inconsistent across
// windows, but the model becomes less prone to getting stuck in a failure loop,
// such as repetition looping or timestamps going out of sync.
ConditionOnPreviousText param.Field[bool] `json:"condition_on_previous_text"`
// Adds support for faster-whisper's voice activity detection. Enabled if True.
VadFilter param.Field[bool] `json:"vad_filter"`
// The format of the output, in one of these options: `json`, `text`, `srt`,
// `verbose_json`, or `vtt`.
ResponseFormat param.Field[AudioResponseFormat] `json:"response_format"`
// The sampling temperature, between 0 and 1. Higher values like 0.8 will make the
// output more random, while lower values like 0.2 will make it more focused and
// deterministic. If set to 0, the model will use
// [log probability](https://en.wikipedia.org/wiki/Log_probability) to
// automatically increase the temperature until certain thresholds are hit.
Temperature param.Field[float64] `json:"temperature"`
// The timestamp granularities to populate for this transcription.
// `response_format` must be set `verbose_json` to use timestamp granularities.
// Either or both of these options are supported: `word`, or `segment`. Note: There
// is no additional latency for segment timestamps, but generating word timestamps
// incurs additional latency.
TimestampGranularities param.Field[[]AudioTranscriptionNewParamsTimestampGranularity] `json:"timestamp_granularities"`
}
func (r AudioTranscriptionNewParams) MarshalMultipart() (data []byte, contentType string, err error) {
buf := bytes.NewBuffer(nil)
writer := multipart.NewWriter(buf)
err = apiform.MarshalRoot(r, writer)
if err != nil {
writer.Close()
return nil, "", err
}
err = writer.Close()
if err != nil {
return nil, "", err
}
return buf.Bytes(), writer.FormDataContentType(), nil
}
type AudioTranscriptionNewParamsTimestampGranularity string
const (
AudioTranscriptionNewParamsTimestampGranularityWord AudioTranscriptionNewParamsTimestampGranularity = "word"
AudioTranscriptionNewParamsTimestampGranularitySegment AudioTranscriptionNewParamsTimestampGranularity = "segment"
)
func (r AudioTranscriptionNewParamsTimestampGranularity) IsKnown() bool {
switch r {
case AudioTranscriptionNewParamsTimestampGranularityWord, AudioTranscriptionNewParamsTimestampGranularitySegment:
return true
}
return false
}