Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
db6b7e3
Initial speech to text abstractions
rogerbarreto Mar 18, 2025
4bdb7b9
Address some feedback (still more things to address)
stephentoub Mar 18, 2025
14c37c2
Merge branch 'main' of https://github.com/rogerbarreto/extensions int…
rogerbarreto Mar 29, 2025
fad8017
Resolve conflict
rogerbarreto Mar 29, 2025
3448daa
Ensure UT are working before further changes
rogerbarreto Mar 29, 2025
ef93211
Update method names Transcribe / Response to GetText
rogerbarreto Mar 29, 2025
43d610c
Update Test Names to new Method names
rogerbarreto Mar 29, 2025
ff4ae4a
Change interface from IList<IAsyncEnumerable> to one stream item at a…
rogerbarreto Mar 29, 2025
0831000
Update XmlDocs with corret definition, ensure correct naming
rogerbarreto Mar 29, 2025
8c893a9
Dropping the Choice / Message concept, flattering the Message with th…
rogerbarreto Mar 29, 2025
3d91982
Remove CultureInfo complexity from language properties
rogerbarreto Mar 30, 2025
009eeca
Adding Prompt property to options + UT
rogerbarreto Mar 30, 2025
305e7e4
Revert global.json changes
rogerbarreto Mar 30, 2025
1feac6d
Add missing experimental
rogerbarreto Mar 30, 2025
956097d
Fix UT
rogerbarreto Mar 30, 2025
0830a51
Address PR comments
rogerbarreto Mar 31, 2025
72407f2
Fix unit tests
rogerbarreto Mar 31, 2025
3c7e4ae
Fix UT
rogerbarreto Apr 1, 2025
8763c8c
Merge branch 'main' into audio-transcription-abstraction
rogerbarreto Apr 1, 2025
8d473cb
Merge branch 'audio-transcription-abstraction' of https://github.com/…
rogerbarreto Apr 1, 2025
c6c016e
Address PR comments
rogerbarreto Apr 1, 2025
b3d7819
Merge branch 'main' into audio-transcription-abstraction
rogerbarreto Apr 2, 2025
ca1338b
Remove async wrapping
rogerbarreto Apr 2, 2025
d3a14c9
Adjusting concat / text fields
rogerbarreto Apr 2, 2025
263f0e0
Start time and end time added to update + UT covering
rogerbarreto Apr 2, 2025
dd5ec14
AsISpeechToText renaming
rogerbarreto Apr 2, 2025
9eabb98
Remove OpenAIClient ctor + small fixes
rogerbarreto Apr 2, 2025
78e4ebb
Removing rawrepresentation impl from Update -> Response
rogerbarreto Apr 2, 2025
46acd1c
Merge branch 'main' into audio-transcription-abstraction
rogerbarreto Apr 2, 2025
8bf3389
Add missing AsISpeechToText UT
rogerbarreto Apr 2, 2025
c5c6e89
Add GetService UT
rogerbarreto Apr 2, 2025
977a0e5
Warning fix
rogerbarreto Apr 2, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Binary file modified eng/spellchecking_exclusions.dic
Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -180,6 +180,55 @@ static async Task<ChatResponse> ToChatResponseAsync(
}
}

/// <summary>Coalesces sequential <see cref="TextContent"/> content elements.</summary>
internal static void CoalesceTextContent(List<AIContent> contents)
{
StringBuilder? coalescedText = null;

// Iterate through all of the items in the list looking for contiguous items that can be coalesced.
int start = 0;
while (start < contents.Count - 1)
{
// We need at least two TextContents in a row to be able to coalesce.
if (contents[start] is not TextContent firstText)
{
start++;
continue;
}

if (contents[start + 1] is not TextContent secondText)
{
start += 2;
continue;
}

// Append the text from those nodes and continue appending subsequent TextContents until we run out.
// We null out nodes as their text is appended so that we can later remove them all in one O(N) operation.
coalescedText ??= new();
_ = coalescedText.Clear().Append(firstText.Text).Append(secondText.Text);
contents[start + 1] = null!;
int i = start + 2;
for (; i < contents.Count && contents[i] is TextContent next; i++)
{
_ = coalescedText.Append(next.Text);
contents[i] = null!;
}

// Store the replacement node.
contents[start] = new TextContent(coalescedText.ToString())
{
// We inherit the properties of the first text node. We don't currently propagate additional
// properties from the subsequent nodes. If we ever need to, we can add that here.
AdditionalProperties = firstText.AdditionalProperties?.Clone(),
};

start = i;
}

// Remove all of the null slots left over from the coalescing process.
_ = contents.RemoveAll(u => u is null);
}

/// <summary>Finalizes the <paramref name="response"/> object.</summary>
private static void FinalizeResponse(ChatResponse response)
{
Expand Down Expand Up @@ -296,53 +345,4 @@ private static void ProcessUpdate(ChatResponseUpdate update, ChatResponse respon
}
}
}

/// <summary>Coalesces sequential <see cref="TextContent"/> content elements.</summary>
private static void CoalesceTextContent(List<AIContent> contents)
{
StringBuilder? coalescedText = null;

// Iterate through all of the items in the list looking for contiguous items that can be coalesced.
int start = 0;
while (start < contents.Count - 1)
{
// We need at least two TextContents in a row to be able to coalesce.
if (contents[start] is not TextContent firstText)
{
start++;
continue;
}

if (contents[start + 1] is not TextContent secondText)
{
start += 2;
continue;
}

// Append the text from those nodes and continue appending subsequent TextContents until we run out.
// We null out nodes as their text is appended so that we can later remove them all in one O(N) operation.
coalescedText ??= new();
_ = coalescedText.Clear().Append(firstText.Text).Append(secondText.Text);
contents[start + 1] = null!;
int i = start + 2;
for (; i < contents.Count && contents[i] is TextContent next; i++)
{
_ = coalescedText.Append(next.Text);
contents[i] = null!;
}

// Store the replacement node.
contents[start] = new TextContent(coalescedText.ToString())
{
// We inherit the properties of the first text node. We don't currently propagate additional
// properties from the subsequent nodes. If we ever need to, we can add that here.
AdditionalProperties = firstText.AdditionalProperties?.Clone(),
};

start = i;
}

// Remove all of the null slots left over from the coalescing process.
_ = contents.RemoveAll(u => u is null);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ namespace Microsoft.Extensions.AI;
/// <summary>Provides a base class for all content used with AI services.</summary>
[JsonPolymorphic(TypeDiscriminatorPropertyName = "$type")]
[JsonDerivedType(typeof(DataContent), typeDiscriminator: "data")]
[JsonDerivedType(typeof(ErrorContent), typeDiscriminator: "error")]
[JsonDerivedType(typeof(FunctionCallContent), typeDiscriminator: "functionCall")]
[JsonDerivedType(typeof(FunctionResultContent), typeDiscriminator: "functionResult")]
[JsonDerivedType(typeof(TextContent), typeDiscriminator: "text")]
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System.Diagnostics;
using System.Text.Json.Serialization;
using Microsoft.Shared.Diagnostics;

namespace Microsoft.Extensions.AI;

/// <summary>Represents an error.</summary>
/// <remarks>
/// Typically, <see cref="ErrorContent"/> is used for non-fatal errors, where something went wrong
/// as part of the operation but the operation was still able to continue.
/// </remarks>
[DebuggerDisplay("{DebuggerDisplay,nq}")]
public class ErrorContent : AIContent
{
/// <summary>The error message.</summary>
private string _message;

/// <summary>Initializes a new instance of the <see cref="ErrorContent"/> class with the specified message.</summary>
/// <param name="message">The message to store in this content.</param>
[JsonConstructor]
public ErrorContent(string message)
{
_message = Throw.IfNull(message);
}

/// <summary>Gets or sets the error message.</summary>
public string Message
{
get => _message;
set => _message = Throw.IfNull(value);
}

/// <summary>Gets or sets the error code.</summary>
public string? ErrorCode { get; set; }

/// <summary>Gets or sets the error details.</summary>
public string? Details { get; set; }

/// <summary>Gets a string representing this instance to display in the debugger.</summary>
[DebuggerBrowsable(DebuggerBrowsableState.Never)]
private string DebuggerDisplay =>
$"Error = {Message}" +
(ErrorCode is not null ? $" ({ErrorCode})" : string.Empty) +
(Details is not null ? $" - {Details}" : string.Empty);
}
Original file line number Diff line number Diff line change
Expand Up @@ -16,16 +16,18 @@
<PropertyGroup>
<TargetFrameworks>$(TargetFrameworks);netstandard2.0</TargetFrameworks>
<NoWarn>$(NoWarn);CA2227;CA1034;SA1316;S3253</NoWarn>
<NoWarn>$(NoWarn);MEAI001</NoWarn>
<TreatWarningsAsErrors>true</TreatWarningsAsErrors>
<DisableNETStandardCompatErrors>true</DisableNETStandardCompatErrors>
</PropertyGroup>

<PropertyGroup>
<InjectExperimentalAttributeOnLegacy>true</InjectExperimentalAttributeOnLegacy>
<InjectJsonSchemaExporterOnLegacy>true</InjectJsonSchemaExporterOnLegacy>
<InjectRequiredMemberOnLegacy>true</InjectRequiredMemberOnLegacy>
<InjectSharedEmptyCollections>true</InjectSharedEmptyCollections>
<InjectStringHashOnLegacy>true</InjectStringHashOnLegacy>
<InjectStringSyntaxAttributeOnLegacy>true</InjectStringSyntaxAttributeOnLegacy>
<InjectRequiredMemberOnLegacy>true</InjectRequiredMemberOnLegacy>
</PropertyGroup>

<ItemGroup Condition="'$(TargetFrameworkIdentifier)' != '.NETCoreApp'">
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.IO;
using System.Threading;
using System.Threading.Tasks;
using Microsoft.Shared.Diagnostics;

namespace Microsoft.Extensions.AI;

/// <summary>
/// Provides an optional base class for an <see cref="ISpeechToTextClient"/> that passes through calls to another instance.
/// </summary>
/// <remarks>
/// This is recommended as a base type when building clients that can be chained in any order around an underlying <see cref="ISpeechToTextClient"/>.
/// The default implementation simply passes each call to the inner client instance.
/// </remarks>
[Experimental("MEAI001")]
public class DelegatingSpeechToTextClient : ISpeechToTextClient
{
/// <summary>
/// Initializes a new instance of the <see cref="DelegatingSpeechToTextClient"/> class.
/// </summary>
/// <param name="innerClient">The wrapped client instance.</param>
protected DelegatingSpeechToTextClient(ISpeechToTextClient innerClient)
{
InnerClient = Throw.IfNull(innerClient);
}

/// <inheritdoc />
public void Dispose()
{
Dispose(disposing: true);
GC.SuppressFinalize(this);
}

/// <summary>Gets the inner <see cref="ISpeechToTextClient" />.</summary>
protected ISpeechToTextClient InnerClient { get; }

/// <inheritdoc />
public virtual Task<SpeechToTextResponse> GetTextAsync(
Stream audioSpeechStream, SpeechToTextOptions? options = null, CancellationToken cancellationToken = default)
{
return InnerClient.GetTextAsync(audioSpeechStream, options, cancellationToken);
}

/// <inheritdoc />
public virtual IAsyncEnumerable<SpeechToTextResponseUpdate> GetStreamingTextAsync(
Stream audioSpeechStream, SpeechToTextOptions? options = null, CancellationToken cancellationToken = default)
{
return InnerClient.GetStreamingTextAsync(audioSpeechStream, options, cancellationToken);
}

/// <inheritdoc />
public virtual object? GetService(Type serviceType, object? serviceKey = null)
{
_ = Throw.IfNull(serviceType);

// If the key is non-null, we don't know what it means so pass through to the inner service.
return
serviceKey is null && serviceType.IsInstanceOfType(this) ? this :
InnerClient.GetService(serviceType, serviceKey);
}

/// <summary>Provides a mechanism for releasing unmanaged resources.</summary>
/// <param name="disposing"><see langword="true"/> if being called from <see cref="Dispose()"/>; otherwise, <see langword="false"/>.</param>
protected virtual void Dispose(bool disposing)
{
if (disposing)
{
InnerClient.Dispose();
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// Licensed to the .NET Foundation under one or more agreements.
// The .NET Foundation licenses this file to you under the MIT license.

using System;
using System.Collections.Generic;
using System.Diagnostics.CodeAnalysis;
using System.IO;
using System.Threading;
using System.Threading.Tasks;

namespace Microsoft.Extensions.AI;

/// <summary>Represents a speech to text client.</summary>
/// <remarks>
/// <para>
/// Unless otherwise specified, all members of <see cref="ISpeechToTextClient"/> are thread-safe for concurrent use.
/// It is expected that all implementations of <see cref="ISpeechToTextClient"/> support being used by multiple requests concurrently.
/// </para>
/// <para>
/// However, implementations of <see cref="ISpeechToTextClient"/> might mutate the arguments supplied to <see cref="GetTextAsync"/> and
/// <see cref="GetStreamingTextAsync"/>, such as by configuring the options instance. Thus, consumers of the interface either should avoid
/// using shared instances of these arguments for concurrent invocations or should otherwise ensure by construction that no
/// <see cref="ISpeechToTextClient"/> instances are used which might employ such mutation. For example, the ConfigureOptions method be
/// provided with a callback that could mutate the supplied options argument, and that should be avoided if using a singleton options instance.
/// The audio speech stream passed to these methods will not be closed or disposed by the implementation.
/// </para>
/// </remarks>
[Experimental("MEAI001")]
public interface ISpeechToTextClient : IDisposable
{
/// <summary>Sends audio speech content to the model and returns the generated text.</summary>
/// <param name="audioSpeechStream">The audio speech stream to send.</param>
/// <param name="options">The speech to text options to configure the request.</param>
/// <param name="cancellationToken">The <see cref="CancellationToken"/> to monitor for cancellation requests. The default is <see cref="CancellationToken.None"/>.</param>
/// <returns>The text generated.</returns>
Task<SpeechToTextResponse> GetTextAsync(
Stream audioSpeechStream,
SpeechToTextOptions? options = null,
CancellationToken cancellationToken = default);

/// <summary>Sends audio speech content to the model and streams back the generated text.</summary>
/// <param name="audioSpeechStream">The audio speech stream to send.</param>
/// <param name="options">The speech to text options to configure the request.</param>
/// <param name="cancellationToken">The <see cref="CancellationToken"/> to monitor for cancellation requests. The default is <see cref="CancellationToken.None"/>.</param>
/// <returns>The text updates representing the streamed output.</returns>
IAsyncEnumerable<SpeechToTextResponseUpdate> GetStreamingTextAsync(
Stream audioSpeechStream,
SpeechToTextOptions? options = null,
CancellationToken cancellationToken = default);

/// <summary>Asks the <see cref="ISpeechToTextClient"/> for an object of the specified type <paramref name="serviceType"/>.</summary>
/// <param name="serviceType">The type of object being requested.</param>
/// <param name="serviceKey">An optional key that can be used to help identify the target service.</param>
/// <returns>The found object, otherwise <see langword="null"/>.</returns>
/// <exception cref="ArgumentNullException"><paramref name="serviceType"/> is <see langword="null"/>.</exception>
/// <remarks>
/// The purpose of this method is to allow for the retrieval of strongly typed services that might be provided by the <see cref="ISpeechToTextClient"/>,
/// including itself or any services it might be wrapping.
/// </remarks>
object? GetService(Type serviceType, object? serviceKey = null);
}
Loading
Loading