-
Notifications
You must be signed in to change notification settings - Fork 3.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
### Motivation and Context <!-- Thank you for your contribution to the semantic-kernel repo! Please help reviewers and future users, providing the following information: 1. Why is this change required? 2. What problem does it solve? 3. What scenario does it contribute to? 4. If it fixes an open issue, please link to the issue here. --> Addresses: #7434 ### Description <!-- Describe your changes, the overall approach, the underlying design. These notes will help understanding how your code works. Thanks! --> Add audio to text to SK Python. ### Contribution Checklist 1. Add audio to text to SK Python. 2. Add text to image integration tests. 3. Refactor Azure AI connector AD auth. <!-- Before submitting this PR, please make sure: --> - [x] The code builds clean without any errors or warnings - [x] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [x] All unit tests pass, and I have added new tests where possible - [x] I didn't break anyone 😄
- Loading branch information
1 parent
1cb0922
commit e9a755d
Showing
45 changed files
with
1,171 additions
and
122 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -33,6 +33,7 @@ | |
"datamodel", | ||
"dotenv", | ||
"endregion", | ||
"entra", | ||
"genai", | ||
"generativeai", | ||
"hnsw", | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
# Copyright (c) Microsoft. All rights reserved. | ||
|
||
import os | ||
import wave | ||
from typing import ClassVar | ||
|
||
import keyboard | ||
import pyaudio | ||
|
||
from semantic_kernel.kernel_pydantic import KernelBaseModel | ||
|
||
|
||
class AudioRecorder(KernelBaseModel): | ||
"""A class to record audio from the microphone and save it to a WAV file. | ||
To start recording, press the spacebar. To stop recording, release the spacebar. | ||
To use as a context manager, that automatically removes the output file after exiting the context: | ||
``` | ||
with AudioRecorder(output_filepath="output.wav") as recorder: | ||
recorder.start_recording() | ||
# Do something with the recorded audio | ||
... | ||
``` | ||
""" | ||
|
||
# Audio recording parameters | ||
FORMAT: ClassVar[int] = pyaudio.paInt16 | ||
CHANNELS: ClassVar[int] = 1 | ||
RATE: ClassVar[int] = 44100 | ||
CHUNK: ClassVar[int] = 1024 | ||
|
||
output_filepath: str | ||
|
||
def start_recording(self) -> None: | ||
# Wait for the spacebar to be pressed to start recording | ||
keyboard.wait("space") | ||
|
||
# Start recording | ||
audio = pyaudio.PyAudio() | ||
stream = audio.open( | ||
format=self.FORMAT, | ||
channels=self.CHANNELS, | ||
rate=self.RATE, | ||
input=True, | ||
frames_per_buffer=self.CHUNK, | ||
) | ||
frames = [] | ||
|
||
while keyboard.is_pressed("space"): | ||
data = stream.read(self.CHUNK) | ||
frames.append(data) | ||
|
||
# Recording stopped as the spacebar is released | ||
stream.stop_stream() | ||
stream.close() | ||
|
||
# Save the recorded data as a WAV file | ||
with wave.open(self.output_filepath, "wb") as wf: | ||
wf.setnchannels(self.CHANNELS) | ||
wf.setsampwidth(audio.get_sample_size(self.FORMAT)) | ||
wf.setframerate(self.RATE) | ||
wf.writeframes(b"".join(frames)) | ||
|
||
audio.terminate() | ||
|
||
def remove_output_file(self) -> None: | ||
os.remove(self.output_filepath) | ||
|
||
def __enter__(self) -> "AudioRecorder": | ||
return self | ||
|
||
def __exit__(self, exc_type, exc_value, traceback) -> None: | ||
self.remove_output_file() |
98 changes: 98 additions & 0 deletions
98
python/samples/concepts/audio_to_text/chat_with_audio_input.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
# Copyright (c) Microsoft. All rights reserved. | ||
|
||
import asyncio | ||
import logging | ||
import os | ||
|
||
from samples.concepts.audio_to_text.audio_recorder import AudioRecorder | ||
from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion | ||
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_prompt_execution_settings import ( | ||
OpenAIChatPromptExecutionSettings, | ||
) | ||
from semantic_kernel.connectors.ai.open_ai.services.azure_audio_to_text import AzureAudioToText | ||
from semantic_kernel.contents import ChatHistory | ||
from semantic_kernel.contents.audio_content import AudioContent | ||
|
||
# This simple sample demonstrates how to use the AzureChatCompletion and AzureAudioToText services | ||
# to create a chat bot that can communicate with the user using audio input. | ||
# The user can enage a long conversation with the chat bot by speaking to it. | ||
|
||
# Additional dependencies required for this sample: | ||
# - pyaudio: `pip install pyaudio` or `uv pip install pyaudio` if you are using uv and have a virtual env activated. | ||
# - keyboard: `pip install keyboard` or `uv pip install keyboard` if you are using uv and have a virtual env activated. | ||
|
||
|
||
logging.basicConfig(level=logging.WARNING) | ||
AUDIO_FILEPATH = os.path.join(os.path.dirname(__file__), "output.wav") | ||
|
||
system_message = """ | ||
You are a chat bot. Your name is Mosscap and | ||
you have one goal: figure out what people need. | ||
Your full name, should you need to know it, is | ||
Splendid Speckled Mosscap. You communicate | ||
effectively, but you tend to answer with long | ||
flowery prose. | ||
""" | ||
|
||
|
||
chat_service = AzureChatCompletion() | ||
audio_to_text_service = AzureAudioToText() | ||
|
||
history = ChatHistory() | ||
history.add_user_message("Hi there, who are you?") | ||
history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.") | ||
|
||
|
||
async def chat() -> bool: | ||
try: | ||
print("User:> ", end="", flush=True) | ||
with AudioRecorder(output_filepath=AUDIO_FILEPATH) as recorder: | ||
recorder.start_recording() | ||
user_input = await audio_to_text_service.get_text_content(AudioContent.from_audio_file(AUDIO_FILEPATH)) | ||
print(user_input.text) | ||
except KeyboardInterrupt: | ||
print("\n\nExiting chat...") | ||
return False | ||
except EOFError: | ||
print("\n\nExiting chat...") | ||
return False | ||
|
||
if "exit" in user_input.text.lower(): | ||
print("\n\nExiting chat...") | ||
return False | ||
|
||
history.add_user_message(user_input.text) | ||
|
||
chunks = chat_service.get_streaming_chat_message_content( | ||
chat_history=history, | ||
settings=OpenAIChatPromptExecutionSettings( | ||
max_tokens=2000, | ||
temperature=0.7, | ||
top_p=0.8, | ||
), | ||
) | ||
|
||
print("Mosscap:> ", end="") | ||
answer = "" | ||
async for message in chunks: | ||
print(str(message), end="") | ||
answer += str(message) | ||
print("\n") | ||
|
||
history.add_assistant_message(str(answer)) | ||
|
||
return True | ||
|
||
|
||
async def main() -> None: | ||
print( | ||
"Instruction: when it's your turn to speak, press the spacebar to start recording." | ||
" Release the spacebar to stop recording." | ||
) | ||
chatting = True | ||
while chatting: | ||
chatting = await chat() | ||
|
||
|
||
if __name__ == "__main__": | ||
asyncio.run(main()) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
51 changes: 51 additions & 0 deletions
51
python/semantic_kernel/connectors/ai/audio_to_text_client_base.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
# Copyright (c) Microsoft. All rights reserved. | ||
|
||
|
||
from abc import ABC, abstractmethod | ||
from typing import Any | ||
|
||
from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings | ||
from semantic_kernel.contents.audio_content import AudioContent | ||
from semantic_kernel.contents.text_content import TextContent | ||
from semantic_kernel.services.ai_service_client_base import AIServiceClientBase | ||
|
||
|
||
class AudioToTextClientBase(AIServiceClientBase, ABC): | ||
"""Base class for audio to text client.""" | ||
|
||
@abstractmethod | ||
async def get_text_contents( | ||
self, | ||
audio_content: AudioContent, | ||
settings: PromptExecutionSettings | None = None, | ||
**kwargs: Any, | ||
) -> list[TextContent]: | ||
"""Get text contents from audio. | ||
Args: | ||
audio_content: Audio content. | ||
settings: Prompt execution settings. | ||
kwargs: Additional arguments. | ||
Returns: | ||
list[TextContent]: Text contents. | ||
""" | ||
raise NotImplementedError | ||
|
||
async def get_text_content( | ||
self, | ||
audio_content: AudioContent, | ||
settings: PromptExecutionSettings | None = None, | ||
**kwargs: Any, | ||
) -> TextContent: | ||
"""Get text content from audio. | ||
Args: | ||
audio_content: Audio content. | ||
settings: Prompt execution settings. | ||
kwargs: Additional arguments. | ||
Returns: | ||
TextContent: Text content. | ||
""" | ||
return (await self.get_text_contents(audio_content, settings, **kwargs))[0] |
30 changes: 30 additions & 0 deletions
30
...nnectors/ai/open_ai/prompt_execution_settings/open_ai_audio_to_text_execution_settings.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# Copyright (c) Microsoft. All rights reserved. | ||
|
||
import logging | ||
from typing import Any | ||
|
||
from pydantic import Field | ||
|
||
from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
class OpenAIAudioToTextExecutionSettings(PromptExecutionSettings): | ||
"""Request settings for OpenAI audio to text services.""" | ||
|
||
ai_model_id: str | None = Field(None, serialization_alias="model") | ||
filename: str | None = None | ||
language: str | None = None | ||
prompt: str | None = None | ||
response_format: str | None = None | ||
temperature: float | None = None | ||
|
||
def prepare_settings_dict(self, **kwargs) -> dict[str, Any]: | ||
"""Prepare the settings dictionary for the OpenAI API.""" | ||
settings_dict = super().prepare_settings_dict(**kwargs) | ||
|
||
# Remove the file name since it will be open as a file object | ||
settings_dict.pop("filename", None) | ||
|
||
return settings_dict |
71 changes: 71 additions & 0 deletions
71
...nnectors/ai/open_ai/prompt_execution_settings/open_ai_text_to_image_execution_settings.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,71 @@ | ||
# Copyright (c) Microsoft. All rights reserved. | ||
|
||
import logging | ||
from typing import Any | ||
|
||
from pydantic import Field, model_validator | ||
|
||
from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings | ||
from semantic_kernel.exceptions.service_exceptions import ServiceInvalidExecutionSettingsError | ||
from semantic_kernel.kernel_pydantic import KernelBaseModel | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
VALID_IMAGE_SIZES = [ | ||
(256, 256), | ||
(512, 512), | ||
(1024, 1024), | ||
(1792, 1024), | ||
(1024, 1792), | ||
] | ||
|
||
|
||
class ImageSize(KernelBaseModel): | ||
"""Image size.""" | ||
|
||
width: int | ||
height: int | ||
|
||
def __str__(self) -> str: | ||
"""Return the string representation of the image size.""" | ||
return f"{self.width}x{self.height}" | ||
|
||
|
||
class OpenAITextToImageExecutionSettings(PromptExecutionSettings): | ||
"""Request settings for OpenAI text to image services.""" | ||
|
||
prompt: str | None = None | ||
ai_model_id: str | None = Field(None, serialization_alias="model") | ||
size: ImageSize | None = None | ||
quality: str | None = None | ||
style: str | None = None | ||
|
||
@model_validator(mode="after") | ||
def check_size(self) -> "OpenAITextToImageExecutionSettings": | ||
"""Check that the requested image size is valid.""" | ||
size = self.size or self.extension_data.get("size") | ||
|
||
if size is not None and (size.width, size.height) not in VALID_IMAGE_SIZES: | ||
raise ServiceInvalidExecutionSettingsError(f"Invalid image size: {size.width}x{size.height}.") | ||
|
||
return self | ||
|
||
@model_validator(mode="after") | ||
def check_prompt(self) -> "OpenAITextToImageExecutionSettings": | ||
"""Check that the prompt is not empty.""" | ||
prompt = self.prompt or self.extension_data.get("prompt") | ||
|
||
if not prompt: | ||
raise ServiceInvalidExecutionSettingsError("The prompt is required.") | ||
|
||
return self | ||
|
||
def prepare_settings_dict(self, **kwargs) -> dict[str, Any]: | ||
"""Prepare the settings dictionary for the OpenAI API.""" | ||
settings_dict = super().prepare_settings_dict(**kwargs) | ||
|
||
if self.size is not None: | ||
settings_dict["size"] = str(self.size) | ||
|
||
return settings_dict |
Oops, something went wrong.