Skip to content

Commit

Permalink
Python: Audio to text (#9505)
Browse files Browse the repository at this point in the history
### Motivation and Context

<!-- Thank you for your contribution to the semantic-kernel repo!
Please help reviewers and future users, providing the following
information:
  1. Why is this change required?
  2. What problem does it solve?
  3. What scenario does it contribute to?
  4. If it fixes an open issue, please link to the issue here.
-->
Addresses: #7434


### Description

<!-- Describe your changes, the overall approach, the underlying design.
These notes will help understanding how your code works. Thanks! -->
Add audio to text to SK Python.

### Contribution Checklist
1. Add audio to text to SK Python.
2. Add text to image integration tests.
3. Refactor Azure AI connector AD auth.

<!-- Before submitting this PR, please make sure: -->

- [x] The code builds clean without any errors or warnings
- [x] The PR follows the [SK Contribution
Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md)
and the [pre-submission formatting
script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts)
raises no violations
- [x] All unit tests pass, and I have added new tests where possible
- [x] I didn't break anyone 😄
  • Loading branch information
TaoChenOSU authored Nov 8, 2024
1 parent 1cb0922 commit e9a755d
Show file tree
Hide file tree
Showing 45 changed files with 1,171 additions and 122 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/python-integration-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,7 @@ jobs:
AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME }} # azure-text-embedding-ada-002
AZURE_OPENAI_CHAT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_CHAT_DEPLOYMENT_NAME }}
AZURE_OPENAI_TEXT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_DEPLOYMENT_NAME }}
AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT: ${{ secrets.AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT }}
AZURE_OPENAI_API_VERSION: ${{ vars.AZURE_OPENAI_API_VERSION }}
AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
BING_API_KEY: ${{ secrets.BING_API_KEY }}
Expand Down Expand Up @@ -218,6 +219,7 @@ jobs:
AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME }} # azure-text-embedding-ada-002
AZURE_OPENAI_CHAT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_CHAT_DEPLOYMENT_NAME }}
AZURE_OPENAI_TEXT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_DEPLOYMENT_NAME }}
AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT: ${{ secrets.AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT }}
AZURE_OPENAI_API_VERSION: ${{ vars.AZURE_OPENAI_API_VERSION }}
AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
BING_API_KEY: ${{ secrets.BING_API_KEY }}
Expand Down
1 change: 1 addition & 0 deletions python/.cspell.json
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
"datamodel",
"dotenv",
"endregion",
"entra",
"genai",
"generativeai",
"hnsw",
Expand Down
74 changes: 74 additions & 0 deletions python/samples/concepts/audio_to_text/audio_recorder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
# Copyright (c) Microsoft. All rights reserved.

import os
import wave
from typing import ClassVar

import keyboard
import pyaudio

from semantic_kernel.kernel_pydantic import KernelBaseModel


class AudioRecorder(KernelBaseModel):
"""A class to record audio from the microphone and save it to a WAV file.
To start recording, press the spacebar. To stop recording, release the spacebar.
To use as a context manager, that automatically removes the output file after exiting the context:
```
with AudioRecorder(output_filepath="output.wav") as recorder:
recorder.start_recording()
# Do something with the recorded audio
...
```
"""

# Audio recording parameters
FORMAT: ClassVar[int] = pyaudio.paInt16
CHANNELS: ClassVar[int] = 1
RATE: ClassVar[int] = 44100
CHUNK: ClassVar[int] = 1024

output_filepath: str

def start_recording(self) -> None:
# Wait for the spacebar to be pressed to start recording
keyboard.wait("space")

# Start recording
audio = pyaudio.PyAudio()
stream = audio.open(
format=self.FORMAT,
channels=self.CHANNELS,
rate=self.RATE,
input=True,
frames_per_buffer=self.CHUNK,
)
frames = []

while keyboard.is_pressed("space"):
data = stream.read(self.CHUNK)
frames.append(data)

# Recording stopped as the spacebar is released
stream.stop_stream()
stream.close()

# Save the recorded data as a WAV file
with wave.open(self.output_filepath, "wb") as wf:
wf.setnchannels(self.CHANNELS)
wf.setsampwidth(audio.get_sample_size(self.FORMAT))
wf.setframerate(self.RATE)
wf.writeframes(b"".join(frames))

audio.terminate()

def remove_output_file(self) -> None:
os.remove(self.output_filepath)

def __enter__(self) -> "AudioRecorder":
return self

def __exit__(self, exc_type, exc_value, traceback) -> None:
self.remove_output_file()
98 changes: 98 additions & 0 deletions python/samples/concepts/audio_to_text/chat_with_audio_input.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# Copyright (c) Microsoft. All rights reserved.

import asyncio
import logging
import os

from samples.concepts.audio_to_text.audio_recorder import AudioRecorder
from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_prompt_execution_settings import (
OpenAIChatPromptExecutionSettings,
)
from semantic_kernel.connectors.ai.open_ai.services.azure_audio_to_text import AzureAudioToText
from semantic_kernel.contents import ChatHistory
from semantic_kernel.contents.audio_content import AudioContent

# This simple sample demonstrates how to use the AzureChatCompletion and AzureAudioToText services
# to create a chat bot that can communicate with the user using audio input.
# The user can enage a long conversation with the chat bot by speaking to it.

# Additional dependencies required for this sample:
# - pyaudio: `pip install pyaudio` or `uv pip install pyaudio` if you are using uv and have a virtual env activated.
# - keyboard: `pip install keyboard` or `uv pip install keyboard` if you are using uv and have a virtual env activated.


logging.basicConfig(level=logging.WARNING)
AUDIO_FILEPATH = os.path.join(os.path.dirname(__file__), "output.wav")

system_message = """
You are a chat bot. Your name is Mosscap and
you have one goal: figure out what people need.
Your full name, should you need to know it, is
Splendid Speckled Mosscap. You communicate
effectively, but you tend to answer with long
flowery prose.
"""


chat_service = AzureChatCompletion()
audio_to_text_service = AzureAudioToText()

history = ChatHistory()
history.add_user_message("Hi there, who are you?")
history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")


async def chat() -> bool:
try:
print("User:> ", end="", flush=True)
with AudioRecorder(output_filepath=AUDIO_FILEPATH) as recorder:
recorder.start_recording()
user_input = await audio_to_text_service.get_text_content(AudioContent.from_audio_file(AUDIO_FILEPATH))
print(user_input.text)
except KeyboardInterrupt:
print("\n\nExiting chat...")
return False
except EOFError:
print("\n\nExiting chat...")
return False

if "exit" in user_input.text.lower():
print("\n\nExiting chat...")
return False

history.add_user_message(user_input.text)

chunks = chat_service.get_streaming_chat_message_content(
chat_history=history,
settings=OpenAIChatPromptExecutionSettings(
max_tokens=2000,
temperature=0.7,
top_p=0.8,
),
)

print("Mosscap:> ", end="")
answer = ""
async for message in chunks:
print(str(message), end="")
answer += str(message)
print("\n")

history.add_assistant_message(str(answer))

return True


async def main() -> None:
print(
"Instruction: when it's your turn to speak, press the spacebar to start recording."
" Release the spacebar to stop recording."
)
chatting = True
while chatting:
chatting = await chat()


if __name__ == "__main__":
asyncio.run(main())
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from semantic_kernel.const import DEFAULT_SERVICE_NAME
from semantic_kernel.exceptions.agent_exceptions import AgentInitializationException
from semantic_kernel.kernel_pydantic import HttpsUrl
from semantic_kernel.utils.authentication.entra_id_authentication import get_entra_auth_token
from semantic_kernel.utils.experimental_decorator import experimental_class
from semantic_kernel.utils.telemetry.user_agent import APP_INFO, prepend_semantic_kernel_to_user_agent

Expand Down Expand Up @@ -122,9 +123,7 @@ def __init__(
and ad_token is None
and azure_openai_settings.token_endpoint
):
ad_token = azure_openai_settings.get_azure_openai_auth_token(
token_endpoint=azure_openai_settings.token_endpoint
)
ad_token = get_entra_auth_token(azure_openai_settings.token_endpoint)

if not client and not azure_openai_settings.api_key and not ad_token and not ad_token_provider:
raise AgentInitializationException("Please provide either api_key, ad_token or ad_token_provider.")
Expand Down
51 changes: 51 additions & 0 deletions python/semantic_kernel/connectors/ai/audio_to_text_client_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
# Copyright (c) Microsoft. All rights reserved.


from abc import ABC, abstractmethod
from typing import Any

from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
from semantic_kernel.contents.audio_content import AudioContent
from semantic_kernel.contents.text_content import TextContent
from semantic_kernel.services.ai_service_client_base import AIServiceClientBase


class AudioToTextClientBase(AIServiceClientBase, ABC):
"""Base class for audio to text client."""

@abstractmethod
async def get_text_contents(
self,
audio_content: AudioContent,
settings: PromptExecutionSettings | None = None,
**kwargs: Any,
) -> list[TextContent]:
"""Get text contents from audio.
Args:
audio_content: Audio content.
settings: Prompt execution settings.
kwargs: Additional arguments.
Returns:
list[TextContent]: Text contents.
"""
raise NotImplementedError

async def get_text_content(
self,
audio_content: AudioContent,
settings: PromptExecutionSettings | None = None,
**kwargs: Any,
) -> TextContent:
"""Get text content from audio.
Args:
audio_content: Audio content.
settings: Prompt execution settings.
kwargs: Additional arguments.
Returns:
TextContent: Text content.
"""
return (await self.get_text_contents(audio_content, settings, **kwargs))[0]
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Copyright (c) Microsoft. All rights reserved.

import logging
from typing import Any

from pydantic import Field

from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings

logger = logging.getLogger(__name__)


class OpenAIAudioToTextExecutionSettings(PromptExecutionSettings):
"""Request settings for OpenAI audio to text services."""

ai_model_id: str | None = Field(None, serialization_alias="model")
filename: str | None = None
language: str | None = None
prompt: str | None = None
response_format: str | None = None
temperature: float | None = None

def prepare_settings_dict(self, **kwargs) -> dict[str, Any]:
"""Prepare the settings dictionary for the OpenAI API."""
settings_dict = super().prepare_settings_dict(**kwargs)

# Remove the file name since it will be open as a file object
settings_dict.pop("filename", None)

return settings_dict
Original file line number Diff line number Diff line change
@@ -0,0 +1,71 @@
# Copyright (c) Microsoft. All rights reserved.

import logging
from typing import Any

from pydantic import Field, model_validator

from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
from semantic_kernel.exceptions.service_exceptions import ServiceInvalidExecutionSettingsError
from semantic_kernel.kernel_pydantic import KernelBaseModel

logger = logging.getLogger(__name__)


VALID_IMAGE_SIZES = [
(256, 256),
(512, 512),
(1024, 1024),
(1792, 1024),
(1024, 1792),
]


class ImageSize(KernelBaseModel):
"""Image size."""

width: int
height: int

def __str__(self) -> str:
"""Return the string representation of the image size."""
return f"{self.width}x{self.height}"


class OpenAITextToImageExecutionSettings(PromptExecutionSettings):
"""Request settings for OpenAI text to image services."""

prompt: str | None = None
ai_model_id: str | None = Field(None, serialization_alias="model")
size: ImageSize | None = None
quality: str | None = None
style: str | None = None

@model_validator(mode="after")
def check_size(self) -> "OpenAITextToImageExecutionSettings":
"""Check that the requested image size is valid."""
size = self.size or self.extension_data.get("size")

if size is not None and (size.width, size.height) not in VALID_IMAGE_SIZES:
raise ServiceInvalidExecutionSettingsError(f"Invalid image size: {size.width}x{size.height}.")

return self

@model_validator(mode="after")
def check_prompt(self) -> "OpenAITextToImageExecutionSettings":
"""Check that the prompt is not empty."""
prompt = self.prompt or self.extension_data.get("prompt")

if not prompt:
raise ServiceInvalidExecutionSettingsError("The prompt is required.")

return self

def prepare_settings_dict(self, **kwargs) -> dict[str, Any]:
"""Prepare the settings dictionary for the OpenAI API."""
settings_dict = super().prepare_settings_dict(**kwargs)

if self.size is not None:
settings_dict["size"] = str(self.size)

return settings_dict
Loading

0 comments on commit e9a755d

Please sign in to comment.