Python: Audio to text (#9505)

### Motivation and Context  Addresses: #7434 ### Description  Add audio to text to SK Python. ### Contribution Checklist 1. Add audio to text to SK Python. 2. Add text to image integration tests. 3. Refactor Azure AI connector AD auth.  - [x] The code builds clean without any errors or warnings - [x] The PR follows the [SK Contribution Guidelines](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md) and the [pre-submission formatting script](https://github.com/microsoft/semantic-kernel/blob/main/CONTRIBUTING.md#development-scripts) raises no violations - [x] All unit tests pass, and I have added new tests where possible - [x] I didn't break anyone 😄
microsoft · Nov 8, 2024 · e9a755d · e9a755d
1 parent 1cb0922
commit e9a755d
Show file tree

Hide file tree

Showing 45 changed files with 1,171 additions and 122 deletions.
diff --git a/.github/workflows/python-integration-tests.yml b/.github/workflows/python-integration-tests.yml
@@ -64,6 +64,7 @@ jobs:
       AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME }} # azure-text-embedding-ada-002
       AZURE_OPENAI_CHAT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_CHAT_DEPLOYMENT_NAME }}
       AZURE_OPENAI_TEXT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_DEPLOYMENT_NAME }}
+      AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT: ${{ secrets.AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT }}
       AZURE_OPENAI_API_VERSION: ${{ vars.AZURE_OPENAI_API_VERSION }}
       AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
       BING_API_KEY: ${{ secrets.BING_API_KEY }}
@@ -218,6 +219,7 @@ jobs:
       AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_EMBEDDING_DEPLOYMENT_NAME }} # azure-text-embedding-ada-002
       AZURE_OPENAI_CHAT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_CHAT_DEPLOYMENT_NAME }}
       AZURE_OPENAI_TEXT_DEPLOYMENT_NAME: ${{ vars.AZURE_OPENAI_TEXT_DEPLOYMENT_NAME }}
+      AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT: ${{ secrets.AZURE_OPENAI_AUDIO_TO_TEXT_ENDPOINT }}
       AZURE_OPENAI_API_VERSION: ${{ vars.AZURE_OPENAI_API_VERSION }}
       AZURE_OPENAI_ENDPOINT: ${{ secrets.AZURE_OPENAI_ENDPOINT }}
       BING_API_KEY: ${{ secrets.BING_API_KEY }}

diff --git a/python/.cspell.json b/python/.cspell.json
@@ -33,6 +33,7 @@
         "datamodel",
         "dotenv",
         "endregion",
+        "entra",
         "genai",
         "generativeai",
         "hnsw",

diff --git a/python/samples/concepts/audio_to_text/audio_recorder.py b/python/samples/concepts/audio_to_text/audio_recorder.py
@@ -0,0 +1,74 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import os
+import wave
+from typing import ClassVar
+
+import keyboard
+import pyaudio
+
+from semantic_kernel.kernel_pydantic import KernelBaseModel
+
+
+class AudioRecorder(KernelBaseModel):
+    """A class to record audio from the microphone and save it to a WAV file.
+
+    To start recording, press the spacebar. To stop recording, release the spacebar.
+
+    To use as a context manager, that automatically removes the output file after exiting the context:
+    ```
+    with AudioRecorder(output_filepath="output.wav") as recorder:
+        recorder.start_recording()
+        # Do something with the recorded audio
+        ...
+    ```
+    """
+
+    # Audio recording parameters
+    FORMAT: ClassVar[int] = pyaudio.paInt16
+    CHANNELS: ClassVar[int] = 1
+    RATE: ClassVar[int] = 44100
+    CHUNK: ClassVar[int] = 1024
+
+    output_filepath: str
+
+    def start_recording(self) -> None:
+        # Wait for the spacebar to be pressed to start recording
+        keyboard.wait("space")
+
+        # Start recording
+        audio = pyaudio.PyAudio()
+        stream = audio.open(
+            format=self.FORMAT,
+            channels=self.CHANNELS,
+            rate=self.RATE,
+            input=True,
+            frames_per_buffer=self.CHUNK,
+        )
+        frames = []
+
+        while keyboard.is_pressed("space"):
+            data = stream.read(self.CHUNK)
+            frames.append(data)
+
+        # Recording stopped as the spacebar is released
+        stream.stop_stream()
+        stream.close()
+
+        # Save the recorded data as a WAV file
+        with wave.open(self.output_filepath, "wb") as wf:
+            wf.setnchannels(self.CHANNELS)
+            wf.setsampwidth(audio.get_sample_size(self.FORMAT))
+            wf.setframerate(self.RATE)
+            wf.writeframes(b"".join(frames))
+
+        audio.terminate()
+
+    def remove_output_file(self) -> None:
+        os.remove(self.output_filepath)
+
+    def __enter__(self) -> "AudioRecorder":
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback) -> None:
+        self.remove_output_file()
diff --git a/python/samples/concepts/audio_to_text/chat_with_audio_input.py b/python/samples/concepts/audio_to_text/chat_with_audio_input.py
@@ -0,0 +1,98 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import asyncio
+import logging
+import os
+
+from samples.concepts.audio_to_text.audio_recorder import AudioRecorder
+from semantic_kernel.connectors.ai.open_ai import AzureChatCompletion
+from semantic_kernel.connectors.ai.open_ai.prompt_execution_settings.open_ai_prompt_execution_settings import (
+    OpenAIChatPromptExecutionSettings,
+)
+from semantic_kernel.connectors.ai.open_ai.services.azure_audio_to_text import AzureAudioToText
+from semantic_kernel.contents import ChatHistory
+from semantic_kernel.contents.audio_content import AudioContent
+
+# This simple sample demonstrates how to use the AzureChatCompletion and AzureAudioToText services
+# to create a chat bot that can communicate with the user using audio input.
+# The user can enage a long conversation with the chat bot by speaking to it.
+
+# Additional dependencies required for this sample:
+# - pyaudio: `pip install pyaudio` or `uv pip install pyaudio` if you are using uv and have a virtual env activated.
+# - keyboard: `pip install keyboard` or `uv pip install keyboard` if you are using uv and have a virtual env activated.
+
+
+logging.basicConfig(level=logging.WARNING)
+AUDIO_FILEPATH = os.path.join(os.path.dirname(__file__), "output.wav")
+
+system_message = """
+You are a chat bot. Your name is Mosscap and
+you have one goal: figure out what people need.
+Your full name, should you need to know it, is
+Splendid Speckled Mosscap. You communicate
+effectively, but you tend to answer with long
+flowery prose.
+"""
+
+
+chat_service = AzureChatCompletion()
+audio_to_text_service = AzureAudioToText()
+
+history = ChatHistory()
+history.add_user_message("Hi there, who are you?")
+history.add_assistant_message("I am Mosscap, a chat bot. I'm trying to figure out what people need.")
+
+
+async def chat() -> bool:
+    try:
+        print("User:> ", end="", flush=True)
+        with AudioRecorder(output_filepath=AUDIO_FILEPATH) as recorder:
+            recorder.start_recording()
+            user_input = await audio_to_text_service.get_text_content(AudioContent.from_audio_file(AUDIO_FILEPATH))
+            print(user_input.text)
+    except KeyboardInterrupt:
+        print("\n\nExiting chat...")
+        return False
+    except EOFError:
+        print("\n\nExiting chat...")
+        return False
+
+    if "exit" in user_input.text.lower():
+        print("\n\nExiting chat...")
+        return False
+
+    history.add_user_message(user_input.text)
+
+    chunks = chat_service.get_streaming_chat_message_content(
+        chat_history=history,
+        settings=OpenAIChatPromptExecutionSettings(
+            max_tokens=2000,
+            temperature=0.7,
+            top_p=0.8,
+        ),
+    )
+
+    print("Mosscap:> ", end="")
+    answer = ""
+    async for message in chunks:
+        print(str(message), end="")
+        answer += str(message)
+    print("\n")
+
+    history.add_assistant_message(str(answer))
+
+    return True
+
+
+async def main() -> None:
+    print(
+        "Instruction: when it's your turn to speak, press the spacebar to start recording."
+        " Release the spacebar to stop recording."
+    )
+    chatting = True
+    while chatting:
+        chatting = await chat()
+
+
+if __name__ == "__main__":
+    asyncio.run(main())
diff --git a/python/semantic_kernel/agents/open_ai/azure_assistant_agent.py b/python/semantic_kernel/agents/open_ai/azure_assistant_agent.py
@@ -13,6 +13,7 @@
 from semantic_kernel.const import DEFAULT_SERVICE_NAME
 from semantic_kernel.exceptions.agent_exceptions import AgentInitializationException
 from semantic_kernel.kernel_pydantic import HttpsUrl
+from semantic_kernel.utils.authentication.entra_id_authentication import get_entra_auth_token
 from semantic_kernel.utils.experimental_decorator import experimental_class
 from semantic_kernel.utils.telemetry.user_agent import APP_INFO, prepend_semantic_kernel_to_user_agent
 
@@ -122,9 +123,7 @@ def __init__(
             and ad_token is None
             and azure_openai_settings.token_endpoint
         ):
-            ad_token = azure_openai_settings.get_azure_openai_auth_token(
-                token_endpoint=azure_openai_settings.token_endpoint
-            )
+            ad_token = get_entra_auth_token(azure_openai_settings.token_endpoint)
 
         if not client and not azure_openai_settings.api_key and not ad_token and not ad_token_provider:
             raise AgentInitializationException("Please provide either api_key, ad_token or ad_token_provider.")

diff --git a/python/semantic_kernel/connectors/ai/audio_to_text_client_base.py b/python/semantic_kernel/connectors/ai/audio_to_text_client_base.py
@@ -0,0 +1,51 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+
+from abc import ABC, abstractmethod
+from typing import Any
+
+from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
+from semantic_kernel.contents.audio_content import AudioContent
+from semantic_kernel.contents.text_content import TextContent
+from semantic_kernel.services.ai_service_client_base import AIServiceClientBase
+
+
+class AudioToTextClientBase(AIServiceClientBase, ABC):
+    """Base class for audio to text client."""
+
+    @abstractmethod
+    async def get_text_contents(
+        self,
+        audio_content: AudioContent,
+        settings: PromptExecutionSettings | None = None,
+        **kwargs: Any,
+    ) -> list[TextContent]:
+        """Get text contents from audio.
+
+        Args:
+            audio_content: Audio content.
+            settings: Prompt execution settings.
+            kwargs: Additional arguments.
+
+        Returns:
+            list[TextContent]: Text contents.
+        """
+        raise NotImplementedError
+
+    async def get_text_content(
+        self,
+        audio_content: AudioContent,
+        settings: PromptExecutionSettings | None = None,
+        **kwargs: Any,
+    ) -> TextContent:
+        """Get text content from audio.
+
+        Args:
+            audio_content: Audio content.
+            settings: Prompt execution settings.
+            kwargs: Additional arguments.
+
+        Returns:
+            TextContent: Text content.
+        """
+        return (await self.get_text_contents(audio_content, settings, **kwargs))[0]
diff --git a/...nnectors/ai/open_ai/prompt_execution_settings/open_ai_audio_to_text_execution_settings.py b/...nnectors/ai/open_ai/prompt_execution_settings/open_ai_audio_to_text_execution_settings.py
@@ -0,0 +1,30 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import logging
+from typing import Any
+
+from pydantic import Field
+
+from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
+
+logger = logging.getLogger(__name__)
+
+
+class OpenAIAudioToTextExecutionSettings(PromptExecutionSettings):
+    """Request settings for OpenAI audio to text services."""
+
+    ai_model_id: str | None = Field(None, serialization_alias="model")
+    filename: str | None = None
+    language: str | None = None
+    prompt: str | None = None
+    response_format: str | None = None
+    temperature: float | None = None
+
+    def prepare_settings_dict(self, **kwargs) -> dict[str, Any]:
+        """Prepare the settings dictionary for the OpenAI API."""
+        settings_dict = super().prepare_settings_dict(**kwargs)
+
+        # Remove the file name since it will be open as a file object
+        settings_dict.pop("filename", None)
+
+        return settings_dict
diff --git a/...nnectors/ai/open_ai/prompt_execution_settings/open_ai_text_to_image_execution_settings.py b/...nnectors/ai/open_ai/prompt_execution_settings/open_ai_text_to_image_execution_settings.py
@@ -0,0 +1,71 @@
+# Copyright (c) Microsoft. All rights reserved.
+
+import logging
+from typing import Any
+
+from pydantic import Field, model_validator
+
+from semantic_kernel.connectors.ai.prompt_execution_settings import PromptExecutionSettings
+from semantic_kernel.exceptions.service_exceptions import ServiceInvalidExecutionSettingsError
+from semantic_kernel.kernel_pydantic import KernelBaseModel
+
+logger = logging.getLogger(__name__)
+
+
+VALID_IMAGE_SIZES = [
+    (256, 256),
+    (512, 512),
+    (1024, 1024),
+    (1792, 1024),
+    (1024, 1792),
+]
+
+
+class ImageSize(KernelBaseModel):
+    """Image size."""
+
+    width: int
+    height: int
+
+    def __str__(self) -> str:
+        """Return the string representation of the image size."""
+        return f"{self.width}x{self.height}"
+
+
+class OpenAITextToImageExecutionSettings(PromptExecutionSettings):
+    """Request settings for OpenAI text to image services."""
+
+    prompt: str | None = None
+    ai_model_id: str | None = Field(None, serialization_alias="model")
+    size: ImageSize | None = None
+    quality: str | None = None
+    style: str | None = None
+
+    @model_validator(mode="after")
+    def check_size(self) -> "OpenAITextToImageExecutionSettings":
+        """Check that the requested image size is valid."""
+        size = self.size or self.extension_data.get("size")
+
+        if size is not None and (size.width, size.height) not in VALID_IMAGE_SIZES:
+            raise ServiceInvalidExecutionSettingsError(f"Invalid image size: {size.width}x{size.height}.")
+
+        return self
+
+    @model_validator(mode="after")
+    def check_prompt(self) -> "OpenAITextToImageExecutionSettings":
+        """Check that the prompt is not empty."""
+        prompt = self.prompt or self.extension_data.get("prompt")
+
+        if not prompt:
+            raise ServiceInvalidExecutionSettingsError("The prompt is required.")
+
+        return self
+
+    def prepare_settings_dict(self, **kwargs) -> dict[str, Any]:
+        """Prepare the settings dictionary for the OpenAI API."""
+        settings_dict = super().prepare_settings_dict(**kwargs)
+
+        if self.size is not None:
+            settings_dict["size"] = str(self.size)
+
+        return settings_dict