Skip to content

Commit

Permalink
Add Google Cloud Speech-to-Text (STT) (home-assistant#120854)
Browse files Browse the repository at this point in the history
* Google Cloud

* .

* fix

* mypy

* add tests

* Update .coveragerc

* Update const.py

* upload file, reconfigure and import flow

* fixes

* default to latest_short

* mypy

* update

* Allow clearing options in the UI

* update

* update

* update
  • Loading branch information
tronikos authored and iloveicedgreentea committed Sep 4, 2024
1 parent 0817474 commit 8b28334
Show file tree
Hide file tree
Showing 9 changed files with 345 additions and 4 deletions.
2 changes: 1 addition & 1 deletion homeassistant/components/google_cloud/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from homeassistant.const import Platform
from homeassistant.core import HomeAssistant

PLATFORMS = [Platform.TTS]
PLATFORMS = [Platform.STT, Platform.TTS]


async def async_setup_entry(hass: HomeAssistant, entry: ConfigEntry) -> bool:
Expand Down
20 changes: 19 additions & 1 deletion homeassistant/components/google_cloud/config_flow.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,16 @@
SelectSelectorMode,
)

from .const import CONF_KEY_FILE, CONF_SERVICE_ACCOUNT_INFO, DEFAULT_LANG, DOMAIN, TITLE
from .const import (
CONF_KEY_FILE,
CONF_SERVICE_ACCOUNT_INFO,
CONF_STT_MODEL,
DEFAULT_LANG,
DEFAULT_STT_MODEL,
DOMAIN,
SUPPORTED_STT_MODELS,
TITLE,
)
from .helpers import (
async_tts_voices,
tts_options_schema,
Expand Down Expand Up @@ -162,6 +171,15 @@ async def async_step_init(
**tts_options_schema(
self.options, voices, from_config_flow=True
).schema,
vol.Optional(
CONF_STT_MODEL,
default=DEFAULT_STT_MODEL,
): SelectSelector(
SelectSelectorConfig(
mode=SelectSelectorMode.DROPDOWN,
options=SUPPORTED_STT_MODELS,
)
),
}
),
self.options,
Expand Down
164 changes: 164 additions & 0 deletions homeassistant/components/google_cloud/const.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@

DEFAULT_LANG = "en-US"

# TTS constants
CONF_GENDER = "gender"
CONF_VOICE = "voice"
CONF_ENCODING = "encoding"
Expand All @@ -18,3 +19,166 @@
CONF_GAIN = "gain"
CONF_PROFILES = "profiles"
CONF_TEXT_TYPE = "text_type"

# STT constants
CONF_STT_MODEL = "stt_model"

DEFAULT_STT_MODEL = "latest_short"

# https://cloud.google.com/speech-to-text/docs/transcription-model
SUPPORTED_STT_MODELS = [
"latest_long",
"latest_short",
"telephony",
"telephony_short",
"medical_dictation",
"medical_conversation",
"command_and_search",
"default",
"phone_call",
"video",
]

# https://cloud.google.com/speech-to-text/docs/speech-to-text-supported-languages
STT_LANGUAGES = [
"af-ZA",
"am-ET",
"ar-AE",
"ar-BH",
"ar-DZ",
"ar-EG",
"ar-IL",
"ar-IQ",
"ar-JO",
"ar-KW",
"ar-LB",
"ar-MA",
"ar-MR",
"ar-OM",
"ar-PS",
"ar-QA",
"ar-SA",
"ar-SY",
"ar-TN",
"ar-YE",
"az-AZ",
"bg-BG",
"bn-BD",
"bn-IN",
"bs-BA",
"ca-ES",
"cmn-Hans-CN",
"cmn-Hans-HK",
"cmn-Hant-TW",
"cs-CZ",
"da-DK",
"de-AT",
"de-CH",
"de-DE",
"el-GR",
"en-AU",
"en-CA",
"en-GB",
"en-GH",
"en-HK",
"en-IE",
"en-IN",
"en-KE",
"en-NG",
"en-NZ",
"en-PH",
"en-PK",
"en-SG",
"en-TZ",
"en-US",
"en-ZA",
"es-AR",
"es-BO",
"es-CL",
"es-CO",
"es-CR",
"es-DO",
"es-EC",
"es-ES",
"es-GT",
"es-HN",
"es-MX",
"es-NI",
"es-PA",
"es-PE",
"es-PR",
"es-PY",
"es-SV",
"es-US",
"es-UY",
"es-VE",
"et-EE",
"eu-ES",
"fa-IR",
"fi-FI",
"fil-PH",
"fr-BE",
"fr-CA",
"fr-CH",
"fr-FR",
"gl-ES",
"gu-IN",
"hi-IN",
"hr-HR",
"hu-HU",
"hy-AM",
"id-ID",
"is-IS",
"it-CH",
"it-IT",
"iw-IL",
"ja-JP",
"jv-ID",
"ka-GE",
"kk-KZ",
"km-KH",
"kn-IN",
"ko-KR",
"lo-LA",
"lt-LT",
"lv-LV",
"mk-MK",
"ml-IN",
"mn-MN",
"mr-IN",
"ms-MY",
"my-MM",
"ne-NP",
"nl-BE",
"nl-NL",
"no-NO",
"pa-Guru-IN",
"pl-PL",
"pt-BR",
"pt-PT",
"ro-RO",
"ru-RU",
"si-LK",
"sk-SK",
"sl-SI",
"sq-AL",
"sr-RS",
"su-ID",
"sv-SE",
"sw-KE",
"sw-TZ",
"ta-IN",
"ta-LK",
"ta-MY",
"ta-SG",
"te-IN",
"th-TH",
"tr-TR",
"uk-UA",
"ur-IN",
"ur-PK",
"uz-UZ",
"vi-VN",
"yue-Hant-HK",
"zu-ZA",
]
5 changes: 4 additions & 1 deletion homeassistant/components/google_cloud/manifest.json
Original file line number Diff line number Diff line change
Expand Up @@ -7,5 +7,8 @@
"documentation": "https://www.home-assistant.io/integrations/google_cloud",
"integration_type": "service",
"iot_class": "cloud_push",
"requirements": ["google-cloud-texttospeech==2.17.2"]
"requirements": [
"google-cloud-texttospeech==2.17.2",
"google-cloud-speech==2.27.0"
]
}
3 changes: 2 additions & 1 deletion homeassistant/components/google_cloud/strings.json
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@
"pitch": "Default pitch of the voice",
"gain": "Default volume gain (in dB) of the voice",
"profiles": "Default audio profiles",
"text_type": "Default text type"
"text_type": "Default text type",
"stt_model": "STT model"
}
}
}
Expand Down
147 changes: 147 additions & 0 deletions homeassistant/components/google_cloud/stt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
"""Support for the Google Cloud STT service."""

from __future__ import annotations

from collections.abc import AsyncGenerator, AsyncIterable
import logging

from google.api_core.exceptions import GoogleAPIError, Unauthenticated
from google.cloud import speech_v1

from homeassistant.components.stt import (
AudioBitRates,
AudioChannels,
AudioCodecs,
AudioFormats,
AudioSampleRates,
SpeechMetadata,
SpeechResult,
SpeechResultState,
SpeechToTextEntity,
)
from homeassistant.config_entries import ConfigEntry
from homeassistant.core import HomeAssistant
from homeassistant.helpers import device_registry as dr
from homeassistant.helpers.entity_platform import AddEntitiesCallback

from .const import (
CONF_SERVICE_ACCOUNT_INFO,
CONF_STT_MODEL,
DEFAULT_STT_MODEL,
DOMAIN,
STT_LANGUAGES,
)

_LOGGER = logging.getLogger(__name__)


async def async_setup_entry(
hass: HomeAssistant,
config_entry: ConfigEntry,
async_add_entities: AddEntitiesCallback,
) -> None:
"""Set up Google Cloud speech platform via config entry."""
service_account_info = config_entry.data[CONF_SERVICE_ACCOUNT_INFO]
client = speech_v1.SpeechAsyncClient.from_service_account_info(service_account_info)
async_add_entities([GoogleCloudSpeechToTextEntity(config_entry, client)])


class GoogleCloudSpeechToTextEntity(SpeechToTextEntity):
"""Google Cloud STT entity."""

def __init__(
self,
entry: ConfigEntry,
client: speech_v1.SpeechAsyncClient,
) -> None:
"""Init Google Cloud STT entity."""
self._attr_unique_id = f"{entry.entry_id}-stt"
self._attr_name = entry.title
self._attr_device_info = dr.DeviceInfo(
identifiers={(DOMAIN, entry.entry_id)},
manufacturer="Google",
model="Cloud",
entry_type=dr.DeviceEntryType.SERVICE,
)
self._entry = entry
self._client = client
self._model = entry.options.get(CONF_STT_MODEL, DEFAULT_STT_MODEL)

@property
def supported_languages(self) -> list[str]:
"""Return a list of supported languages."""
return STT_LANGUAGES

@property
def supported_formats(self) -> list[AudioFormats]:
"""Return a list of supported formats."""
return [AudioFormats.WAV, AudioFormats.OGG]

@property
def supported_codecs(self) -> list[AudioCodecs]:
"""Return a list of supported codecs."""
return [AudioCodecs.PCM, AudioCodecs.OPUS]

@property
def supported_bit_rates(self) -> list[AudioBitRates]:
"""Return a list of supported bitrates."""
return [AudioBitRates.BITRATE_16]

@property
def supported_sample_rates(self) -> list[AudioSampleRates]:
"""Return a list of supported samplerates."""
return [AudioSampleRates.SAMPLERATE_16000]

@property
def supported_channels(self) -> list[AudioChannels]:
"""Return a list of supported channels."""
return [AudioChannels.CHANNEL_MONO]

async def async_process_audio_stream(
self, metadata: SpeechMetadata, stream: AsyncIterable[bytes]
) -> SpeechResult:
"""Process an audio stream to STT service."""
streaming_config = speech_v1.StreamingRecognitionConfig(
config=speech_v1.RecognitionConfig(
encoding=(
speech_v1.RecognitionConfig.AudioEncoding.OGG_OPUS
if metadata.codec == AudioCodecs.OPUS
else speech_v1.RecognitionConfig.AudioEncoding.LINEAR16
),
sample_rate_hertz=metadata.sample_rate,
language_code=metadata.language,
model=self._model,
)
)

async def request_generator() -> (
AsyncGenerator[speech_v1.StreamingRecognizeRequest]
):
# The first request must only contain a streaming_config
yield speech_v1.StreamingRecognizeRequest(streaming_config=streaming_config)
# All subsequent requests must only contain audio_content
async for audio_content in stream:
yield speech_v1.StreamingRecognizeRequest(audio_content=audio_content)

try:
responses = await self._client.streaming_recognize(
requests=request_generator(),
timeout=10,
)

transcript = ""
async for response in responses:
_LOGGER.debug("response: %s", response)
if not response.results:
continue
result = response.results[0]
if not result.alternatives:
continue
transcript += response.results[0].alternatives[0].transcript
except GoogleAPIError as err:
_LOGGER.error("Error occurred during Google Cloud STT call: %s", err)
if isinstance(err, Unauthenticated):
self._entry.async_start_reauth(self.hass)
return SpeechResult(None, SpeechResultState.ERROR)

return SpeechResult(transcript, SpeechResultState.SUCCESS)
3 changes: 3 additions & 0 deletions requirements_all.txt
Original file line number Diff line number Diff line change
Expand Up @@ -985,6 +985,9 @@ google-api-python-client==2.71.0
# homeassistant.components.google_pubsub
google-cloud-pubsub==2.23.0

# homeassistant.components.google_cloud
google-cloud-speech==2.27.0

# homeassistant.components.google_cloud
google-cloud-texttospeech==2.17.2

Expand Down
Loading

0 comments on commit 8b28334

Please sign in to comment.