Skip to content

Commit 4b91b8e

Browse files
committed
feat: Add Google Speech to Text API Document Loader
1 parent 7372156 commit 4b91b8e

File tree

2 files changed

+34
-9
lines changed

2 files changed

+34
-9
lines changed

libs/langchain/langchain/document_loaders/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -87,6 +87,7 @@
8787
from langchain.document_loaders.git import GitLoader
8888
from langchain.document_loaders.gitbook import GitbookLoader
8989
from langchain.document_loaders.github import GitHubIssuesLoader
90+
from langchain.document_loaders.google_speech_to_text import GoogleSpeechToTextLoader
9091
from langchain.document_loaders.googledrive import GoogleDriveLoader
9192
from langchain.document_loaders.gutenberg import GutenbergLoader
9293
from langchain.document_loaders.hn import HNLoader
@@ -267,6 +268,7 @@
267268
"GitbookLoader",
268269
"GoogleApiClient",
269270
"GoogleApiYoutubeLoader",
271+
"GoogleSpeechToTextLoader",
270272
"GoogleDriveLoader",
271273
"GutenbergLoader",
272274
"HNLoader",

libs/langchain/langchain/document_loaders/google_speech_to_text.py

Lines changed: 32 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,11 @@
77
from langchain.utilities.vertexai import get_client_info
88

99
if TYPE_CHECKING:
10-
from google.api_core.client_options import ClientOptions
1110
from google.cloud.speech_v2 import (
1211
RecognitionConfig,
1312
SpeechClient,
1413
)
14+
from google.protobuf.field_mask_pb2 import FieldMask
1515

1616

1717
class GoogleSpeechToTextLoader(BaseLoader):
@@ -26,7 +26,8 @@ class GoogleSpeechToTextLoader(BaseLoader):
2626
2727
Audio files can be specified via a Google Cloud Storage uri or a local file path.
2828
29-
For a detailed explanation of Google Cloud Speech-to-Text, refer to the product documentation.
29+
For a detailed explanation of Google Cloud Speech-to-Text, refer to the product
30+
documentation.
3031
https://cloud.google.com/speech-to-text
3132
"""
3233

@@ -39,8 +40,8 @@ def __init__(
3940
file_path: str,
4041
location: str = "global",
4142
recognizer_id: str = "_",
42-
*,
4343
config: Optional[RecognitionConfig] = None,
44+
config_mask: Optional[FieldMask] = None,
4445
):
4546
"""
4647
Initializes the GoogleSpeechToTextLoader.
@@ -53,10 +54,20 @@ def __init__(
5354
config: Recognition options and features.
5455
For more information:
5556
https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v2.types.RecognitionConfig
57+
config_mask: The list of fields in config that override the values in the
58+
``default_recognition_config`` of the recognizer during this
59+
recognition request.
60+
For more information:
61+
https://cloud.google.com/python/docs/reference/speech/latest/google.cloud.speech_v2.types.RecognizeRequest
5662
"""
5763
try:
5864
from google.api_core.client_options import ClientOptions
59-
from google.cloud.speech_v2 import SpeechClient
65+
from google.cloud.speech_v2 import (
66+
AutoDetectDecodingConfig,
67+
RecognitionConfig,
68+
RecognitionFeatures,
69+
SpeechClient,
70+
)
6071
except ImportError as exc:
6172
raise ImportError(
6273
"Could not import google-cloud-speech python package. "
@@ -67,7 +78,17 @@ def __init__(
6778
self.file_path = file_path
6879
self.location = location
6980
self.recognizer_id = recognizer_id
70-
self.config = config
81+
# Config must be set in speech recognition request.
82+
self.config = config or RecognitionConfig(
83+
auto_decoding_config=AutoDetectDecodingConfig(),
84+
language_codes=["en-US"],
85+
model="long",
86+
features=RecognitionFeatures(
87+
# Automatic punctuation could be useful for language applications
88+
enable_automatic_punctuation=True,
89+
),
90+
)
91+
self.config_mask = config_mask
7192

7293
self._client = SpeechClient(
7394
client_info=get_client_info(module="speech-to-text"),
@@ -95,16 +116,18 @@ def load(self) -> List[Document]:
95116
"Please install it with `pip install google-cloud-speech`."
96117
) from exc
97118

98-
request = RecognizeRequest(recognizer=self._recognizer_path, config=self.config)
119+
request = RecognizeRequest(
120+
recognizer=self._recognizer_path,
121+
config=self.config,
122+
config_mask=self.config_mask,
123+
)
99124

100125
if "gs://" in self.file_path:
101-
request.gcs_uri = self.file_path
126+
request.uri = self.file_path
102127
else:
103-
# Reads a file as bytes
104128
with open(self.file_path, "rb") as f:
105129
request.content = f.read()
106130

107-
# Transcribes the audio into text
108131
response = self._client.recognize(request=request)
109132

110133
return [

0 commit comments

Comments
 (0)