|
| 1 | +# Copyright 2016 Google Inc. All Rights Reserved. |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software |
| 10 | +# distributed under the License is distributed on an "AS IS" BASIS, |
| 11 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 12 | +# See the License for the specific language governing permissions and |
| 13 | +# limitations under the License. |
| 14 | + |
| 15 | +"""Basic client for Google Cloud Speech API.""" |
| 16 | + |
| 17 | +from base64 import b64encode |
| 18 | + |
| 19 | +from google.cloud._helpers import _to_bytes |
| 20 | +from google.cloud import client as client_module |
| 21 | +from google.cloud.speech.connection import Connection |
| 22 | + |
| 23 | + |
| 24 | +class Encoding(object): |
| 25 | + """Audio encoding types. |
| 26 | +
|
| 27 | + See: |
| 28 | + https://cloud.google.com/speech/reference/rest/v1beta1/\ |
| 29 | + RecognitionConfig#AudioEncoding |
| 30 | + """ |
| 31 | + |
| 32 | + LINEAR16 = 'LINEAR16' |
| 33 | + """LINEAR16 encoding type.""" |
| 34 | + |
| 35 | + FLAC = 'FLAC' |
| 36 | + """FLAC encoding type.""" |
| 37 | + |
| 38 | + MULAW = 'MULAW' |
| 39 | + """MULAW encoding type.""" |
| 40 | + |
| 41 | + AMR = 'AMR' |
| 42 | + """AMR encoding type.""" |
| 43 | + |
| 44 | + AMR_WB = 'AMR_WB' |
| 45 | + """AMR_WB encoding type.""" |
| 46 | + |
| 47 | + |
| 48 | +class Client(client_module.Client): |
| 49 | + """Client to bundle configuration needed for API requests. |
| 50 | +
|
| 51 | + :type project: str |
| 52 | + :param project: The project which the client acts on behalf of. Will be |
| 53 | + passed when creating a dataset / job. If not passed, |
| 54 | + falls back to the default inferred from the environment. |
| 55 | +
|
| 56 | + :type credentials: :class:`oauth2client.client.OAuth2Credentials` or |
| 57 | + :class:`NoneType` |
| 58 | + :param credentials: The OAuth2 Credentials to use for the connection |
| 59 | + owned by this client. If not passed (and if no ``http`` |
| 60 | + object is passed), falls back to the default inferred |
| 61 | + from the environment. |
| 62 | +
|
| 63 | + :type http: :class:`httplib2.Http` or class that defines ``request()``. |
| 64 | + :param http: An optional HTTP object to make requests. If not passed, an |
| 65 | + ``http`` object is created that is bound to the |
| 66 | + ``credentials`` for the current object. |
| 67 | + """ |
| 68 | + |
| 69 | + _connection_class = Connection |
| 70 | + |
| 71 | + def sync_recognize(self, content, source_uri, encoding, sample_rate, |
| 72 | + language_code=None, max_alternatives=None, |
| 73 | + profanity_filter=None, |
| 74 | + speech_context=None): |
| 75 | + """Synchronous Speech Recognition. |
| 76 | +
|
| 77 | + .. _sync_recognize: https://cloud.google.com/speech/reference/\ |
| 78 | + rest/v1beta1/speech/syncrecognize |
| 79 | +
|
| 80 | + See `sync_recognize`_. |
| 81 | +
|
| 82 | + :type content: bytes |
| 83 | + :param content: Byte stream of audio. |
| 84 | +
|
| 85 | + :type source_uri: str |
| 86 | + :param source_uri: URI that points to a file that contains audio |
| 87 | + data bytes as specified in RecognitionConfig. |
| 88 | + Currently, only Google Cloud Storage URIs are |
| 89 | + supported, which must be specified in the following |
| 90 | + format: gs://bucket_name/object_name |
| 91 | +
|
| 92 | + :type encoding: str |
| 93 | + :param encoding: encoding of audio data sent in all RecognitionAudio |
| 94 | + messages, can be one of: :attr:`~.Encoding.LINEAR16`, |
| 95 | + :attr:`~.Encoding.FLAC`, :attr:`~.Encoding.MULAW`, |
| 96 | + :attr:`~.Encoding.AMR`, :attr:`~.Encoding.AMR_WB` |
| 97 | +
|
| 98 | + :type sample_rate: int |
| 99 | + :param sample_rate: Sample rate in Hertz of the audio data sent in all |
| 100 | + requests. Valid values are: 8000-48000. |
| 101 | + 16000 is optimal. For best results, set the |
| 102 | + sampling rate of the audio source to 16000 Hz. |
| 103 | + If that's not possible, use the native sample rate |
| 104 | + of the audio source (instead of re-sampling). |
| 105 | +
|
| 106 | + :type language_code: str |
| 107 | + :param language_code: (Optional) The language of the supplied audio as |
| 108 | + BCP-47 language tag. Example: "en-GB". |
| 109 | + If omitted, defaults to "en-US". |
| 110 | +
|
| 111 | + :type max_alternatives: int |
| 112 | + :param max_alternatives: (Optional) Maximum number of recognition |
| 113 | + hypotheses to be returned. The server may |
| 114 | + return fewer than maxAlternatives. |
| 115 | + Valid values are 0-30. A value of 0 or 1 |
| 116 | + will return a maximum of 1. Defaults to 1 |
| 117 | +
|
| 118 | + :type profanity_filter: bool |
| 119 | + :param profanity_filter: If True, the server will attempt to filter |
| 120 | + out profanities, replacing all but the |
| 121 | + initial character in each filtered word with |
| 122 | + asterisks, e.g. "f***". If False or omitted, |
| 123 | + profanities won't be filtered out. |
| 124 | +
|
| 125 | + :type speech_context: list |
| 126 | + :param speech_context: A list of strings (max 50) containing words and |
| 127 | + phrases "hints" so that the speech recognition |
| 128 | + is more likely to recognize them. This can be |
| 129 | + used to improve the accuracy for specific words |
| 130 | + and phrases. This can also be used to add new |
| 131 | + words to the vocabulary of the recognizer. |
| 132 | +
|
| 133 | + :rtype: list |
| 134 | + :returns: A list of dictionaries. One dict for each alternative. Each |
| 135 | + dictionary typically contains two keys (though not |
| 136 | + all will be present in all cases) |
| 137 | +
|
| 138 | + * ``transcript``: The detected text from the audio recording. |
| 139 | + * ``confidence``: The confidence in language detection, float |
| 140 | + between 0 and 1. |
| 141 | + """ |
| 142 | + |
| 143 | + if (content is None) and (source_uri is None): |
| 144 | + raise ValueError('content and source_uri cannot be both equal to\ |
| 145 | + None') |
| 146 | + |
| 147 | + if (content is not None) and (source_uri is not None): |
| 148 | + raise ValueError('content and source_uri cannot be both different from\ |
| 149 | + None') |
| 150 | + if encoding is None: |
| 151 | + raise ValueError('encoding cannot be None') |
| 152 | + if sample_rate is None: |
| 153 | + raise ValueError('sample_rate cannot be None') |
| 154 | + |
| 155 | + if content is not None: |
| 156 | + audio = {'content': b64encode(_to_bytes(content))} |
| 157 | + else: |
| 158 | + audio = {'uri': source_uri} |
| 159 | + |
| 160 | + config = {'encoding': encoding, 'sampleRate': sample_rate} |
| 161 | + |
| 162 | + if language_code is not None: |
| 163 | + config['languageCode'] = language_code |
| 164 | + if max_alternatives is not None: |
| 165 | + config['maxAlternatives'] = max_alternatives |
| 166 | + if profanity_filter is not None: |
| 167 | + config['profanityFilter'] = profanity_filter |
| 168 | + if speech_context is not None: |
| 169 | + config['speechContext'] = {'phrases': speech_context} |
| 170 | + |
| 171 | + data = { |
| 172 | + 'audio': audio, |
| 173 | + 'config': config, |
| 174 | + } |
| 175 | + |
| 176 | + api_response = self.connection.api_request( |
| 177 | + method='POST', path='syncrecognize', data=data) |
| 178 | + |
| 179 | + if len(api_response['results']) == 1: |
| 180 | + return api_response['results'][0]['alternatives'] |
| 181 | + else: |
| 182 | + raise ValueError('result in api should have length 1') |
0 commit comments