Skip to content

Commit fadc226

Browse files
committed
Adding Document.annotate_text() in language.
1 parent 0e6850f commit fadc226

File tree

2 files changed

+274
-21
lines changed

2 files changed

+274
-21
lines changed

gcloud/language/document.py

Lines changed: 74 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,8 @@
2121

2222
from gcloud.language.entity import Entity
2323
from gcloud.language.sentiment import Sentiment
24+
from gcloud.language.token import Sentence
25+
from gcloud.language.token import Token
2426

2527

2628
DEFAULT_LANGUAGE = 'en-US'
@@ -185,3 +187,75 @@ def analyze_sentiment(self):
185187
api_response = self.client.connection.api_request(
186188
method='POST', path='analyzeSentiment', data=data)
187189
return Sentiment.from_api_repr(api_response['documentSentiment'])
190+
191+
def annotate_text(self, include_syntax=True, include_entities=True,
192+
include_sentiment=True):
193+
"""Advanced natural language API: document syntax and other features.
194+
195+
Includes the full functionality of :meth:`analyze_entities` and
196+
:meth:`analyze_sentiment`, enabled by the flags
197+
``include_entities`` and ``include_sentiment`` respectively.
198+
199+
In addition ``include_syntax`` adds a new feature that analyzes
200+
the document for semantic and syntacticinformation.
201+
202+
.. note::
203+
204+
This API is intended for users who are familiar with machine
205+
learning and need in-depth text features to build upon.
206+
207+
.. _annotateText: https://cloud.google.com/natural-language/\
208+
reference/rest/v1beta1/documents/annotateText
209+
210+
See `annotateText`_.
211+
212+
:type include_syntax: bool
213+
:param include_syntax: (Optional) Flag to enable syntax analysis
214+
of the current document.
215+
216+
:type include_entities: bool
217+
:param include_entities: (Optional) Flag to enable entity extraction
218+
from the current document.
219+
220+
:type include_sentiment: bool
221+
:param include_sentiment: (Optional) Flag to enable sentiment
222+
analysis of the current document.
223+
224+
:rtype: :class:`Annotations`
225+
:returns: A tuple of each of the four values returned from the API:
226+
sentences, tokens, sentiment and entities.
227+
"""
228+
features = {}
229+
if include_syntax:
230+
features['extractSyntax'] = True
231+
if include_entities:
232+
features['extractEntities'] = True
233+
if include_sentiment:
234+
features['extractDocumentSentiment'] = True
235+
236+
data = {
237+
'document': self._to_dict(),
238+
'features': features,
239+
'encodingType': self.encoding,
240+
}
241+
api_response = self.client.connection.api_request(
242+
method='POST', path='annotateText', data=data)
243+
244+
sentences = [Sentence.from_api_repr(sentence)
245+
for sentence in api_response['sentences']]
246+
tokens = [Token.from_api_repr(token)
247+
for token in api_response['tokens']]
248+
sentiment_info = api_response.get('documentSentiment')
249+
if sentiment_info is None:
250+
sentiment = None
251+
else:
252+
sentiment = Sentiment.from_api_repr(sentiment_info)
253+
entities = [Entity.from_api_repr(entity)
254+
for entity in api_response['entities']]
255+
annotations = Annotations(
256+
sentences=sentences,
257+
tokens=tokens,
258+
sentiment=sentiment,
259+
entities=entities,
260+
)
261+
return annotations

gcloud/language/test_document.py

Lines changed: 200 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,86 @@
1515
import unittest
1616

1717

18+
ANNOTATE_NAME = 'Moon'
19+
ANNOTATE_CONTENT = 'A cow jumped over the %s.' % (ANNOTATE_NAME,)
20+
ANNOTATE_POLARITY = 1
21+
ANNOTATE_MAGNITUDE = 0.2
22+
ANNOTATE_SALIENCE = 0.11793101
23+
ANNOTATE_WIKI_URL = 'http://en.wikipedia.org/wiki/Natural_satellite'
24+
25+
26+
def _make_token_json(name, part_of_speech, head, edge_label):
27+
token_dict = {
28+
'text': {
29+
'content': name,
30+
'beginOffset': -1,
31+
},
32+
'partOfSpeech': {'tag': part_of_speech},
33+
'dependencyEdge': {
34+
'headTokenIndex': head,
35+
'label': edge_label,
36+
},
37+
'lemma': name,
38+
}
39+
return token_dict
40+
41+
42+
def _get_token_and_sentences(include_syntax):
43+
from gcloud.language.token import PartOfSpeech
44+
45+
if include_syntax:
46+
token_info = [
47+
('A', PartOfSpeech.DETERMINER, 1, 'DET'),
48+
('cow', PartOfSpeech.NOUN, 2, 'NSUBJ'),
49+
('jumped', PartOfSpeech.VERB, 2, 'ROOT'),
50+
('over', PartOfSpeech.ADPOSITION, 2, 'PREP'),
51+
('the', PartOfSpeech.DETERMINER, 5, 'DET'),
52+
(ANNOTATE_NAME, PartOfSpeech.NOUN, 3, 'POBJ'),
53+
('.', PartOfSpeech.PUNCTUATION, 2, 'P'),
54+
]
55+
sentences = [
56+
{
57+
'text': {
58+
'content': ANNOTATE_CONTENT,
59+
'beginOffset': -1,
60+
},
61+
},
62+
]
63+
else:
64+
token_info = []
65+
sentences = []
66+
67+
return token_info, sentences
68+
69+
70+
def _get_entities(include_entities):
71+
from gcloud.language.entity import EntityType
72+
73+
if include_entities:
74+
entities = [
75+
{
76+
'name': ANNOTATE_NAME,
77+
'type': EntityType.LOCATION,
78+
'metadata': {
79+
'wikipedia_url': ANNOTATE_WIKI_URL,
80+
},
81+
'salience': ANNOTATE_SALIENCE,
82+
'mentions': [
83+
{
84+
'text': {
85+
'content': ANNOTATE_NAME,
86+
'beginOffset': -1
87+
}
88+
}
89+
]
90+
},
91+
]
92+
else:
93+
entities = []
94+
95+
return entities
96+
97+
1898
class TestDocument(unittest.TestCase):
1999

20100
def _getTargetClass(self):
@@ -95,8 +175,18 @@ def test__to_dict_with_no_content(self):
95175
'type': klass.PLAIN_TEXT,
96176
})
97177

98-
def test_analyze_entities(self):
178+
def _verify_entity(self, entity, name, entity_type, wiki_url, salience):
99179
from gcloud.language.entity import Entity
180+
181+
self.assertIsInstance(entity, Entity)
182+
self.assertEqual(entity.name, name)
183+
self.assertEqual(entity.entity_type, entity_type)
184+
self.assertEqual(entity.wikipedia_url, wiki_url)
185+
self.assertEqual(entity.metadata, {})
186+
self.assertEqual(entity.salience, salience)
187+
self.assertEqual(entity.mentions, [name])
188+
189+
def test_analyze_entities(self):
100190
from gcloud.language.entity import EntityType
101191

102192
name1 = 'R-O-C-K'
@@ -136,7 +226,7 @@ def test_analyze_entities(self):
136226
],
137227
},
138228
],
139-
'language': 'en',
229+
'language': 'en-US',
140230
}
141231
connection = _Connection(response)
142232
client = _Client(connection=connection)
@@ -145,31 +235,26 @@ def test_analyze_entities(self):
145235
entities = document.analyze_entities()
146236
self.assertEqual(len(entities), 2)
147237
entity1 = entities[0]
148-
self.assertIsInstance(entity1, Entity)
149-
self.assertEqual(entity1.name, name1)
150-
self.assertEqual(entity1.entity_type, EntityType.OTHER)
151-
self.assertEqual(entity1.wikipedia_url, None)
152-
self.assertEqual(entity1.metadata, {})
153-
self.assertEqual(entity1.salience, salience1)
154-
self.assertEqual(entity1.mentions, [name1])
238+
self._verify_entity(entity1, name1, EntityType.OTHER,
239+
None, salience1)
155240
entity2 = entities[1]
156-
self.assertIsInstance(entity2, Entity)
157-
self.assertEqual(entity2.name, name2)
158-
self.assertEqual(entity2.entity_type, EntityType.LOCATION)
159-
self.assertEqual(entity2.wikipedia_url, wiki2)
160-
self.assertEqual(entity2.metadata, {})
161-
self.assertEqual(entity2.salience, salience2)
162-
self.assertEqual(entity2.mentions, [name2])
241+
self._verify_entity(entity2, name2, EntityType.LOCATION,
242+
wiki2, salience2)
163243

164244
# Verify the request.
165245
self.assertEqual(len(connection._requested), 1)
166246
req = connection._requested[0]
167247
self.assertEqual(req['path'], 'analyzeEntities')
168248
self.assertEqual(req['method'], 'POST')
169249

170-
def test_analyze_sentiment(self):
250+
def _verify_sentiment(self, sentiment, polarity, magnitude):
171251
from gcloud.language.sentiment import Sentiment
172252

253+
self.assertIsInstance(sentiment, Sentiment)
254+
self.assertEqual(sentiment.polarity, polarity)
255+
self.assertEqual(sentiment.magnitude, magnitude)
256+
257+
def test_analyze_sentiment(self):
173258
content = 'All the pretty horses.'
174259
polarity = 1
175260
magnitude = 0.6
@@ -178,23 +263,117 @@ def test_analyze_sentiment(self):
178263
'polarity': polarity,
179264
'magnitude': magnitude,
180265
},
181-
'language': 'en',
266+
'language': 'en-US',
182267
}
183268
connection = _Connection(response)
184269
client = _Client(connection=connection)
185270
document = self._makeOne(client, content)
186271

187272
sentiment = document.analyze_sentiment()
188-
self.assertIsInstance(sentiment, Sentiment)
189-
self.assertEqual(sentiment.polarity, polarity)
190-
self.assertEqual(sentiment.magnitude, magnitude)
273+
self._verify_sentiment(sentiment, polarity, magnitude)
191274

192275
# Verify the request.
193276
self.assertEqual(len(connection._requested), 1)
194277
req = connection._requested[0]
195278
self.assertEqual(req['path'], 'analyzeSentiment')
196279
self.assertEqual(req['method'], 'POST')
197280

281+
def _verify_sentences(self, include_syntax, annotations):
282+
from gcloud.language.token import Sentence
283+
284+
if include_syntax:
285+
self.assertEqual(len(annotations.sentences), 1)
286+
sentence = annotations.sentences[0]
287+
self.assertIsInstance(sentence, Sentence)
288+
self.assertEqual(sentence.content, ANNOTATE_CONTENT)
289+
self.assertEqual(sentence.begin, -1)
290+
else:
291+
self.assertEqual(annotations.sentences, [])
292+
293+
def _verify_tokens(self, annotations, token_info):
294+
from gcloud.language.token import Token
295+
296+
self.assertEqual(len(annotations.tokens), len(token_info))
297+
for token, info in zip(annotations.tokens, token_info):
298+
self.assertIsInstance(token, Token)
299+
self.assertEqual(token.text_content, info[0])
300+
self.assertEqual(token.text_begin, -1)
301+
self.assertEqual(token.part_of_speech, info[1])
302+
self.assertEqual(token.edge_index, info[2])
303+
self.assertEqual(token.edge_label, info[3])
304+
self.assertEqual(token.lemma, info[0])
305+
306+
def _annotate_text_helper(self, include_sentiment,
307+
include_entities, include_syntax):
308+
from gcloud.language.document import Annotations
309+
from gcloud.language.entity import EntityType
310+
311+
token_info, sentences = _get_token_and_sentences(include_syntax)
312+
entities = _get_entities(include_entities)
313+
tokens = [_make_token_json(*info) for info in token_info]
314+
response = {
315+
'sentences': sentences,
316+
'tokens': tokens,
317+
'entities': entities,
318+
'language': 'en-US',
319+
}
320+
if include_sentiment:
321+
response['documentSentiment'] = {
322+
'polarity': ANNOTATE_POLARITY,
323+
'magnitude': ANNOTATE_MAGNITUDE,
324+
}
325+
326+
connection = _Connection(response)
327+
client = _Client(connection=connection)
328+
document = self._makeOne(client, ANNOTATE_CONTENT)
329+
330+
annotations = document.annotate_text(
331+
include_syntax=include_syntax, include_entities=include_entities,
332+
include_sentiment=include_sentiment)
333+
self.assertIsInstance(annotations, Annotations)
334+
# Sentences
335+
self._verify_sentences(include_syntax, annotations)
336+
# Token
337+
self._verify_tokens(annotations, token_info)
338+
# Sentiment
339+
if include_sentiment:
340+
self._verify_sentiment(annotations.sentiment,
341+
ANNOTATE_POLARITY, ANNOTATE_MAGNITUDE)
342+
else:
343+
self.assertIsNone(annotations.sentiment)
344+
# Entity
345+
if include_entities:
346+
self.assertEqual(len(annotations.entities), 1)
347+
entity = annotations.entities[0]
348+
self._verify_entity(entity, ANNOTATE_NAME, EntityType.LOCATION,
349+
ANNOTATE_WIKI_URL, ANNOTATE_SALIENCE)
350+
else:
351+
self.assertEqual(annotations.entities, [])
352+
353+
# Verify the request.
354+
self.assertEqual(len(connection._requested), 1)
355+
req = connection._requested[0]
356+
self.assertEqual(req['path'], 'annotateText')
357+
self.assertEqual(req['method'], 'POST')
358+
features = req['data']['features']
359+
self.assertEqual(features.get('extractDocumentSentiment', False),
360+
include_sentiment)
361+
self.assertEqual(features.get('extractEntities', False),
362+
include_entities)
363+
self.assertEqual(features.get('extractSyntax', False), include_syntax)
364+
365+
def test_annotate_text(self):
366+
self._annotate_text_helper(True, True, True)
367+
368+
def test_annotate_text_sentiment_only(self):
369+
self._annotate_text_helper(True, False, False)
370+
371+
def test_annotate_text_entities_only(self):
372+
self._annotate_text_helper(False, True, False)
373+
374+
def test_annotate_text_syntax_only(self):
375+
self._annotate_text_helper(False, False, True)
376+
198377

199378
class _Connection(object):
200379

0 commit comments

Comments
 (0)