Skip to content

Commit a34109d

Browse files
committed
Adding Document.analyze_entities() in language.
1 parent 7dbdefb commit a34109d

File tree

2 files changed

+167
-0
lines changed

2 files changed

+167
-0
lines changed

gcloud/language/document.py

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
A document is used to hold text to be analyzed and annotated.
1818
"""
1919

20+
from gcloud.language.entity import Entity
21+
2022

2123
DEFAULT_LANGUAGE = 'en'
2224
"""Default document language, English."""
@@ -101,3 +103,44 @@ def __init__(self, client, content=None, gcs_url=None, doc_type=PLAIN_TEXT,
101103
self.doc_type = doc_type
102104
self.language = language
103105
self.encoding = encoding
106+
107+
def _to_dict(self):
108+
"""Helper to convert the current document into a dictionary.
109+
110+
To be used when constructing requests.
111+
112+
:rtype: dict
113+
:returns: The Document value as a JSON dictionary.
114+
"""
115+
info = {
116+
'type': self.doc_type,
117+
'language': self.language,
118+
}
119+
if self.content is not None:
120+
info['content'] = self.content
121+
elif self.gcs_url is not None:
122+
info['gcsContentUri'] = self.gcs_url
123+
return info
124+
125+
def analyze_entities(self):
126+
"""Analyze the entities in the current document.
127+
128+
Finds named entities (currently finds proper names as of August 2016)
129+
in the text, entity types, salience, mentions for each entity, and
130+
other properties.
131+
132+
See:
133+
https://cloud.google.com/natural-language/reference/\
134+
rest/v1beta1/documents/analyzeEntities
135+
136+
:rtype: list
137+
:returns: A list of :class:`Entity` returned from the API.
138+
"""
139+
data = {
140+
'document': self._to_dict(),
141+
'encodingType': self.encoding,
142+
}
143+
api_response = self.client.connection.api_request(
144+
method='POST', path='analyzeEntities', data=data)
145+
return [Entity.from_api_repr(entity)
146+
for entity in api_response['entities']]

gcloud/language/test_document.py

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,3 +62,127 @@ def test_constructor_text_and_gcs(self):
6262
with self.assertRaises(ValueError):
6363
self._makeOne(None, content='abc',
6464
gcs_url='gs://some-bucket/some-obj.txt')
65+
66+
def test__to_dict_with_content(self):
67+
klass = self._getTargetClass()
68+
content = 'Hello World'
69+
document = self._makeOne(None, content=content)
70+
info = document._to_dict()
71+
self.assertEqual(info, {
72+
'content': content,
73+
'language': document.language,
74+
'type': klass.PLAIN_TEXT,
75+
})
76+
77+
def test__to_dict_with_gcs(self):
78+
klass = self._getTargetClass()
79+
gcs_url = 'gs://some-bucket/some-obj.html'
80+
document = self._makeOne(None, gcs_url=gcs_url)
81+
info = document._to_dict()
82+
self.assertEqual(info, {
83+
'gcsContentUri': gcs_url,
84+
'language': document.language,
85+
'type': klass.PLAIN_TEXT,
86+
})
87+
88+
def test__to_dict_with_no_content(self):
89+
klass = self._getTargetClass()
90+
document = self._makeOne(None, content='')
91+
document.content = None # Manually unset the content.
92+
info = document._to_dict()
93+
self.assertEqual(info, {
94+
'language': document.language,
95+
'type': klass.PLAIN_TEXT,
96+
})
97+
98+
def test_analyze_entities(self):
99+
from gcloud.language.entity import Entity
100+
from gcloud.language.entity import EntityType
101+
102+
name1 = 'R-O-C-K'
103+
name2 = 'USA'
104+
content = name1 + ' in the ' + name2
105+
metadata1 = {
106+
'wikipedia_url': 'http://en.wikipedia.org/wiki/Rock_music',
107+
}
108+
metadata2 = {
109+
'wikipedia_url': 'http://en.wikipedia.org/wiki/United_States',
110+
}
111+
salience1 = 0.91391456
112+
salience2 = 0.086085409
113+
response = {
114+
'entities': [
115+
{
116+
'name': name1,
117+
'type': EntityType.OTHER,
118+
'metadata': metadata1,
119+
'salience': salience1,
120+
'mentions': [
121+
{
122+
'text': {
123+
'content': name1,
124+
'beginOffset': -1
125+
}
126+
}
127+
]
128+
},
129+
{
130+
'name': name2,
131+
'type': EntityType.LOCATION,
132+
'metadata': metadata2,
133+
'salience': salience2,
134+
'mentions': [
135+
{
136+
'text': {
137+
'content': name2,
138+
'beginOffset': -1,
139+
},
140+
},
141+
],
142+
},
143+
],
144+
'language': 'en',
145+
}
146+
connection = _Connection(response)
147+
client = _Client(connection=connection)
148+
document = self._makeOne(client, content)
149+
150+
entities = document.analyze_entities()
151+
self.assertEqual(len(entities), 2)
152+
entity1 = entities[0]
153+
self.assertIsInstance(entity1, Entity)
154+
self.assertEqual(entity1.name, name1)
155+
self.assertEqual(entity1.entity_type, EntityType.OTHER)
156+
self.assertEqual(entity1.metadata, metadata1)
157+
self.assertEqual(entity1.salience, salience1)
158+
self.assertEqual(entity1.mentions, [name1])
159+
entity2 = entities[1]
160+
self.assertIsInstance(entity2, Entity)
161+
self.assertEqual(entity2.name, name2)
162+
self.assertEqual(entity2.entity_type, EntityType.LOCATION)
163+
self.assertEqual(entity2.metadata, metadata2)
164+
self.assertEqual(entity2.salience, salience2)
165+
self.assertEqual(entity2.mentions, [name2])
166+
167+
# Verify the request.
168+
self.assertEqual(len(connection._requested), 1)
169+
req = connection._requested[0]
170+
self.assertEqual(req['path'], 'analyzeEntities')
171+
self.assertEqual(req['method'], 'POST')
172+
173+
174+
class _Connection(object):
175+
176+
def __init__(self, response):
177+
self._response = response
178+
self._requested = []
179+
180+
def api_request(self, **kwargs):
181+
self._requested.append(kwargs)
182+
return self._response
183+
184+
185+
class _Client(object):
186+
187+
def __init__(self, connection=None):
188+
self.connection = connection

0 commit comments

Comments
 (0)