From e135dba5777eec9fc52afdd87f00dd00f8336fce Mon Sep 17 00:00:00 2001 From: Luke Sneeringer Date: Thu, 16 Mar 2017 11:10:27 -0700 Subject: [PATCH] Provide full Mention objects, not just the content. (#3156) Fixes #3149. --- language/google/cloud/language/entity.py | 77 +++++++++++++++- language/unit_tests/test_api_responses.py | 9 +- language/unit_tests/test_document.py | 10 ++- language/unit_tests/test_entity.py | 104 ++++++++++++++++++++-- system_tests/language.py | 6 +- 5 files changed, 190 insertions(+), 16 deletions(-) diff --git a/language/google/cloud/language/entity.py b/language/google/cloud/language/entity.py index c4a8473f9955..4bf3ed950ba8 100644 --- a/language/google/cloud/language/entity.py +++ b/language/google/cloud/language/entity.py @@ -46,6 +46,80 @@ class EntityType(object): """Other entity type (i.e. known but not classified).""" +class MentionType(object): + """List of possible mention types.""" + + TYPE_UNKNOWN = 'TYPE_UNKNOWN' + """Unknown mention type""" + + PROPER = 'PROPER' + """Proper name""" + + COMMON = 'COMMON' + """Common noun (or noun compound)""" + + +class Mention(object): + """A Google Cloud Natural Language API mention. + + Represents a mention for an entity in the text. Currently, proper noun + mentions are supported. + """ + def __init__(self, text, mention_type): + self.text = text + self.mention_type = mention_type + + def __str__(self): + return str(self.text) + + @classmethod + def from_api_repr(cls, payload): + """Convert a Mention from the JSON API into an :class:`Mention`. + + :param payload: dict + :type payload: The value from the backend. + + :rtype: :class:`Mention` + :returns: The mention parsed from the API representation. + """ + text = TextSpan.from_api_repr(payload['text']) + mention_type = payload['type'] + return cls(text, mention_type) + + +class TextSpan(object): + """A span of text from Google Cloud Natural Language API. + + Represents a word or phrase of text, as well as its offset + from the original document. + """ + def __init__(self, content, begin_offset): + self.content = content + self.begin_offset = begin_offset + + def __str__(self): + """Return the string representation of this TextSpan. + + :rtype: str + :returns: The text content + """ + return self.content + + @classmethod + def from_api_repr(cls, payload): + """Convert a TextSpan from the JSON API into an :class:`TextSpan`. + + :param payload: dict + :type payload: The value from the backend. + + :rtype: :class:`TextSpan` + :returns: The text span parsed from the API representation. + """ + content = payload['content'] + begin_offset = payload['beginOffset'] + return cls(content=content, begin_offset=begin_offset) + + class Entity(object): """A Google Cloud Natural Language API entity. @@ -101,6 +175,5 @@ def from_api_repr(cls, payload): entity_type = payload['type'] metadata = payload['metadata'] salience = payload['salience'] - mentions = [value['text']['content'] - for value in payload['mentions']] + mentions = [Mention.from_api_repr(val) for val in payload['mentions']] return cls(name, entity_type, metadata, salience, mentions) diff --git a/language/unit_tests/test_api_responses.py b/language/unit_tests/test_api_responses.py index 7bdba120a14e..bc04522acb06 100644 --- a/language/unit_tests/test_api_responses.py +++ b/language/unit_tests/test_api_responses.py @@ -18,7 +18,10 @@ class TestEntityResponse(unittest.TestCase): ENTITY_DICT = { - 'mentions': [{'text': {'content': 'Italian'}}], + 'mentions': [{ + 'text': {'content': 'Italian', 'beginOffset': 0}, + 'type': 'PROPER', + }], 'metadata': {'wikipedia_url': 'http://en.wikipedia.org/wiki/Italy'}, 'name': 'Italian', 'salience': 0.15, @@ -46,12 +49,14 @@ def test_api_repr_factory(self): def _verify_entity_response(self, entity_response): from google.cloud.language.entity import EntityType + from google.cloud.language.entity import Mention self.assertEqual(len(entity_response.entities), 1) entity = entity_response.entities[0] self.assertEqual(entity.name, 'Italian') self.assertEqual(len(entity.mentions), 1) - self.assertEqual(entity.mentions[0], 'Italian') + self.assertIsInstance(entity.mentions[0], Mention) + self.assertEqual(str(entity.mentions[0]), 'Italian') self.assertTrue(entity.metadata['wikipedia_url'].endswith('Italy')) self.assertAlmostEqual(entity.salience, 0.15) self.assertEqual(entity.entity_type, EntityType.LOCATION) diff --git a/language/unit_tests/test_document.py b/language/unit_tests/test_document.py index d41c4ad062a4..19f747ed491b 100644 --- a/language/unit_tests/test_document.py +++ b/language/unit_tests/test_document.py @@ -84,7 +84,8 @@ def _get_entities(include_entities): 'text': { 'content': ANNOTATE_NAME, 'beginOffset': -1 - } + }, + 'type': 'TYPE_UNKNOWN', } ] }, @@ -215,7 +216,8 @@ def _verify_entity(self, entity, name, entity_type, wiki_url, salience): else: self.assertEqual(entity.metadata, {}) self.assertEqual(entity.salience, salience) - self.assertEqual(entity.mentions, [name]) + self.assertEqual(len(entity.mentions), 1) + self.assertEqual(entity.mentions[0].text.content, name) @staticmethod def _expected_data(content, encoding_type=None, @@ -265,7 +267,8 @@ def test_analyze_entities(self): 'text': { 'content': name1, 'beginOffset': -1 - } + }, + 'type': 'TYPE_UNKNOWN', } ] }, @@ -280,6 +283,7 @@ def test_analyze_entities(self): 'content': name2, 'beginOffset': -1, }, + 'type': 'PROPER', }, ], }, diff --git a/language/unit_tests/test_entity.py b/language/unit_tests/test_entity.py index cd4cb6b8f7d9..d8ff94094831 100644 --- a/language/unit_tests/test_entity.py +++ b/language/unit_tests/test_entity.py @@ -27,6 +27,10 @@ def _make_one(self, *args, **kw): return self._get_target_class()(*args, **kw) def test_constructor_defaults(self): + from google.cloud.language.entity import Mention + from google.cloud.language.entity import MentionType + from google.cloud.language.entity import TextSpan + name = 'Italian' entity_type = 'LOCATION' wiki_url = 'http://en.wikipedia.org/wiki/Italy' @@ -35,7 +39,10 @@ def test_constructor_defaults(self): 'wikipedia_url': wiki_url, } salience = 0.19960518 - mentions = ['Italian'] + mentions = [Mention( + mention_type=MentionType.PROPER, + text=TextSpan(content='Italian', begin_offset=0), + )] entity = self._make_one(name, entity_type, metadata, salience, mentions) self.assertEqual(entity.name, name) @@ -45,9 +52,13 @@ def test_constructor_defaults(self): self.assertEqual(entity.mentions, mentions) def test_from_api_repr(self): + from google.cloud.language.entity import EntityType + from google.cloud.language.entity import Mention + from google.cloud.language.entity import MentionType + klass = self._get_target_class() name = 'Italy' - entity_type = 'LOCATION' + entity_type = EntityType.LOCATION salience = 0.223 wiki_url = 'http://en.wikipedia.org/wiki/Italy' mention1 = 'Italy' @@ -59,9 +70,12 @@ def test_from_api_repr(self): 'salience': salience, 'metadata': {'wikipedia_url': wiki_url}, 'mentions': [ - {'text': {'content': mention1}}, - {'text': {'content': mention2}}, - {'text': {'content': mention3}}, + {'text': {'content': mention1, 'beginOffset': 3}, + 'type': 'PROPER'}, + {'text': {'content': mention2, 'beginOffset': 5}, + 'type': 'PROPER'}, + {'text': {'content': mention3, 'beginOffset': 8}, + 'type': 'PROPER'}, ], } entity = klass.from_api_repr(payload) @@ -69,4 +83,82 @@ def test_from_api_repr(self): self.assertEqual(entity.entity_type, entity_type) self.assertEqual(entity.salience, salience) self.assertEqual(entity.metadata, {'wikipedia_url': wiki_url}) - self.assertEqual(entity.mentions, [mention1, mention2, mention3]) + + # Assert that we got back Mention objects for each mention. + self.assertIsInstance(entity.mentions[0], Mention) + self.assertIsInstance(entity.mentions[1], Mention) + self.assertIsInstance(entity.mentions[2], Mention) + + # Assert that the text (and string coercison) are correct. + self.assertEqual([str(i) for i in entity.mentions], + [mention1, mention2, mention3]) + + # Assert that the begin offsets are preserved. + self.assertEqual([i.text.begin_offset for i in entity.mentions], + [3, 5, 8]) + + # Assert that the mention types are preserved. + for mention in entity.mentions: + self.assertEqual(mention.mention_type, MentionType.PROPER) + + +class TestMention(unittest.TestCase): + PAYLOAD = { + 'text': {'content': 'Greece', 'beginOffset': 42}, + 'type': 'PROPER', + } + + def test_constructor(self): + from google.cloud.language.entity import Mention + from google.cloud.language.entity import MentionType + from google.cloud.language.entity import TextSpan + + mention = Mention( + text=TextSpan(content='snails', begin_offset=90), + mention_type=MentionType.COMMON, + ) + + self.assertIsInstance(mention.text, TextSpan) + self.assertEqual(mention.text.content, 'snails') + self.assertEqual(mention.text.begin_offset, 90) + self.assertEqual(mention.mention_type, MentionType.COMMON) + + def test_from_api_repr(self): + from google.cloud.language.entity import Mention + from google.cloud.language.entity import MentionType + from google.cloud.language.entity import TextSpan + + mention = Mention.from_api_repr(self.PAYLOAD) + + self.assertIsInstance(mention, Mention) + self.assertIsInstance(mention.text, TextSpan) + self.assertEqual(mention.text.content, 'Greece') + self.assertEqual(mention.text.begin_offset, 42) + self.assertEqual(mention.mention_type, MentionType.PROPER) + + def test_dunder_str(self): + from google.cloud.language.entity import Mention + + mention = Mention.from_api_repr(self.PAYLOAD) + self.assertEqual(str(mention), 'Greece') + + +class TestTextSpan(unittest.TestCase): + def test_constructor(self): + from google.cloud.language.entity import TextSpan + + text = TextSpan(content='Winston Churchill', begin_offset=1945) + self.assertIsInstance(text, TextSpan) + self.assertEqual(text.content, str(text), 'Winston Churchill') + self.assertEqual(text.begin_offset, 1945) + + def test_from_api_repr(self): + from google.cloud.language.entity import TextSpan + + text = TextSpan.from_api_repr({ + 'beginOffset': 1953, + 'content': 'Queen Elizabeth', + }) + self.assertIsInstance(text, TextSpan) + self.assertEqual(text.content, str(text), 'Queen Elizabeth') + self.assertEqual(text.begin_offset, 1953) diff --git a/system_tests/language.py b/system_tests/language.py index 23b76da2cf32..ac56799f36d8 100644 --- a/system_tests/language.py +++ b/system_tests/language.py @@ -75,7 +75,7 @@ def _check_analyze_entities_result(self, entities): self.assertEqual(entity1.entity_type, EntityType.PERSON) self.assertGreater(entity1.salience, 0.0) # Other mentions may occur, e.g. "painter". - self.assertIn(entity1.name, entity1.mentions) + self.assertIn(entity1.name, [str(i) for i in entity1.mentions]) self.assertEqual(entity1.metadata['wikipedia_url'], 'http://en.wikipedia.org/wiki/Caravaggio') self.assertIsInstance(entity1.metadata, dict) @@ -83,7 +83,7 @@ def _check_analyze_entities_result(self, entities): self.assertEqual(entity2.name, self.NAME2) self.assertEqual(entity2.entity_type, EntityType.LOCATION) self.assertGreater(entity2.salience, 0.0) - self.assertEqual(entity2.mentions, [entity2.name]) + self.assertEqual([str(i) for i in entity2.mentions], [entity2.name]) self.assertEqual(entity2.metadata['wikipedia_url'], 'http://en.wikipedia.org/wiki/Italy') self.assertIsInstance(entity2.metadata, dict) @@ -92,7 +92,7 @@ def _check_analyze_entities_result(self, entities): choices = (EntityType.EVENT, EntityType.WORK_OF_ART) self.assertIn(entity3.entity_type, choices) self.assertGreater(entity3.salience, 0.0) - self.assertEqual(entity3.mentions, [entity3.name]) + self.assertEqual([str(i) for i in entity3.mentions], [entity3.name]) wiki_url = ('http://en.wikipedia.org/wiki/' 'The_Calling_of_St_Matthew_(Caravaggio)') self.assertEqual(entity3.metadata['wikipedia_url'], wiki_url)