Skip to content

Commit

Permalink
major refactor of URIs. URL-encoding them here was wrong, as it'll ha…
Browse files Browse the repository at this point in the history
…ppen again later. We actually just need to backslash Lucene-specific characters.
  • Loading branch information
Rob Speer committed Sep 30, 2011
1 parent 28ba65c commit b4d5f3e
Show file tree
Hide file tree
Showing 4 changed files with 306 additions and 71 deletions.
112 changes: 53 additions & 59 deletions conceptnet5/graph.py
Original file line number Diff line number Diff line change
@@ -1,48 +1,43 @@
# -*- coding: utf-8 -*-
from neo4jrestclient.client import GraphDatabase, Node
import urllib
import re

def uri_is_safe(uri):
LUCENE_UNSAFE = re.compile(r'([-+&|!(){}\[\]^"~*?\\: ])')
def lucene_escape(text):
"""
Determines if this is a correctly-encoded URI, by checking for some
common problems that would cause it to be incorrect.
URIs are searchable with Lucene. This might be awesome, but it means
that when looking them up, we have to escape out special characters by
prepending a backslash to them.
The output of encode_uri() should always pass uri_is_safe().
This should only be done inside a neo4j index.query().
"""
return (isinstance(uri, str) and ',' not in uri and ':' not in uri
and ' ' not in uri and '&' not in uri)
# The first two backslashes are understood by the expression as a
# literal backslash. The final \1 refers to what the expression matched.
#
# Fun fact: if Python didn't have raw strings, the replacement string
# would have to be '\\\\\\1'.
return LUCENE_UNSAFE.sub(r'\\\1', text)

def encode_uri(uri):
"""
Takes in a URI and makes sure it follows our conventions:
- expressed as a UTF-8 string
- spaces are changed to underscores
- URL-encoded, so for example a comma becomes %2C
def normalize_uri(uri):
"""
if isinstance(uri, unicode):
uri = uri.replace(u' ', u'_').encode('utf-8', 'replace')
else:
uri = uri.replace(' ', '_')
return urllib.quote(uri)
Ensure that a URI is in Unicode, strip whitespace that may have crept
in, and change spaces to underscores, creating URIs that will be
friendlier to work with later.
def decode_uri(uri):
We don't worry about URL-quoting here; the client framework takes
care of that for us.
"""
Converts a URI to readable Unicode text.
"""
unquoted = urllib.unquote(uri).decode('utf-8', 'replace')
return unquoted.replace('_', ' ')
if isinstance(uri, str):
uri = uri.decode('utf-8')
return uri.strip().replace(u' ', u'_')

class ConceptNetGraph(object):
def __init__(self, url):
"""
initializes ConceptNetGraph,
creates GraphDatabase and node_index objects
args:
url -- url of neo4j database in use
Create a ConceptNetGraph object, backed by a Neo4j databases at the
given URL.
"""

self.graph = GraphDatabase(url)
self._node_index = self.graph.nodes.indexes['node_auto_index']
self._edge_index = self.graph.relationships.indexes['relationship_auto_index']
Expand Down Expand Up @@ -71,17 +66,21 @@ def _create_node(self, uri, properties):
uri -- identifier of intended node, used in index
properties -- (optional) properties for assertions (see assertions)
"""
# Apply normalization to the URI here. All downstream functions can
# assume it's normalized.
uri = normalize_uri(uri)

if uri.count('/') < 2:
raise ValueError("""
The URI %r is too short. You can't create the root or
a type with this method.
""" % uri)

_, type, rest = uri.split('/', 2)
method = getattr(self, '_create_%s_node' % type)
if method is None:
raise ValueError("I don't know how to create type %r" % type)
return method(self, url, rest, properties)
return method(uri, rest, properties)

def _create_concept_node(self, uri, rest, properties):
"""
Expand All @@ -99,7 +98,7 @@ def _create_concept_node(self, uri, rest, properties):
return self.graph.node(
type='concept',
language=language,
name=decode_uri(name),
name=name,
uri=uri,
**properties
)
Expand All @@ -119,7 +118,7 @@ def _create_relation_node(self, uri, rest, properties):
name = rest
return self.graph.node(
type='relation',
name=decode_uri(rel),
name=rel,
uri=uri,
**properties
)
Expand All @@ -144,7 +143,7 @@ def _create_assertion_node(self, uri, rest, properties):
args = []
rel = self.get_or_create_node(rel_uri)
for arg_uri in arg_uris: args.append(self.get_or_create_node(arg_uri))
return self._create_assertion_from_components(uri, relation, args, properties)
return self._create_assertion_from_components(uri, rel, args, properties)

def _create_assertion_from_components(self, uri, relation, args, properties):
"""
Expand All @@ -153,7 +152,7 @@ def _create_assertion_from_components(self, uri, relation, args, properties):
create the assertion.
"""
assertion = self.graph.node(
type=type,
type='assertion',
uri=uri
)
self._create_edge("relation", assertion, relation)
Expand Down Expand Up @@ -203,29 +202,21 @@ def _create_frame_node(self, uri, rest, properties):
name = rest
return self.graph.node(
type='frame',
name=decode_uri(rel),
uri= uri
name=rel,
uri=uri
)

def make_assertion_uri(self, relation_uri, arg_uri_list):
"""creates assertion uri out of component uris"""

for uri in [relation_uri] + arg_uri_list:
if not uri_is_safe(uri):
raise ValueError("The URI %r has unsafe characters in it. "
"Please use encode_uri() first." % uri)
return '/assertion/_' + relation_uri + '/_' + '/_'.join(arg_uri_list)

def get_node(self, uri):
"""
searches for node in main index,
returns either single Node, None or Error (for multiple results)
"""

if not uri_is_safe(uri):
raise ValueError("This URI has unsafe characters in it. "
"Please use encode_uri() first.")
results = self._node_index.query('uri', uri)
uri = normalize_uri(uri)
results = self._node_index.query('uri', lucene_escape(uri))
if len(results) == 1:
return results[0]
elif len(results) == 0:
Expand Down Expand Up @@ -278,13 +269,13 @@ def _any_to_node(self, obj):
else:
raise TypeError

def _create_edge(self, type, source, target, props):
def _create_edge(self, type, source, target, props = {}):
"""
Create an edge and ensure that it is indexed by its nodes.
"""
source = self._any_to_node(source)
target = self._any_to_node(target)
edge = source.relationships.create(type, target, props)
edge = source.relationships.create(type, target, **props)
edge['nodes'] = '%d-%d' % (source.id, target.id)

def get_node_by_id(self, id):
Expand Down Expand Up @@ -331,7 +322,7 @@ def get_or_create_assertion(self, relation, args, properties = {}):
if isinstance(node_uri,Node):
uris.append(node_uri['uri'])
nodes.append(node_uri)
elif uri_is_safe(node_uri):
elif isinstance(node_uri, basestring):
uris.append(node_uri)
nodes.append(self.get_or_create_node(node_uri))
else:
Expand All @@ -351,7 +342,7 @@ def get_or_create_expression(self, frame, args, properties = {}):
args:
relation -- relation node in desired expression
args -- argument nodes desired in expression
properties -- properties for
properties -- properties for FIXME
"""

#uris = []
Expand All @@ -368,7 +359,7 @@ def get_or_create_concept(self, language, name):
name -- name of concept ie. 'dog','fish' etc
"""

uri = "/concept/%s/%s" % (language, uri_encode(name))
uri = "/concept/%s/%s" % (language, name)
return self.get_node(uri) or self._create_node(uri,{})

def get_or_create_relation(self, name):
Expand All @@ -380,18 +371,18 @@ def get_or_create_relation(self, name):
name -- name of relation ie. 'IsA'
"""

uri = "/concept/%s" % (uri_encode(name))
return self.get_node(uri) or self._create_node(uri,{})
uri = "/concept/%s" % name
return self.get_node(uri) or self._create_node(uri, {})

def get_or_create_frame(self, name):
"""
finds of creates frame using name of frame. convenience function.
args:
name -- name of frame, ie. "{1} is used for {2}"
name -- name of frame, ie. "$1 is used for $2"
"""

uri = "/frame/%s" % (uri_encode(name))
uri = "/frame/%s" % name
return self.get_node(uri) or self._create_node(uri,{})

#def get_args(self,assertion):
Expand All @@ -414,14 +405,17 @@ def get_or_create_frame(self, name):

if __name__ == '__main__':
g = ConceptNetGraph('http://localhost:7474/db/data')
a1 = g.get_or_create_node(encode_uri(u"/assertion/_/relation/IsA/_/concept/en/dog/_/concept/en/animal"))
a1 = g.get_or_create_node(u"/assertion/_/relation/IsA/_/concept/en/dog/_/concept/en/animal")

a2 = g.get_or_create_node(encode_uri(u"/assertion/_/relation/UsedFor/_/concept/zh_TW/枕頭/_/concept/zh_TW/睡覺"))
a2 = g.get_or_create_node(u"/assertion/_/relation/UsedFor/_/concept/zh_TW/枕頭/_/concept/zh_TW/睡覺")

a3 = g.get_or_create_node(encode_uri("/assertion/_/relation/IsA/_/concept/en/test_:D/_/concept/en/it works"))
a3 = g.get_or_create_node(u"/assertion/_/relation/IsA/_/concept/en/test_:D/_/concept/en/it works")

g.get_or_create_edge('justify', 0, a1)
g.get_or_create_edge('justify', 0, a2)
print a1['uri'], a1.id
print a2['uri'], a2.id
print a3['uri'], a3.id
print g.get_edge('justify', 0, 474).id
print g.get_edge('justify', 0, a1.id).id
print g.get_edge('justify', 0, a2.id).id

Loading

0 comments on commit b4d5f3e

Please sign in to comment.