Skip to content

Commit

Permalink
output external URI maps as tab-separated files
Browse files Browse the repository at this point in the history
  • Loading branch information
Rob Speer committed May 6, 2016
1 parent 4fc1424 commit d28aefc
Show file tree
Hide file tree
Showing 5 changed files with 21 additions and 17 deletions.
4 changes: 2 additions & 2 deletions Snakefile
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ rule read_umbel:
"data/raw/umbel/{filename}.nt"
output:
"data/edges/umbel/{filename}.msgpack",
"data/edges/umbel/{filename}.links.jsons"
"data/edges/umbel/{filename}.links.csv"
shell:
"python3 -m conceptnet5.readers.umbel data/raw/umbel/ {output}"

Expand Down Expand Up @@ -211,7 +211,7 @@ rule read_wordnet:
"data/raw/wordnet-rdf/wn31.nt"
output:
"data/edges/wordnet/wordnet.msgpack",
"data/edges/wordnet/wordnet.links.jsons"
"data/edges/wordnet/wordnet.links.csv"
shell:
"python3 -m conceptnet5.readers.wordnet {input} {output}"

Expand Down
8 changes: 5 additions & 3 deletions TODO.txt
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
Design of ConceptNet 5.5:

- [x] link the API responses (fix Pygments order)
- [ ] Don't stem things on the way in
- [x] Don't stem things on the way in
- [ ] Use the blacklist when making assertions
- [ ] Figure out how to keep more Verbosity
- [x] Figure out how to keep more Verbosity
- [ ] Coarsen Wiktionary using etymologies
- [ ] Use JSON-LD when possible
- [X] Store the index in a reasonable constant database
- [ ] Handle 'sw-maps' correctly, no proliferation of N-triples, no double-escaping
- [x] Store the index in a reasonable constant database

ConceptNet 5.6 perhaps:

Expand Down
8 changes: 4 additions & 4 deletions conceptnet5/readers/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,8 +96,8 @@ def run_umbel(input_dir, output, mapping):
input_dir: a directory containing N-Triples files of Umbel data
output: a msgpack file of edges
mapping: an N-Triples output file that will map external Semantic Web URIs to
ConceptNet URIs
mapping: a tab-separated output file that will map external Semantic Web
URIs to ConceptNet URIs
"""
umbel.handle_file(input_dir, output, mapping)

Expand Down Expand Up @@ -151,7 +151,7 @@ def run_wordnet(input, output, mapping):
input: an .nt file of WordNet data
output: a msgpack file of edges
mapping: an N-Triples output file that will map external Semantic Web URIs to
ConceptNet URIs
mapping: a tab-separated output file that will map external Semantic Web
URIs to ConceptNet URIs
"""
wordnet.handle_file(input, output, mapping)
8 changes: 4 additions & 4 deletions conceptnet5/readers/umbel.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ def run_umbel(input_dir, output_file, sw_map_file):
ConceptNet.
"""
out = MsgpackStreamWriter(output_file)
map_out = NTriplesWriter(sw_map_file)
map_out = open(sw_map_file, 'w', encoding='utf-8')
reader = NTriplesReader()

labels = {}
Expand Down Expand Up @@ -113,9 +113,9 @@ def run_umbel(input_dir, output_file, sw_map_file):
rel_uri, frame = REL_MAPPING[rel_name]
surface = frame % (labels[web_subj], labels[web_obj])
out.write(umbel_edge(rel_uri, subj_uri, obj_uri, surface, SOURCE))
map_out.write_link(web_rel, full_conceptnet_url(rel_uri))
map_out.write_link(web_subj, full_conceptnet_url(subj_uri))
map_out.write_link(web_obj, full_conceptnet_url(obj_uri))
map_out.write('{}\t{}'.format(web_rel, full_conceptnet_url(rel_uri)))
map_out.write('{}\t{}'.format(web_subj), full_conceptnet_url(subj_uri)))
map_out.write('{}\t{}'.format(web_obj), full_conceptnet_url(obj_uri)))

# altLabel relations assign different texts to the same node. We'll
# represent those in ConceptNet with Synonym relations.
Expand Down
10 changes: 6 additions & 4 deletions conceptnet5/readers/wordnet.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from conceptnet5.edges import make_edge
from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter
from conceptnet5.formats.semantic_web import (
NTriplesReader, NTriplesWriter, resource_name, full_conceptnet_url
NTriplesReader, resource_name, full_conceptnet_url
)
import re
import os
Expand Down Expand Up @@ -99,7 +99,6 @@ def label_sort_key(label):
def run_wordnet(input_file, output_file, sw_map_file):
reader = NTriplesReader()
out = MsgpackStreamWriter(output_file)
map_out = NTriplesWriter(sw_map_file)

synset_senses = defaultdict(list)
sense_synsets = {}
Expand All @@ -110,7 +109,6 @@ def run_wordnet(input_file, output_file, sw_map_file):
synset_glosses = {}
synset_disambig = {}
synset_uris = {}
term_info = {}

# First pass: find data about synsets
for subj, rel, obj, objtag in reader.parse_file(input_file):
Expand Down Expand Up @@ -189,7 +187,6 @@ def run_wordnet(input_file, output_file, sw_map_file):
)
out.write(edge)


for subj, rel, obj, objtag in reader.parse_file(input_file):
relname = resource_name(rel)
if relname in REL_MAPPING:
Expand Down Expand Up @@ -230,6 +227,11 @@ def run_wordnet(input_file, output_file, sw_map_file):
)
out.write(edge)

with open(sw_map_file, 'w', encoding='utf-8') as map_out:
for wn_uri in sorted(synset_uris):
cn_uri = synset_uris[wn_uri]
print("{}\t{}".format(wn_uri, cn_uri), out=map_out)


# Entry point for testing
handle_file = run_wordnet
Expand Down

0 comments on commit d28aefc

Please sign in to comment.