diff --git a/conceptnet5/api.py b/conceptnet5/api.py index 3ee2c2ec..a58a9bfc 100644 --- a/conceptnet5/api.py +++ b/conceptnet5/api.py @@ -2,8 +2,8 @@ This file defines the ConceptNet web API responses. """ +from conceptnet5.nodes import ld_node, standardized_concept_uri from conceptnet5.vectors.query import VectorSpaceWrapper -from conceptnet5.nodes import standardized_concept_uri, ld_node VECTORS = VectorSpaceWrapper() FINDER = VECTORS.finder diff --git a/conceptnet5/builders/cli.py b/conceptnet5/builders/cli.py index d335973e..6a65055d 100644 --- a/conceptnet5/builders/cli.py +++ b/conceptnet5/builders/cli.py @@ -1,7 +1,8 @@ import click + from .combine_assertions import combine_assertions -from .reduce_assoc import reduce_assoc from .morphology import prepare_vocab_for_morphology, subwords_to_edges +from .reduce_assoc import reduce_assoc @click.group() @@ -25,12 +26,14 @@ def run_combine(input, output): @cli.command(name='reduce_assoc') @click.argument('assoc_filename', type=click.Path(readable=True, dir_okay=False)) -@click.argument('embedding_filenames', nargs=-1, type=click.Path(readable=True, dir_okay=False)) +@click.argument( + 'embedding_filenames', nargs=-1, type=click.Path(readable=True, dir_okay=False) +) @click.argument('output', type=click.Path(writable=True, dir_okay=False)) def run_reduce_assoc(assoc_filename, embedding_filenames, output): """ - Takes in a file of tab-separated simple associations, one or more - hdf5 files defining vector embeddings, and removes from the associations + Takes in a file of tab-separated simple associations, one or more + hdf5 files defining vector embeddings, and removes from the associations low-frequency terms and associations that are judged unlikely to be useful by various filters. """ diff --git a/conceptnet5/builders/combine_assertions.py b/conceptnet5/builders/combine_assertions.py index 3fffcf4c..c5accf8b 100644 --- a/conceptnet5/builders/combine_assertions.py +++ b/conceptnet5/builders/combine_assertions.py @@ -1,16 +1,18 @@ -from __future__ import unicode_literals, print_function - import itertools import json - import os from conceptnet5.edges import make_edge from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter from conceptnet5.languages import ALL_LANGUAGES from conceptnet5.readers.wiktionary import valid_language -from conceptnet5.uri import conjunction_uri,get_uri_language, is_absolute_url, Licenses, \ - split_uri, uri_prefix +from conceptnet5.uri import ( + Licenses, + conjunction_uri, + get_uri_language, + is_absolute_url, + split_uri, +) from conceptnet5.util import get_support_data_filename N = 100 @@ -91,9 +93,14 @@ def make_assertion(line_group): license = Licenses.cc_attribution return make_edge( - rel=rel, start=start, end=end, weight=weight, - dataset=dataset, license=license, sources=sources, - surfaceText=surface_text + rel=rel, + start=start, + end=end, + weight=weight, + dataset=dataset, + license=license, + sources=sources, + surfaceText=surface_text, ) @@ -113,6 +120,7 @@ def combine_assertions(input_filename, output_filename): This process requires its input to be a sorted CSV so that all edges for the same assertion will appear consecutively. """ + def group_func(line): "Group lines by their URI (their first column)." return line.split('\t', 1)[0] diff --git a/conceptnet5/builders/morphology.py b/conceptnet5/builders/morphology.py index 49b12c74..e165a9cf 100644 --- a/conceptnet5/builders/morphology.py +++ b/conceptnet5/builders/morphology.py @@ -4,7 +4,7 @@ from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter from conceptnet5.languages import ATOMIC_SPACE_LANGUAGES from conceptnet5.nodes import split_uri -from conceptnet5.uri import get_uri_language, join_uri, Licenses +from conceptnet5.uri import Licenses, get_uri_language, join_uri def prepare_vocab_for_morphology(language, input, output): @@ -61,11 +61,13 @@ def subwords_to_edges(language, input, output): if chunk != '_': start = join_uri('x', language, chunk.strip('_')) edge = make_edge( - '/r/SubwordOf', start, end, + '/r/SubwordOf', + start, + end, dataset='/d/morphology', license=Licenses.cc_attribution, sources=MORPH_SOURCES, - weight=0.01 + weight=0.01, ) writer.write(edge) writer.close() diff --git a/conceptnet5/builders/reduce_assoc.py b/conceptnet5/builders/reduce_assoc.py index a2444679..3566b568 100644 --- a/conceptnet5/builders/reduce_assoc.py +++ b/conceptnet5/builders/reduce_assoc.py @@ -5,10 +5,11 @@ from collections import defaultdict +import pandas as pd + from conceptnet5.relations import is_negative_relation from conceptnet5.uri import is_concept, uri_prefix from conceptnet5.vectors.formats import load_hdf -import pandas as pd def concept_is_bad(uri): @@ -19,14 +20,19 @@ def concept_is_bad(uri): specific phrase, possibly mis-parsed. A concept with a colon is probably detritus from a wiki. """ - return (':' in uri or uri.count('_') >= 3 or - uri.startswith('/a/') or uri.count('/') <= 2) + return ( + ':' in uri + or uri.count('_') >= 3 + or uri.startswith('/a/') + or uri.count('/') <= 2 + ) class ConceptNetAssociationGraph: ''' Class to hold the concept-association edge graph. ''' + def __init__(self): '''Construct a graph with no vertices or edges.''' self.vertex_to_neighbors = defaultdict(set) @@ -43,15 +49,15 @@ def vertices(self): def find_components(self): ''' - Returns a dict mapping the vertices of the graph to labels, - such that two vertices map to the same label if and only if - they belong to the same connected component of the undirected - graph obtained by adding the reversal of every edge to the - graph. (But note that this function does not modify the graph, + Returns a dict mapping the vertices of the graph to labels, + such that two vertices map to the same label if and only if + they belong to the same connected component of the undirected + graph obtained by adding the reversal of every edge to the + graph. (But note that this function does not modify the graph, i.e. it does not add any edges.) ''' - - component_labels = {vertex : -1 for vertex in self.vertices()} + + component_labels = {vertex: -1 for vertex in self.vertices()} vertices_to_examine = set(self.vertices()) new_label = -1 while len(vertices_to_examine) > 0: @@ -72,19 +78,18 @@ def find_components(self): return component_labels @classmethod - def from_csv(cls, filename, filtered_concepts=None, - reject_negative_relations=True): + def from_csv(cls, filename, filtered_concepts=None, reject_negative_relations=True): """ - Reads an association file and builds an (undirected) graph from it. + Reads an association file and builds an (undirected) graph from it. - If filtered_concepts isn't None, it should be a collection of concepts, - and only vertices from this collection and edges that link two such - vertices will be added to the graph. If it _is_ None (the default), - however, please note that no such filtering will be done (i.e. the - effective filter collection is then the universal set of concepts, not + If filtered_concepts isn't None, it should be a collection of concepts, + and only vertices from this collection and edges that link two such + vertices will be added to the graph. If it _is_ None (the default), + however, please note that no such filtering will be done (i.e. the + effective filter collection is then the universal set of concepts, not the empty set). - If reject_negative_relations is True (the default), only edges not + If reject_negative_relations is True (the default), only edges not corresponding to negative relations will be added to the graph. """ graph = cls() @@ -119,29 +124,29 @@ def from_csv(cls, filename, filtered_concepts=None, class ConceptNetAssociationGraphForReduction(ConceptNetAssociationGraph): """ - Subclass of ConceptNetAssociationGraph specialized for use in making + Subclass of ConceptNetAssociationGraph specialized for use in making the reduced subgraph of a full set of associations. """ + def __init__(self): super().__init__() self.edges = [] - + def add_edge(self, left, right, value, dataset, relation): """ - In addition to the superclass's handling of a new edge, + In addition to the superclass's handling of a new edge, saves the full edge data. """ super().add_edge(left, right, value, dataset, relation) self.edges.append((left, right, value, dataset, relation)) - def make_filtered_concepts(filename, cutoff=3, en_cutoff=3): """ - Takes in a file of tab-separated associations, and returns a set of - concepts from which those which are unlikely to be useful have been - removed. - + Takes in a file of tab-separated associations, and returns a set of + concepts from which those which are unlikely to be useful have been + removed. + All concepts that occur fewer than `cutoff` times will be removed. All English concepts that occur fewer than `en_cutoff` times will be removed. """ @@ -160,20 +165,18 @@ def make_filtered_concepts(filename, cutoff=3, en_cutoff=3): counts[gright] += 1 filtered_concepts = { - concept for (concept, count) in counts.items() - if ( - count >= en_cutoff or - (not is_concept(concept) and count >= cutoff) - ) + concept + for (concept, count) in counts.items() + if (count >= en_cutoff or (not is_concept(concept) and count >= cutoff)) } return filtered_concepts def read_embedding_vocabularies(filenames): """ - Reads every vector embedding file in the given collection of - filenames, and returns the union of their vocabularies. (The - files are assumed to be hdf5 files containing dataframes, and + Reads every vector embedding file in the given collection of + filenames, and returns the union of their vocabularies. (The + files are assumed to be hdf5 files containing dataframes, and the vocabularies are their indices. """ result = pd.Index([]) @@ -183,28 +186,29 @@ def read_embedding_vocabularies(filenames): return result - -def reduce_assoc(assoc_filename, embedding_filenames, output_filename, - cutoff=3, en_cutoff=3): +def reduce_assoc( + assoc_filename, embedding_filenames, output_filename, cutoff=3, en_cutoff=3 +): """ Takes in a file of tab-separated simple associations, and removes - uncommon associations and associations unlikely to be useful. Also - requires one or more vector embedding files (from which only the - vocabularies are used; associations involving terms that have no - connection, no matter how distant, to the union of those vocabularies + uncommon associations and associations unlikely to be useful. Also + requires one or more vector embedding files (from which only the + vocabularies are used; associations involving terms that have no + connection, no matter how distant, to the union of those vocabularies will be removed). All concepts that occur fewer than `cutoff` times will be removed. All English concepts that occur fewer than `en_cutoff` times will be removed. """ - filtered_concepts = make_filtered_concepts(assoc_filename, cutoff=cutoff, - en_cutoff=en_cutoff) + filtered_concepts = make_filtered_concepts( + assoc_filename, cutoff=cutoff, en_cutoff=en_cutoff + ) graph = ConceptNetAssociationGraphForReduction.from_csv( assoc_filename, filtered_concepts=filtered_concepts, - reject_negative_relations=True + reject_negative_relations=True, ) component_labels = graph.find_components() @@ -215,11 +219,11 @@ def reduce_assoc(assoc_filename, embedding_filenames, output_filename, # from any of the embedding vocabularies, there will be no way to assign # vectors to any of its vertices, so we remove that component from the # output. - - good_component_labels = set(label for term, label - in component_labels.items() - if term in embedding_vocab) - + + good_component_labels = set( + label for term, label in component_labels.items() if term in embedding_vocab + ) + with open(output_filename, 'w', encoding='utf-8') as out: for gleft, gright, value, dataset, rel in graph.edges: if component_labels[gleft] not in good_component_labels: diff --git a/conceptnet5/db/cli.py b/conceptnet5/db/cli.py index 2daef5f3..3e5fa23d 100644 --- a/conceptnet5/db/cli.py +++ b/conceptnet5/db/cli.py @@ -1,7 +1,8 @@ import click -from .connection import get_db_connection, check_db_connection + +from .connection import check_db_connection, get_db_connection from .prepare_data import assertions_to_sql_csv, load_sql_csv -from .schema import create_tables, create_indices +from .schema import create_indices, create_tables @click.group() @@ -11,13 +12,18 @@ def cli(): @cli.command(name='prepare_data') @click.argument('input_filename', type=click.Path(readable=True, dir_okay=False)) -@click.argument('output_dir', type=click.Path(writable=True, dir_okay=True, file_okay=False)) +@click.argument( + 'output_dir', type=click.Path(writable=True, dir_okay=True, file_okay=False) +) def prepare_data(input_filename, output_dir): assertions_to_sql_csv(input_filename, output_dir) @cli.command(name='load_data') -@click.argument('input_dir', type=click.Path(readable=True, writable=True, dir_okay=True, file_okay=False)) +@click.argument( + 'input_dir', + type=click.Path(readable=True, writable=True, dir_okay=True, file_okay=False), +) def load_data(input_dir): conn = get_db_connection() create_tables(conn) diff --git a/conceptnet5/db/connection.py b/conceptnet5/db/connection.py index ecf0433d..2b131a63 100644 --- a/conceptnet5/db/connection.py +++ b/conceptnet5/db/connection.py @@ -1,4 +1,5 @@ import psycopg2 + from conceptnet5.db import config _CONNECTIONS = {} diff --git a/conceptnet5/db/prepare_data.py b/conceptnet5/db/prepare_data.py index 78f1a02f..80aa7ab5 100644 --- a/conceptnet5/db/prepare_data.py +++ b/conceptnet5/db/prepare_data.py @@ -1,8 +1,9 @@ +import json + from conceptnet5.formats.msgpack_stream import read_msgpack_stream -from conceptnet5.uri import uri_prefixes from conceptnet5.relations import SYMMETRIC_RELATIONS +from conceptnet5.uri import uri_prefixes from ordered_set import OrderedSet -import json def write_row(outfile, items): diff --git a/conceptnet5/db/query.py b/conceptnet5/db/query.py index 5b085307..828e1da9 100644 --- a/conceptnet5/db/query.py +++ b/conceptnet5/db/query.py @@ -1,8 +1,9 @@ -from conceptnet5.db.connection import get_db_connection +import itertools +import json + from conceptnet5.db.config import DB_NAME +from conceptnet5.db.connection import get_db_connection from conceptnet5.edges import transform_for_linked_data -import json -import itertools from ftfy.fixes import remove_control_chars LIST_QUERIES = {} @@ -109,11 +110,10 @@ def gin_jsonb_value(criteria, node_forward=True): 'end': 'end', 'rel': 'rel', 'dataset': 'dataset', - # edges have a 'sources' element, but the query key we've historically # accepted is 'source', so let's just accept both 'source': 'sources', - 'sources': 'sources' + 'sources': 'sources', } if node_forward: criteria_map['node'] = 'start' @@ -135,6 +135,7 @@ class AssertionFinder(object): The object that interacts with the database to find ConcetNet assertions (edges) matching certain criteria. """ + def __init__(self, dbname=None): self.connection = None self.dbname = dbname @@ -192,7 +193,9 @@ def feature_data(row): cursor.execute(NODE_TO_FEATURE_QUERY, {'node': uri, 'limit': limit}) results = {} for feature, rows in itertools.groupby(cursor.fetchall(), extract_feature): - results[feature] = [transform_for_linked_data(feature_data(row)) for row in rows] + results[feature] = [ + transform_for_linked_data(feature_data(row)) for row in rows + ] return results def lookup_assertion(self, uri): @@ -217,7 +220,9 @@ def random_edges(self, limit=20): self.connection = get_db_connection(self.dbname) cursor = self.connection.cursor() cursor.execute(RANDOM_QUERY, {'limit': limit}) - results = [transform_for_linked_data(data) for uri, data, weight in cursor.fetchall()] + results = [ + transform_for_linked_data(data) for uri, data, weight in cursor.fetchall() + ] return results def query(self, criteria, limit=20, offset=0): @@ -237,18 +242,14 @@ def query(self, criteria, limit=20, offset=0): 'query_forward': jsonify(query_forward), 'query_backward': jsonify(query_backward), 'limit': limit, - 'offset': offset - } + 'offset': offset, + }, ) else: query = gin_jsonb_value(criteria) cursor.execute( GIN_QUERY_1WAY, - { - 'query': jsonify(query), - 'limit': limit, - 'offset': offset - } + {'query': jsonify(query), 'limit': limit, 'offset': offset}, ) results = [ diff --git a/conceptnet5/edges.py b/conceptnet5/edges.py index abe5abb6..4ebdf9e2 100644 --- a/conceptnet5/edges.py +++ b/conceptnet5/edges.py @@ -4,15 +4,24 @@ an edge. """ -from conceptnet5.uri import ( - assertion_uri, uri_prefix, conjunction_uri, is_concept -) -from conceptnet5.nodes import ld_node import re - -def make_edge(rel, start, end, dataset, license, sources, - surfaceText=None, surfaceStart=None, surfaceEnd=None, weight=1.0): +from conceptnet5.nodes import ld_node +from conceptnet5.uri import assertion_uri, conjunction_uri, is_concept, uri_prefix + + +def make_edge( + rel, + start, + end, + dataset, + license, + sources, + surfaceText=None, + surfaceStart=None, + surfaceEnd=None, + weight=1.0, +): """ Take in the information representing an edge (a justified assertion), and output that edge in dictionary form. @@ -49,7 +58,7 @@ def make_edge(rel, start, end, dataset, license, sources, features = [ "%s %s -" % (pstart, rel), "%s - %s" % (pstart, pend), - "- %s %s" % (rel, pend) + "- %s %s" % (rel, pend), ] else: features = [] @@ -72,7 +81,7 @@ def make_edge(rel, start, end, dataset, license, sources, 'weight': weight, 'surfaceText': surfaceText, 'surfaceStart': surfaceStart, - 'surfaceEnd': surfaceEnd + 'surfaceEnd': surfaceEnd, } return obj diff --git a/conceptnet5/formats/convert.py b/conceptnet5/formats/convert.py index 8f2e5c8d..7c2edd8a 100644 --- a/conceptnet5/formats/convert.py +++ b/conceptnet5/formats/convert.py @@ -1,11 +1,12 @@ -from __future__ import unicode_literals, print_function -from conceptnet5.languages import COMMON_LANGUAGES -from conceptnet5.uri import get_uri_language, join_uri, split_uri -from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter, read_msgpack_stream -from conceptnet5.formats.json_stream import JSONStreamWriter, read_json_stream +import json from collections import defaultdict + import click -import json + +from conceptnet5.formats.json_stream import JSONStreamWriter, read_json_stream +from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter, read_msgpack_stream +from conceptnet5.languages import COMMON_LANGUAGES +from conceptnet5.uri import get_uri_language, join_uri, split_uri def msgpack_to_json(input_filename, output_filename): @@ -39,7 +40,7 @@ def msgpack_to_tab_separated(input_filename, output_filename): 'weight': round(info['weight'], 3), 'sources': info['sources'], 'dataset': info['dataset'], - 'license': info['license'] + 'license': info['license'], } for extra_key in 'surfaceText', 'surfaceStart', 'surfaceEnd': if info.get(extra_key): @@ -77,8 +78,8 @@ def msgpack_to_assoc(input_filename, output_filename): start_uri = info['start'] end_uri = info['end'] if not ( - get_uri_language(start_uri) in COMMON_LANGUAGES and - get_uri_language(end_uri) in COMMON_LANGUAGES + get_uri_language(start_uri) in COMMON_LANGUAGES + and get_uri_language(end_uri) in COMMON_LANGUAGES ): continue rel = info['rel'] @@ -95,7 +96,7 @@ def msgpack_to_assoc(input_filename, output_filename): end=prefix, weight=1., dataset=dataset, - rel='/r/SenseOf' + rel='/r/SenseOf', ) weight_by_dataset[dataset] += 1. count_by_dataset[dataset] += 1 @@ -120,8 +121,7 @@ def msgpack_to_assoc(input_filename, output_filename): for (start, end) in pairs: line = "{start}\t{end}\t{weight}\t{dataset}\t{rel}".format( - start=start, end=end, weight=weight, dataset=dataset, - rel=rel + start=start, end=end, weight=weight, dataset=dataset, rel=rel ) weight_by_dataset[dataset] += weight count_by_dataset[dataset] += 1 diff --git a/conceptnet5/formats/json_stream.py b/conceptnet5/formats/json_stream.py index cfbd342c..d351798b 100644 --- a/conceptnet5/formats/json_stream.py +++ b/conceptnet5/formats/json_stream.py @@ -1,6 +1,6 @@ +import gzip import json import sys -import gzip class JSONStreamWriter(object): diff --git a/conceptnet5/formats/msgpack_stream.py b/conceptnet5/formats/msgpack_stream.py index dc1b413f..8f41dc56 100644 --- a/conceptnet5/formats/msgpack_stream.py +++ b/conceptnet5/formats/msgpack_stream.py @@ -1,5 +1,4 @@ import msgpack -import sys class MsgpackStreamWriter(object): diff --git a/conceptnet5/formats/semantic_web.py b/conceptnet5/formats/semantic_web.py index ebb54e50..41e7ee4e 100644 --- a/conceptnet5/formats/semantic_web.py +++ b/conceptnet5/formats/semantic_web.py @@ -1,7 +1,9 @@ -from ftfy.fixes import decode_escapes +import re import urllib + import langcodes -import re + +from ftfy.fixes import decode_escapes SEE_ALSO = 'http://www.w3.org/2000/01/rdf-schema#seeAlso' @@ -86,7 +88,8 @@ def resource_name(url): return path.split('/')[-1] -NQUADS_ITEM_RE = re.compile(r''' +NQUADS_ITEM_RE = re.compile( + r''' ( < # A URL (URI, IRI) enclosed in angle brackets (?P [^> ]+) > @@ -102,7 +105,9 @@ def resource_name(url): (?P[A-Za-z0-9_]+) # A blank node identifier | [#] (?P.*) # The line could end with a comment )\s* - ''', re.VERBOSE) + ''', + re.VERBOSE, +) def parse_nquads_line(line): diff --git a/conceptnet5/language/lemmatize.py b/conceptnet5/language/lemmatize.py index 4807cf97..66d71f16 100644 --- a/conceptnet5/language/lemmatize.py +++ b/conceptnet5/language/lemmatize.py @@ -1,8 +1,8 @@ import sqlite3 + import wordfreq +from conceptnet5.uri import join_uri, split_uri from conceptnet5.util import get_data_filename -from conceptnet5.uri import split_uri, join_uri - WORDFREQ_LANGUAGES = set(wordfreq.available_languages()) @@ -193,6 +193,7 @@ def lemmatize_uri(self, uri): root, _form = self.lookup(language, text, pos) return join_uri('c', language, root, *rest) + LEMMATIZER = DBLemmatizer() diff --git a/conceptnet5/language/token_utils.py b/conceptnet5/language/token_utils.py index 160ab146..b71b61d9 100644 --- a/conceptnet5/language/token_utils.py +++ b/conceptnet5/language/token_utils.py @@ -1,15 +1,9 @@ -# coding: utf-8 """ This file contains some generally useful operations you would perform to separate and join tokens. The tools apply most to English, but should also be able to do their job in any Western language that uses spaces. """ -from __future__ import unicode_literals - import re -import sys - -PY2 = (sys.version_info.major < 3) def untokenize(tokens): diff --git a/conceptnet5/nodes.py b/conceptnet5/nodes.py index 1b8c262b..a44bcc83 100644 --- a/conceptnet5/nodes.py +++ b/conceptnet5/nodes.py @@ -4,11 +4,8 @@ terms and languages into a standard form (english_filter, simple_tokenize, LCODE_ALIASES). """ -from urllib.parse import urlparse - import re -from wordfreq import simple_tokenize -from wordfreq.preprocess import preprocess_text +from urllib.parse import urlparse from conceptnet5.language.english import english_filter from conceptnet5.languages import LCODE_ALIASES @@ -20,6 +17,8 @@ uri_prefix, uri_to_label, ) +from wordfreq import simple_tokenize +from wordfreq.preprocess import preprocess_text def preprocess_and_tokenize_text(lang, text): diff --git a/conceptnet5/readers/cc_cedict.py b/conceptnet5/readers/cc_cedict.py index 09b62317..96932315 100644 --- a/conceptnet5/readers/cc_cedict.py +++ b/conceptnet5/readers/cc_cedict.py @@ -32,14 +32,17 @@ DATE_RANGE_REGEX = re.compile(r'(.+?)\s\(.+\d.+\),') # date range DEFINITIONS_REGEX = re.compile(r'/|;') # separate definitions HAN_CHAR_REGEX = regex.compile('([\p{IsIdeo}]+[\|ยท]?)+') # Han characters -LINE_REGEX = re.compile(r'(.+)\s(.+)\[.+\]\s/(.+)/') # separate traditional and simplified words +LINE_REGEX = re.compile( + r'(.+)\s(.+)\[.+\]\s/(.+)/' +) # separate traditional and simplified words LIT_FIG_REGEX = re.compile(r'(\b|\s)(fig|lit).\s') # literally/figuratively PAREN_REGEX = re.compile(r'\(.+?\)') # parenthesis SB_REGEX = re.compile(r'\b(sb)\b') STH_REGEX = re.compile(r'\b(sth)\b') SEE_ALSO_REGEX = re.compile(r'see( also)?') # see also -VARIANT_REGEX = re.compile(r'((old |Japanese )?variant of|archaic version of|also ' - r'written|same as)\s') # variant syntax +VARIANT_REGEX = re.compile( + r'((old |Japanese )?variant of|archaic version of|also ' r'written|same as)\s' +) # variant syntax def remove_reference_syntax(definition): @@ -116,12 +119,14 @@ def handle_file(filename, output_file): traditional, simplified, definitions = re.match(LINE_REGEX, line).groups() # Make an edge between the traditional and simplified version - edge = make_edge(rel='/r/Synonym', - start=standardized_concept_uri('zh-Hant', traditional), - end=standardized_concept_uri('zh-Hans', simplified), - dataset=DATASET, - license=LICENSE, - sources=SOURCE) + edge = make_edge( + rel='/r/Synonym', + start=standardized_concept_uri('zh-Hant', traditional), + end=standardized_concept_uri('zh-Hans', simplified), + dataset=DATASET, + license=LICENSE, + sources=SOURCE, + ) out.write(edge) for definition in re.split(DEFINITIONS_REGEX, definitions): @@ -135,20 +140,24 @@ def handle_file(filename, output_file): if person_match: persons = extract_person(person_match) for person in persons: - edge = make_edge(rel='/r/Synonym', - start=standardized_concept_uri('zh-Hant', traditional), - end=standardized_concept_uri('en', person), - dataset=DATASET, - license=LICENSE, - sources=SOURCE) + edge = make_edge( + rel='/r/Synonym', + start=standardized_concept_uri('zh-Hant', traditional), + end=standardized_concept_uri('en', person), + dataset=DATASET, + license=LICENSE, + sources=SOURCE, + ) out.write(edge) - edge = make_edge(rel='/r/Synonym', - start=standardized_concept_uri('zh-Hans', simplified), - end=standardized_concept_uri('en', person), - dataset=DATASET, - license=LICENSE, - sources=SOURCE) + edge = make_edge( + rel='/r/Synonym', + start=standardized_concept_uri('zh-Hans', simplified), + end=standardized_concept_uri('en', person), + dataset=DATASET, + license=LICENSE, + sources=SOURCE, + ) out.write(edge) continue @@ -156,20 +165,24 @@ def handle_file(filename, output_file): if definition.startswith('CL:'): related_words = extract_measure_words(definition) for word in related_words: - edge = make_edge(rel='/r/RelatedTo', - start=standardized_concept_uri('zh-Hant', traditional), - end=standardized_concept_uri('zh', word), - dataset=DATASET, - license=LICENSE, - sources=SOURCE) + edge = make_edge( + rel='/r/RelatedTo', + start=standardized_concept_uri('zh-Hant', traditional), + end=standardized_concept_uri('zh', word), + dataset=DATASET, + license=LICENSE, + sources=SOURCE, + ) out.write(edge) - edge = make_edge(rel='/r/RelatedTo', - start=standardized_concept_uri('zh-Hans', simplified), - end=standardized_concept_uri('zh', word), - dataset=DATASET, - license=LICENSE, - sources=SOURCE) + edge = make_edge( + rel='/r/RelatedTo', + start=standardized_concept_uri('zh-Hans', simplified), + end=standardized_concept_uri('zh', word), + dataset=DATASET, + license=LICENSE, + sources=SOURCE, + ) out.write(edge) continue @@ -180,40 +193,48 @@ def handle_file(filename, output_file): if re.match(VARIANT_REGEX, definition) or re.match(ABBR_REGEX, definition): variants = extract_han_characters(definition) for variant in variants: - edge = make_edge(rel='/r/Synonym', - start=standardized_concept_uri('zh-Hant', traditional), - end=standardized_concept_uri('zh', variant), - dataset=DATASET, - license=LICENSE, - sources=SOURCE) + edge = make_edge( + rel='/r/Synonym', + start=standardized_concept_uri('zh-Hant', traditional), + end=standardized_concept_uri('zh', variant), + dataset=DATASET, + license=LICENSE, + sources=SOURCE, + ) out.write(edge) - edge = make_edge(rel='/r/Synonym', - start=standardized_concept_uri('zh-Hans', simplified), - end=standardized_concept_uri('zh', variant), - dataset=DATASET, - license=LICENSE, - sources=SOURCE) + edge = make_edge( + rel='/r/Synonym', + start=standardized_concept_uri('zh-Hans', simplified), + end=standardized_concept_uri('zh', variant), + dataset=DATASET, + license=LICENSE, + sources=SOURCE, + ) out.write(edge) continue if re.match(SEE_ALSO_REGEX, definition): references = extract_han_characters(definition) for reference in references: - edge = make_edge(rel='/r/RelatedTo', - start=standardized_concept_uri('zh-Hant', traditional), - end=standardized_concept_uri('zh', reference), - dataset=DATASET, - license=LICENSE, - sources=SOURCE) + edge = make_edge( + rel='/r/RelatedTo', + start=standardized_concept_uri('zh-Hant', traditional), + end=standardized_concept_uri('zh', reference), + dataset=DATASET, + license=LICENSE, + sources=SOURCE, + ) out.write(edge) - edge = make_edge(rel='/r/RelatedTo', - start=standardized_concept_uri('zh-Hans', simplified), - end=standardized_concept_uri('zh', reference), - dataset=DATASET, - license=LICENSE, - sources=SOURCE) + edge = make_edge( + rel='/r/RelatedTo', + start=standardized_concept_uri('zh-Hans', simplified), + end=standardized_concept_uri('zh', reference), + dataset=DATASET, + license=LICENSE, + sources=SOURCE, + ) out.write(edge) # Remove 'lit.', 'fig.' @@ -229,18 +250,22 @@ def handle_file(filename, output_file): # Skip long definitions and make an edge out of remaining information if len(definition.split()) < 6: - edge = make_edge(rel='/r/Synonym', - start=standardized_concept_uri('zh-Hant', traditional), - end=standardized_concept_uri('en', definition), - dataset=DATASET, - license=LICENSE, - sources=SOURCE) + edge = make_edge( + rel='/r/Synonym', + start=standardized_concept_uri('zh-Hant', traditional), + end=standardized_concept_uri('en', definition), + dataset=DATASET, + license=LICENSE, + sources=SOURCE, + ) out.write(edge) - edge = make_edge(rel='/r/Synonym', - start=standardized_concept_uri('zh-Hans', simplified), - end=standardized_concept_uri('en', definition), - dataset=DATASET, - license=LICENSE, - sources=SOURCE) + edge = make_edge( + rel='/r/Synonym', + start=standardized_concept_uri('zh-Hans', simplified), + end=standardized_concept_uri('en', definition), + dataset=DATASET, + license=LICENSE, + sources=SOURCE, + ) out.write(edge) diff --git a/conceptnet5/readers/cli.py b/conceptnet5/readers/cli.py index eefef787..fa431f78 100644 --- a/conceptnet5/readers/cli.py +++ b/conceptnet5/readers/cli.py @@ -1,7 +1,18 @@ import click + from . import ( - cc_cedict, conceptnet4, dbpedia, emoji, jmdict, kyoto_yahoo, - nadya, ptt_petgame, opencyc, verbosity, wiktionary, wordnet + cc_cedict, + conceptnet4, + dbpedia, + emoji, + jmdict, + kyoto_yahoo, + nadya, + opencyc, + ptt_petgame, + verbosity, + wiktionary, + wordnet, ) @@ -24,12 +35,16 @@ def run_conceptnet4(input, output): @cli.command(name='jmdict') -@click.argument('input', type=click.Path(readable=True, dir_okay=False), - # help='XML file containing JMDict' - ) -@click.argument('output', type=click.Path(writable=True, dir_okay=False), - # help='msgpack file to output to' - ) +@click.argument( + 'input', + type=click.Path(readable=True, dir_okay=False), + # help='XML file containing JMDict' +) +@click.argument( + 'output', + type=click.Path(writable=True, dir_okay=False), + # help='msgpack file to output to' +) def run_jmdict(input, output): """ Import JMDict (a multilingual Japanese dictionary) from its XML format. @@ -112,8 +127,7 @@ def run_verbosity(input, output): @cli.command(name='wiktionary_pre') -@click.argument('inputs', type=click.Path(readable=True, dir_okay=False), - nargs=-1) +@click.argument('inputs', type=click.Path(readable=True, dir_okay=False), nargs=-1) @click.argument('output', type=click.Path(writable=True, dir_okay=False)) def run_wiktionary_pre(inputs, output): """ diff --git a/conceptnet5/readers/conceptnet4.py b/conceptnet5/readers/conceptnet4.py index 96a522fa..f23cadd2 100644 --- a/conceptnet5/readers/conceptnet4.py +++ b/conceptnet5/readers/conceptnet4.py @@ -1,55 +1,168 @@ -from __future__ import unicode_literals +from conceptnet5.edges import make_edge +from conceptnet5.formats.json_stream import read_json_stream +from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter +from conceptnet5.language.lemmatize import lemmatize_uri +from conceptnet5.nodes import standardized_concept_uri, valid_concept_name +from conceptnet5.uri import Licenses, join_uri +from wordfreq import simple_tokenize + """ This script reads the ConceptNet 4 data out of the flat files in raw_data, and builds ConceptNet 5 edges from the data. """ -from wordfreq import simple_tokenize -from conceptnet5.formats.json_stream import read_json_stream -from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter -from conceptnet5.nodes import ( - standardized_concept_uri, valid_concept_name -) -from conceptnet5.edges import make_edge -from conceptnet5.language.lemmatize import lemmatize_uri -from conceptnet5.uri import join_uri, Licenses # bedume is a prolific OMCS contributor who seemed to go off the rails at some # point, adding lots of highly correlated nonsense assertions. We need to # filter them out without losing his informative statements. BEDUME_FLAGGED_CONCEPTS = [ - 'cute', 'lost', 'sewing', 'brat', 'working', 'have sex', 'shopping', - 'driving to work', 'typing', 'in jail', 'jogging in park', 'washing his car', - 'washing her car', 'poor', 'pulling weeds', 'dancing', 'sleeping', 'pouting', - 'raking leaves', 'washing her car', 'chopping wood', 'writing book', - 'shouting', 'taking out garbage', 'crying', 'it', 'running', 'cooking', - 'late', 'happy', 'eating', 'afraid', 'voting', 'it', 'thief', - 'shoveling snow', 'drinking', 'drunk', 'watching tv', 'nut', 'early', 'well', - 'ill', 'jogging', 'dead', 'naked', 'playing cards', 'sick', 'painting', - 'reading', 'hunter', 'playing monopoly', 'building new house', 'riding horse', - 'playing in football game', 'making love', 'knitting', - 'going to take vacation', 'fishing', 'going to dentist', 'going to store', - 'going to airport', 'going to go to store', 'kid', 'computer', 'stew', - 'taking walk', 'tired', 'new computer', 'horn', 'serving mealfish', - 'potatoe shed', 'hunting', 'crazy', 'buying new car', 'laughing', - 'intoxicated', 'eating hamburger', 'wok' + 'cute', + 'lost', + 'sewing', + 'brat', + 'working', + 'have sex', + 'shopping', + 'driving to work', + 'typing', + 'in jail', + 'jogging in park', + 'washing his car', + 'washing her car', + 'poor', + 'pulling weeds', + 'dancing', + 'sleeping', + 'pouting', + 'raking leaves', + 'washing her car', + 'chopping wood', + 'writing book', + 'shouting', + 'taking out garbage', + 'crying', + 'it', + 'running', + 'cooking', + 'late', + 'happy', + 'eating', + 'afraid', + 'voting', + 'it', + 'thief', + 'shoveling snow', + 'drinking', + 'drunk', + 'watching tv', + 'nut', + 'early', + 'well', + 'ill', + 'jogging', + 'dead', + 'naked', + 'playing cards', + 'sick', + 'painting', + 'reading', + 'hunter', + 'playing monopoly', + 'building new house', + 'riding horse', + 'playing in football game', + 'making love', + 'knitting', + 'going to take vacation', + 'fishing', + 'going to dentist', + 'going to store', + 'going to airport', + 'going to go to store', + 'kid', + 'computer', + 'stew', + 'taking walk', + 'tired', + 'new computer', + 'horn', + 'serving mealfish', + 'potatoe shed', + 'hunting', + 'crazy', + 'buying new car', + 'laughing', + 'intoxicated', + 'eating hamburger', + 'wok', ] BEDUME_FLAGGED_PLACES = [ - 'alaska', 'kansa', 'kansas', 'utah', 'austria', 'delaware', 'pennsylvania', 'italy', - 'cuba', 'norway', 'idaho', 'france', 'utha', 'mexico', 'connecticut', - 'massachusetts', 'montana', 'wyoming', 'every state', 'new york', 'maine', - 'suface of moon', 'germany', 'nebraska', 'finland', 'louisiana', 'belgium', - 'morrocco', 'ireland', 'ceylon', 'california', 'oregon', 'florida', - 'uraguay', 'egypt', 'maryland', 'washington', 'morocco', 'south dakota', - 'tuscon', 'panama', 'alberta', 'arizona', 'texas', 'new jersey', 'colorado', - 'jamaica', 'vermont', 'nevada', 'delawere', 'hawaii', 'minnesota', 'tuscony', - 'costa rica', 'south dakato', 'south dakota', 'china', 'argentina', - 'venazuela', 'honduras', 'opera', 'wisconsin', 'great britain', -] -AROUND_PREPOSITIONS = [ - 'in', 'on', 'at', 'under', 'near' + 'alaska', + 'kansa', + 'kansas', + 'utah', + 'austria', + 'delaware', + 'pennsylvania', + 'italy', + 'cuba', + 'norway', + 'idaho', + 'france', + 'utha', + 'mexico', + 'connecticut', + 'massachusetts', + 'montana', + 'wyoming', + 'every state', + 'new york', + 'maine', + 'suface of moon', + 'germany', + 'nebraska', + 'finland', + 'louisiana', + 'belgium', + 'morrocco', + 'ireland', + 'ceylon', + 'california', + 'oregon', + 'florida', + 'uraguay', + 'egypt', + 'maryland', + 'washington', + 'morocco', + 'south dakota', + 'tuscon', + 'panama', + 'alberta', + 'arizona', + 'texas', + 'new jersey', + 'colorado', + 'jamaica', + 'vermont', + 'nevada', + 'delawere', + 'hawaii', + 'minnesota', + 'tuscony', + 'costa rica', + 'south dakato', + 'south dakota', + 'china', + 'argentina', + 'venazuela', + 'honduras', + 'opera', + 'wisconsin', + 'great britain', ] +AROUND_PREPOSITIONS = ['in', 'on', 'at', 'under', 'near'] # Some specific relations were once added to ConceptNet that have no purpose # for us anymore, especially ones connected with a project that was trying to @@ -59,7 +172,10 @@ # supposed to make it into the actual database. RELATIONS_TO_DROP = { - '/r/HasPainIntensity', '/r/HasPainCharacter', '/r/InheritsFrom', '/r/SimilarSize' + '/r/HasPainIntensity', + '/r/HasPainCharacter', + '/r/InheritsFrom', + '/r/SimilarSize', } CONTRIBUTOR_BLACKLIST = { '/s/contributor/omcs/brunogodoifred', @@ -91,20 +207,45 @@ } CONCEPT_BLACKLIST = { # Too vague - '/c/en/', '/c/en/he', '/c/en/i', '/c/en/it', '/c/en/she', - '/c/en/something', '/c/en/someone', '/c/en/that', '/c/en/there', - '/c/en/they', '/c/en/this', '/c/en/you', - '/c/en/often', '/c/en/sometimes', '/c/en/usually', '/c/en/if', - '/c/en/when', '/c/en/whether', '/c/en/nothing', '/c/en/nobody', + '/c/en/', + '/c/en/he', + '/c/en/i', + '/c/en/it', + '/c/en/she', + '/c/en/something', + '/c/en/someone', + '/c/en/that', + '/c/en/there', + '/c/en/they', + '/c/en/this', + '/c/en/you', + '/c/en/often', + '/c/en/sometimes', + '/c/en/usually', + '/c/en/if', + '/c/en/when', + '/c/en/whether', + '/c/en/nothing', + '/c/en/nobody', '/c/en/no_one', - # OMCS users tended to give unfortunate, stereotyped answers when asked # about terms distinguished by their gender. As part of the de-biasing # effort, we should skip these. We can learn enough about 'man' and 'woman' # from dictionary definitions and from statements about 'person'. - '/c/en/man', '/c/en/woman', '/c/en/boy', '/c/en/girl', '/c/en/boyfriend', - '/c/en/girlfriend', '/c/en/brother', '/c/en/sister', '/c/en/mother', - '/c/en/father', '/c/en/daughter', '/c/en/son', '/c/en/wife', '/c/en/husband' + '/c/en/man', + '/c/en/woman', + '/c/en/boy', + '/c/en/girl', + '/c/en/boyfriend', + '/c/en/girlfriend', + '/c/en/brother', + '/c/en/sister', + '/c/en/mother', + '/c/en/father', + '/c/en/daughter', + '/c/en/son', + '/c/en/wife', + '/c/en/husband', } ACTIVITY_BLACKLIST = { "20 Questions", @@ -113,15 +254,49 @@ "response to diagram", "commons2_reject", "globalmind", - "pycommons/question" + "pycommons/question", } MORE_STOPWORDS = [ - 'a', 'an', 'the', 'be', 'is', 'are', - 'some', 'any', 'you', 'me', 'him', 'it', 'them', 'i', 'we', 'she', 'he', 'they', - 'your', 'my', 'our', 'his', 'her', 'its', 'their', 'this', 'that', - 'these', 'those', 'something', 'someone', 'somebody', 'anything', 'anyone', - "someone's", "something's", "anything's", "somebody's", "anyone's", + 'a', + 'an', + 'the', + 'be', + 'is', + 'are', + 'some', + 'any', + 'you', + 'me', + 'him', + 'it', + 'them', + 'i', + 'we', + 'she', + 'he', + 'they', + 'your', + 'my', + 'our', + 'his', + 'her', + 'its', + 'their', + 'this', + 'that', + 'these', + 'those', + 'something', + 'someone', + 'somebody', + 'anything', + 'anyone', + "someone's", + "something's", + "anything's", + "somebody's", + "anyone's", ] @@ -150,7 +325,9 @@ def can_skip(parts_dict): return True if len(parts_dict["startText"]) == 0 or len(parts_dict["endText"]) == 0: return True - if lang == 'pt' and (len(parts_dict["startText"]) <= 2 or len(parts_dict["endText"]) <= 2): + if lang == 'pt' and ( + len(parts_dict["startText"]) <= 2 or len(parts_dict["endText"]) <= 2 + ): return True if 'rubycommons' in parts_dict["activity"]: return True @@ -161,8 +338,8 @@ def can_skip(parts_dict): if parts_dict["activity"] in ACTIVITY_BLACKLIST: return True if not ( - valid_concept_name(parts_dict["startText"]) and - valid_concept_name(parts_dict["endText"]) + valid_concept_name(parts_dict["startText"]) + and valid_concept_name(parts_dict["endText"]) ): return True return False @@ -174,7 +351,10 @@ def skip_assertion(source_dict, start, end): Filter out assertions that we can tell will be unhelpful after we've extracted them. """ - if lemmatize_uri(start) in CONCEPT_BLACKLIST or lemmatize_uri(end) in CONCEPT_BLACKLIST: + if ( + lemmatize_uri(start) in CONCEPT_BLACKLIST + or lemmatize_uri(end) in CONCEPT_BLACKLIST + ): return True if source_dict['contributor'] in CONTRIBUTOR_BLACKLIST: return True @@ -204,7 +384,9 @@ def build_frame_text(parts_dict): frame_text = frame_text.replace('{%}', '') else: frame_text = frame_text.replace('{%}', 'not ') - frame_text = frame_text.replace('{1}', '[[%s]]' % parts_dict["startText"]).replace('{2}', '[[%s]]' % parts_dict["endText"]) + frame_text = frame_text.replace('{1}', '[[%s]]' % parts_dict["startText"]).replace( + '{2}', '[[%s]]' % parts_dict["endText"] + ) return frame_text @@ -234,10 +416,7 @@ def filtered_uri(lang, text): def filter_stopwords(text): - words = [ - word for word in simple_tokenize(text) - if word not in MORE_STOPWORDS - ] + words = [word for word in simple_tokenize(text) if word not in MORE_STOPWORDS] text2 = ' '.join(words) if not text2: text2 = text @@ -286,8 +465,7 @@ def build_sources(parts_dict, preposition_fix=False): """ creator_source = {} creator_node = join_uri( - '/s/contributor/omcs', - standardize_username(parts_dict["creator"]) + '/s/contributor/omcs', standardize_username(parts_dict["creator"]) ) creator_source['contributor'] = creator_node @@ -308,9 +486,11 @@ def build_sources(parts_dict, preposition_fix=False): vote_int = vote[1] vote_source = { - 'contributor': join_uri('/s/contributor/omcs', standardize_username(username)), + 'contributor': join_uri( + '/s/contributor/omcs', standardize_username(username) + ), 'activity': '/s/activity/omcs/vote', - 'weight': float(vote_int) + 'weight': float(vote_int), } sources.append(vote_source) return sources @@ -348,14 +528,11 @@ def handle_assertion(self, parts_dict): if '} around {' in parts_dict['frame_text']: for prep in AROUND_PREPOSITIONS: if parts_dict['endText'].startswith(prep + ' '): - parts_dict['endText'] = \ - parts_dict['endText'][len(prep) + 1:] + parts_dict['endText'] = parts_dict['endText'][len(prep) + 1 :] replacement = '} %s {' % prep - parts_dict['frame_text'] = \ - parts_dict['frame_text'].replace( - '} around {', - replacement - ) + parts_dict['frame_text'] = parts_dict['frame_text'].replace( + '} around {', replacement + ) preposition_fix = True break @@ -381,13 +558,16 @@ def handle_assertion(self, parts_dict): if not skip_assertion(source_dict, start, end): weight = source_dict.pop('weight') yield make_edge( - rel=relation, start=start, end=end, - dataset=dataset, license=Licenses.cc_attribution, - sources=[source_dict], surfaceText=frame_text, - + rel=relation, + start=start, + end=end, + dataset=dataset, + license=Licenses.cc_attribution, + sources=[source_dict], + surfaceText=frame_text, # The edge weight is the weight computed by build_sources, # times the multiplier set on this instance - weight=weight * self.weight + weight=weight * self.weight, ) def transform_file(self, input_filename, output_file): @@ -400,4 +580,3 @@ def transform_file(self, input_filename, output_file): def handle_file(input_filename, output_file): builder = CN4Builder() builder.transform_file(input_filename, output_file) - diff --git a/conceptnet5/readers/dbpedia.py b/conceptnet5/readers/dbpedia.py index 8205abfd..63e2cbd2 100644 --- a/conceptnet5/readers/dbpedia.py +++ b/conceptnet5/readers/dbpedia.py @@ -25,18 +25,20 @@ 'instance_types' and 'mappingbased_objects' files. """ -from conceptnet5.language.token_utils import un_camel_case -from conceptnet5.uri import Licenses, uri_prefix, split_uri -from conceptnet5.nodes import standardized_concept_uri, topic_to_concept -from conceptnet5.edges import make_edge -from conceptnet5.languages import ALL_LANGUAGES, LCODE_ALIASES -from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter -from conceptnet5.formats.semantic_web import resource_name, parse_nquads -import urllib import bz2 +import itertools import pathlib +import urllib from operator import itemgetter -import itertools + +from conceptnet5.edges import make_edge +from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter +from conceptnet5.formats.semantic_web import parse_nquads, resource_name +from conceptnet5.language.token_utils import un_camel_case +from conceptnet5.languages import ALL_LANGUAGES, LCODE_ALIASES +from conceptnet5.nodes import standardized_concept_uri, topic_to_concept +from conceptnet5.uri import Licenses, split_uri, uri_prefix + parse_url = urllib.parse.urlparse @@ -44,19 +46,16 @@ 'isPartOf': '/r/PartOf', 'series': '/r/PartOf', 'languageFamily': '/r/PartOf', - 'location': '/r/AtLocation', 'place': '/r/AtLocation', 'locatedInArea': '/r/AtLocation', 'spokenIn': '/r/AtLocation', - # leave out differentFrom, as it is mostly about confusable names 'sameAs': '/r/Synonym', 'similar': '/r/SimilarTo', 'related': '/r/RelatedTo', 'seeAlso': '/r/RelatedTo', 'type': '/r/InstanceOf', - 'field': '/r/dbpedia/field', 'academicDiscipline': '/r/dbpedia/field', 'genre': '/r/dbpedia/genre', @@ -68,19 +67,17 @@ 'language': '/r/dbpedia/language', 'occupation': '/r/dbpedia/occupation', 'profession': '/r/dbpedia/occupation', - - #'author': '/r/dbpedia/writer', - #'writer': '/r/dbpedia/writer', - #'director': '/r/dbpedia/director', - #'starring': '/r/dbpedia/starring', - #'producer': '/r/dbpedia/producer', - #'associatedBand': '/r/dbpedia/associatedBand', - #'associatedMusicalArtist': '/r/dbpedia/associatedMusicalArtist', - #'bandMember': '/r/dbpedia/bandMember', - #'artist': '/r/dbpedia/artist', - #'musicalArtist': '/r/dbpedia/artist', - #'musicalBand': '/r/dbpedia/artist', - + # 'author': '/r/dbpedia/writer', + # 'writer': '/r/dbpedia/writer', + # 'director': '/r/dbpedia/director', + # 'starring': '/r/dbpedia/starring', + # 'producer': '/r/dbpedia/producer', + # 'associatedBand': '/r/dbpedia/associatedBand', + # 'associatedMusicalArtist': '/r/dbpedia/associatedMusicalArtist', + # 'bandMember': '/r/dbpedia/bandMember', + # 'artist': '/r/dbpedia/artist', + # 'musicalArtist': '/r/dbpedia/artist', + # 'musicalBand': '/r/dbpedia/artist', 'genus': '/r/dbpedia/genus', 'leader': '/r/dbpedia/leader', 'capital': '/r/dbpedia/capital', @@ -90,14 +87,21 @@ # Ban some concepts that are way too generic and often differ from the common # way that people use these words -CONCEPT_BLACKLIST = { - '/c/en/work/n', '/c/en/agent/n', '/c/en/artist/n', '/c/en/thing/n' -} +CONCEPT_BLACKLIST = {'/c/en/work/n', '/c/en/agent/n', '/c/en/artist/n', '/c/en/thing/n'} TYPE_BLACKLIST = { - 'Settlement', 'Railway Line', 'Road', 'Sports Event', 'Event', - 'Olympic Event', 'Soccer Tournament', 'Election', 'Diocese', - 'Year', 'Football League Season', 'Grand Prix' + 'Settlement', + 'Railway Line', + 'Road', + 'Sports Event', + 'Event', + 'Olympic Event', + 'Soccer Tournament', + 'Election', + 'Diocese', + 'Year', + 'Football League Season', + 'Grand Prix', } @@ -188,7 +192,15 @@ def interlanguage_mapping(interlang_path, ok_concepts): pieces = split_uri(subj_concept) if len(pieces) >= 6: sense = pieces[5] - if 'album' in sense or 'film' in sense or 'series' in sense or 'disambiguation' in sense or 'song' in sense or 'album' in sense or 'band' in sense: + if ( + 'album' in sense + or 'film' in sense + or 'series' in sense + or 'disambiguation' in sense + or 'song' in sense + or 'album' in sense + or 'band' in sense + ): continue if uri_prefix(subj_concept) in ok_concepts: targets = [subj_url] @@ -227,9 +239,11 @@ def process_dbpedia(input_dir, output_file, concept_file): for subj, pred, obj, _graph in quads: subj_url = subj['url'] if ( - 'Category:' in subj_url or 'File:' in subj_url or - 'List_of' in subj_url or '__' in subj_url or - 'Template:' in subj_url + 'Category:' in subj_url + or 'File:' in subj_url + or 'List_of' in subj_url + or '__' in subj_url + or 'Template:' in subj_url ): continue if subj_url in mapped_urls: @@ -239,24 +253,27 @@ def process_dbpedia(input_dir, output_file, concept_file): obj_concept = standardized_concept_uri('en', obj_type, 'n') if obj_concept not in CONCEPT_BLACKLIST: edge = make_edge( - '/r/IsA', subj_concept, obj_concept, + '/r/IsA', + subj_concept, + obj_concept, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, sources=[{'contributor': '/s/resource/dbpedia/2015/en'}], weight=0.5, surfaceStart=url_to_label(subj['url']), - surfaceEnd=url_to_label(obj['url']) + surfaceEnd=url_to_label(obj['url']), ) out.write(edge) for other_url in mapped_urls[subj_url]: if other_url.startswith('http://wikidata.dbpedia.org/'): urledge = make_edge( '/r/ExternalURL', - subj_concept, other_url, + subj_concept, + other_url, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, sources=[{'contributor': '/s/resource/dbpedia/2015/en'}], - weight=1.0 + weight=1.0, ) out.write(urledge) else: @@ -264,22 +281,28 @@ def process_dbpedia(input_dir, output_file, concept_file): if other_concept: urledge = make_edge( '/r/ExternalURL', - other_concept, other_url, + other_concept, + other_url, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, - sources=[{'contributor': '/s/resource/dbpedia/2015/en'}], - weight=1.0 + sources=[ + {'contributor': '/s/resource/dbpedia/2015/en'} + ], + weight=1.0, ) out.write(urledge) edge = make_edge( '/r/Synonym', - other_concept, subj_concept, + other_concept, + subj_concept, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, - sources=[{'contributor': '/s/resource/dbpedia/2015/en'}], + sources=[ + {'contributor': '/s/resource/dbpedia/2015/en'} + ], weight=0.5, surfaceStart=url_to_label(other_url), - surfaceEnd=url_to_label(subj_url) + surfaceEnd=url_to_label(subj_url), ) out.write(edge) @@ -290,21 +313,24 @@ def process_dbpedia(input_dir, output_file, concept_file): obj_concept = translate_dbpedia_url(obj['url']) rel_name = resource_name(pred['url']) if ( - subj_concept and obj_concept and - subj['url'] in mapped_urls and obj['url'] in mapped_urls + subj_concept + and obj_concept + and subj['url'] in mapped_urls + and obj['url'] in mapped_urls ): if rel_name in RELATIONS: rel = RELATIONS[rel_name] edge = make_edge( - rel, subj_concept, obj_concept, + rel, + subj_concept, + obj_concept, dataset='/d/dbpedia/en', license=Licenses.cc_sharealike, sources=[{'contributor': '/s/resource/dbpedia/2015/en'}], weight=0.5, surfaceStart=url_to_label(subj['url']), - surfaceEnd=url_to_label(obj['url']) + surfaceEnd=url_to_label(obj['url']), ) out.write(edge) out.close() - diff --git a/conceptnet5/readers/emoji.py b/conceptnet5/readers/emoji.py index 7726cc34..0dfee18e 100644 --- a/conceptnet5/readers/emoji.py +++ b/conceptnet5/readers/emoji.py @@ -1,9 +1,9 @@ +import xml.etree.ElementTree as ET + from conceptnet5.edges import make_edge from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter -from conceptnet5.uri import Licenses from conceptnet5.nodes import standardized_concept_uri - -import xml.etree.ElementTree as ET +from conceptnet5.uri import Licenses REL = '/r/SymbolOf' DATASET = '/d/emoji' @@ -26,7 +26,9 @@ def handle_file(input_file, output_file): tree = ET.parse(input_file) out = MsgpackStreamWriter(output_file) root = tree.getroot() - lang = root[0][1].attrib['type'] # language is at position [1] within the child node [0] + lang = root[0][1].attrib[ + 'type' + ] # language is at position [1] within the child node [0] if len(root) >= 2: for annotation in root[1]: diff --git a/conceptnet5/readers/jmdict.py b/conceptnet5/readers/jmdict.py index e396f84d..16b6e116 100644 --- a/conceptnet5/readers/jmdict.py +++ b/conceptnet5/readers/jmdict.py @@ -1,17 +1,13 @@ -from __future__ import unicode_literals, print_function -import xmltodict -import re import codecs +import re + import langcodes +import xmltodict + +from conceptnet5.edges import make_edge from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter -from conceptnet5.uri import Licenses from conceptnet5.nodes import standardized_concept_uri, valid_concept_name -from conceptnet5.edges import make_edge - - -# Now that Unicode literals are on, get the type of a Unicode string, -# regardless of whether this is Python 2 or 3. -STRING_TYPE = type("") +from conceptnet5.uri import Licenses # I took the time to record these, but in the end I don't think I plan # to use them. Japanese parts of speech don't fit neatly enough into @@ -22,7 +18,7 @@ # from their more helpful entity form). NOUN_TYPES = [ "noun (comm", - "adverbial ", + "adverbial ", # don't ask me what an adverbial noun is "noun, used", "noun (temp", "noun or pa", @@ -33,10 +29,7 @@ "adjectival", "pre-noun a", ] -ADV_TYPES = [ - "adverb (fu", - "adverb tak", -] +ADV_TYPES = ["adverb (fu", "adverb tak"] VERB_TYPES = [ "Ichidan ve", "Nidan verb", @@ -61,7 +54,7 @@ def convert_lang_code(code): def fix_context(context): ending = ' term' if context.endswith(ending): - return context[:-len(ending)] + return context[: -len(ending)] return context @@ -84,14 +77,17 @@ def get_list(node, tag): return [subnode] -GLOSS_RE = re.compile(r''' +GLOSS_RE = re.compile( + r''' # Separate out text in parentheses or brackets. ^ (\(.*?\)|\[.*?\] )? # possibly a bracketed expression before the gloss (.*?) # the gloss itself ( \(.*?\)|\[.*?\])? # possibly a bracketed expression after the gloss $ -''', re.VERBOSE) +''', + re.VERBOSE, +) def parse_gloss(text): @@ -172,10 +168,7 @@ def handle_file(filename, output_file): # # Get all the glosses, including the lsource if it's there. glosses = get_list(sense, 'gloss') + get_list(sense, 'lsource') - contexts = [ - fix_context(context) - for context in get_list(sense, 'field') - ] + contexts = [fix_context(context) for context in get_list(sense, 'field')] pos = '_' for pos_tag in get_list(sense, 'pos'): if pos_tag[:10] in NOUN_TYPES: @@ -194,7 +187,7 @@ def handle_file(filename, output_file): # '@xml:lang' elements. text = parse_gloss(gloss['#text']) lang = convert_lang_code(gloss['@xml:lang']) - elif isinstance(gloss, STRING_TYPE): + elif isinstance(gloss, str): # If there's no 'lang' attribute, the gloss is in English, # and xmltodict gives it to us as a plain Unicode string. lang = 'en' @@ -208,13 +201,17 @@ def handle_file(filename, output_file): # we can't expand them), and we also don't want to deal with texts # that are more than five words long. if ( - text is not None and '.' not in text and - text.count(' ') <= 4 and valid_concept_name(text) + text is not None + and '.' not in text + and text.count(' ') <= 4 + and valid_concept_name(text) ): for head in headwords: if len(senses) >= 2: sensekey = '%d' % (sense_num + 1) - ja_concept = standardized_concept_uri('ja', head, pos, 'jmdict', sensekey) + ja_concept = standardized_concept_uri( + 'ja', head, pos, 'jmdict', sensekey + ) else: ja_concept = standardized_concept_uri('ja', head, pos) other_concept = standardized_concept_uri(lang, text) @@ -229,10 +226,13 @@ def output_edge(out, rel, subj_concept, obj_concept): """ Write an edge to `out`, an instance of MsgpackStreamWriter. """ - edge = make_edge(rel, subj_concept, obj_concept, - dataset='/d/jmdict', - license=Licenses.cc_sharealike, - sources=[{'contributor': '/s/resource/jmdict/1.07'}], - weight=2.0) + edge = make_edge( + rel, + subj_concept, + obj_concept, + dataset='/d/jmdict', + license=Licenses.cc_sharealike, + sources=[{'contributor': '/s/resource/jmdict/1.07'}], + weight=2.0, + ) out.write(edge) - diff --git a/conceptnet5/readers/kyoto_yahoo.py b/conceptnet5/readers/kyoto_yahoo.py index 994434de..d3a2b4e3 100644 --- a/conceptnet5/readers/kyoto_yahoo.py +++ b/conceptnet5/readers/kyoto_yahoo.py @@ -5,8 +5,8 @@ http://coling2016.okbqa.org/OKBQA201602.pdf """ -from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter from conceptnet5.edges import make_edge +from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter from conceptnet5.uri import Licenses # Assign a unique dataset and source to this data @@ -14,11 +14,7 @@ SOURCE = '/s/activity/kyoto_yahoo' -WEIGHT_TABLE = { - '3': 0.5, - '4': 1.0, - '5': 2.0 -} +WEIGHT_TABLE = {'3': 0.5, '4': 1.0, '5': 2.0} def handle_file(input_filename, output_file): @@ -36,6 +32,6 @@ def handle_file(input_filename, output_file): dataset=DATASET, sources=[{'activity': SOURCE}], license=Licenses.cc_attribution, - weight=WEIGHT_TABLE[weight] + weight=WEIGHT_TABLE[weight], ) out.write(edge) diff --git a/conceptnet5/readers/nadya.py b/conceptnet5/readers/nadya.py index da258075..a323a8a1 100644 --- a/conceptnet5/readers/nadya.py +++ b/conceptnet5/readers/nadya.py @@ -2,8 +2,8 @@ Handle data that has been collected from nadya.jp, an online word game created to collect data for ConceptNet, by Nihon Unisys and Dentsu. """ -from conceptnet5.readers.conceptnet4 import CN4Builder from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter +from conceptnet5.readers.conceptnet4 import CN4Builder # The nadya.jp data is distributed as a PostgreSQL database. The following # command will extract a file in the form of 'nadya-2017.csv' from such a @@ -54,8 +54,19 @@ def handle_line(line, builder): ConceptNet edges that can be extracted from it. """ parts = line.rstrip('\n').split('\t') - (cnet4_id, lang, frame_text, relname, start_text, end_text, - freq, vote, email, creator, voter) = parts + ( + cnet4_id, + lang, + frame_text, + relname, + start_text, + end_text, + freq, + vote, + email, + creator, + voter, + ) = parts if cnet4_id == 'cnet4_id': return @@ -85,7 +96,7 @@ def handle_line(line, builder): 'creator': voter, 'votes': [], 'activity': 'nadya.jp', - 'goodness': 3 + 'goodness': 3, } yield from builder.handle_assertion(parts_dict) diff --git a/conceptnet5/readers/opencyc.py b/conceptnet5/readers/opencyc.py index 9053bbdd..47f2c423 100644 --- a/conceptnet5/readers/opencyc.py +++ b/conceptnet5/readers/opencyc.py @@ -9,15 +9,14 @@ from collections import defaultdict -from wordfreq import simple_tokenize - from conceptnet5.edges import make_edge from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter -from conceptnet5.formats.semantic_web import resource_name, parse_nquads +from conceptnet5.formats.semantic_web import parse_nquads, resource_name from conceptnet5.language.token_utils import un_camel_case from conceptnet5.nodes import standardized_concept_uri from conceptnet5.readers.conceptnet4 import filter_stopwords from conceptnet5.uri import Licenses +from wordfreq import simple_tokenize SOURCE = {'contributor': '/s/resource/opencyc/2012'} RDF_LABEL = 'http://www.w3.org/2000/01/rdf-schema#label' @@ -28,33 +27,61 @@ def opencyc_edge(rel, start, end, start_text, end_text): Get the ConceptNet representation of an OpenCyc edge. """ return make_edge( - rel=rel, start=start, end=end, + rel=rel, + start=start, + end=end, dataset='/d/opencyc', license=Licenses.cc_attribution, sources=[SOURCE], weight=1.0, surfaceStart=start_text, - surfaceEnd=end_text + surfaceEnd=end_text, ) def external_url_edge(start, end): return make_edge( - rel='/r/ExternalURL', start=start, end=end, + rel='/r/ExternalURL', + start=start, + end=end, dataset='/d/opencyc', license=Licenses.cc_attribution, sources=[SOURCE], - weight=1.0 + weight=1.0, ) # These words tend to indicate Cyc internals that are presented the same way # as facts about the external world. BLACKLIST_WORDS = { - 'arg', 'arity', 'aura', 'bugzilla', 'cbl', 'cblask', 'cblassign', - 'centroid', 'cw', 'cwe', 'cyc', 'cycl', 'deprecated', 'fn', 'individual', - 'mett', 'microtheory', 'mr', 'mt', 'obo', 'opencyc', 'pcw', 'temporally', - 'type', 'union', 'underspecified', 'wn', 'wordnet' + 'arg', + 'arity', + 'aura', + 'bugzilla', + 'cbl', + 'cblask', + 'cblassign', + 'centroid', + 'cw', + 'cwe', + 'cyc', + 'cycl', + 'deprecated', + 'fn', + 'individual', + 'mett', + 'microtheory', + 'mr', + 'mt', + 'obo', + 'opencyc', + 'pcw', + 'temporally', + 'type', + 'union', + 'underspecified', + 'wn', + 'wordnet', } @@ -82,7 +109,12 @@ def run_opencyc(input_file, output_file): rel_name = resource_name(pred['url']) web_subj = subj.get('url') web_obj = obj.get('url') - if rel_name == 'subClassOf' and web_obj is not None and web_subj in labels and web_obj in labels: + if ( + rel_name == 'subClassOf' + and web_obj is not None + and web_subj in labels + and web_obj in labels + ): subj_label = labels[web_subj] obj_label = labels[web_obj] if '_' in subj_label or '_' in obj_label: @@ -105,7 +137,11 @@ def run_opencyc(input_file, output_file): if (obj_uri, web_obj) not in seen_external_urls: out.write(external_url_edge(obj_uri, web_obj)) seen_external_urls.add((obj_uri, web_obj)) - elif rel_name == 'sameAs' and web_subj in labels and web_obj.startswith('http://umbel.org/'): + elif ( + rel_name == 'sameAs' + and web_subj in labels + and web_obj.startswith('http://umbel.org/') + ): subj_label = labels[web_subj] subj_uri = standardized_concept_uri('en', subj_label) if (subj_uri, web_obj) not in seen_external_urls: diff --git a/conceptnet5/readers/ptt_petgame.py b/conceptnet5/readers/ptt_petgame.py index eca03178..29e946e9 100644 --- a/conceptnet5/readers/ptt_petgame.py +++ b/conceptnet5/readers/ptt_petgame.py @@ -1,12 +1,11 @@ -from __future__ import unicode_literals import codecs import json + +from conceptnet5.edges import make_edge from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter from conceptnet5.nodes import standardized_concept_uri -from conceptnet5.edges import make_edge -from conceptnet5.util import get_support_data_filename from conceptnet5.uri import Licenses - +from conceptnet5.util import get_support_data_filename FRAME_DATA = json.load( codecs.open(get_support_data_filename('zh_frames.json'), encoding='utf-8') @@ -20,7 +19,9 @@ def handle_raw_assertion(line): ftext = fdata['text'] rel = fdata['relation'] - surfaceText = ftext.replace('{1}', '[[' + concept1 + ']]').replace('{2}', '[[' + concept2 + ']]') + surfaceText = ftext.replace('{1}', '[[' + concept1 + ']]').replace( + '{2}', '[[' + concept2 + ']]' + ) # We mark surface texts with * if {2} comes before {1}. if ftext.find('{2}') < ftext.find('{1}'): surfaceText = '*' + surfaceText @@ -29,11 +30,18 @@ def handle_raw_assertion(line): end = standardized_concept_uri('zh_TW', concept2) source = { 'contributor': '/s/contributor/petgame/' + user, - 'activity': '/s/activity/ptt/petgame' + 'activity': '/s/activity/ptt/petgame', } - yield make_edge(rel, start, end, dataset='/d/conceptnet/4/zh', - license=Licenses.cc_attribution, sources=[source], - surfaceText=surfaceText, weight=1) + yield make_edge( + rel, + start, + end, + dataset='/d/conceptnet/4/zh', + license=Licenses.cc_attribution, + sources=[source], + surfaceText=surfaceText, + weight=1, + ) def handle_file(input_filename, output_file): diff --git a/conceptnet5/readers/verbosity.py b/conceptnet5/readers/verbosity.py index dba3780e..d372a188 100644 --- a/conceptnet5/readers/verbosity.py +++ b/conceptnet5/readers/verbosity.py @@ -1,11 +1,11 @@ -from __future__ import print_function, unicode_literals, division -from conceptnet5.uri import Licenses -from conceptnet5.nodes import standardized_concept_uri +import re +from collections import defaultdict + from conceptnet5.edges import make_edge from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter +from conceptnet5.nodes import standardized_concept_uri +from conceptnet5.uri import Licenses from conceptnet5.util.sounds_like import sounds_like_score -from collections import defaultdict -import re # If any word in a clue matches one of these words, it is probably a bad # common-sense assertion. @@ -30,11 +30,54 @@ # assertions. The list is much more extensive than the three and a half # stopwords that ConceptNet uses for English in general. STOPWORDS = { - 'a', 'an', 'the', 'to', 'of', 'for', 'in', 'on', 'at', 'by', 'with', 'and', - 'or', 'far', 'near', 'away', 'from', 'thing', 'something', 'things', 'be', - 'is', 'are', 'was', 'were', 'as', 'so', 'get', 'i', 'me', 'you', 'it', 'he', - 'she', 'him', 'her', 'this', 'that', 'they', 'them', 'some', 'many', 'no', - 'one', 'all', 'either', 'both', 'er' + 'a', + 'an', + 'the', + 'to', + 'of', + 'for', + 'in', + 'on', + 'at', + 'by', + 'with', + 'and', + 'or', + 'far', + 'near', + 'away', + 'from', + 'thing', + 'something', + 'things', + 'be', + 'is', + 'are', + 'was', + 'were', + 'as', + 'so', + 'get', + 'i', + 'me', + 'you', + 'it', + 'he', + 'she', + 'him', + 'her', + 'this', + 'that', + 'they', + 'them', + 'some', + 'many', + 'no', + 'one', + 'all', + 'either', + 'both', + 'er', } @@ -138,7 +181,7 @@ def handle_file(infile, outfile): # revisit. # # The weight is the score divided by 100. All divisions are floating - # point, as defined by the __future__ import at the top of this module. + # point. score = (freq * 2 - 1) * (1 - sls) * (1 - orderscore / 1000) if score <= 1.: outcomes['low score'] += 1 @@ -156,9 +199,7 @@ def handle_file(infile, outfile): rightwords.extend(morewords) for i, rightword in enumerate(rightwords): - source = { - 'contributor': '/s/resource/verbosity' - } + source = {'contributor': '/s/resource/verbosity'} if i > 0: source['process'] = '/s/process/split_words' @@ -169,8 +210,14 @@ def handle_file(infile, outfile): outcomes['success'] += 1 leftc = standardized_concept_uri('en', left) rightc = standardized_concept_uri('en', rightword) - edge = make_edge(rel, leftc, rightc, dataset='/d/verbosity', - license=Licenses.cc_attribution, - sources=[source], surfaceText=text, - weight=weight) + edge = make_edge( + rel, + leftc, + rightc, + dataset='/d/verbosity', + license=Licenses.cc_attribution, + sources=[source], + surfaceText=text, + weight=weight, + ) writer.write(edge) diff --git a/conceptnet5/readers/wiktionary.py b/conceptnet5/readers/wiktionary.py index 81ef5be1..e8e6ff78 100644 --- a/conceptnet5/readers/wiktionary.py +++ b/conceptnet5/readers/wiktionary.py @@ -1,14 +1,14 @@ +import os +import pathlib +import sqlite3 +from collections import Counter + +from conceptnet5.edges import make_edge from conceptnet5.formats.json_stream import read_json_stream from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter -from conceptnet5.nodes import standardized_concept_uri from conceptnet5.languages import ALL_LANGUAGES, valid_language -from conceptnet5.edges import make_edge +from conceptnet5.nodes import standardized_concept_uri from conceptnet5.uri import Licenses, uri_prefix -import sqlite3 -import pathlib -import os -from collections import Counter - PARSER_RULE = '/s/process/wikiparsec/2' @@ -41,7 +41,9 @@ def prepare_db(inputs, dbfile): # use these to disambiguate definitions later. if item['rel'] != 'definition': if 'language' in tfrom and valid_language(tfrom['language']): - add_title(db, file_language, tfrom['language'], tfrom['text']) + add_title( + db, file_language, tfrom['language'], tfrom['text'] + ) if 'language' in tto and valid_language(tto['language']): add_title(db, file_language, tto['language'], tto['text']) @@ -55,10 +57,18 @@ def prepare_db(inputs, dbfile): # Use only Etymology 1 entries for learning word forms. if (tfrom.get('etym') or '1') == '1': language = tfrom.get('language', tto.get('language')) - if valid_language(language) and tfrom['text'] != tto['text']: + if ( + valid_language(language) + and tfrom['text'] != tto['text'] + ): add_form( - db, file_language, language, - tfrom['text'], pos, tto['text'], form_name + db, + file_language, + language, + tfrom['text'], + pos, + tto['text'], + form_name, ) db.commit() finally: @@ -66,15 +76,20 @@ def prepare_db(inputs, dbfile): def make_tables(db): - db.execute("CREATE TABLE titles " - "(id integer primary key, site_language text, language text, " - "title text)") - db.execute("CREATE UNIQUE INDEX titles_uniq ON titles " - "(site_language, language, title)") + db.execute( + "CREATE TABLE titles " + "(id integer primary key, site_language text, language text, " + "title text)" + ) + db.execute( + "CREATE UNIQUE INDEX titles_uniq ON titles " "(site_language, language, title)" + ) db.execute("CREATE INDEX titles_search ON titles (language, title)") - db.execute("CREATE TABLE forms " - "(id integer primary key, site_language text, language text, " - "word text, pos text, root text, form text)") + db.execute( + "CREATE TABLE forms " + "(id integer primary key, site_language text, language text, " + "word text, pos text, root text, form text)" + ) db.execute("CREATE INDEX forms_search ON forms (language, word)") @@ -82,7 +97,7 @@ def add_title(db, file_language, language, title): db.execute( "INSERT OR IGNORE INTO titles (site_language, language, title) " "VALUES (?, ?, ?)", - (file_language, language, title.lower()) + (file_language, language, title.lower()), ) @@ -90,7 +105,14 @@ def add_form(db, file_language, language, word, pos, root, form): db.execute( "INSERT INTO forms (site_language, language, word, pos, root, form) " "VALUES (?, ?, ?, ?, ?, ?)", - (file_language, language, word.lower(), pos.lower(), root.lower(), form.lower()) + ( + file_language, + language, + word.lower(), + pos.lower(), + root.lower(), + form.lower(), + ), ) @@ -114,7 +136,7 @@ def add_form(db, file_language, language, word, pos, root, form): "coordinate": ("/r/SimilarTo", False), "quasi-synonym": ("/r/SimilarTo", False), "translation": ("/r/Synonym", False), - "definition": (None, False) + "definition": (None, False), } @@ -187,8 +209,10 @@ def disambiguate_language(text, assumed_languages, db): ok_languages = [] for language in assumed_languages: c = db.cursor() - c.execute('SELECT * from titles where language=? and title=? limit 1', - (language, text)) + c.execute( + 'SELECT * from titles where language=? and title=? limit 1', + (language, text), + ) if c.fetchone(): ok_languages.append(language) @@ -238,10 +262,7 @@ def read_wiktionary(input_file, db_file, output_file): web_url = 'http://{}.wiktionary.org/wiki/{}'.format(language, url_title) web_source = '/s/resource/wiktionary/{}'.format(language) - source = { - 'contributor': web_source, - 'process': PARSER_RULE - } + source = {'contributor': web_source, 'process': PARSER_RULE} # Scan through the 'from' items, such as the start nodes of # translations, looking for distinct etymologies. If we get more than @@ -250,7 +271,8 @@ def read_wiktionary(input_file, db_file, output_file): all_etyms = { (item['from']['language'], etym_label(language, item['from'])) for item in items - if 'language' in item['from'] and item['from']['text'] == title + if 'language' in item['from'] + and item['from']['text'] == title and etym_label(language, item['from']) is not None } word_languages = {wlang for (wlang, _) in all_etyms} @@ -258,16 +280,19 @@ def read_wiktionary(input_file, db_file, output_file): if valid_language(wlang): cpage = standardized_concept_uri(wlang, title) ld_edge = make_edge( - '/r/ExternalURL', cpage, web_url, - dataset=dataset, weight=0.25, sources=[source], - license=Licenses.cc_sharealike + '/r/ExternalURL', + cpage, + web_url, + dataset=dataset, + weight=0.25, + sources=[source], + license=Licenses.cc_sharealike, ) out.write(ld_edge) etym_to_translation_sense = {} language_etym_counts = Counter(lang for (lang, etym) in all_etyms) polysemous_languages = { - lang for lang in language_etym_counts - if language_etym_counts[lang] > 1 + lang for lang in language_etym_counts if language_etym_counts[lang] > 1 } for item in items: @@ -282,13 +307,19 @@ def read_wiktionary(input_file, db_file, output_file): assumed_languages.append(lang2) cfrom = transform_term( - language, tfrom, assumed_languages, db, - use_etyms=(lang1 in polysemous_languages) + language, + tfrom, + assumed_languages, + db, + use_etyms=(lang1 in polysemous_languages), ) cpage = cfrom cto = transform_term( - language, tto, assumed_languages, db, - use_etyms=(lang2 in polysemous_languages) + language, + tto, + assumed_languages, + db, + use_etyms=(lang2 in polysemous_languages), ) if cfrom is None or cto is None: @@ -317,11 +348,17 @@ def read_wiktionary(input_file, db_file, output_file): weight = 1. if rel == '/r/EtymologicallyRelatedTo': weight = 0.25 - edge = make_edge(rel, cfrom, cto, dataset=dataset, weight=weight, - sources=[source], - surfaceStart=tfrom['text'], - surfaceEnd=tto['text'], - license=Licenses.cc_sharealike) + edge = make_edge( + rel, + cfrom, + cto, + dataset=dataset, + weight=weight, + sources=[source], + surfaceStart=tfrom['text'], + surfaceEnd=tto['text'], + license=Licenses.cc_sharealike, + ) out.write(edge) out.close() diff --git a/conceptnet5/readers/wordnet.py b/conceptnet5/readers/wordnet.py index 658e05d1..6f3bd10d 100644 --- a/conceptnet5/readers/wordnet.py +++ b/conceptnet5/readers/wordnet.py @@ -1,56 +1,51 @@ -from __future__ import unicode_literals from collections import defaultdict -from conceptnet5.uri import Licenses -from conceptnet5.nodes import standardized_concept_uri + from conceptnet5.edges import make_edge from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter -from conceptnet5.formats.semantic_web import ( - resource_name, parse_nquads -) - +from conceptnet5.formats.semantic_web import parse_nquads, resource_name +from conceptnet5.nodes import standardized_concept_uri +from conceptnet5.uri import Licenses -SOURCE = {'contributor': '/s/resource/wordnet/rdf/3.1'} -DATASET = '/d/wordnet/3.1' -WN20_URL = 'http://www.w3.org/2006/03/wn/wn20/instances/' +SOURCE = {"contributor": "/s/resource/wordnet/rdf/3.1"} +DATASET = "/d/wordnet/3.1" +WN20_URL = "http://www.w3.org/2006/03/wn/wn20/instances/" PARTS_OF_SPEECH = { - 'noun': 'n', - 'verb': 'v', - 'adjective': 'a', - 'adjectivesatellite': 'a', - 'adverb': 'r', - 'phrase': 'p' + "noun": "n", + "verb": "v", + "adjective": "a", + "adjectivesatellite": "a", + "adverb": "r", + "phrase": "p", } REL_MAPPING = { - 'hypernym': ('IsA', '{0} is a type of {1}'), - 'hypernym-v': ('MannerOf', '{0} is a way to {1}'), - 'part_meronym': ('PartOf', '{0} is a part of {1}'), - 'domain_category': ('HasContext', '{0} is used in the context of {1}'), - 'domain_region': ('HasContext', '{0} is used in the region of {1}'), - 'cause': ('Causes', '{0} causes {1}'), - 'action': ('UsedFor', '{0} is used for {1}'), - 'result': ('UsedFor', '{0} is used for {1}'), - 'beneficiary': ('UsedFor', '{0} is used for the benefit of {1}'), - 'location': ('AtLocation', '{0} is located in {1}'), - 'creator': ('CreatedBy', '{0} is created by {1}'), - 'entail': ('Entails', '{0} entails {1}'), - 'similar': ('SimilarTo', '{0} is similar to {1}'), - 'also': ('RelatedTo', '{0} is related to {1}'), - 'antonym': ('Antonym', '{0} is the opposite of {1}'), - 'derivation': ('DerivedFrom', 'The word "{0}" is derived from "{1}"'), - 'translation': ('~Synonym', '{0} is a translation of {1}'), - + "hypernym": ("IsA", "{0} is a type of {1}"), + "hypernym-v": ("MannerOf", "{0} is a way to {1}"), + "part_meronym": ("PartOf", "{0} is a part of {1}"), + "domain_category": ("HasContext", "{0} is used in the context of {1}"), + "domain_region": ("HasContext", "{0} is used in the region of {1}"), + "cause": ("Causes", "{0} causes {1}"), + "action": ("UsedFor", "{0} is used for {1}"), + "result": ("UsedFor", "{0} is used for {1}"), + "beneficiary": ("UsedFor", "{0} is used for the benefit of {1}"), + "location": ("AtLocation", "{0} is located in {1}"), + "creator": ("CreatedBy", "{0} is created by {1}"), + "entail": ("Entails", "{0} entails {1}"), + "similar": ("SimilarTo", "{0} is similar to {1}"), + "also": ("RelatedTo", "{0} is related to {1}"), + "antonym": ("Antonym", "{0} is the opposite of {1}"), + "derivation": ("DerivedFrom", 'The word "{0}" is derived from "{1}"'), + "translation": ("~Synonym", "{0} is a translation of {1}"), # We may want some of these to be more specific, but it's hard to classify # them in terms of existing relations, and there aren't enough of them to # justify their own relations. - 'pertainym': ('RelatedTo', '{0} is related to {1}'), - 'agent': ('RelatedTo', '{0} is related to {1}'), - 'patient': ('RelatedTo', '{0} is related to {1}'), - 'theme': ('RelatedTo', '{0} is related to {1}'), - 'instrument': ('RelatedTo', '{0} is related to {1}'), - 'goal': ('RelatedTo', '{0} is related to {1}'), - + "pertainym": ("RelatedTo", "{0} is related to {1}"), + "agent": ("RelatedTo", "{0} is related to {1}"), + "patient": ("RelatedTo", "{0} is related to {1}"), + "theme": ("RelatedTo", "{0} is related to {1}"), + "instrument": ("RelatedTo", "{0} is related to {1}"), + "goal": ("RelatedTo", "{0} is related to {1}"), # Do we want a relation for verbs in the same VerbNet group? } @@ -59,9 +54,7 @@ # http://compling.hss.ntu.edu.sg/omw/, but the language codes are translated # into BCP 47. -SHAREALIKE_LANGUAGES = [ - 'ar', 'arb', 'nl', 'pt', 'ro', 'lt', 'sk', 'sl' -] +SHAREALIKE_LANGUAGES = ["ar", "arb", "nl", "pt", "ro", "lt", "sk", "sl"] def label_sort_key(label): @@ -89,7 +82,13 @@ def label_sort_key(label): won't be disambiguated by adding the category "person". For people, we apply this rule no matter what, choosing their longest name. """ - return (not label[0].isdigit(), label[-1].isdigit(), not label.islower(), -len(label), label) + return ( + not label[0].isdigit(), + label[-1].isdigit(), + not label.islower(), + -len(label), + label, + ) def run_wordnet(input_file, output_file): @@ -106,20 +105,20 @@ def run_wordnet(input_file, output_file): synset_uris = {} # First pass: find data about synsets - quads = parse_nquads(open(input_file, encoding='utf-8')) + quads = parse_nquads(open(input_file, encoding="utf-8")) for subj_dict, rel_dict, obj_dict, _graph in quads: - if 'url' not in subj_dict or 'url' not in rel_dict: + if "url" not in subj_dict or "url" not in rel_dict: continue - subj = subj_dict['url'] - rel = rel_dict['url'] - obj = obj_dict.get('url') - objtext = obj_dict.get('text') + subj = subj_dict["url"] + rel = rel_dict["url"] + obj = obj_dict.get("url") + objtext = obj_dict.get("text") relname = resource_name(rel) - if relname == 'label': - if obj_dict['lang'] == 'en': + if relname == "label": + if obj_dict["lang"] == "en": synset_labels[subj].append(objtext) - elif relname == 'sameAs': + elif relname == "sameAs": if obj.startswith(WN20_URL): # If we have a link to RDF WordNet 2.0, the URL (URI? IRI?) # will contain a standardized label for this concept, which @@ -128,22 +127,29 @@ def run_wordnet(input_file, output_file): # a number of labels in no particular order, making it hard to # determine from 3.1 alone what to name a category. objname = resource_name(obj) - parts = objname.split('-')[1:-2] + parts = objname.split("-")[1:-2] # Handle missing apostrophes - label = '-'.join(parts).replace('_s_', "'s_").replace('_s-', "'s_").replace("s__", "s'_").replace("s_-", "s'-").replace('_', ' ') + label = ( + "-".join(parts) + .replace("_s_", "'s_") + .replace("_s-", "'s_") + .replace("s__", "s'_") + .replace("s_-", "s'-") + .replace("_", " ") + ) synset_canonical_labels[subj] = label - elif relname == 'domain_category': + elif relname == "domain_category": synset_categories[subj] = obj - elif relname == 'lexical_domain': + elif relname == "lexical_domain": target = resource_name(obj) - if '.' in target: - domain = target.split('.')[1] + if "." in target: + domain = target.split(".")[1] synset_domains[subj] = domain - elif relname == 'gloss': + elif relname == "gloss": synset_glosses[subj] = objtext - elif relname == 'reference': + elif relname == "reference": lemma = resource_name(subj) synset = obj synset_senses[synset].append(lemma) @@ -153,8 +159,9 @@ def run_wordnet(input_file, output_file): for synset, values in synset_labels.items(): values.sort(key=lambda label: (label in used_labels,) + label_sort_key(label)) if ( - synset not in synset_canonical_labels or - synset_canonical_labels[synset][0].isupper() and synset_domains.get(synset) == 'person' + synset not in synset_canonical_labels + or synset_canonical_labels[synset][0].isupper() + and synset_domains.get(synset) == "person" ): label = values[0] synset_canonical_labels[synset] = label @@ -165,63 +172,71 @@ def run_wordnet(input_file, output_file): category_name = synset_canonical_labels[synset_categories[synset]] else: category_name = synset_domains.get(synset, None) - synset_no_fragment = synset.split('#')[0] + synset_no_fragment = synset.split("#")[0] pos = synset_no_fragment[-1].lower() - assert pos in 'nvarsp', synset - if pos == 's': - pos = 'a' - elif pos == 'p': - pos = '-' - if category_name in ('pert', 'all', 'tops'): + assert pos in "nvarsp", synset + if pos == "s": + pos = "a" + elif pos == "p": + pos = "-" + if category_name in ("pert", "all", "tops"): category_name = None synset_disambig[synset] = (pos, category_name) canon = synset_canonical_labels[synset] - canon_uri = standardized_concept_uri('en', canon, pos, 'wn', category_name) + canon_uri = standardized_concept_uri("en", canon, pos, "wn", category_name) synset_uris[synset] = canon_uri for label in labels: if label != canon: - other_uri = standardized_concept_uri('en', label, pos, 'wn', category_name) - rel_uri = '/r/Synonym' - surface = '[[{0}]] is a synonym of [[{1}]]'.format(label, canon) + other_uri = standardized_concept_uri( + "en", label, pos, "wn", category_name + ) + rel_uri = "/r/Synonym" + surface = "[[{0}]] is a synonym of [[{1}]]".format(label, canon) edge = make_edge( - rel_uri, other_uri, canon_uri, dataset=DATASET, surfaceText=surface, - license=Licenses.cc_attribution, sources=[SOURCE], weight=2.0 + rel_uri, + other_uri, + canon_uri, + dataset=DATASET, + surfaceText=surface, + license=Licenses.cc_attribution, + sources=[SOURCE], + weight=2.0, ) out.write(edge) - quads = parse_nquads(open(input_file, encoding='utf-8')) + quads = parse_nquads(open(input_file, encoding="utf-8")) for subj_dict, rel_dict, obj_dict, _graph in quads: - if 'url' not in subj_dict or 'url' not in rel_dict: + if "url" not in subj_dict or "url" not in rel_dict: continue - subj = subj_dict['url'] - rel = rel_dict['url'] - obj = obj_dict.get('url') + subj = subj_dict["url"] + rel = rel_dict["url"] + obj = obj_dict.get("url") relname = resource_name(rel) if relname in REL_MAPPING: pos, sense = synset_disambig.get(subj, (None, None)) - if relname == 'hypernym' and pos == 'v': - relname = 'hypernym-v' + if relname == "hypernym" and pos == "v": + relname = "hypernym-v" rel, frame = REL_MAPPING[relname] reversed_frame = False - if rel.startswith('~'): + if rel.startswith("~"): rel = rel[1:] reversed_frame = True - rel_uri = '/r/' + rel + rel_uri = "/r/" + rel if obj is not None: obj_uri = synset_uris.get(obj) if obj not in synset_canonical_labels: continue obj_label = synset_canonical_labels[obj] else: - text = obj_dict['text'] + text = obj_dict["text"] # Some WordNets use strings with "!" in them to indicate # out-of-band information, such as a missing translation - if (not text) or '!' in text: + if (not text) or "!" in text: continue - lang = obj_dict['lang'] - obj_uri = standardized_concept_uri(lang, text, pos, 'wn', sense) + lang = obj_dict["lang"] + obj_uri = standardized_concept_uri(lang, text, pos, "wn", sense) obj_label = text if subj not in synset_uris or subj not in synset_canonical_labels: @@ -229,7 +244,7 @@ def run_wordnet(input_file, output_file): subj_uri = synset_uris[subj] subj_label = synset_canonical_labels[subj] license = Licenses.cc_attribution - langcode = subj_uri.split('/')[2] + langcode = subj_uri.split("/")[2] if langcode in SHAREALIKE_LANGUAGES: license = Licenses.cc_sharealike @@ -237,19 +252,30 @@ def run_wordnet(input_file, output_file): subj_uri, obj_uri = obj_uri, subj_uri subj_label, obj_label = obj_label, subj_label - surface = frame.format('[[%s]]' % subj_label, '[[%s]]' % obj_label) + surface = frame.format("[[%s]]" % subj_label, "[[%s]]" % obj_label) edge = make_edge( - rel_uri, subj_uri, obj_uri, dataset=DATASET, surfaceText=surface, - license=license, sources=[SOURCE], weight=2.0 + rel_uri, + subj_uri, + obj_uri, + dataset=DATASET, + surfaceText=surface, + license=license, + sources=[SOURCE], + weight=2.0, ) out.write(edge) for wn_url in sorted(synset_uris): cn_uri = synset_uris[wn_url] edge = make_edge( - '/r/ExternalURL', cn_uri, wn_url, dataset=DATASET, - license=Licenses.cc_sharealike, sources=[SOURCE], weight=1.0 + "/r/ExternalURL", + cn_uri, + wn_url, + dataset=DATASET, + license=Licenses.cc_sharealike, + sources=[SOURCE], + weight=1.0, ) out.write(edge) diff --git a/conceptnet5/util/__init__.py b/conceptnet5/util/__init__.py index f2c5db8f..188be3d9 100644 --- a/conceptnet5/util/__init__.py +++ b/conceptnet5/util/__init__.py @@ -1,7 +1,12 @@ -import pkg_resources import os -DATA_DIR = os.environ.get('CONCEPTNET_DATA') or os.environ.get('CONCEPTNET_BUILD_DATA') or os.path.expanduser('~/.conceptnet5') +import pkg_resources + +DATA_DIR = ( + os.environ.get('CONCEPTNET_DATA') + or os.environ.get('CONCEPTNET_BUILD_DATA') + or os.path.expanduser('~/.conceptnet5') +) if not os.path.exists(DATA_DIR): DATA_DIR = 'data' diff --git a/conceptnet5/util/sounds_like.py b/conceptnet5/util/sounds_like.py index 31c31a5e..24e7c6bc 100644 --- a/conceptnet5/util/sounds_like.py +++ b/conceptnet5/util/sounds_like.py @@ -1,8 +1,8 @@ -from __future__ import with_statement, print_function, unicode_literals, division from conceptnet5.util import get_support_data_filename - PHONETIC_DICT = {} + + def _setup(): """ Read the dictionary file, creating a mapping from words to their @@ -12,10 +12,13 @@ def _setup(): """ with open(get_support_data_filename('cmudict.0.7a')) as rhymelist: for line in rhymelist: - if line.startswith(';;;'): continue + if line.startswith(';;;'): + continue word, phon = line.strip().split(' ') phon = phon.split(' ') PHONETIC_DICT[word] = phon + + _setup() @@ -58,17 +61,17 @@ def edit_distance(list1, list2): """ m = len(list1) n = len(list2) - data = [[0 for col in range(n+1)] for row in range(m+1)] - for col in range(n+1): + data = [[0 for col in range(n + 1)] for row in range(m + 1)] + for col in range(n + 1): data[0][col] = col - for row in range(m+1): + for row in range(m + 1): data[row][0] = row - for a in range(1, m+1): - for b in range(1, n+1): - if list1[a-1] == list2[b-1]: - data[a][b] = data[a-1][b-1] + for a in range(1, m + 1): + for b in range(1, n + 1): + if list1[a - 1] == list2[b - 1]: + data[a][b] = data[a - 1][b - 1] else: - data[a][b] = 1 + min(data[a-1][b], data[a][b-1], data[a-1][b-1]) + data[a][b] = 1 + min(data[a - 1][b], data[a][b - 1], data[a - 1][b - 1]) return data[m][n] @@ -87,11 +90,11 @@ def longest_match(list1, list2): """ m = len(list1) n = len(list2) - data = [[0 for col in range(n+1)] for row in range(m+1)] - for a in range(1, m+1): - for b in range(1, n+1): - if list1[a-1] == list2[b-1]: - data[a][b] = 1 + data[a-1][b-1] + data = [[0 for col in range(n + 1)] for row in range(m + 1)] + for a in range(1, m + 1): + for b in range(1, n + 1): + if list1[a - 1] == list2[b - 1]: + data[a][b] = 1 + data[a - 1][b - 1] else: data[a][b] = 0 maxes = [max(row) for row in data] @@ -166,7 +169,7 @@ def scaled_prefix_match(list1, list2): """ The length of the longest common prefix between two lists, as a proportion of their minimum length. - + >>> scaled_prefix_match('test', 'toast') 0.25 """ @@ -190,10 +193,12 @@ def combined_score(list1, list2): This measure is the average of the four similarity measures above. """ - return (scaled_edit_distance_match(list1, list2) - + scaled_suffix_match(list1, list2) - + scaled_prefix_match(list1, list2) - + scaled_longest_match(list1, list2)) / 4 + return ( + scaled_edit_distance_match(list1, list2) + + scaled_suffix_match(list1, list2) + + scaled_prefix_match(list1, list2) + + scaled_longest_match(list1, list2) + ) / 4 def _sounds_like_score(text1, text2): @@ -202,8 +207,10 @@ def _sounds_like_score(text1, text2): spelling or their phonetics. The higher this is, the more likely it is that one is a 'pun' on the other. """ - result = max(combined_score(text1.replace(' ', ''), text2.replace(' ', '')), - combined_score(get_phonetic(text1), get_phonetic(text2))) + result = max( + combined_score(text1.replace(' ', ''), text2.replace(' ', '')), + combined_score(get_phonetic(text1), get_phonetic(text2)), + ) return result @@ -222,8 +229,7 @@ def sounds_like_score(target, clue): subscores = [] for word in clue.split(): subscores.append(_sounds_like_score(target, word)) - scores = [_sounds_like_score(target, clue), - sum(subscores) / len(subscores)] + scores = [_sounds_like_score(target, clue), sum(subscores) / len(subscores)] return max(scores) @@ -236,21 +242,21 @@ def test(cutoff=0.35): assert sounds_like_score('research', 're search') > cutoff assert sounds_like_score('feet', 'eat') > cutoff assert sounds_like_score('mother', 'other') > cutoff - assert sounds_like_score('fish', 'swish') > cutoff - assert sounds_like_score('heat', 'feat meat') > cutoff - assert sounds_like_score('love', 'above') > cutoff - assert sounds_like_score('love', 'of') > cutoff + assert sounds_like_score('fish', 'swish') > cutoff + assert sounds_like_score('heat', 'feat meat') > cutoff + assert sounds_like_score('love', 'above') > cutoff + assert sounds_like_score('love', 'of') > cutoff # Negative tests: these are not sufficiently similar, and should be # less than the cutoff assert sounds_like_score('spam', 'eggs') < cutoff assert sounds_like_score('cow', 'logical') < cutoff assert sounds_like_score('sister', 'brother') < cutoff - assert sounds_like_score('a', 'b') < cutoff - assert sounds_like_score('fish', 'chips') < cutoff - assert sounds_like_score('behind', 'not') < cutoff - assert sounds_like_score('name', 'nomenclature') < cutoff - assert sounds_like_score('clothing', 'covering') < cutoff + assert sounds_like_score('a', 'b') < cutoff + assert sounds_like_score('fish', 'chips') < cutoff + assert sounds_like_score('behind', 'not') < cutoff + assert sounds_like_score('name', 'nomenclature') < cutoff + assert sounds_like_score('clothing', 'covering') < cutoff assert sounds_like_score('love', 'of another') < cutoff diff --git a/conceptnet5/util/whereami.py b/conceptnet5/util/whereami.py index 83ff5f07..c06f19b7 100644 --- a/conceptnet5/util/whereami.py +++ b/conceptnet5/util/whereami.py @@ -1,7 +1,8 @@ -import conceptnet5 import inspect import os +import conceptnet5 + def get_code_base(): return os.path.dirname(inspect.getsourcefile(conceptnet5)) diff --git a/conceptnet5/vectors/cli.py b/conceptnet5/vectors/cli.py index 24f02748..5a9f8d1a 100644 --- a/conceptnet5/vectors/cli.py +++ b/conceptnet5/vectors/cli.py @@ -1,17 +1,18 @@ -import click from os import path +import click + from .debias import de_bias_frame -from .evaluation import wordsim, analogy, bias +from .evaluation import analogy, bias, wordsim from .evaluation.compare import compare_embeddings, graph_comparison from .formats import ( - convert_glove, - convert_word2vec, convert_fasttext, + convert_glove, convert_polyglot, + convert_word2vec, + export_text, load_hdf, save_hdf, - export_text, save_labels, save_npy, ) @@ -19,7 +20,7 @@ from .miniaturize import miniaturize from .propagate import sharded_propagate from .query import VectorSpaceWrapper -from .retrofit import sharded_retrofit, join_shards +from .retrofit import join_shards, sharded_retrofit from .transforms import make_big_frame, make_small_frame ANALOGY_FILENAME = 'data/raw/analogy/SAT-package-V3.txt' diff --git a/conceptnet5/vectors/debias.py b/conceptnet5/vectors/debias.py index 0b7313df..d2b5d7ad 100644 --- a/conceptnet5/vectors/debias.py +++ b/conceptnet5/vectors/debias.py @@ -3,8 +3,7 @@ from sklearn import svm from sklearn.preprocessing import normalize -from conceptnet5.vectors import standardized_uri, normalize_vec - +from conceptnet5.vectors import normalize_vec, standardized_uri # A list of English words referring to nationalities, nations, ethnicities, and # religions. Our goal is to prevent ConceptNet from learning insults and @@ -361,11 +360,11 @@ def make_shard_endpoints(total_length, shard_size=int(1e6)): """ - Partition the half-open integer interval [0, total_length) into a - sequence of half-open subintervals [s0,e0), [s1,e1), ... [s_n, e_n) - such that s0 = 0, s_(k+1) = e_k, e_n = total_length, and each of these - subintervals (except possibly the last) has length equal to the given - shard_size. Return the sequence of pairs of endpoints of the + Partition the half-open integer interval [0, total_length) into a + sequence of half-open subintervals [s0,e0), [s1,e1), ... [s_n, e_n) + such that s0 = 0, s_(k+1) = e_k, e_n = total_length, and each of these + subintervals (except possibly the last) has length equal to the given + shard_size. Return the sequence of pairs of endpoints of the subintervals. """ shard_end = 0 @@ -595,11 +594,15 @@ def de_bias_frame(frame): The resulting space attempts not to learn stereotyped associations with anyone's race, color, religion, national origin, sex, gender presentation, or sexual orientation. - - The input frame is modified in-place; this can save considerable memory + + The input frame is modified in-place; this can save considerable memory with realistically sized semantic spaces. """ de_bias_category(frame, PEOPLE_BY_ETHNICITY, CULTURE_PREJUDICES + SEX_PREJUDICES) de_bias_category(frame, PEOPLE_BY_BELIEF, CULTURE_PREJUDICES + SEX_PREJUDICES) - de_bias_category(frame, FEMALE_WORDS + MALE_WORDS + ORIENTATION_WORDS + AGE_WORDS, CULTURE_PREJUDICES + SEX_PREJUDICES) + de_bias_category( + frame, + FEMALE_WORDS + MALE_WORDS + ORIENTATION_WORDS + AGE_WORDS, + CULTURE_PREJUDICES + SEX_PREJUDICES + ) de_bias_binary(frame, GENDER_NEUTRAL_WORDS, GENDERED_WORDS, MALE_WORDS, FEMALE_WORDS) diff --git a/conceptnet5/vectors/evaluation/analogy.py b/conceptnet5/vectors/evaluation/analogy.py index 8d21c490..b512e470 100644 --- a/conceptnet5/vectors/evaluation/analogy.py +++ b/conceptnet5/vectors/evaluation/analogy.py @@ -3,13 +3,16 @@ import numpy as np import pandas as pd -import wordfreq -from scipy.stats import spearmanr, hmean +from scipy.stats import hmean, spearmanr from statsmodels.stats.proportion import proportion_confint +import wordfreq from conceptnet5.util import get_support_data_filename from conceptnet5.vectors import standardized_uri -from conceptnet5.vectors.evaluation.wordsim import confidence_interval, empty_comparison_table +from conceptnet5.vectors.evaluation.wordsim import ( + confidence_interval, + empty_comparison_table, +) from conceptnet5.vectors.query import VectorSpaceWrapper @@ -21,7 +24,7 @@ def read_google_analogies(filename): [standardized_uri('en', term) for term in line.rstrip().split(' ')] for line in open(filename, encoding='utf-8') if not line.startswith(':') - ] + ] return quads @@ -43,12 +46,16 @@ def read_turney_analogies(filename): # Line 0 is a header we can discard. raw_pairs = [qline.split(' ')[:2] for qline in question_lines[1:]] - concept_pairs = [tuple(standardized_uri('en', term) for term in pair) for pair - in raw_pairs] + concept_pairs = [ + tuple(standardized_uri('en', term) for term in pair) + for pair in raw_pairs + ] # The first of the pairs we got is the prompt pair. The others are # answers (a) through (e). - questions.append((concept_pairs[0], concept_pairs[1:], answer_index)) + questions.append( + (concept_pairs[0], concept_pairs[1:], answer_index) + ) question_lines.clear() else: question_lines.append(line) @@ -164,14 +171,18 @@ def read_bats(category): quads = [] for i in range(len(pairs)): first_pair = pairs[i] - first_pair[1] = first_pair[1][0] # select only one term for b1, even if more may be available + first_pair[1] = first_pair[1][ + 0 + ] # select only one term for b1, even if more may be available second_pairs = [pair for j, pair in enumerate(pairs) if j != i] for second_pair in second_pairs: quad = [] # the first three elements of a quad are the two terms in first_pair and the first # term of the second_pair - quad.extend([standardized_uri('en', term) for term in first_pair + second_pair[:1]]) + quad.extend( + [standardized_uri('en', term) for term in first_pair + second_pair[:1]] + ) # if the second element of the second pair (b2) is a list, it means there are multiple # correct answers for b2. We want to keep all of them. @@ -195,11 +206,7 @@ def analogy_func(wrap, a1, b1, a2, weight_direct=2 / 3, weight_transpose=1 / 3): vb1 = wrap.get_vector(b1) va2 = wrap.get_vector(a2) - return ( - (vb1 - va1) * weight_direct + - (va2 - va1) * weight_transpose + - vb1 - ) + return (vb1 - va1) * weight_direct + (va2 - va1) * weight_transpose + vb1 def best_analogy_3cosmul(wrap, subframe, a1, b1, a2): @@ -237,15 +244,20 @@ def pairwise_analogy_func(wrap, a1, b1, a2, b2, weight_direct, weight_transpose) value = ( weight_direct * (vb2 - va2).dot(vb1 - va1) + weight_transpose * (vb2 - vb1).dot(va2 - va1) - + vb2.dot(vb1) + va2.dot(va1) + + vb2.dot(vb1) + + va2.dot(va1) ) return value -def eval_pairwise_analogies(vectors, eval_filename, weight_direct, weight_transpose, subset='all'): +def eval_pairwise_analogies( + vectors, eval_filename, weight_direct, weight_transpose, subset='all' +): total = 0 correct = 0 - for idx, (prompt, choices, answer) in enumerate(read_turney_analogies(eval_filename)): + for idx, (prompt, choices, answer) in enumerate( + read_turney_analogies(eval_filename) + ): # Enable an artificial training/test split if subset == 'all' or (subset == 'dev') == (idx % 2 == 0): a1, b1 = prompt @@ -253,7 +265,9 @@ def eval_pairwise_analogies(vectors, eval_filename, weight_direct, weight_transp for choice in choices: a2, b2 = choice choice_values.append( - pairwise_analogy_func(vectors, a1, b1, a2, b2, weight_direct, weight_transpose) + pairwise_analogy_func( + vectors, a1, b1, a2, b2, weight_direct, weight_transpose + ) ) our_answer = np.argmax(choice_values) if our_answer == answer: @@ -279,15 +293,36 @@ def optimize_weights(func, *args): """ print('Tuning analogy weights') weights = [ - 0., 0.05, 0.1, 0.15, 0.2, 0.3, 0.35, 0.4, 0.5, 0.6, 0.65, 0.7, 0.8, - 0.9, 1.0, 1.5, 2.0, 2.5, 3.0 + 0., + 0.05, + 0.1, + 0.15, + 0.2, + 0.3, + 0.35, + 0.4, + 0.5, + 0.6, + 0.65, + 0.7, + 0.8, + 0.9, + 1.0, + 1.5, + 2.0, + 2.5, + 3.0, ] best_weights = None best_acc = 0. for weight_direct in weights: for weight_transpose in weights: - scores = func(*args, weight_direct=weight_direct, weight_transpose=weight_transpose, - subset='dev') + scores = func( + *args, + weight_direct=weight_direct, + weight_transpose=weight_transpose, + subset='dev' + ) if isinstance(scores, list): # If a function to optimize returns two results, like eval_semeval2012_analogies(), # take their harmonic mean to compute the weights optimal for both results @@ -339,10 +374,10 @@ def eval_open_vocab_analogies(vectors, quads, vocab_size=200000, verbose=False): for quad in quads: prompt = quad[:3] answer = quad[3] - result = best_analogy_3cosmul( - vectors, tframe, *prompt + result = best_analogy_3cosmul(vectors, tframe, *prompt) + is_correct = (isinstance(answer, list) and result in answer) or ( + result == answer ) - is_correct = (isinstance(answer, list) and result in answer) or (result == answer) if is_correct: correct += 1 else: @@ -376,16 +411,18 @@ def choose_vocab(quads, vocab_size): vocab = [ standardized_uri('en', word) for word in sorted(set([quad[3] for quad in quads])) - ] + ] else: vocab = [ standardized_uri('en', word) for word in wordfreq.top_n_list('en', vocab_size) - ] + ] return vocab -def eval_semeval2012_analogies(vectors, weight_direct, weight_transpose, subset, subclass): +def eval_semeval2012_analogies( + vectors, weight_direct, weight_transpose, subset, subclass +): """ For a set of test pairs: * Compute a Spearman correlation coefficient between the ranks produced by vectors and @@ -394,7 +431,9 @@ def eval_semeval2012_analogies(vectors, weight_direct, weight_transpose, subset, """ train_pairs = read_train_pairs_semeval2012(subset, subclass) test_questions = read_test_questions_semeval2012(subset, subclass) - pairqnum2least, pairqnum2most = read_turk_answers_semeval2012(subset, subclass, test_questions) + pairqnum2least, pairqnum2most = read_turk_answers_semeval2012( + subset, subclass, test_questions + ) turk_rank = read_turk_ranks_semeval2012(subset, subclass) pairs_to_rank = [pair for pair, score in turk_rank] @@ -404,12 +443,15 @@ def eval_semeval2012_analogies(vectors, weight_direct, weight_transpose, subset, rank_pair_scores = [] for train_pair in train_pairs: pair_to_rank = pair.strip().replace('"', '').split(':') - score = pairwise_analogy_func(vectors, standardized_uri('en', train_pair[0]), - standardized_uri('en', train_pair[1]), - standardized_uri('en', pair_to_rank[0]), - standardized_uri('en', pair_to_rank[1]), - weight_direct, - weight_transpose) + score = pairwise_analogy_func( + vectors, + standardized_uri('en', train_pair[0]), + standardized_uri('en', train_pair[1]), + standardized_uri('en', pair_to_rank[0]), + standardized_uri('en', pair_to_rank[1]), + weight_direct, + weight_transpose, + ) rank_pair_scores.append(score) our_pair_scores[pair] = np.mean(rank_pair_scores) @@ -456,8 +498,12 @@ def eval_semeval2012_analogies(vectors, weight_direct, weight_transpose, subset, # Compute an accuracy score on MaxDiff questions maxdiff = (correct_least + correct_most) / (2 * total) - low_maxdiff, high_maxdiff = proportion_confint((correct_least + correct_most), (2 * total)) - maxdiff_results = pd.Series([maxdiff, low_maxdiff, high_maxdiff], index=['acc', 'low', 'high']) + low_maxdiff, high_maxdiff = proportion_confint( + (correct_least + correct_most), (2 * total) + ) + maxdiff_results = pd.Series( + [maxdiff, low_maxdiff, high_maxdiff], index=['acc', 'low', 'high'] + ) return [maxdiff_results, spearman_results] @@ -471,9 +517,9 @@ def eval_semeval2012_global(vectors, weight_direct, weight_transpose, subset): for subclass in product(range(1, 11), 'a b c d e f g h i j'): subclass = ''.join([str(element) for element in subclass]) try: - maxdiff, spearman = eval_semeval2012_analogies(vectors, weight_direct, - weight_transpose, - subset, subclass) + maxdiff, spearman = eval_semeval2012_analogies( + vectors, weight_direct, weight_transpose, subset, subclass + ) spearman_scores.append(spearman) maxdiff_scores.append(maxdiff) except FileNotFoundError: @@ -487,8 +533,10 @@ def eval_semeval2012_global(vectors, weight_direct, weight_transpose, subset): spearman_output.append(average_spearman_score) maxdiff_output.append(average_maxdiff_score) - return [pd.Series(maxdiff_output, index=['acc', 'low', 'high']), - pd.Series(spearman_output, index=['acc', 'low', 'high'])] + return [ + pd.Series(maxdiff_output, index=['acc', 'low', 'high']), + pd.Series(spearman_output, index=['acc', 'low', 'high']), + ] def eval_bats_category(vectors, category, vocab_size=200000, verbose=False): @@ -500,8 +548,14 @@ def eval_bats_category(vectors, category, vocab_size=200000, verbose=False): return category_results -def evaluate(frame, analogy_filename, subset='test', tune_analogies=True, scope='global', - google_vocab_size=200000): +def evaluate( + frame, + analogy_filename, + subset='test', + tune_analogies=True, + scope='global', + google_vocab_size=200000, +): """ Run SAT and Semeval12-2 evaluations. @@ -524,21 +578,23 @@ def evaluate(frame, analogy_filename, subset='test', tune_analogies=True, scope= results = empty_comparison_table() if tune_analogies: - sat_weights = optimize_weights(eval_pairwise_analogies, vectors, analogy_filename) + sat_weights = optimize_weights( + eval_pairwise_analogies, vectors, analogy_filename + ) semeval_weights = optimize_weights(eval_semeval2012_global, vectors) else: sat_weights = (0.35, 0.65) semeval_weights = (0.3, 0.35) - sat_results = eval_pairwise_analogies(vectors, - analogy_filename, - sat_weights[0], - sat_weights[1], - subset) + sat_results = eval_pairwise_analogies( + vectors, analogy_filename, sat_weights[0], sat_weights[1], subset + ) results.loc['sat-analogies'] = sat_results for gsubset in ['semantic', 'syntactic']: - google_results = eval_google_analogies(vectors, subset=gsubset, vocab_size=google_vocab_size) + google_results = eval_google_analogies( + vectors, subset=gsubset, vocab_size=google_vocab_size + ) results.loc['google-%s' % gsubset] = google_results # There's no meaningful "all" subset for semeval12, because the dev and @@ -548,21 +604,22 @@ def evaluate(frame, analogy_filename, subset='test', tune_analogies=True, scope= else: semeval12_subset = 'test' if scope == 'global': - maxdiff_score, spearman_score = eval_semeval2012_global(vectors, - semeval_weights[0], - semeval_weights[1], - semeval12_subset) + maxdiff_score, spearman_score = eval_semeval2012_global( + vectors, semeval_weights[0], semeval_weights[1], semeval12_subset + ) results.loc['semeval12-spearman'] = spearman_score results.loc['semeval12-maxdiff'] = maxdiff_score else: for subclass in product(range(1, 11), 'a b c d e f g h i j'): subclass = ''.join([str(element) for element in subclass]) try: - maxdiff_score, spearman_score = eval_semeval2012_analogies(vectors, - semeval_weights[0], - semeval_weights[1], - semeval12_subset, - subclass) + maxdiff_score, spearman_score = eval_semeval2012_analogies( + vectors, + semeval_weights[0], + semeval_weights[1], + semeval12_subset, + subclass, + ) results.loc['semeval12-{}-spearman'.format(subclass)] = spearman_score results.loc['semeval12-{}-maxdiff'.format(subclass)] = maxdiff_score except FileNotFoundError: @@ -578,7 +635,9 @@ def evaluate(frame, analogy_filename, subset='test', tune_analogies=True, scope= if scope == 'global': average_scores = [] for interval in ['acc', 'low', 'high']: - average_scores.append(np.mean([result[interval] for name, result in bats_results])) + average_scores.append( + np.mean([result[interval] for name, result in bats_results]) + ) results.loc['bats'] = pd.Series(average_scores, index=['acc', 'low', 'high']) else: for name, result in bats_results: diff --git a/conceptnet5/vectors/evaluation/bias.py b/conceptnet5/vectors/evaluation/bias.py index cd260038..de302f9a 100644 --- a/conceptnet5/vectors/evaluation/bias.py +++ b/conceptnet5/vectors/evaluation/bias.py @@ -2,14 +2,14 @@ import pandas as pd import scipy -from conceptnet5.vectors import standardized_uri, normalize_vec, get_vector -from conceptnet5.vectors.transforms import ( - l2_normalize_rows, subtract_mean_vector -) +from conceptnet5.vectors import get_vector, normalize_vec, standardized_uri from conceptnet5.vectors.debias import ( FEMALE_WORDS, MALE_WORDS, PEOPLE_BY_BELIEF, PEOPLE_BY_ETHNICITY, get_category_axis, get_vocabulary_vectors ) +from conceptnet5.vectors.transforms import ( + l2_normalize_rows, subtract_mean_vector +) # A list of gender-stereotyped pairs, from Bolukbasi et al.: # https://arxiv.org/pdf/1607.06520.pdf @@ -195,12 +195,16 @@ def measure_bias(frame): - Coarse-grained ethnicity - Religious beliefs """ - gender_binary_axis = normalize_vec(get_category_axis(frame, FEMALE_WORDS) - get_category_axis(frame, MALE_WORDS)) + gender_binary_axis = normalize_vec( + get_category_axis(frame, FEMALE_WORDS) - get_category_axis(frame, MALE_WORDS) + ) gender_bias_numbers = [] for female_biased_word, male_biased_word in GENDER_BIAS_PAIRS: female_biased_uri = standardized_uri('en', female_biased_word) male_biased_uri = standardized_uri('en', male_biased_word) - diff = normalize_vec(get_vector(frame, female_biased_uri) - get_vector(frame, male_biased_uri)).dot(gender_binary_axis) + diff = normalize_vec( + get_vector(frame, female_biased_uri) - get_vector(frame, male_biased_uri) + ).dot(gender_binary_axis) gender_bias_numbers.append(diff) mean = np.mean(gender_bias_numbers) diff --git a/conceptnet5/vectors/evaluation/compare.py b/conceptnet5/vectors/evaluation/compare.py index 7868d477..edfbc881 100644 --- a/conceptnet5/vectors/evaluation/compare.py +++ b/conceptnet5/vectors/evaluation/compare.py @@ -1,8 +1,15 @@ -from conceptnet5.vectors.evaluation import analogy, story, wordsim, bias -from conceptnet5.vectors.formats import load_hdf, save_hdf, load_glove, load_fasttext, load_word2vec_bin import numpy as np import pandas as pd +from conceptnet5.vectors.evaluation import analogy, story, wordsim +from conceptnet5.vectors.formats import ( + load_fasttext, + load_glove, + load_hdf, + load_word2vec_bin, + save_hdf, +) + # The filename of Turney's SAT evaluation data, which cannot be distributed # with this code and must be requested from Peter Turney. ANALOGY_FILENAME = 'data/raw/analogy/SAT-package-V3.txt' @@ -26,7 +33,9 @@ def compare_embeddings(filenames, subset='dev', run_analogies=False): for filename in filenames: print(filename) frame = load_any_embeddings(filename) - wordsim_results = wordsim.evaluate(frame, subset=subset, semeval_scope='per-language') + wordsim_results = wordsim.evaluate( + frame, subset=subset, semeval_scope='per-language' + ) story_results = story.evaluate(frame, subset=subset).to_frame('story-cloze').T frame_results = [wordsim_results, story_results] @@ -34,11 +43,7 @@ def compare_embeddings(filenames, subset='dev', run_analogies=False): analogy_results = analogy.evaluate(frame, ANALOGY_FILENAME, subset=subset) frame_results.append(analogy_results) - results.append( - pd.concat( - frame_results, axis=0 - ) - ) + results.append(pd.concat(frame_results, axis=0)) result = pd.concat(results, keys=filenames) save_hdf(result, '/tmp/numberbatch-comparison.h5') return result @@ -46,23 +51,33 @@ def compare_embeddings(filenames, subset='dev', run_analogies=False): def graph_comparison(table_filename, out_filename): import matplotlib.pyplot as plt + result = load_hdf(table_filename) # plt.style.use('bmh') plt.rcParams['xtick.labelsize'] = 'x-large' plt.rcParams['ytick.labelsize'] = 'x-large' evals = ['men3000', 'rw', 'mturk', 'ws353', 'semeval-2a-en'] - eval_labels = ['MEN-3000', 'Rare Words', 'MTurk-771', 'WordSim-353', 'SemEval 2017-2a'] + eval_labels = [ + 'MEN-3000', + 'Rare Words', + 'MTurk-771', + 'WordSim-353', + 'SemEval 2017-2a', + ] prop_cycle = list(plt.rcParams['axes.prop_cycle']) colors = [props['color'] for props in prop_cycle] systems = [ - ('word2vec Google News', 'data/raw/vectors/GoogleNews-vectors-negative300.bin.gz'), + ( + 'word2vec Google News', + 'data/raw/vectors/GoogleNews-vectors-negative300.bin.gz', + ), ('GloVe 1.2 840B', 'data/raw/vectors/glove12.840B.300d.txt.gz'), ('GloVe renormalized', 'data/vectors/glove12-840B.h5'), ('fastText enWP (without OOV)', 'data/raw/vectors/fasttext-wiki-en.vec.gz'), # ('ConceptNet Numberbatch biased', 'data/vectors/numberbatch-biased.h5'), - ('ConceptNet Numberbatch', 'data/vectors/numberbatch.h5') + ('ConceptNet Numberbatch', 'data/vectors/numberbatch.h5'), ] width = 0.84 / len(systems) ind = np.arange(len(evals)) @@ -72,39 +87,64 @@ def graph_comparison(table_filename, out_filename): eval_table = result.xs(syspath, level=0).loc[evals] value = eval_table['acc'] errs = [eval_table['high'] - value, value - eval_table['low']] - ax.bar(ind + i * width, value, width * 0.9, color=colors[i], yerr=errs, ecolor='k') + ax.bar( + ind + i * width, value, width * 0.9, color=colors[i], yerr=errs, ecolor='k' + ) ax.set_ylim(0.0, 1.0) ax.set_yticks(np.arange(0.0, 1.1, 0.1)) - ax.legend([name for (name, path) in systems], bbox_to_anchor=(1.02, 1), loc=2, borderaxespad=0.) + ax.legend( + [name for (name, path) in systems], + bbox_to_anchor=(1.02, 1), + loc=2, + borderaxespad=0., + ) ax.set_xticks(ind + width * len(systems) / 2) ax.set_xticklabels(eval_labels) ax.xaxis.grid(False) ax.yaxis.grid(True) ax.set_axisbelow(True) - plt.ylabel('Evaluation score (Spearman \N{GREEK SMALL LETTER RHO})', fontsize='x-large') + plt.ylabel( + 'Evaluation score (Spearman \N{GREEK SMALL LETTER RHO})', fontsize='x-large' + ) plt.savefig(out_filename, bbox_inches="tight", dpi=300) def graph_bias_comparison(table_filename, out_filename): import matplotlib.pyplot as plt + result = load_hdf(table_filename) # plt.style.use('bmh') plt.rcParams['xtick.labelsize'] = 'x-large' plt.rcParams['ytick.labelsize'] = 'x-large' - evals = ['gender', 'beliefs', 'ethnicity-coarse', 'ethnicity-fine', 'ethnicity-names'] - eval_labels = ['Gender bias', 'Religious bias', 'Ethnic bias (coarse)', 'Ethnic bias (fine)', 'Bias from names'] + evals = [ + 'gender', + 'beliefs', + 'ethnicity-coarse', + 'ethnicity-fine', + 'ethnicity-names', + ] + eval_labels = [ + 'Gender bias', + 'Religious bias', + 'Ethnic bias (coarse)', + 'Ethnic bias (fine)', + 'Bias from names', + ] prop_cycle = list(plt.rcParams['axes.prop_cycle']) colors = [props['color'] for props in prop_cycle] systems = [ - ('word2vec Google News', 'data/raw/vectors/GoogleNews-vectors-negative300.bin.gz'), + ( + 'word2vec Google News', + 'data/raw/vectors/GoogleNews-vectors-negative300.bin.gz', + ), ('GloVe 1.2 840B', 'data/raw/vectors/glove12.840B.300d.txt.gz'), ('GloVe renormalized', 'data/vectors/glove12-840B.h5'), ('fastText enWP (without OOV)', 'data/raw/vectors/fasttext-wiki-en.vec.gz'), # ('ConceptNet Numberbatch biased', 'data/vectors/numberbatch-biased.h5'), - ('ConceptNet Numberbatch 17.04', 'data/vectors/numberbatch.h5') + ('ConceptNet Numberbatch 17.04', 'data/vectors/numberbatch.h5'), ] width = 0.84 / len(systems) ind = np.arange(len(evals)) @@ -114,11 +154,18 @@ def graph_bias_comparison(table_filename, out_filename): eval_table = result.xs(syspath, level=0).loc[evals] value = eval_table['bias'] errs = [eval_table['high'] - value, value - eval_table['low']] - ax.bar(ind + i * width, value, width * 0.9, color=colors[i], yerr=errs, ecolor='k') + ax.bar( + ind + i * width, value, width * 0.9, color=colors[i], yerr=errs, ecolor='k' + ) ax.set_ylim(0.0, 0.4) ax.set_yticks(np.arange(0.0, 0.5, 0.1)) - ax.legend([name for (name, path) in systems], bbox_to_anchor=(1.02, 1), loc=2, borderaxespad=0.) + ax.legend( + [name for (name, path) in systems], + bbox_to_anchor=(1.02, 1), + loc=2, + borderaxespad=0., + ) ax.set_xticks(ind + width * len(systems) / 2) ax.set_xticklabels(eval_labels) ax.xaxis.grid(False) diff --git a/conceptnet5/vectors/evaluation/story.py b/conceptnet5/vectors/evaluation/story.py index 9f3eb69d..80769e86 100644 --- a/conceptnet5/vectors/evaluation/story.py +++ b/conceptnet5/vectors/evaluation/story.py @@ -1,9 +1,9 @@ +import pandas as pd +from statsmodels.stats.proportion import proportion_confint + from conceptnet5.util import get_support_data_filename -from conceptnet5.vectors import get_vector, standardized_uri, get_vector, cosine_similarity +from conceptnet5.vectors import cosine_similarity from conceptnet5.vectors.query import VectorSpaceWrapper -from statsmodels.stats.proportion import proportion_confint -import numpy as np -import pandas as pd def read_cloze(filename): @@ -39,7 +39,9 @@ def evaluate(frame, subset='val'): elif subset == 'all': # for the final evaluation, use just the test data subset = 'test' - filename = get_support_data_filename('story-cloze/cloze_test_spring2016_%s.tsv' % subset) + filename = get_support_data_filename( + 'story-cloze/cloze_test_spring2016_%s.tsv' % subset + ) vectors = VectorSpaceWrapper(frame=frame) total = 0 correct = 0 diff --git a/conceptnet5/vectors/evaluation/wordsim.py b/conceptnet5/vectors/evaluation/wordsim.py index 2c323e4b..556ee8e1 100644 --- a/conceptnet5/vectors/evaluation/wordsim.py +++ b/conceptnet5/vectors/evaluation/wordsim.py @@ -1,10 +1,12 @@ -from conceptnet5.util import get_support_data_filename -from conceptnet5.vectors import standardized_uri, get_vector, cosine_similarity -from conceptnet5.vectors.query import VectorSpaceWrapper -from scipy.stats import spearmanr, pearsonr, tmean, hmean from itertools import combinations + import numpy as np import pandas as pd +from scipy.stats import hmean, pearsonr, spearmanr, tmean + +from conceptnet5.util import get_support_data_filename +from conceptnet5.vectors import cosine_similarity, get_vector, standardized_uri +from conceptnet5.vectors.query import VectorSpaceWrapper SAMPLE_SIZES = { 'ws353': 353, @@ -19,14 +21,12 @@ 'scws': 2003, 'pku500-zh': 500, 'jsim-ja': 4429, - 'semeval-2a-en': 500, 'semeval-2a-de': 500, 'semeval-2a-es': 500, 'semeval-2a-it': 500, 'semeval-2a-fa': 500, 'semeval17-2a': 2000, - 'semeval-2b-de-es': 956, 'semeval-2b-de-fa': 888, 'semeval-2b-de-it': 912, @@ -49,7 +49,7 @@ 'Stanford': 'Pennington et al. (2014)', 'UFRGS': 'Salle et al. (2016)', 'Google+HL': 'Soricut and Och (2015)', - 'Oxford': 'Botha and Blunsom (2014)' + 'Oxford': 'Botha and Blunsom (2014)', } @@ -62,16 +62,11 @@ def confidence_interval(rho, N): interval = 1.96 / np.sqrt(N - 3) low = z - interval high = z + interval - return pd.Series( - [rho, np.tanh(low), np.tanh(high)], - index=['acc', 'low', 'high'] - ) + return pd.Series([rho, np.tanh(low), np.tanh(high)], index=['acc', 'low', 'high']) def empty_comparison_table(): - return pd.DataFrame( - columns=['acc', 'low', 'high'] - ) + return pd.DataFrame(columns=['acc', 'low', 'high']) def make_comparison_table(scores): @@ -91,293 +86,269 @@ def make_comparison_table(scores): # available # Levy et al., 2015 -COMPARISONS['Bar-Ilan', 'PPMI'] = make_comparison_table({ - 'men3000': .745, - 'mturk': .686, - 'rw': .462, - 'simlex': .393, - 'ws353': .721 # estimate -}) - -COMPARISONS['Bar-Ilan', 'SVD'] = make_comparison_table({ - 'men3000': .778, - 'mturk': .666, - 'rw': .514, - 'simlex': .432, - 'ws353': .733 # estimate -}) - -COMPARISONS['Bar-Ilan', 'SGNS'] = make_comparison_table({ - 'men3000': .774, - 'mturk': .693, - 'rw': .470, - 'simlex': .438, - 'ws353': .729 # estimate -}) - -COMPARISONS['Bar-Ilan', 'GloVe'] = make_comparison_table({ - 'men3000': .729, - 'mturk': .632, - 'rw': .403, - 'simlex': .398, - 'ws353': .654 # estimate -}) - -COMPARISONS['Google', 'word2vec SGNS'] = make_comparison_table({ - 'men3000': .732, - 'rw': .385, - 'ws353': .624, - 'scws': .574 -}) +COMPARISONS['Bar-Ilan', 'PPMI'] = make_comparison_table( + { + 'men3000': .745, + 'mturk': .686, + 'rw': .462, + 'simlex': .393, + 'ws353': .721, # estimate + } +) + +COMPARISONS['Bar-Ilan', 'SVD'] = make_comparison_table( + { + 'men3000': .778, + 'mturk': .666, + 'rw': .514, + 'simlex': .432, + 'ws353': .733, # estimate + } +) + +COMPARISONS['Bar-Ilan', 'SGNS'] = make_comparison_table( + { + 'men3000': .774, + 'mturk': .693, + 'rw': .470, + 'simlex': .438, + 'ws353': .729, # estimate + } +) + +COMPARISONS['Bar-Ilan', 'GloVe'] = make_comparison_table( + { + 'men3000': .729, + 'mturk': .632, + 'rw': .403, + 'simlex': .398, + 'ws353': .654, # estimate + } +) + +COMPARISONS['Google', 'word2vec SGNS'] = make_comparison_table( + {'men3000': .732, 'rw': .385, 'ws353': .624, 'scws': .574} +) # Speer and Chin, 2016 - arXiv:1604.01692v1 -COMPARISONS['Luminoso', 'GloVe'] = make_comparison_table({ - 'rw': .528, - 'men3000': .840, - 'ws353': .798 -}) - -COMPARISONS['Luminoso', 'word2vec SGNS'] = make_comparison_table({ - 'rw': .476, - 'men3000': .778, - 'ws353': .731 -}) - -COMPARISONS['Luminoso', 'Numberbatch 2016.04'] = make_comparison_table({ - 'rw': .596, - 'men3000': .859, - 'ws353': .821 -}) - -COMPARISONS['Luminoso', 'PPMI'] = make_comparison_table({ - 'rw': .420, - 'men3000': .764, - 'ws353': .651, - 'scws': .608 -}) +COMPARISONS['Luminoso', 'GloVe'] = make_comparison_table( + {'rw': .528, 'men3000': .840, 'ws353': .798} +) + +COMPARISONS['Luminoso', 'word2vec SGNS'] = make_comparison_table( + {'rw': .476, 'men3000': .778, 'ws353': .731} +) + +COMPARISONS['Luminoso', 'Numberbatch 2016.04'] = make_comparison_table( + {'rw': .596, 'men3000': .859, 'ws353': .821} +) + +COMPARISONS['Luminoso', 'PPMI'] = make_comparison_table( + {'rw': .420, 'men3000': .764, 'ws353': .651, 'scws': .608} +) # Pennington et al., 2014 -COMPARISONS['Stanford', 'GloVe'] = make_comparison_table({ - 'rw': .477, - 'men3000': .816, - 'ws353': .759 -}) +COMPARISONS['Stanford', 'GloVe'] = make_comparison_table( + {'rw': .477, 'men3000': .816, 'ws353': .759} +) # Joulin et al., 2016 - "Bag of Tricks" # Rounded-off numbers from the blog post at https://research.facebook.com/blog/fasttext/ -COMPARISONS['Facebook', 'fastText'] = make_comparison_table({ - 'rw': .46, - 'ws353': .73, - 'gur350-de': .69, - 'zg222-de': .37, -}) +COMPARISONS['Facebook', 'fastText'] = make_comparison_table( + {'rw': .46, 'ws353': .73, 'gur350-de': .69, 'zg222-de': .37} +) # Salle et al., 2016 - LexVec # https://github.com/alexandres/lexvec -COMPARISONS['UFRGS', 'LexVec'] = make_comparison_table({ - 'rw': .489, - 'simlex': .384, - 'scws': .652, - 'ws353': .661, - 'men3000': .759, - 'mturk': .655 -}) - -COMPARISONS['Google+HL', 'SG+Morph'] = make_comparison_table({ - 'rw': .418, - 'ws353': .712, - 'gur350-de': .641, - 'zg222-de': .215, - 'ws353-es': .473, -}) - -COMPARISONS['Oxford', 'BB2014'] = make_comparison_table({ - 'rw': .300, - 'ws353': .400, - 'gur350-de': .560, - 'zg222-de': .250 -}) +COMPARISONS['UFRGS', 'LexVec'] = make_comparison_table( + { + 'rw': .489, + 'simlex': .384, + 'scws': .652, + 'ws353': .661, + 'men3000': .759, + 'mturk': .655, + } +) + +COMPARISONS['Google+HL', 'SG+Morph'] = make_comparison_table( + {'rw': .418, 'ws353': .712, 'gur350-de': .641, 'zg222-de': .215, 'ws353-es': .473} +) + +COMPARISONS['Oxford', 'BB2014'] = make_comparison_table( + {'rw': .300, 'ws353': .400, 'gur350-de': .560, 'zg222-de': .250} +) # Comparisons from SemEval results -COMPARISONS['SemEval2017', 'Luminoso'] = make_comparison_table({ - 'semeval-2a-en': .789, - 'semeval-2a-de': .700, - 'semeval-2a-es': .743, - 'semeval-2a-it': .741, - 'semeval-2a-fa': .503, - - 'semeval-2b-en-de': .763, - 'semeval-2b-en-es': .761, - 'semeval-2b-en-it': .776, - 'semeval-2b-en-fa': .598, - 'semeval-2b-de-es': .728, - 'semeval-2b-de-it': .741, - 'semeval-2b-de-fa': .598, - 'semeval-2b-es-it': .753, - 'semeval-2b-es-fa': .627, - 'semeval-2b-it-fa': .604, -}) - -COMPARISONS['SemEval2017', 'Nasari'] = make_comparison_table({ - # This is the baseline system, by Uniroma - 'semeval-2a-en': .682, - 'semeval-2a-de': .514, - 'semeval-2a-es': .600, - 'semeval-2a-it': .596, - 'semeval-2a-fa': .405, - - 'semeval-2b-en-de': .598, - 'semeval-2b-en-es': .633, - 'semeval-2b-en-it': .648, - 'semeval-2b-en-fa': .505, - 'semeval-2b-de-es': .549, - 'semeval-2b-de-it': .561, - 'semeval-2b-de-fa': .458, - 'semeval-2b-es-it': .595, - 'semeval-2b-es-fa': .479, - 'semeval-2b-it-fa': .486, -}) - -COMPARISONS['SemEval2017', 'QLUT'] = make_comparison_table({ - 'semeval-2a-en': .778, -}) - -COMPARISONS['SemEval2017', 'HCCL'] = make_comparison_table({ - 'semeval-2a-en': .687, - 'semeval-2a-de': .594, - 'semeval-2a-es': .701, - 'semeval-2a-it': .651, - 'semeval-2a-fa': .436, - - 'semeval-2b-en-de': .307, - 'semeval-2b-en-es': .087, - 'semeval-2b-en-it': .055, - 'semeval-2b-en-fa': .012, - 'semeval-2b-de-es': .045, - 'semeval-2b-de-it': .037, - 'semeval-2b-de-fa': .023, - 'semeval-2b-es-it': .064, - 'semeval-2b-es-fa': .048, - 'semeval-2b-it-fa': .000, -}) - -COMPARISONS['SemEval2017', 'Mahtab'] = make_comparison_table({ - 'semeval-2a-fa': .715, -}) - -COMPARISONS['SemEval2017', 'hhu'] = make_comparison_table({ - 'semeval-2a-en': .704, - 'semeval-2a-fa': .604, - 'semeval-2b-en-fa': .513, -}) - -COMPARISONS['SemEval2017', 'OoO'] = make_comparison_table({ - 'semeval-2b-en-de': .570, - 'semeval-2b-en-es': .584, - 'semeval-2b-en-it': .584, - 'semeval-2b-de-es': .549, - 'semeval-2b-de-it': .548, - 'semeval-2b-es-it': .570, -}) - -COMPARISONS['SemEval2017', 'SEW'] = make_comparison_table({ - 'semeval-2a-en': .464, - 'semeval-2a-de': .449, - 'semeval-2a-es': .616, - 'semeval-2a-it': .569, - 'semeval-2a-fa': .393, - - 'semeval-2b-en-de': .464, - 'semeval-2b-en-es': .505, - 'semeval-2b-en-it': .526, - 'semeval-2b-en-fa': .420, - 'semeval-2b-de-es': .530, - 'semeval-2b-de-it': .520, - 'semeval-2b-de-fa': .428, - 'semeval-2b-es-it': .595, - 'semeval-2b-es-fa': .515, - 'semeval-2b-it-fa': .489, -}) - -COMPARISONS['SemEval2017', 'RUFINO'] = make_comparison_table({ - 'semeval-2a-en': .656, - 'semeval-2a-de': .539, - 'semeval-2a-es': .549, - 'semeval-2a-it': .476, - 'semeval-2a-fa': .360, - - 'semeval-2b-en-de': .330, - 'semeval-2b-en-es': .340, - 'semeval-2b-en-it': .342, - 'semeval-2b-en-fa': .373, - 'semeval-2b-de-es': .318, - 'semeval-2b-de-it': .327, - 'semeval-2b-de-fa': .267, - 'semeval-2b-es-it': .356, - 'semeval-2b-es-fa': .300, - 'semeval-2b-it-fa': .249, -}) - -COMPARISONS['SemEval2017', 'Citius'] = make_comparison_table({ - 'semeval-2a-en': .651, - 'semeval-2a-es': .523, - 'semeval-2b-en-es': .577, -}) - -COMPARISONS['SemEval2017', 'l2f'] = make_comparison_table({ - 'semeval-2a-en': .649, -}) - -COMPARISONS['SemEval2017', 'gpv8'] = make_comparison_table({ - 'semeval-2a-en': .555, - 'semeval-2a-de': .347, - 'semeval-2a-it': .499, -}) - -COMPARISONS['SemEval2017', 'MERALI'] = make_comparison_table({ - 'semeval-2a-en': .594, -}) - -COMPARISONS['SemEval2017', 'Amateur'] = make_comparison_table({ - 'semeval-2a-en': .589, -}) - -COMPARISONS['SemEval2017', 'Wild Devs'] = make_comparison_table({ - 'semeval-2a-en': .468, -}) +COMPARISONS['SemEval2017', 'Luminoso'] = make_comparison_table( + { + 'semeval-2a-en': .789, + 'semeval-2a-de': .700, + 'semeval-2a-es': .743, + 'semeval-2a-it': .741, + 'semeval-2a-fa': .503, + 'semeval-2b-en-de': .763, + 'semeval-2b-en-es': .761, + 'semeval-2b-en-it': .776, + 'semeval-2b-en-fa': .598, + 'semeval-2b-de-es': .728, + 'semeval-2b-de-it': .741, + 'semeval-2b-de-fa': .598, + 'semeval-2b-es-it': .753, + 'semeval-2b-es-fa': .627, + 'semeval-2b-it-fa': .604, + } +) + +COMPARISONS['SemEval2017', 'Nasari'] = make_comparison_table( + { + # This is the baseline system, by Uniroma + 'semeval-2a-en': .682, + 'semeval-2a-de': .514, + 'semeval-2a-es': .600, + 'semeval-2a-it': .596, + 'semeval-2a-fa': .405, + 'semeval-2b-en-de': .598, + 'semeval-2b-en-es': .633, + 'semeval-2b-en-it': .648, + 'semeval-2b-en-fa': .505, + 'semeval-2b-de-es': .549, + 'semeval-2b-de-it': .561, + 'semeval-2b-de-fa': .458, + 'semeval-2b-es-it': .595, + 'semeval-2b-es-fa': .479, + 'semeval-2b-it-fa': .486, + } +) + +COMPARISONS['SemEval2017', 'QLUT'] = make_comparison_table({'semeval-2a-en': .778}) + +COMPARISONS['SemEval2017', 'HCCL'] = make_comparison_table( + { + 'semeval-2a-en': .687, + 'semeval-2a-de': .594, + 'semeval-2a-es': .701, + 'semeval-2a-it': .651, + 'semeval-2a-fa': .436, + 'semeval-2b-en-de': .307, + 'semeval-2b-en-es': .087, + 'semeval-2b-en-it': .055, + 'semeval-2b-en-fa': .012, + 'semeval-2b-de-es': .045, + 'semeval-2b-de-it': .037, + 'semeval-2b-de-fa': .023, + 'semeval-2b-es-it': .064, + 'semeval-2b-es-fa': .048, + 'semeval-2b-it-fa': .000, + } +) + +COMPARISONS['SemEval2017', 'Mahtab'] = make_comparison_table({'semeval-2a-fa': .715}) + +COMPARISONS['SemEval2017', 'hhu'] = make_comparison_table( + {'semeval-2a-en': .704, 'semeval-2a-fa': .604, 'semeval-2b-en-fa': .513} +) + +COMPARISONS['SemEval2017', 'OoO'] = make_comparison_table( + { + 'semeval-2b-en-de': .570, + 'semeval-2b-en-es': .584, + 'semeval-2b-en-it': .584, + 'semeval-2b-de-es': .549, + 'semeval-2b-de-it': .548, + 'semeval-2b-es-it': .570, + } +) + +COMPARISONS['SemEval2017', 'SEW'] = make_comparison_table( + { + 'semeval-2a-en': .464, + 'semeval-2a-de': .449, + 'semeval-2a-es': .616, + 'semeval-2a-it': .569, + 'semeval-2a-fa': .393, + 'semeval-2b-en-de': .464, + 'semeval-2b-en-es': .505, + 'semeval-2b-en-it': .526, + 'semeval-2b-en-fa': .420, + 'semeval-2b-de-es': .530, + 'semeval-2b-de-it': .520, + 'semeval-2b-de-fa': .428, + 'semeval-2b-es-it': .595, + 'semeval-2b-es-fa': .515, + 'semeval-2b-it-fa': .489, + } +) + +COMPARISONS['SemEval2017', 'RUFINO'] = make_comparison_table( + { + 'semeval-2a-en': .656, + 'semeval-2a-de': .539, + 'semeval-2a-es': .549, + 'semeval-2a-it': .476, + 'semeval-2a-fa': .360, + 'semeval-2b-en-de': .330, + 'semeval-2b-en-es': .340, + 'semeval-2b-en-it': .342, + 'semeval-2b-en-fa': .373, + 'semeval-2b-de-es': .318, + 'semeval-2b-de-it': .327, + 'semeval-2b-de-fa': .267, + 'semeval-2b-es-it': .356, + 'semeval-2b-es-fa': .300, + 'semeval-2b-it-fa': .249, + } +) + +COMPARISONS['SemEval2017', 'Citius'] = make_comparison_table( + {'semeval-2a-en': .651, 'semeval-2a-es': .523, 'semeval-2b-en-es': .577} +) + +COMPARISONS['SemEval2017', 'l2f'] = make_comparison_table({'semeval-2a-en': .649}) + +COMPARISONS['SemEval2017', 'gpv8'] = make_comparison_table( + {'semeval-2a-en': .555, 'semeval-2a-de': .347, 'semeval-2a-it': .499} +) + +COMPARISONS['SemEval2017', 'MERALI'] = make_comparison_table({'semeval-2a-en': .594}) + +COMPARISONS['SemEval2017', 'Amateur'] = make_comparison_table({'semeval-2a-en': .589}) + +COMPARISONS['SemEval2017', 'Wild Devs'] = make_comparison_table({'semeval-2a-en': .468}) # Hypothetical SemEval runs of existing systems -COMPARISONS['SemEval2017', 'fastText'] = make_comparison_table({ - 'semeval-2a-en': .468, - 'semeval-2a-de': .507, - 'semeval-2a-es': .417, - 'semeval-2a-it': .344, - 'semeval-2a-fa': .334, -}) +COMPARISONS['SemEval2017', 'fastText'] = make_comparison_table( + { + 'semeval-2a-en': .468, + 'semeval-2a-de': .507, + 'semeval-2a-es': .417, + 'semeval-2a-it': .344, + 'semeval-2a-fa': .334, + } +) # Hypothetical SemEval runs of existing systems -COMPARISONS['SemEval2017', 'Luminoso, no OOV'] = make_comparison_table({ - 'semeval-2a-en': .747, - 'semeval-2a-de': .599, - 'semeval-2a-es': .611, - 'semeval-2a-it': .606, - 'semeval-2a-fa': .363, - - 'semeval-2b-en-de': .696, - 'semeval-2b-en-es': .675, - 'semeval-2b-en-it': .677, - 'semeval-2b-en-fa': .502, - 'semeval-2b-de-es': .620, - 'semeval-2b-de-it': .612, - 'semeval-2b-de-fa': .501, - 'semeval-2b-es-it': .613, - 'semeval-2b-es-fa': .482, - 'semeval-2b-it-fa': .474, -}) - -COMPARISONS['SemEval2017', 'word2vec'] = make_comparison_table({ - 'semeval-2a-en': .575, -}) +COMPARISONS['SemEval2017', 'Luminoso, no OOV'] = make_comparison_table( + { + 'semeval-2a-en': .747, + 'semeval-2a-de': .599, + 'semeval-2a-es': .611, + 'semeval-2a-it': .606, + 'semeval-2a-fa': .363, + 'semeval-2b-en-de': .696, + 'semeval-2b-en-es': .675, + 'semeval-2b-en-it': .677, + 'semeval-2b-en-fa': .502, + 'semeval-2b-de-es': .620, + 'semeval-2b-de-it': .612, + 'semeval-2b-de-fa': .501, + 'semeval-2b-es-it': .613, + 'semeval-2b-es-fa': .482, + 'semeval-2b-it-fa': .474, + } +) + +COMPARISONS['SemEval2017', 'word2vec'] = make_comparison_table({'semeval-2a-en': .575}) def read_ws353(): @@ -464,7 +435,9 @@ def read_men3000(subset='dev'): as more related compared to another randomly chosen pair. """ lang1, lang2 = 'en', 'en' - filename = get_support_data_filename('mensim/MEN_dataset_lemma_form.{}'.format(subset)) + filename = get_support_data_filename( + 'mensim/MEN_dataset_lemma_form.{}'.format(subset) + ) with open(filename) as file: for line in file: parts = line.rstrip().split() @@ -505,7 +478,9 @@ def read_jsim(): """ lang1, lang2 = 'ja', 'ja' for pos in ('noun', 'verb', 'adj', 'adv'): - filename = get_support_data_filename('jSIM/similarity_full/score_{}_new_full.csv'.format(pos)) + filename = get_support_data_filename( + 'jSIM/similarity_full/score_{}_new_full.csv'.format(pos) + ) with open(filename, encoding='utf-8') as file: for line in file: if line.startswith('word1'): @@ -541,7 +516,9 @@ def read_semeval_crosslingual(lang1, lang2, subset='test'): """ Parses Semeval2017-Task2 crosslingual word similarity (Subtask2) test collection. """ - filename = get_support_data_filename('semeval17-2/{}-{}.{}.txt'.format(lang1, lang2, subset)) + filename = get_support_data_filename( + 'semeval17-2/{}-{}.{}.txt'.format(lang1, lang2, subset) + ) with open(filename) as file: for line in file: @@ -558,25 +535,28 @@ def compute_semeval_score(pearson_score, spearman_score): intervals = ['acc', 'low', 'high'] scores = [] for interval in intervals: - if any(np.isnan(x) for x in [spearman_score[interval], pearson_score[interval]]): + if any( + np.isnan(x) for x in [spearman_score[interval], pearson_score[interval]] + ): scores.append(float('NaN')) elif any(x <= 0 for x in [spearman_score[interval], pearson_score[interval]]): scores.append(0) else: scores.append(hmean([spearman_score[interval], pearson_score[interval]])) - return pd.Series( - scores, - index=intervals - ) + return pd.Series(scores, index=intervals) def evaluate_semeval_monolingual(vectors, lang): """ Get a semeval score for a single monolingual test set. """ - spearman_score = measure_correlation(spearmanr, vectors, read_semeval_monolingual(lang)) - pearson_score = measure_correlation(pearsonr, vectors, read_semeval_monolingual(lang)) + spearman_score = measure_correlation( + spearmanr, vectors, read_semeval_monolingual(lang) + ) + pearson_score = measure_correlation( + pearsonr, vectors, read_semeval_monolingual(lang) + ) score = compute_semeval_score(spearman_score, pearson_score) return score @@ -585,8 +565,12 @@ def evaluate_semeval_crosslingual(vectors, lang1, lang2): """ Get a semeval score for a single crosslingual test set """ - spearman_score = measure_correlation(spearmanr, vectors, read_semeval_crosslingual(lang1, lang2)) - pearson_score = measure_correlation(pearsonr, vectors, read_semeval_crosslingual(lang1, lang2)) + spearman_score = measure_correlation( + spearmanr, vectors, read_semeval_crosslingual(lang1, lang2) + ) + pearson_score = measure_correlation( + pearsonr, vectors, read_semeval_crosslingual(lang1, lang2) + ) score = compute_semeval_score(spearman_score, pearson_score) return score @@ -602,13 +586,14 @@ def evaluate_semeval_monolingual_global(vectors): score = evaluate_semeval_monolingual(vectors, lang) scores.append(score) - top_scores = sorted(scores, key=lambda x: x['acc'] if not np.isnan(x['acc']) else 0)[-4:] + top_scores = sorted( + scores, key=lambda x: x['acc'] if not np.isnan(x['acc']) else 0 + )[-4:] acc_average = tmean([score['acc'] for score in top_scores]) low_average = tmean([score['low'] for score in top_scores]) high_average = tmean([score['high'] for score in top_scores]) return pd.Series( - [acc_average, low_average, high_average], - index=['acc', 'low', 'high'] + [acc_average, low_average, high_average], index=['acc', 'low', 'high'] ) @@ -624,13 +609,14 @@ def evaluate_semeval_crosslingual_global(vectors): score = evaluate_semeval_crosslingual(vectors, lang1, lang2) scores.append(score) - top_scores = sorted(scores, key=lambda x: x['acc'] if not np.isnan(x['acc']) else 0)[-6:] + top_scores = sorted( + scores, key=lambda x: x['acc'] if not np.isnan(x['acc']) else 0 + )[-6:] acc_average = tmean([score['acc'] for score in top_scores]) low_average = tmean([score['low'] for score in top_scores]) high_average = tmean([score['high'] for score in top_scores]) return pd.Series( - [acc_average, low_average, high_average], - index=['acc', 'low', 'high'] + [acc_average, low_average, high_average], index=['acc', 'low', 'high'] ) @@ -651,8 +637,9 @@ def measure_correlation(correlation_function, vectors, standard, verbose=0): our_score = vectors.get_similarity(uri1, uri2) else: - our_score = cosine_similarity(get_vector(vectors, term1, lang1), - get_vector(vectors, term2, lang2)) + our_score = cosine_similarity( + get_vector(vectors, term1, lang1), get_vector(vectors, term2, lang2) + ) if verbose > 1: print('%s\t%s\t%3.3f\t%3.3f' % (term1, term2, gold_score, our_score)) @@ -715,11 +702,14 @@ def evaluate(frame, subset='dev', semeval_scope='global'): languages = ['en', 'de', 'es', 'it', 'fa'] for lang in languages: - results.loc['semeval-2a-{}'.format(lang)] = evaluate_semeval_monolingual(vectors, lang) + results.loc['semeval-2a-{}'.format(lang)] = evaluate_semeval_monolingual( + vectors, lang + ) for lang1, lang2 in combinations(languages, 2): - results.loc['semeval-2b-{}-{}'.format(lang1, lang2)] = evaluate_semeval_crosslingual( - vectors, lang1, lang2) + results.loc[ + 'semeval-2b-{}-{}'.format(lang1, lang2) + ] = evaluate_semeval_crosslingual(vectors, lang1, lang2) return results @@ -763,19 +753,24 @@ def evaluate_raw(frame, subset='dev', semeval_scope='global'): languages = ['en', 'de', 'es', 'it', 'fa'] for lang in languages: - results.loc['semeval-2a-{}'.format(lang)] = evaluate_semeval_monolingual(frame, lang) + results.loc['semeval-2a-{}'.format(lang)] = evaluate_semeval_monolingual( + frame, lang + ) for lang1, lang2 in combinations(languages, 2): - results.loc['semeval-2b-{}-{}'.format(lang1, lang2)] = evaluate_semeval_crosslingual( - frame, lang1, lang2) + results.loc[ + 'semeval-2b-{}-{}'.format(lang1, lang2) + ] = evaluate_semeval_crosslingual(frame, lang1, lang2) return results def comparison_table(): comparisons = dict(COMPARISONS) comparison_list = sorted(comparisons) - big_frame = pd.concat([comparisons[key] for key in comparison_list], - keys=pd.MultiIndex.from_tuples(comparison_list)) + big_frame = pd.concat( + [comparisons[key] for key in comparison_list], + keys=pd.MultiIndex.from_tuples(comparison_list), + ) return big_frame.dropna() @@ -784,7 +779,9 @@ def results_in_context(results, name=('Luminoso', 'Numberbatch 17.02')): comparisons = dict(COMPARISONS) comparisons[name] = results comparison_list = sorted(comparisons) - big_frame = pd.concat([comparisons[key] for key in comparison_list], - keys=pd.MultiIndex.from_tuples(comparison_list)) + big_frame = pd.concat( + [comparisons[key] for key in comparison_list], + keys=pd.MultiIndex.from_tuples(comparison_list), + ) return big_frame.dropna() diff --git a/conceptnet5/vectors/formats.py b/conceptnet5/vectors/formats.py index 9641431f..4ac03090 100644 --- a/conceptnet5/vectors/formats.py +++ b/conceptnet5/vectors/formats.py @@ -1,9 +1,12 @@ -import pandas as pd -import numpy as np import gzip -import struct import pickle +import struct + +import numpy as np +import pandas as pd + from ordered_set import OrderedSet + from .transforms import l1_normalize_columns, l2_normalize_rows, standardize_row_labels @@ -144,7 +147,7 @@ def load_glove(filename, max_rows=1000000): arr[i] = values if len(label_list) < max_rows: - arr = arr[:len(label_list)] + arr = arr[: len(label_list)] return pd.DataFrame(arr, index=label_list, dtype='f') @@ -171,7 +174,7 @@ def load_fasttext(filename, max_rows=1000000): label_list.append(label) if len(label_list) < max_rows: - arr = arr[:len(label_list)] + arr = arr[: len(label_list)] return pd.DataFrame(arr, index=label_list, dtype='f') diff --git a/conceptnet5/vectors/merge.py b/conceptnet5/vectors/merge.py index 23f8a17a..a21b1480 100644 --- a/conceptnet5/vectors/merge.py +++ b/conceptnet5/vectors/merge.py @@ -1,9 +1,10 @@ -import pandas as pd import numpy as np +import pandas as pd from sklearn.preprocessing import normalize -from conceptnet5.uri import get_uri_language from conceptnet5.languages import CORE_LANGUAGES +from conceptnet5.uri import get_uri_language + from .formats import load_hdf @@ -26,8 +27,8 @@ def dataframe_svd_projection(frame, k): def concat_intersect(frame_filenames): """ - Find the intersection of the labels of all the frames in the given - files , and concatenate the vectors that the frames have for each of + Find the intersection of the labels of all the frames in the given + files , and concatenate the vectors that the frames have for each of those labels. This is exactly what `pd.concat` is for. However, `pd.concat` uses too @@ -67,7 +68,7 @@ def concat_intersect(frame_filenames): frame = load_hdf(frame_filename) width = frame.shape[1] for i, label in enumerate(label_intersection): - joindata[i, offset:(offset + width)] = frame.loc[label].values + joindata[i, offset : (offset + width)] = frame.loc[label].values del frame # Convert the array to a DataFrame with the appropriate labels, and @@ -95,11 +96,15 @@ def merge_intersect(frame_filenames, subsample=20, k=300): # are single words in our CORE_LANGUAGES. Even those are too numerous, # so we take an arbitrary 1/n sample of them, where n is given by the # `subsample` parameter. - filtered_labels = pd.Series([ - label for (i, label) in enumerate(joined.index) - if i % subsample == 0 and '_' not in label - and get_uri_language(label) in CORE_LANGUAGES - ]) + filtered_labels = pd.Series( + [ + label + for (i, label) in enumerate(joined.index) + if i % subsample == 0 + and '_' not in label + and get_uri_language(label) in CORE_LANGUAGES + ] + ) # Mean-center and L_2-normalize the data, to prevent artifacts # in dimensionality reduction. diff --git a/conceptnet5/vectors/miniaturize.py b/conceptnet5/vectors/miniaturize.py index 4a0cb683..361c21d1 100644 --- a/conceptnet5/vectors/miniaturize.py +++ b/conceptnet5/vectors/miniaturize.py @@ -1,9 +1,10 @@ -import wordfreq import numpy as np import pandas as pd -from conceptnet5.uri import split_uri +import wordfreq from conceptnet5.languages import CORE_LANGUAGES +from conceptnet5.uri import split_uri + from .debias import de_bias_frame @@ -46,12 +47,16 @@ def miniaturize(frame, other_vocab=None, k=300, debias=True): # # Non-English languages use terms with frequency 1e-6 or greater, because # only that much of the list has been loaded. - vocab1 = [term for term in frame.index if '_' not in term - and term_freq(term) >= 1e-8] + vocab1 = [ + term for term in frame.index if '_' not in term and term_freq(term) >= 1e-8 + ] vocab_set = set(vocab1) if other_vocab is not None: - extra_vocab = [term for term in other_vocab if '_' in term and - term in frame.index and term not in vocab_set] + extra_vocab = [ + term + for term in other_vocab + if '_' in term and term in frame.index and term not in vocab_set + ] extra_vocab = extra_vocab[:20000] else: extra_vocab = [] diff --git a/conceptnet5/vectors/ppmi.py b/conceptnet5/vectors/ppmi.py index f3ddbcd3..0b9fc5e8 100644 --- a/conceptnet5/vectors/ppmi.py +++ b/conceptnet5/vectors/ppmi.py @@ -2,6 +2,7 @@ import pandas as pd from scipy import sparse from scipy.sparse import linalg + from .sparse_matrix_builder import build_from_conceptnet_table diff --git a/conceptnet5/vectors/propagate.py b/conceptnet5/vectors/propagate.py index f5da1a1e..9fbe7851 100644 --- a/conceptnet5/vectors/propagate.py +++ b/conceptnet5/vectors/propagate.py @@ -1,42 +1,46 @@ """ -Implements 'propagation', whereby terms from the full ConceptNet graph are -assigned vectors from the embeddings produced by retrofitting against the +Implements 'propagation', whereby terms from the full ConceptNet graph are +assigned vectors from the embeddings produced by retrofitting against the reduced graph. """ import numpy as np import pandas as pd from scipy.sparse import diags + from conceptnet5.builders.reduce_assoc import ConceptNetAssociationGraph from conceptnet5.uri import get_uri_language -from .sparse_matrix_builder import SparseMatrixBuilder + from .formats import load_hdf, save_hdf +from .sparse_matrix_builder import SparseMatrixBuilder class ConceptNetAssociationGraphForPropagation(ConceptNetAssociationGraph): """ - Subclass of ConceptNetAssociationGraph specialized for use in making + Subclass of ConceptNetAssociationGraph specialized for use in making the full graph of a set of associations as required for propagation. """ + def __init__(self): super().__init__() self.edges = set() def add_edge(self, left, right, value, dataset, relation): """ - In addition to the superclass's handling of a new edge, + In addition to the superclass's handling of a new edge, saves the edges as a set of (left, right) pairs. """ super().add_edge(left, right, value, dataset, relation) self.edges.add((left, right)) - self.edges.add((right, left)) # save undirected edges + self.edges.add((right, left)) # save undirected edges -def sharded_propagate(assoc_filename, embedding_filename, - output_filename, nshards=6, iterations=20): +def sharded_propagate( + assoc_filename, embedding_filename, output_filename, nshards=6, iterations=20 +): """ - A wrapper around propagate which reduces memory requirements by - splitting the embedding into shards (along the dimensions of the + A wrapper around propagate which reduces memory requirements by + splitting the embedding into shards (along the dimensions of the embedding feature space). """ # frame_box is basically a reference to a single large DataFrame. The @@ -44,8 +48,9 @@ def sharded_propagate(assoc_filename, embedding_filename, # contains one item, which is the DataFrame. When it's absent, the list # is empty. frame_box = [load_hdf(embedding_filename)] - adjacency_matrix, combined_index, n_new_english = \ - make_adjacency_matrix(assoc_filename, frame_box[0].index) + adjacency_matrix, combined_index, n_new_english = make_adjacency_matrix( + assoc_filename, frame_box[0].index + ) shard_width = frame_box[0].shape[1] // nshards for i in range(nshards): @@ -54,34 +59,37 @@ def sharded_propagate(assoc_filename, embedding_filename, shard_to = shard_from + shard_width if len(frame_box) == 0: frame_box.append(load_hdf(embedding_filename)) - embedding_shard = pd.DataFrame( - frame_box[0].iloc[:, shard_from:shard_to]) + embedding_shard = pd.DataFrame(frame_box[0].iloc[:, shard_from:shard_to]) # Delete full_dense_frame while running retrofitting, because it takes # up a lot of memory and we can reload it from disk later. frame_box.clear() - propagated = propagate(combined_index, embedding_shard, - adjacency_matrix, n_new_english, - iterations=iterations) + propagated = propagate( + combined_index, + embedding_shard, + adjacency_matrix, + n_new_english, + iterations=iterations, + ) save_hdf(propagated, temp_filename) del propagated def make_adjacency_matrix(assoc_filename, embedding_vocab): """ - Build a sparse adjacency matrix for the ConceptNet graph presented - in the given assoc file, including all terms from the given embedding - vocabulary and removing all terms from connected components of the graph - that do not overlap that vocabulary. - - Also builds an index giving all terms from the resulting joined - graph+embedding vocabulary in the order corresponding to the rows and - columns of the matrix. Note that it is guaranteed that the terms from - the embedding vocabulary will preceed the remaining terms in that index, - and that among the remaining terms the terms in English will follow all + Build a sparse adjacency matrix for the ConceptNet graph presented + in the given assoc file, including all terms from the given embedding + vocabulary and removing all terms from connected components of the graph + that do not overlap that vocabulary. + + Also builds an index giving all terms from the resulting joined + graph+embedding vocabulary in the order corresponding to the rows and + columns of the matrix. Note that it is guaranteed that the terms from + the embedding vocabulary will preceed the remaining terms in that index, + and that among the remaining terms the terms in English will follow all the others. - + Returns the matrix and index, and the number of new English terms. """ # First eliminate all connected components of the graph that don't @@ -89,19 +97,22 @@ def make_adjacency_matrix(assoc_filename, embedding_vocab): # those terms. graph = ConceptNetAssociationGraphForPropagation.from_csv( - assoc_filename, reject_negative_relations=False) + assoc_filename, reject_negative_relations=False + ) component_labels = graph.find_components() # Get the labels of components that overlap the embedding vocabulary. - good_component_labels = set(label for term, label - in component_labels.items() - if term in embedding_vocab) + good_component_labels = set( + label for term, label in component_labels.items() if term in embedding_vocab + ) # Now get the concepts in those components. - good_concepts = set(term for term, label - in component_labels.items() - if label in good_component_labels) - + good_concepts = set( + term + for term, label in component_labels.items() + if label in good_component_labels + ) + del component_labels, good_component_labels # Put terms from the embedding first, then terms from the good part @@ -113,49 +124,51 @@ def make_adjacency_matrix(assoc_filename, embedding_vocab): # than list comprehensions.) new_vocab = good_concepts - set(embedding_vocab) good_concepts = embedding_vocab.append( - pd.Index([term for term in new_vocab - if get_uri_language(term) != 'en'])) + pd.Index([term for term in new_vocab if get_uri_language(term) != 'en']) + ) n_good_concepts_not_new_en = len(good_concepts) good_concepts = good_concepts.append( - pd.Index([term for term in new_vocab - if get_uri_language(term) == 'en'])) + pd.Index([term for term in new_vocab if get_uri_language(term) == 'en']) + ) del new_vocab n_new_english = len(good_concepts) - n_good_concepts_not_new_en - - good_concepts_map = {term : i for i, term in enumerate(good_concepts)} - + + good_concepts_map = {term: i for i, term in enumerate(good_concepts)} + # Convert the good part of the graph to an adjacency matrix representation. - # Note: the edges added differ slightly from the way it is done in (e.g.) + # Note: the edges added differ slightly from the way it is done in (e.g.) # build_from_conceptnet_table (in sparse_matrix_builder.py), in that we # do not add edges linking specific senses of terms to their more general # forms (as defined by uri_prefixes). Currently no such specific senses # show up in the input to retrofitting (i.e. the output of # build_from_conceptnet_table), so it doesn't matter, but in the future # we may want to add such edges here as well. - + builder = SparseMatrixBuilder() - for v,w in graph.edges: + for v, w in graph.edges: try: index0 = good_concepts_map[v] index1 = good_concepts_map[w] builder[index0, index1] = 1 except KeyError: - pass # one of v, w wasn't good + pass # one of v, w wasn't good del graph - + adjacency_matrix = builder.tocsr( - shape=(len(good_concepts), len(good_concepts)), dtype=np.int8) + shape=(len(good_concepts), len(good_concepts)), dtype=np.int8 + ) return adjacency_matrix, good_concepts, n_new_english -def propagate(combined_index, embedding, adjacency_matrix, n_new_english, - iterations=20): +def propagate( + combined_index, embedding, adjacency_matrix, n_new_english, iterations=20 +): """ - For as many non-English terms as possible in the ConceptNet graph whose - edges are presented in the given adjacency matrix (with corresponding term - labels in the given index), find a vector in the target space of the vector + For as many non-English terms as possible in the ConceptNet graph whose + edges are presented in the given adjacency matrix (with corresponding term + labels in the given index), find a vector in the target space of the vector embedding presented in the given embedding file. """ @@ -164,28 +177,35 @@ def propagate(combined_index, embedding, adjacency_matrix, n_new_english, embedding_dimension = embedding.values.shape[1] new_vocab_size = len(combined_index) - embedding.values.shape[0] - vectors = np.vstack([embedding.values, - np.zeros((new_vocab_size, embedding_dimension), - dtype=embedding.values.dtype)]) - + vectors = np.vstack( + [ + embedding.values, + np.zeros( + (new_vocab_size, embedding_dimension), dtype=embedding.values.dtype + ), + ] + ) + for iteration in range(iterations): - zero_indicators = (np.abs(vectors).sum(1) == 0) + zero_indicators = np.abs(vectors).sum(1) == 0 if not np.any(zero_indicators): break # Find terms with zero vectors having neighbors with nonzero vectors. nonzero_indicators = np.logical_not(zero_indicators) - fringe = (adjacency_matrix.dot(nonzero_indicators.astype(np.int8)) != 0) + fringe = adjacency_matrix.dot(nonzero_indicators.astype(np.int8)) != 0 fringe = np.logical_and(fringe, zero_indicators) # Update each as the average of its nonzero neighbors adjacent_nonzeros = adjacency_matrix[fringe, :].dot( - diags([nonzero_indicators.astype(np.int8)], [0], format='csc')) + diags([nonzero_indicators.astype(np.int8)], [0], format='csc') + ) n_adjacent_nonzeros = adjacent_nonzeros.sum(axis=1).A[:, 0] weights = 1.0 / n_adjacent_nonzeros vectors[fringe, :] = adjacency_matrix[fringe, :].dot(vectors) - vectors[fringe, :] = diags([weights], [0], format='csr').dot( - vectors[fringe, :]) + vectors[fringe, :] = diags([weights], [0], format='csr').dot(vectors[fringe, :]) n_old_plus_new_non_en = len(combined_index) - n_new_english - result = pd.DataFrame(index=combined_index[0:n_old_plus_new_non_en], - data=vectors[0:n_old_plus_new_non_en, :]) + result = pd.DataFrame( + index=combined_index[0:n_old_plus_new_non_en], + data=vectors[0:n_old_plus_new_non_en, :], + ) return result diff --git a/conceptnet5/vectors/query.py b/conceptnet5/vectors/query.py index 5f385b96..7b9a42c0 100644 --- a/conceptnet5/vectors/query.py +++ b/conceptnet5/vectors/query.py @@ -1,18 +1,17 @@ import marisa_trie - import numpy as np import pandas as pd -import wordfreq +import wordfreq from conceptnet5.db.query import AssertionFinder from conceptnet5.uri import get_uri_language, split_uri, uri_prefix from conceptnet5.util import get_data_filename from conceptnet5.vectors import ( - similar_to_vec, - weighted_average, - normalize_vec, cosine_similarity, + normalize_vec, + similar_to_vec, standardized_uri, + weighted_average, ) from conceptnet5.vectors.formats import load_hdf from conceptnet5.vectors.transforms import l2_normalize_rows diff --git a/conceptnet5/vectors/retrofit.py b/conceptnet5/vectors/retrofit.py index 9ded2fcc..dfd79a0c 100644 --- a/conceptnet5/vectors/retrofit.py +++ b/conceptnet5/vectors/retrofit.py @@ -1,19 +1,29 @@ -import pandas as pd import numpy as np +import pandas as pd from sklearn.preprocessing import normalize -from .sparse_matrix_builder import build_from_conceptnet_table + from .formats import load_hdf, save_hdf +from .sparse_matrix_builder import build_from_conceptnet_table -def sharded_retrofit(dense_hdf_filename, conceptnet_filename, output_filename, - iterations=5, nshards=6, verbosity=0, - max_cleanup_iters=20, orig_vec_weight=0.15): +def sharded_retrofit( + dense_hdf_filename, + conceptnet_filename, + output_filename, + iterations=5, + nshards=6, + verbosity=0, + max_cleanup_iters=20, + orig_vec_weight=0.15, +): # frame_box is basically a reference to a single large DataFrame. The # DataFrame will at times be present or absent. When it's present, the list # contains one item, which is the DataFrame. When it's absent, the list # is empty. frame_box = [load_hdf(dense_hdf_filename)] - sparse_csr, combined_index = build_from_conceptnet_table(conceptnet_filename, orig_index=frame_box[0].index) + sparse_csr, combined_index = build_from_conceptnet_table( + conceptnet_filename, orig_index=frame_box[0].index + ) shard_width = frame_box[0].shape[1] // nshards for i in range(nshards): @@ -28,7 +38,15 @@ def sharded_retrofit(dense_hdf_filename, conceptnet_filename, output_filename, # up a lot of memory and we can reload it from disk later. frame_box.clear() - retrofitted = retrofit(combined_index, dense_frame, sparse_csr, iterations, verbosity, max_cleanup_iters, orig_vec_weight) + retrofitted = retrofit( + combined_index, + dense_frame, + sparse_csr, + iterations, + verbosity, + max_cleanup_iters, + orig_vec_weight, + ) save_hdf(retrofitted, temp_filename) del retrofitted @@ -42,7 +60,7 @@ def join_shards(output_filename, nshards=6, sort=False): if joined_matrix is None: joined_matrix = np.zeros((nrows, ncols * nshards), dtype='f') joined_labels = shard.index - joined_matrix[:, (ncols * i):(ncols * (i + 1))] = shard.values + joined_matrix[:, (ncols * i) : (ncols * (i + 1))] = shard.values del shard normalize(joined_matrix, axis=1, norm='l2', copy=False) @@ -52,9 +70,15 @@ def join_shards(output_filename, nshards=6, sort=False): save_hdf(dframe, output_filename) -def retrofit(row_labels, dense_frame, sparse_csr, - iterations=5, verbosity=0, max_cleanup_iters=20, - orig_vec_weight=0.15): +def retrofit( + row_labels, + dense_frame, + sparse_csr, + iterations=5, + verbosity=0, + max_cleanup_iters=20, + orig_vec_weight=0.15, +): """ Retrofitting is a process of combining information from a machine-learned space of term vectors with further structured information about those @@ -82,14 +106,12 @@ def retrofit(row_labels, dense_frame, sparse_csr, appropriately. """ # Initialize a DataFrame with rows that we know - retroframe = pd.DataFrame( - index=row_labels, columns=dense_frame.columns, dtype='f' - ) + retroframe = pd.DataFrame(index=row_labels, columns=dense_frame.columns, dtype='f') retroframe.update(dense_frame) # orig_weights = 1 for known vectors, 0 for unknown vectors orig_weights = 1 - retroframe.iloc[:, 0].isnull() - orig_vec_indicators = (orig_weights.values != 0) + orig_vec_indicators = orig_weights.values != 0 orig_vecs = retroframe.fillna(0).values # Subtract the mean so that vectors don't just clump around common @@ -102,7 +124,7 @@ def retrofit(row_labels, dense_frame, sparse_csr, vecs = orig_vecs for iteration in range(iterations): if verbosity >= 1: - print('Retrofitting: Iteration %s of %s' % (iteration+1, iterations)) + print('Retrofitting: Iteration %s of %s' % (iteration + 1, iterations)) # Since the sparse weight matrix is row-stochastic and has self-loops, # pre-multiplication by it replaces each vector by a weighted average @@ -113,7 +135,7 @@ def retrofit(row_labels, dense_frame, sparse_csr, # terms with lots of zero neighbors. # Find, for every term, the total weight of its nonzero neighbors. - nonzero_indicators = (np.abs(vecs).sum(1) != 0) + nonzero_indicators = np.abs(vecs).sum(1) != 0 total_neighbor_weights = sparse_csr.dot(nonzero_indicators) # Now average with all the neighbors. @@ -126,17 +148,20 @@ def retrofit(row_labels, dense_frame, sparse_csr, # that are nonzero now, after averaging. Also, we reshape the total # weights into a column vector so that numpy will broadcast the # division by weights across the columns of the embedding matrix. - nonzero_indicators = (np.abs(vecs).sum(1) != 0) + nonzero_indicators = np.abs(vecs).sum(1) != 0 total_neighbor_weights = total_neighbor_weights[nonzero_indicators] - total_neighbor_weights = total_neighbor_weights.reshape((len(total_neighbor_weights), 1)) + total_neighbor_weights = total_neighbor_weights.reshape( + (len(total_neighbor_weights), 1) + ) vecs[nonzero_indicators] /= total_neighbor_weights # Re-center the (new) non-zero vectors. vecs[nonzero_indicators] -= vecs[nonzero_indicators].mean(0) # Average known rows with original vectors - vecs[orig_vec_indicators, :] = \ - (1.0 - orig_vec_weight) * vecs[orig_vec_indicators, :] + orig_vec_weight * orig_vecs[orig_vec_indicators, :] + vecs[orig_vec_indicators, :] = (1.0 - orig_vec_weight) * vecs[ + orig_vec_indicators, : + ] + orig_vec_weight * orig_vecs[orig_vec_indicators, :] # Clean up as many all-zero vectors as possible. Zero vectors # can either come from components of the conceptnet graph that @@ -152,7 +177,7 @@ def retrofit(row_labels, dense_frame, sparse_csr, # this code. n_zero_indicators_old = -1 for iteration in range(max_cleanup_iters): - zero_indicators = (np.abs(vecs).sum(1) == 0) + zero_indicators = np.abs(vecs).sum(1) == 0 n_zero_indicators = np.sum(zero_indicators) if n_zero_indicators == 0 or n_zero_indicators == n_zero_indicators_old: break @@ -162,9 +187,15 @@ def retrofit(row_labels, dense_frame, sparse_csr, vecs[zero_indicators, :] = sparse_csr[zero_indicators, :].dot(vecs) # Now divide each newly nonzero vector (row) by the total weight of its # old nonzero neighbors. - new_nonzero_indicators = np.logical_and(zero_indicators, np.abs(vecs).sum(1) != 0) - total_neighbor_weights = sparse_csr[new_nonzero_indicators, :].dot(np.logical_not(zero_indicators)) - total_neighbor_weights = total_neighbor_weights.reshape((len(total_neighbor_weights), 1)) + new_nonzero_indicators = np.logical_and( + zero_indicators, np.abs(vecs).sum(1) != 0 + ) + total_neighbor_weights = sparse_csr[new_nonzero_indicators, :].dot( + np.logical_not(zero_indicators) + ) + total_neighbor_weights = total_neighbor_weights.reshape( + (len(total_neighbor_weights), 1) + ) vecs[new_nonzero_indicators, :] /= total_neighbor_weights else: print('Warning: cleanup iteration limit exceeded.') diff --git a/conceptnet5/vectors/sparse_matrix_builder.py b/conceptnet5/vectors/sparse_matrix_builder.py index d2bf1559..9ee5a4bf 100644 --- a/conceptnet5/vectors/sparse_matrix_builder.py +++ b/conceptnet5/vectors/sparse_matrix_builder.py @@ -1,12 +1,14 @@ -import pandas as pd from collections import defaultdict -from ordered_set import OrderedSet + +import pandas as pd from scipy import sparse from sklearn.preprocessing import normalize from conceptnet5.languages import CORE_LANGUAGES from conceptnet5.relations import SYMMETRIC_RELATIONS from conceptnet5.uri import get_uri_language, uri_prefix, uri_prefixes +from ordered_set import OrderedSet + from ..vectors import replace_numbers @@ -15,6 +17,7 @@ class SparseMatrixBuilder: SparseMatrixBuilder is a utility class that helps build a matrix of unknown shape. """ + def __init__(self): self.row_index = [] self.col_index = [] @@ -30,8 +33,9 @@ def add(self, row, col, val): self.values.append(val) def tocsr(self, shape, dtype=float): - return sparse.coo_matrix((self.values, (self.row_index, self.col_index)), - shape=shape, dtype=dtype).tocsr() + return sparse.coo_matrix( + (self.values, (self.row_index, self.col_index)), shape=shape, dtype=dtype + ).tocsr() def build_from_conceptnet_table(filename, orig_index=(), self_loops=True): diff --git a/conceptnet5/vectors/transforms.py b/conceptnet5/vectors/transforms.py index c64ba70f..b2c3008b 100644 --- a/conceptnet5/vectors/transforms.py +++ b/conceptnet5/vectors/transforms.py @@ -18,11 +18,15 @@ def standardize_row_labels(frame, language='en', forms=True): # Check for en/term format we use to train fastText on OpenSubtitles data if all(label.count('/') == 1 for label in frame.index[0:5]): tuples = [label.partition('/') for label in frame.index] - frame.index = [uri_prefix(standardized_uri(language, text)) - for language, _slash, text in tuples] + frame.index = [ + uri_prefix(standardized_uri(language, text)) + for language, _slash, text in tuples + ] # Re-label the DataFrame with standardized, non-unique row labels - frame.index = [uri_prefix(standardized_uri(language, label)) for label in frame.index] + frame.index = [ + uri_prefix(standardized_uri(language, label)) for label in frame.index + ] # Assign row n a weight of 1/(n+1) for weighted averaging nrows = frame.shape[0] @@ -57,7 +61,9 @@ def l1_normalize_columns(frame): post-processing GloVe output. """ index = frame.index - return pd.DataFrame(data=normalize(frame, norm='l1', copy=False, axis=0), index=index) + return pd.DataFrame( + data=normalize(frame, norm='l1', copy=False, axis=0), index=index + ) def l2_normalize_rows(frame): @@ -72,7 +78,9 @@ def l2_normalize_rows(frame): if frame.shape[0] == 0: return frame index = frame.index - return pd.DataFrame(data=normalize(frame, norm='l2', copy=False, axis=1), index=index) + return pd.DataFrame( + data=normalize(frame, norm='l2', copy=False, axis=1), index=index + ) def subtract_mean_vector(frame): @@ -119,4 +127,3 @@ def make_small_frame(big_frame, concepts): """ small_vocab = choose_small_vocabulary(big_frame.index, concepts) return big_frame.ix[small_vocab] - diff --git a/setup.py b/setup.py index dd01a7bd..d8b2c60f 100644 --- a/setup.py +++ b/setup.py @@ -1,9 +1,10 @@ #!/usr/bin/env python -from setuptools import setup, find_packages, Command -from setuptools.command.install import install -from setuptools.command.develop import develop import sys +from setuptools import Command, find_packages, setup +from setuptools.command.develop import develop +from setuptools.command.install import install + packages = find_packages() version_str = '5.7.0' diff --git a/tests/full-build/test_all_relations_recorded.py b/tests/full-build/test_all_relations_recorded.py index 834d8f0c..def7c191 100644 --- a/tests/full-build/test_all_relations_recorded.py +++ b/tests/full-build/test_all_relations_recorded.py @@ -3,8 +3,9 @@ ConceptNet build are in fact recorded in the relations.py file. ''' -from conceptnet5.util import get_data_filename from conceptnet5.relations import ALL_RELATIONS +from conceptnet5.util import get_data_filename + def collect_relations(path): ''' @@ -19,6 +20,7 @@ def collect_relations(path): relations.add(relation) return relations + def test_relations_recorded(): built_relations_file = get_data_filename('stats/relations.txt') built_relations = collect_relations(built_relations_file) diff --git a/tests/full-build/test_enough_data_exists.py b/tests/full-build/test_enough_data_exists.py index 64a78377..fba50b6d 100644 --- a/tests/full-build/test_enough_data_exists.py +++ b/tests/full-build/test_enough_data_exists.py @@ -3,9 +3,9 @@ we have data in all the appropriate languages, from all the appropriate sources. """ -from conceptnet5.util import get_data_filename -from conceptnet5.languages import CORE_LANGUAGES, COMMON_LANGUAGES, ALL_LANGUAGES from conceptnet5.db.query import AssertionFinder +from conceptnet5.languages import ALL_LANGUAGES, COMMON_LANGUAGES, CORE_LANGUAGES +from conceptnet5.util import get_data_filename test_finder = None @@ -34,12 +34,20 @@ def test_languages_exist(): def test_datasets_exist(): for dataset in [ - '/d/conceptnet/4/en', '/d/conceptnet/4/pt', '/d/conceptnet/4/ja', - '/d/conceptnet/4/zh', '/d/conceptnet/4/nl', - '/d/dbpedia', '/d/jmdict', '/d/opencyc', '/d/verbosity', '/d/wordnet', - '/d/wiktionary/en', '/d/wiktionary/fr', '/d/wiktionary/de' + '/d/conceptnet/4/en', + '/d/conceptnet/4/pt', + '/d/conceptnet/4/ja', + '/d/conceptnet/4/zh', + '/d/conceptnet/4/nl', + '/d/dbpedia', + '/d/jmdict', + '/d/opencyc', + '/d/verbosity', + '/d/wordnet', + '/d/wiktionary/en', + '/d/wiktionary/fr', + '/d/wiktionary/de', ]: # Test that each dataset has at least 100 assertions q = test_finder.query({'dataset': dataset}, limit=100) assert len(q) == 100, dataset - diff --git a/tests/small-build/test_lemmatizer.py b/tests/small-build/test_lemmatizer.py index 29419196..0c804bf4 100644 --- a/tests/small-build/test_lemmatizer.py +++ b/tests/small-build/test_lemmatizer.py @@ -1,6 +1,7 @@ -from conceptnet5.language.lemmatize import lemmatize from nose.tools import eq_ +from conceptnet5.language.lemmatize import lemmatize + def test_lemmatize(): eq_(lemmatize('en', 'eating'), ('eat', 'pres+ptcp')) diff --git a/tests/small-build/test_propagate.py b/tests/small-build/test_propagate.py index c87396d6..19941661 100644 --- a/tests/small-build/test_propagate.py +++ b/tests/small-build/test_propagate.py @@ -1,12 +1,15 @@ import io +from unittest.mock import Mock, patch + import numpy as np import pandas as pd - -from conceptnet5.uri import concept_uri, get_uri_language -from conceptnet5.vectors.propagate import sharded_propagate, make_adjacency_matrix, propagate from numpy.testing import assert_allclose from scipy import sparse -from unittest.mock import patch, Mock + +from conceptnet5.uri import concept_uri, get_uri_language +from conceptnet5.vectors.propagate import ( + make_adjacency_matrix, propagate, sharded_propagate +) # Constant parameters. N_TRIALS = 20 diff --git a/tests/small-build/test_queries.py b/tests/small-build/test_queries.py index 4d55cedc..4f22b8c4 100644 --- a/tests/small-build/test_queries.py +++ b/tests/small-build/test_queries.py @@ -1,4 +1,5 @@ from nose.tools import eq_ + from conceptnet5.db.query import AssertionFinder test_finder = None diff --git a/tests/small-build/test_vectors.py b/tests/small-build/test_vectors.py index ddd18a9a..26d8d9c6 100644 --- a/tests/small-build/test_vectors.py +++ b/tests/small-build/test_vectors.py @@ -2,13 +2,14 @@ import numpy as np import pandas as pd -from nose.tools import ok_, assert_almost_equal, with_setup +from nose.tools import assert_almost_equal, ok_, with_setup from conceptnet5.uri import is_term from conceptnet5.vectors import get_vector -from conceptnet5.vectors.transforms import standardize_row_labels, \ - l1_normalize_columns, \ - l2_normalize_rows, make_big_frame, make_small_frame, shrink_and_sort +from conceptnet5.vectors.transforms import ( + l1_normalize_columns, l2_normalize_rows, make_big_frame, make_small_frame, + shrink_and_sort, standardize_row_labels +) DATA = os.environ.get("CONCEPTNET_BUILD_DATA", "testdata") TEST_FRAME = None diff --git a/tests/test_convert.py b/tests/test_convert.py index 172a9328..473dd3ea 100644 --- a/tests/test_convert.py +++ b/tests/test_convert.py @@ -1,11 +1,14 @@ -from conceptnet5.formats.convert import msgpack_to_json, json_to_msgpack -from conceptnet5.formats.json_stream import JSONStreamWriter, read_json_stream -from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter, read_msgpack_stream +import os +from itertools import zip_longest +from tempfile import TemporaryDirectory from nose.tools import eq_ -from tempfile import TemporaryDirectory -from itertools import zip_longest -import os + +from conceptnet5.formats.convert import json_to_msgpack, msgpack_to_json +from conceptnet5.formats.json_stream import JSONStreamWriter, read_json_stream +from conceptnet5.formats.msgpack_stream import ( + MsgpackStreamWriter, read_msgpack_stream +) DATA = [ {'a': 1}, diff --git a/web/conceptnet_web/api.py b/web/conceptnet_web/api.py index 5d8993cd..33921056 100644 --- a/web/conceptnet_web/api.py +++ b/web/conceptnet_web/api.py @@ -4,14 +4,15 @@ import os import flask +from flask_cors import CORS +from flask_limiter import Limiter + from conceptnet5 import api as responses from conceptnet5.api import VALID_KEYS, error from conceptnet5.nodes import standardized_concept_uri from conceptnet_web.error_logging import try_configuring_sentry from conceptnet_web.filters import FILTERS from conceptnet_web.json_rendering import jsonify -from flask_cors import CORS -from flask_limiter import Limiter # Configuration diff --git a/web/conceptnet_web/error_logging.py b/web/conceptnet_web/error_logging.py index de7eecd1..a63f7d57 100644 --- a/web/conceptnet_web/error_logging.py +++ b/web/conceptnet_web/error_logging.py @@ -1,8 +1,10 @@ -from conceptnet5.util import get_data_filename -import os import logging +import os + from raven.contrib.flask import Sentry +from conceptnet5.util import get_data_filename + def try_configuring_sentry(app): dsn_path = get_data_filename('deploy/sentry-dsn.txt') diff --git a/web/conceptnet_web/filters.py b/web/conceptnet_web/filters.py index e7d87c83..4bc2ae92 100644 --- a/web/conceptnet_web/filters.py +++ b/web/conceptnet_web/filters.py @@ -1,6 +1,8 @@ from jinja2.ext import Markup + from conceptnet5.languages import get_language_name from conceptnet5.uri import split_uri, uri_prefix + from .json_rendering import highlight_and_link_json diff --git a/web/conceptnet_web/json_rendering.py b/web/conceptnet_web/json_rendering.py index 1214ce23..7ef13436 100644 --- a/web/conceptnet_web/json_rendering.py +++ b/web/conceptnet_web/json_rendering.py @@ -1,10 +1,11 @@ +import json +import re + +import flask +from jinja2.ext import Markup +from pygments import highlight from pygments.formatters import HtmlFormatter from pygments.lexers import get_lexer_by_name -from pygments import highlight -from jinja2.ext import Markup -import flask -import re -import json def request_wants_json(): diff --git a/web/conceptnet_web/web.py b/web/conceptnet_web/web.py index 2dd5bd3f..40d08446 100644 --- a/web/conceptnet_web/web.py +++ b/web/conceptnet_web/web.py @@ -1,18 +1,18 @@ """ This file sets up Flask to serve the ConceptNet 5 API in JSON-LD format. """ -from conceptnet5 import api as responses -from conceptnet_web.filters import FILTERS -from conceptnet_web.relations import REL_HEADINGS -from conceptnet_web.error_logging import try_configuring_sentry -from conceptnet5.uri import split_uri -from conceptnet5.nodes import standardized_concept_uri -from conceptnet5.languages import COMMON_LANGUAGES, LANGUAGE_NAMES +import os import flask from flask_limiter import Limiter -import os +from conceptnet5 import api as responses +from conceptnet5.languages import COMMON_LANGUAGES, LANGUAGE_NAMES +from conceptnet5.nodes import standardized_concept_uri +from conceptnet5.uri import split_uri +from conceptnet_web.error_logging import try_configuring_sentry +from conceptnet_web.filters import FILTERS +from conceptnet_web.relations import REL_HEADINGS # Configuration app = flask.Flask('conceptnet_web') @@ -205,4 +205,3 @@ def render_error(status, details): if __name__ == '__main__': app.debug = True app.run('127.0.0.1', debug=True, port=8084) - diff --git a/web/setup.py b/web/setup.py index 7bbd2b13..a016cd37 100644 --- a/web/setup.py +++ b/web/setup.py @@ -1,9 +1,10 @@ #!/usr/bin/env python -from setuptools import setup, find_packages, Command -from setuptools.command.install import install -from setuptools.command.develop import develop import sys +from setuptools import Command, find_packages, setup +from setuptools.command.develop import develop +from setuptools.command.install import install + packages = find_packages() version_str = '5.7.0'