Skip to content

Commit

Permalink
Merge pull request #257 from commonsense/reformat
Browse files Browse the repository at this point in the history
Meaningless formatting cleanups
  • Loading branch information
Robyn Speer authored Apr 9, 2019
2 parents 0cbdffb + dd5ef67 commit 895ea23
Show file tree
Hide file tree
Showing 63 changed files with 1,859 additions and 1,193 deletions.
2 changes: 1 addition & 1 deletion conceptnet5/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
This file defines the ConceptNet web API responses.
"""

from conceptnet5.nodes import ld_node, standardized_concept_uri
from conceptnet5.vectors.query import VectorSpaceWrapper
from conceptnet5.nodes import standardized_concept_uri, ld_node

VECTORS = VectorSpaceWrapper()
FINDER = VECTORS.finder
Expand Down
11 changes: 7 additions & 4 deletions conceptnet5/builders/cli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import click

from .combine_assertions import combine_assertions
from .reduce_assoc import reduce_assoc
from .morphology import prepare_vocab_for_morphology, subwords_to_edges
from .reduce_assoc import reduce_assoc


@click.group()
Expand All @@ -25,12 +26,14 @@ def run_combine(input, output):

@cli.command(name='reduce_assoc')
@click.argument('assoc_filename', type=click.Path(readable=True, dir_okay=False))
@click.argument('embedding_filenames', nargs=-1, type=click.Path(readable=True, dir_okay=False))
@click.argument(
'embedding_filenames', nargs=-1, type=click.Path(readable=True, dir_okay=False)
)
@click.argument('output', type=click.Path(writable=True, dir_okay=False))
def run_reduce_assoc(assoc_filename, embedding_filenames, output):
"""
Takes in a file of tab-separated simple associations, one or more
hdf5 files defining vector embeddings, and removes from the associations
Takes in a file of tab-separated simple associations, one or more
hdf5 files defining vector embeddings, and removes from the associations
low-frequency terms and associations that are judged unlikely to be
useful by various filters.
"""
Expand Down
24 changes: 16 additions & 8 deletions conceptnet5/builders/combine_assertions.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
from __future__ import unicode_literals, print_function

import itertools
import json

import os

from conceptnet5.edges import make_edge
from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter
from conceptnet5.languages import ALL_LANGUAGES
from conceptnet5.readers.wiktionary import valid_language
from conceptnet5.uri import conjunction_uri,get_uri_language, is_absolute_url, Licenses, \
split_uri, uri_prefix
from conceptnet5.uri import (
Licenses,
conjunction_uri,
get_uri_language,
is_absolute_url,
split_uri,
)
from conceptnet5.util import get_support_data_filename

N = 100
Expand Down Expand Up @@ -91,9 +93,14 @@ def make_assertion(line_group):
license = Licenses.cc_attribution

return make_edge(
rel=rel, start=start, end=end, weight=weight,
dataset=dataset, license=license, sources=sources,
surfaceText=surface_text
rel=rel,
start=start,
end=end,
weight=weight,
dataset=dataset,
license=license,
sources=sources,
surfaceText=surface_text,
)


Expand All @@ -113,6 +120,7 @@ def combine_assertions(input_filename, output_filename):
This process requires its input to be a sorted CSV so that all edges for
the same assertion will appear consecutively.
"""

def group_func(line):
"Group lines by their URI (their first column)."
return line.split('\t', 1)[0]
Expand Down
8 changes: 5 additions & 3 deletions conceptnet5/builders/morphology.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from conceptnet5.formats.msgpack_stream import MsgpackStreamWriter
from conceptnet5.languages import ATOMIC_SPACE_LANGUAGES
from conceptnet5.nodes import split_uri
from conceptnet5.uri import get_uri_language, join_uri, Licenses
from conceptnet5.uri import Licenses, get_uri_language, join_uri


def prepare_vocab_for_morphology(language, input, output):
Expand Down Expand Up @@ -61,11 +61,13 @@ def subwords_to_edges(language, input, output):
if chunk != '_':
start = join_uri('x', language, chunk.strip('_'))
edge = make_edge(
'/r/SubwordOf', start, end,
'/r/SubwordOf',
start,
end,
dataset='/d/morphology',
license=Licenses.cc_attribution,
sources=MORPH_SOURCES,
weight=0.01
weight=0.01,
)
writer.write(edge)
writer.close()
104 changes: 54 additions & 50 deletions conceptnet5/builders/reduce_assoc.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,10 +5,11 @@

from collections import defaultdict

import pandas as pd

from conceptnet5.relations import is_negative_relation
from conceptnet5.uri import is_concept, uri_prefix
from conceptnet5.vectors.formats import load_hdf
import pandas as pd


def concept_is_bad(uri):
Expand All @@ -19,14 +20,19 @@ def concept_is_bad(uri):
specific phrase, possibly mis-parsed. A concept with a colon is probably
detritus from a wiki.
"""
return (':' in uri or uri.count('_') >= 3 or
uri.startswith('/a/') or uri.count('/') <= 2)
return (
':' in uri
or uri.count('_') >= 3
or uri.startswith('/a/')
or uri.count('/') <= 2
)


class ConceptNetAssociationGraph:
'''
Class to hold the concept-association edge graph.
'''

def __init__(self):
'''Construct a graph with no vertices or edges.'''
self.vertex_to_neighbors = defaultdict(set)
Expand All @@ -43,15 +49,15 @@ def vertices(self):

def find_components(self):
'''
Returns a dict mapping the vertices of the graph to labels,
such that two vertices map to the same label if and only if
they belong to the same connected component of the undirected
graph obtained by adding the reversal of every edge to the
graph. (But note that this function does not modify the graph,
Returns a dict mapping the vertices of the graph to labels,
such that two vertices map to the same label if and only if
they belong to the same connected component of the undirected
graph obtained by adding the reversal of every edge to the
graph. (But note that this function does not modify the graph,
i.e. it does not add any edges.)
'''
component_labels = {vertex : -1 for vertex in self.vertices()}

component_labels = {vertex: -1 for vertex in self.vertices()}
vertices_to_examine = set(self.vertices())
new_label = -1
while len(vertices_to_examine) > 0:
Expand All @@ -72,19 +78,18 @@ def find_components(self):
return component_labels

@classmethod
def from_csv(cls, filename, filtered_concepts=None,
reject_negative_relations=True):
def from_csv(cls, filename, filtered_concepts=None, reject_negative_relations=True):
"""
Reads an association file and builds an (undirected) graph from it.
Reads an association file and builds an (undirected) graph from it.
If filtered_concepts isn't None, it should be a collection of concepts,
and only vertices from this collection and edges that link two such
vertices will be added to the graph. If it _is_ None (the default),
however, please note that no such filtering will be done (i.e. the
effective filter collection is then the universal set of concepts, not
If filtered_concepts isn't None, it should be a collection of concepts,
and only vertices from this collection and edges that link two such
vertices will be added to the graph. If it _is_ None (the default),
however, please note that no such filtering will be done (i.e. the
effective filter collection is then the universal set of concepts, not
the empty set).
If reject_negative_relations is True (the default), only edges not
If reject_negative_relations is True (the default), only edges not
corresponding to negative relations will be added to the graph.
"""
graph = cls()
Expand Down Expand Up @@ -119,29 +124,29 @@ def from_csv(cls, filename, filtered_concepts=None,

class ConceptNetAssociationGraphForReduction(ConceptNetAssociationGraph):
"""
Subclass of ConceptNetAssociationGraph specialized for use in making
Subclass of ConceptNetAssociationGraph specialized for use in making
the reduced subgraph of a full set of associations.
"""

def __init__(self):
super().__init__()
self.edges = []

def add_edge(self, left, right, value, dataset, relation):
"""
In addition to the superclass's handling of a new edge,
In addition to the superclass's handling of a new edge,
saves the full edge data.
"""
super().add_edge(left, right, value, dataset, relation)
self.edges.append((left, right, value, dataset, relation))



def make_filtered_concepts(filename, cutoff=3, en_cutoff=3):
"""
Takes in a file of tab-separated associations, and returns a set of
concepts from which those which are unlikely to be useful have been
removed.
Takes in a file of tab-separated associations, and returns a set of
concepts from which those which are unlikely to be useful have been
removed.
All concepts that occur fewer than `cutoff` times will be removed.
All English concepts that occur fewer than `en_cutoff` times will be removed.
"""
Expand All @@ -160,20 +165,18 @@ def make_filtered_concepts(filename, cutoff=3, en_cutoff=3):
counts[gright] += 1

filtered_concepts = {
concept for (concept, count) in counts.items()
if (
count >= en_cutoff or
(not is_concept(concept) and count >= cutoff)
)
concept
for (concept, count) in counts.items()
if (count >= en_cutoff or (not is_concept(concept) and count >= cutoff))
}
return filtered_concepts


def read_embedding_vocabularies(filenames):
"""
Reads every vector embedding file in the given collection of
filenames, and returns the union of their vocabularies. (The
files are assumed to be hdf5 files containing dataframes, and
Reads every vector embedding file in the given collection of
filenames, and returns the union of their vocabularies. (The
files are assumed to be hdf5 files containing dataframes, and
the vocabularies are their indices.
"""
result = pd.Index([])
Expand All @@ -183,28 +186,29 @@ def read_embedding_vocabularies(filenames):
return result



def reduce_assoc(assoc_filename, embedding_filenames, output_filename,
cutoff=3, en_cutoff=3):
def reduce_assoc(
assoc_filename, embedding_filenames, output_filename, cutoff=3, en_cutoff=3
):
"""
Takes in a file of tab-separated simple associations, and removes
uncommon associations and associations unlikely to be useful. Also
requires one or more vector embedding files (from which only the
vocabularies are used; associations involving terms that have no
connection, no matter how distant, to the union of those vocabularies
uncommon associations and associations unlikely to be useful. Also
requires one or more vector embedding files (from which only the
vocabularies are used; associations involving terms that have no
connection, no matter how distant, to the union of those vocabularies
will be removed).
All concepts that occur fewer than `cutoff` times will be removed.
All English concepts that occur fewer than `en_cutoff` times will be removed.
"""

filtered_concepts = make_filtered_concepts(assoc_filename, cutoff=cutoff,
en_cutoff=en_cutoff)
filtered_concepts = make_filtered_concepts(
assoc_filename, cutoff=cutoff, en_cutoff=en_cutoff
)

graph = ConceptNetAssociationGraphForReduction.from_csv(
assoc_filename,
filtered_concepts=filtered_concepts,
reject_negative_relations=True
reject_negative_relations=True,
)

component_labels = graph.find_components()
Expand All @@ -215,11 +219,11 @@ def reduce_assoc(assoc_filename, embedding_filenames, output_filename,
# from any of the embedding vocabularies, there will be no way to assign
# vectors to any of its vertices, so we remove that component from the
# output.
good_component_labels = set(label for term, label
in component_labels.items()
if term in embedding_vocab)

good_component_labels = set(
label for term, label in component_labels.items() if term in embedding_vocab
)

with open(output_filename, 'w', encoding='utf-8') as out:
for gleft, gright, value, dataset, rel in graph.edges:
if component_labels[gleft] not in good_component_labels:
Expand Down
14 changes: 10 additions & 4 deletions conceptnet5/db/cli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
import click
from .connection import get_db_connection, check_db_connection

from .connection import check_db_connection, get_db_connection
from .prepare_data import assertions_to_sql_csv, load_sql_csv
from .schema import create_tables, create_indices
from .schema import create_indices, create_tables


@click.group()
Expand All @@ -11,13 +12,18 @@ def cli():

@cli.command(name='prepare_data')
@click.argument('input_filename', type=click.Path(readable=True, dir_okay=False))
@click.argument('output_dir', type=click.Path(writable=True, dir_okay=True, file_okay=False))
@click.argument(
'output_dir', type=click.Path(writable=True, dir_okay=True, file_okay=False)
)
def prepare_data(input_filename, output_dir):
assertions_to_sql_csv(input_filename, output_dir)


@cli.command(name='load_data')
@click.argument('input_dir', type=click.Path(readable=True, writable=True, dir_okay=True, file_okay=False))
@click.argument(
'input_dir',
type=click.Path(readable=True, writable=True, dir_okay=True, file_okay=False),
)
def load_data(input_dir):
conn = get_db_connection()
create_tables(conn)
Expand Down
1 change: 1 addition & 0 deletions conceptnet5/db/connection.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import psycopg2

from conceptnet5.db import config

_CONNECTIONS = {}
Expand Down
5 changes: 3 additions & 2 deletions conceptnet5/db/prepare_data.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
import json

from conceptnet5.formats.msgpack_stream import read_msgpack_stream
from conceptnet5.uri import uri_prefixes
from conceptnet5.relations import SYMMETRIC_RELATIONS
from conceptnet5.uri import uri_prefixes
from ordered_set import OrderedSet
import json


def write_row(outfile, items):
Expand Down
Loading

0 comments on commit 895ea23

Please sign in to comment.