Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 26 additions & 20 deletions polyglotdb/corpus/audio.py
Original file line number Diff line number Diff line change
Expand Up @@ -742,10 +742,15 @@ def get_utterance_acoustics(self, acoustic_name, utterance_id, discourse, speake
columns = '"time", {}'.format(', '.join(property_names))
speaker = speaker.replace("'", r"\'") # Escape apostrophes
discourse = discourse.replace("'", r"\'") # Escape apostrophes
query = '''select {} from "{}"
WHERE "utterance_id" = '{}'
AND "discourse" = '{}'
AND "speaker" = '{}';'''.format(columns, acoustic_name, utterance_id, discourse, speaker)
if utterance_id is not None:
query = '''select {} from "{}"
WHERE "utterance_id" = '{}'
AND "discourse" = '{}'
AND "speaker" = '{}';'''.format(columns, acoustic_name, utterance_id, discourse, speaker)
else:
query = '''select {} from "{}"
WHERE "discourse" = '{}'
AND "speaker" = '{}';'''.format(columns, acoustic_name, discourse, speaker)
result = self.execute_influxdb(query)
track = Track()
for r in result.get_points(acoustic_name):
Expand Down Expand Up @@ -906,15 +911,16 @@ def _save_measurement(self, sound_file, track, acoustic_name, **kwargs):
phone_type.begin.column_name('begin'),
phone_type.end.column_name('end'),
phone_type.word.label.column_name('word_label'),
phone_type.speaker.name.column_name('speaker'),
phone_type.utterance.id.column_name('utterance_id')]
phone_type.speaker.name.column_name('speaker')]
column_labels = ['label', 'begin', 'end', 'word_label', 'speaker']
if "utterance" in self.annotation_types:
columns.append(phone_type.syllable.label.column_name('utterance_id'))
column_labels.append('utterance_id')
if 'syllable' in self.annotation_types:
columns.append(phone_type.syllable.label.column_name('syllable_label'))
q = q.columns(*columns).order_by(phone_type.begin)
phones = [(x['label'], x['begin'], x['end'], x['word_label'], x['speaker'], x['utterance_id'], x['syllable_label']) for x in q.all()]
else:
q = q.columns(*columns).order_by(phone_type.begin)
phones = [(x['label'], x['begin'], x['end'], x['word_label'], x['speaker'], x['utterance_id']) for x in q.all()]
column_labels.append('syllable_label')
q = q.columns(*columns).order_by(phone_type.begin)
phones = [{y: x[y] for y in column_labels} for x in q.all()]
for time_point, value in track.items():
fields = {}
for name, type in measures:
Expand All @@ -925,20 +931,20 @@ def _save_measurement(self, sound_file, track, acoustic_name, **kwargs):
continue
label = None
speaker = None
word_label = None
syllable_label = None
for i, p in enumerate(phones):
if p[1] > time_point:
if p['begin'] > time_point:
break
label = p[0]
speaker = p[4]
label = p['label']
speaker = p['speaker']
if 'syllable' in self.annotation_types:
syllable_label = p[6]
word_label = p[3]
utterance_id = p[5]
syllable_label = p['syllable_label']
word_label = p['word_label']
if utterance_id is None:
utterance_id = p.get('utterance_id', None)
if i == len(phones) - 1:
break
else:
label = None
speaker = None
if speaker is None:
continue
t_dict = {'speaker': speaker}
Expand Down
20 changes: 10 additions & 10 deletions polyglotdb/corpus/syllabic.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,18 +60,18 @@ def find_onsets(self, syllabic_label='syllabic'):
for d in discourses:
statement = '''match (w:{word_name}:{corpus_name})-[:spoken_by]->(s:Speaker:{corpus_name}),
(w)-[:spoken_in]->(d:Discourse:{corpus_name})
where (w)<-[:contained_by*1..2]-()-[:is_a]->(:{syllabic_name})
where (w)<-[:contained_by]-()-[:is_a]->(:{syllabic_name})
AND s.name = $speaker
AND d.name = $discourse
with w
match (n:{phone_name}:{corpus_name})-[:is_a]->(t:{corpus_name}:{syllabic_name}),
(n)-[:contained_by*1..2]->(w)
(n)-[:contained_by]->(w)
with w, n
order by n.begin
with w,collect(n)[0..1] as coll unwind coll as n

MATCH (pn:{phone_name}:{corpus_name})-[:contained_by*1..2]->(w)
where not (pn)<-[:precedes]-()-[:contained_by*1..2]->(w)
MATCH (pn:{phone_name}:{corpus_name})-[:contained_by]->(w)
where not (pn)<-[:precedes]-()-[:contained_by]->(w)
with w, n,pn
match p = shortestPath((pn)-[:precedes*0..10]->(n))
with [x in nodes(p)[0..-1]|x.label] as onset
Expand Down Expand Up @@ -105,18 +105,18 @@ def find_codas(self, syllabic_label='syllabic'):
for d in discourses:
statement = '''match (w:{word_name}:{corpus_name})-[:spoken_by]->(s:Speaker:{corpus_name}),
(w)-[:spoken_in]->(d:Discourse:{corpus_name})
where (w)<-[:contained_by*1..2]-()-[:is_a]->(:{syllabic_name})
where (w)<-[:contained_by]-()-[:is_a]->(:{syllabic_name})
AND s.name = $speaker
AND d.name = $discourse
with w
match (n:{phone_name}:{corpus_name})-[:is_a]->(t:{corpus_name}:{syllabic_name}),
(n)-[:contained_by*1..2]->(w)
(n)-[:contained_by]->(w)
with w, n
order by n.begin DESC
with w,collect(n)[0..1] as coll unwind coll as n

MATCH (pn:{phone_name}:{corpus_name})-[:contained_by*1..2]->(w)
where not (pn)-[:precedes]->()-[:contained_by*1..2]->(w)
MATCH (pn:{phone_name}:{corpus_name})-[:contained_by]->(w)
where not (pn)-[:precedes]->()-[:contained_by]->(w)
with w, n,pn
match p = shortestPath((n)-[:precedes*0..10]->(pn))
with [x in nodes(p)[1..]|x.label] as coda
Expand Down Expand Up @@ -287,8 +287,8 @@ def encode_syllables(self, algorithm='maxonset', syllabic_label='syllabic', call
codas = norm_count_dict(codas, onset=False)
elif algorithm == 'maxonset':
if custom_onsets is None:
onsets = set(onsets.keys())
print(onsets)
onsets = sorted(set(onsets.keys()))
print(f"Onsets found by max onset: {onsets}")
else:
raise NotImplementedError

Expand Down
1 change: 0 additions & 1 deletion polyglotdb/corpus/utterance.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,6 @@ def encode_utterances(self, min_pause_length=0.5, min_utterance_length=0,
call_back(i)
call_back('Parsing utterances for discourse {} of {} ({})...'.format(i, len(discourses), d))
utt_data = self.get_utterance_ids(d, min_pause_length, min_utterance_length)
speaker_data = {}
for s, utterances in utt_data.items():
speaker_data = []
prev_id = None
Expand Down
4 changes: 2 additions & 2 deletions polyglotdb/databases/neo4j.conf
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,8 @@ dbms.security.auth_enabled=false
# calculated based on available system resources.
# Uncomment these lines to set specific initial and maximum
# heap size.
server.memory.heap.initial_size=512m
server.memory.heap.max_size=512m
server.memory.heap.initial_size=2g
server.memory.heap.max_size=4g

# The amount of memory to use for mapping the store files, in bytes (or
# kilobytes with the 'k' suffix, megabytes with 'm' and gigabytes with 'g').
Expand Down
84 changes: 53 additions & 31 deletions polyglotdb/io/importer/from_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,7 @@ def _begin_index(tx, at):

def _end_index(tx, at):
tx.run('CREATE INDEX FOR (n:%s) ON (n.end)' % at)

corpus_name = corpus_context.cypher_safe_name
with corpus_context.graph_driver.session() as session:
for i, s in enumerate(speakers):
speaker_statements = []
Expand Down Expand Up @@ -258,12 +258,15 @@ def _end_index(tx, at):
except:
raise
finally:
# with open(path, 'w'):
# pass
os.remove(s[2])
log.info('Finished loading {} relationships for speaker {}!'.format(s[3], s[4]))
log.debug('{} relationships loading took: {} seconds.'.format(s[3], time.time() - begin))

statement = f"""
MATCH (subunit:{corpus_name}:speech)-[:contained_by*2..]->(superunit:{corpus_name}:speech)
with subunit, superunit
CREATE (subunit)-[:contained_by]->(superunit)
"""
corpus_context.execute_cypher(statement)
log.info('Finished importing into the graph database!')
log.debug('Graph importing took: {} seconds'.format(time.time() - initial_begin))

Expand Down Expand Up @@ -768,6 +771,19 @@ def import_utterance_csv(corpus_context, call_back=None, stop_check=None):
corpus=corpus_context.cypher_safe_name,
word_type=corpus_context.word_name)
corpus_context.execute_cypher(statement)
word_statement = '''
CALL {{
LOAD CSV WITH HEADERS FROM "{path}" AS csvLine
MATCH (n)-[:contained_by*]->()-[:contained_by]->(utt:utterance:{corpus}:speech {{id: csvLine.id}})
WITH utt, collect(n) AS subunits
UNWIND subunits AS w
CREATE (w)-[:contained_by]->(utt)
}} IN TRANSACTIONS OF 1000 ROWS
'''
statement = word_statement.format(path=csv_path,
corpus=corpus_context.cypher_safe_name,
word_type=corpus_context.word_name)
corpus_context.execute_cypher(statement)
if corpus_context.config.debug:
print('Hierarchical relationship creation took {} seconds.'.format(time.time() - begin))
os.remove(path)
Expand Down Expand Up @@ -895,6 +911,21 @@ def import_syllable_csv(corpus_context, call_back=None, stop_check=None):
word_name=corpus_context.word_name,
phone_name=corpus_context.phone_name)
corpus_context.execute_cypher(statement)
rel_statement = '''
CALL {{
LOAD CSV WITH HEADERS FROM "{path}" AS csvLine
MATCH (s:syllable:{corpus}:speech {{id: csvLine.id}})-[:contained_by]->(w:{word_name}:{corpus}:speech),
(w)-[:contained_by]->(n)
WITH s, collect(n) as superunits
UNWIND superunits AS u
CREATE (s)-[:contained_by]->(u)
}} IN TRANSACTIONS OF 2000 ROWS
'''
statement = rel_statement.format(path=csv_path,
corpus=corpus_context.cypher_safe_name,
word_name=corpus_context.word_name,
phone_name=corpus_context.phone_name)
corpus_context.execute_cypher(statement)
if corpus_context.config.debug:
print('Hierarchical relationship creation took {} seconds.'.format(time.time() - begin))

Expand Down Expand Up @@ -937,21 +968,6 @@ def import_syllable_csv(corpus_context, call_back=None, stop_check=None):
if corpus_context.config.debug:
print('Precedence relationship creation took {} seconds.'.format(time.time() - begin))

begin = time.time()
del_rel_statement = '''
CALL {{
LOAD CSV WITH HEADERS FROM "{path}" AS csvLine
MATCH (n:{phone_name}:{corpus}:speech:nucleus {{id: csvLine.vowel_id}})-[r:contained_by]->(w:{word_name}:{corpus}:speech)
DELETE r
}} IN TRANSACTIONS OF 2000 ROWS
'''
statement = del_rel_statement.format(path=csv_path,
corpus=corpus_context.cypher_safe_name,
word_name=corpus_context.word_name,
phone_name=corpus_context.phone_name)
corpus_context.execute_cypher(statement)
if corpus_context.config.debug:
print('Phone-word relationship deletion took {} seconds.'.format(time.time() - begin))

begin = time.time()
onset_statement = '''
Expand All @@ -964,12 +980,9 @@ def import_syllable_csv(corpus_context, call_back=None, stop_check=None):
onspath = (onset)-[:precedes*1..10]->(n)
WITH n, w, s, csvLine, onspath
UNWIND (CASE WHEN onspath IS NOT NULL THEN nodes(onspath)[0..-1] ELSE [NULL] END) AS o
OPTIONAL MATCH (o)-[r:contained_by]->(w)
WITH n, w, s, csvLine, [x IN collect(o) WHERE x IS NOT NULL | x] AS ons,
[x IN collect(r) WHERE x IS NOT NULL | x] AS rels
WITH n, s, csvLine, [x IN collect(o) WHERE x IS NOT NULL | x] AS ons
FOREACH (o IN ons | SET o :onset, o.syllable_position = 'onset')
FOREACH (o IN ons | CREATE (o)-[:contained_by]->(s))
FOREACH (r IN rels | DELETE r)
}} IN TRANSACTIONS OF 2000 ROWS
'''
statement = onset_statement.format(path=csv_path,
Expand All @@ -991,12 +1004,9 @@ def import_syllable_csv(corpus_context, call_back=None, stop_check=None):
codapath = (n)-[:precedes*1..10]->(coda)
WITH n, w, s, codapath
UNWIND (CASE WHEN codapath IS NOT NULL THEN nodes(codapath)[1..] ELSE [NULL] END) AS c
OPTIONAL MATCH (c)-[r:contained_by]->(w)
WITH n, w, s, [x IN collect(c) WHERE x IS NOT NULL | x] AS cod,
[x IN collect(r) WHERE x IS NOT NULL | x] AS rels
WITH n, s, [x IN collect(c) WHERE x IS NOT NULL | x] AS cod
FOREACH (c IN cod | SET c :coda, c.syllable_position = 'coda')
FOREACH (c IN cod | CREATE (c)-[:contained_by]->(s))
FOREACH (r IN rels | DELETE r)
}} IN TRANSACTIONS OF 2000 ROWS
'''
statement = coda_statement.format(path=csv_path,
Expand Down Expand Up @@ -1118,6 +1128,21 @@ def import_nonsyl_csv(corpus_context, call_back=None, stop_check=None):
word_name=corpus_context.word_name,
phone_name=corpus_context.phone_name)
corpus_context.execute_cypher(statement)
rel_statement = '''
CALL {{
LOAD CSV WITH HEADERS FROM "{path}" AS csvLine
MATCH (s:syllable:{corpus}:speech {{id: csvLine.id}})-[:contained_by]->(w:{word_name}:{corpus}:speech),
(w)-[:contained_by]->(n)
WITH s, collect(n) as superunits
UNWIND superunits AS u
CREATE (s)-[:contained_by]->(u)
}} IN TRANSACTIONS OF 2000 ROWS
'''
statement = rel_statement.format(path=csv_path,
corpus=corpus_context.cypher_safe_name,
word_name=corpus_context.word_name,
phone_name=corpus_context.phone_name)
corpus_context.execute_cypher(statement)
if corpus_context.config.debug:
print('Hierarchical and spoken relationship creation took {} seconds.'.format(time.time() - begin))

Expand Down Expand Up @@ -1169,13 +1194,10 @@ def import_nonsyl_csv(corpus_context, call_back=None, stop_check=None):
p = (o)-[:precedes*..10]->(c)
WITH o, w, s, p, csvLine
UNWIND (CASE WHEN p IS NOT NULL THEN nodes(p) ELSE [o] END) AS c
OPTIONAL MATCH (c)-[r:contained_by]->(w)
WITH w, s, toInteger(csvLine.break) AS break, [x IN collect(c) WHERE x IS NOT NULL | x] AS cod,
[x IN collect(r) WHERE x IS NOT NULL | x] AS rels
WITH s, toInteger(csvLine.break) AS break, [x IN collect(c) WHERE x IS NOT NULL | x] AS cod
FOREACH (c IN cod[break..] | SET c :coda, c.syllable_position = 'coda')
FOREACH (c IN cod[..break] | SET c :onset, c.syllable_position = 'onset')
FOREACH (c IN cod | CREATE (c)-[:contained_by]->(s))
FOREACH (r IN rels | DELETE r)
}} IN TRANSACTIONS OF 2000 ROWS
'''
statement = phone_statement.format(path=csv_path,
Expand Down
2 changes: 1 addition & 1 deletion polyglotdb/pgdb.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,7 +215,7 @@ def stop():
pid = int(f.read().strip())
try:
os.kill(pid, signal.SIGINT)
except ProcessLookupError:
except (ProcessLookupError, OSError):
pass
os.remove(pid_file)
except FileNotFoundError:
Expand Down
9 changes: 1 addition & 8 deletions polyglotdb/query/annotations/attributes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,15 +341,8 @@ def __getattr__(self, key):
return AcousticAttribute(self, key)
elif self.hierarchy is not None and key in self.hierarchy.get_higher_types(self.node_type):
from .hierarchical import HierarchicalAnnotation
types = self.hierarchy.get_higher_types(self.node_type)
prev_node = self
cur_node = None
for t in types:
higher_node = AnnotationNode(t, corpus=self.corpus, hierarchy=self.hierarchy)
cur_node = HierarchicalAnnotation(prev_node, higher_node)
prev_node = cur_node
if t == key:
break
cur_node = HierarchicalAnnotation(prev_node, AnnotationNode(key, corpus=self.corpus, hierarchy=self.hierarchy))
return cur_node
elif self.hierarchy is not None and key in self.hierarchy.get_lower_types(self.node_type):
from .path import SubPathAnnotation
Expand Down
Loading