MontrealCorpusTools · mmcauliffe · Nov 25, 2025 · Nov 21, 2025 · Nov 22, 2025 · Nov 22, 2025
diff --git a/polyglotdb/corpus/audio.py b/polyglotdb/corpus/audio.py
@@ -742,10 +742,15 @@ def get_utterance_acoustics(self, acoustic_name, utterance_id, discourse, speake
         columns = '"time", {}'.format(', '.join(property_names))
         speaker = speaker.replace("'", r"\'") # Escape apostrophes
         discourse = discourse.replace("'", r"\'") # Escape apostrophes
-        query = '''select {} from "{}"
-                        WHERE "utterance_id" = '{}'
-                        AND "discourse" = '{}'
-                        AND "speaker" = '{}';'''.format(columns, acoustic_name, utterance_id, discourse, speaker)
+        if utterance_id is not None:
+            query = '''select {} from "{}"
+                            WHERE "utterance_id" = '{}'
+                            AND "discourse" = '{}'
+                            AND "speaker" = '{}';'''.format(columns, acoustic_name, utterance_id, discourse, speaker)
+        else:
+            query = '''select {} from "{}"
+                            WHERE "discourse" = '{}'
+                            AND "speaker" = '{}';'''.format(columns, acoustic_name, discourse, speaker)
         result = self.execute_influxdb(query)
         track = Track()
         for r in result.get_points(acoustic_name):
@@ -906,15 +911,16 @@ def _save_measurement(self, sound_file, track, acoustic_name, **kwargs):
                     phone_type.begin.column_name('begin'), 
                     phone_type.end.column_name('end'),
                     phone_type.word.label.column_name('word_label'),
-                    phone_type.speaker.name.column_name('speaker'),
-                    phone_type.utterance.id.column_name('utterance_id')]
+                    phone_type.speaker.name.column_name('speaker')]
+        column_labels = ['label', 'begin', 'end', 'word_label', 'speaker']
+        if "utterance" in self.annotation_types:
+            columns.append(phone_type.syllable.label.column_name('utterance_id'))
+            column_labels.append('utterance_id')
         if 'syllable' in self.annotation_types:
             columns.append(phone_type.syllable.label.column_name('syllable_label'))
-            q = q.columns(*columns).order_by(phone_type.begin)
-            phones = [(x['label'], x['begin'], x['end'], x['word_label'], x['speaker'], x['utterance_id'], x['syllable_label']) for x in q.all()]
-        else:
-            q = q.columns(*columns).order_by(phone_type.begin)
-            phones = [(x['label'], x['begin'], x['end'], x['word_label'], x['speaker'], x['utterance_id']) for x in q.all()]
+            column_labels.append('syllable_label')
+        q = q.columns(*columns).order_by(phone_type.begin)
+        phones = [{y: x[y] for y in column_labels} for x in q.all()]
         for time_point, value in track.items():
             fields = {}
             for name, type in measures:
@@ -925,20 +931,20 @@ def _save_measurement(self, sound_file, track, acoustic_name, **kwargs):
                 continue
             label = None
             speaker = None
+            word_label = None
+            syllable_label = None
             for i, p in enumerate(phones):
-                if p[1] > time_point:
+                if p['begin'] > time_point:
                     break
-                label = p[0]
-                speaker = p[4]
+                label = p['label']
+                speaker = p['speaker']
                 if 'syllable' in self.annotation_types:
-                    syllable_label = p[6]
-                word_label = p[3]
-                utterance_id = p[5]
+                    syllable_label = p['syllable_label']
+                word_label = p['word_label']
+                if utterance_id is None:
+                    utterance_id = p.get('utterance_id', None)
                 if i == len(phones) - 1:
                     break
-            else:
-                label = None
-                speaker = None
             if speaker is None:
                 continue
             t_dict = {'speaker': speaker}

diff --git a/polyglotdb/corpus/syllabic.py b/polyglotdb/corpus/syllabic.py
@@ -60,18 +60,18 @@ def find_onsets(self, syllabic_label='syllabic'):
             for d in discourses:
                 statement = '''match (w:{word_name}:{corpus_name})-[:spoken_by]->(s:Speaker:{corpus_name}),
                 (w)-[:spoken_in]->(d:Discourse:{corpus_name})
-        where (w)<-[:contained_by*1..2]-()-[:is_a]->(:{syllabic_name})
+        where (w)<-[:contained_by]-()-[:is_a]->(:{syllabic_name})
         AND s.name = $speaker
         AND d.name = $discourse
         with w
         match (n:{phone_name}:{corpus_name})-[:is_a]->(t:{corpus_name}:{syllabic_name}),
-        (n)-[:contained_by*1..2]->(w)
+        (n)-[:contained_by]->(w)
         with w, n
         order by n.begin
         with w,collect(n)[0..1] as coll unwind coll as n
 
-        MATCH (pn:{phone_name}:{corpus_name})-[:contained_by*1..2]->(w)
-        where not (pn)<-[:precedes]-()-[:contained_by*1..2]->(w)
+        MATCH (pn:{phone_name}:{corpus_name})-[:contained_by]->(w)
+        where not (pn)<-[:precedes]-()-[:contained_by]->(w)
         with w, n,pn
         match p = shortestPath((pn)-[:precedes*0..10]->(n))
         with [x in nodes(p)[0..-1]|x.label] as onset
@@ -105,18 +105,18 @@ def find_codas(self, syllabic_label='syllabic'):
             for d in discourses:
                 statement = '''match (w:{word_name}:{corpus_name})-[:spoken_by]->(s:Speaker:{corpus_name}),
                 (w)-[:spoken_in]->(d:Discourse:{corpus_name})
-        where (w)<-[:contained_by*1..2]-()-[:is_a]->(:{syllabic_name})
+        where (w)<-[:contained_by]-()-[:is_a]->(:{syllabic_name})
         AND s.name = $speaker
         AND d.name = $discourse
         with w
         match (n:{phone_name}:{corpus_name})-[:is_a]->(t:{corpus_name}:{syllabic_name}),
-        (n)-[:contained_by*1..2]->(w)
+        (n)-[:contained_by]->(w)
         with w, n
         order by n.begin DESC
         with w,collect(n)[0..1] as coll unwind coll as n
 
-        MATCH (pn:{phone_name}:{corpus_name})-[:contained_by*1..2]->(w)
-        where not (pn)-[:precedes]->()-[:contained_by*1..2]->(w)
+        MATCH (pn:{phone_name}:{corpus_name})-[:contained_by]->(w)
+        where not (pn)-[:precedes]->()-[:contained_by]->(w)
         with w, n,pn
         match p = shortestPath((n)-[:precedes*0..10]->(pn))
         with [x in nodes(p)[1..]|x.label] as coda
@@ -287,8 +287,8 @@ def encode_syllables(self, algorithm='maxonset', syllabic_label='syllabic', call
             codas = norm_count_dict(codas, onset=False)
         elif algorithm == 'maxonset':
             if custom_onsets is None:
-                onsets = set(onsets.keys())
-                print(onsets)
+                onsets = sorted(set(onsets.keys()))
+                print(f"Onsets found by max onset: {onsets}")
         else:
             raise NotImplementedError
 

diff --git a/polyglotdb/corpus/utterance.py b/polyglotdb/corpus/utterance.py
@@ -65,7 +65,6 @@ def encode_utterances(self, min_pause_length=0.5, min_utterance_length=0,
                 call_back(i)
                 call_back('Parsing utterances for discourse {} of {} ({})...'.format(i, len(discourses), d))
             utt_data = self.get_utterance_ids(d, min_pause_length, min_utterance_length)
-            speaker_data = {}
             for s, utterances in utt_data.items():
                 speaker_data = []
                 prev_id = None

diff --git a/polyglotdb/databases/neo4j.conf b/polyglotdb/databases/neo4j.conf
@@ -32,8 +32,8 @@ dbms.security.auth_enabled=false
 # calculated based on available system resources.
 # Uncomment these lines to set specific initial and maximum
 # heap size.
-server.memory.heap.initial_size=512m
-server.memory.heap.max_size=512m
+server.memory.heap.initial_size=2g
+server.memory.heap.max_size=4g
 
 # The amount of memory to use for mapping the store files, in bytes (or
 # kilobytes with the 'k' suffix, megabytes with 'm' and gigabytes with 'g').

diff --git a/polyglotdb/io/importer/from_csv.py b/polyglotdb/io/importer/from_csv.py
@@ -133,7 +133,7 @@ def _begin_index(tx, at):
 
     def _end_index(tx, at):
         tx.run('CREATE INDEX FOR (n:%s) ON (n.end)' % at)
-
+    corpus_name = corpus_context.cypher_safe_name
     with corpus_context.graph_driver.session() as session:
         for i, s in enumerate(speakers):
             speaker_statements = []
@@ -258,12 +258,15 @@ def _end_index(tx, at):
             except:
                 raise
             finally:
-                # with open(path, 'w'):
-                #    pass
                 os.remove(s[2])
             log.info('Finished loading {} relationships for speaker {}!'.format(s[3], s[4]))
             log.debug('{} relationships loading took: {} seconds.'.format(s[3], time.time() - begin))
-
+    statement = f"""
+    MATCH (subunit:{corpus_name}:speech)-[:contained_by*2..]->(superunit:{corpus_name}:speech)
+    with subunit, superunit
+    CREATE (subunit)-[:contained_by]->(superunit)
+    """
+    corpus_context.execute_cypher(statement)
     log.info('Finished importing into the graph database!')
     log.debug('Graph importing took: {} seconds'.format(time.time() - initial_begin))
 
@@ -768,6 +771,19 @@ def import_utterance_csv(corpus_context, call_back=None, stop_check=None):
                                               corpus=corpus_context.cypher_safe_name,
                                               word_type=corpus_context.word_name)
             corpus_context.execute_cypher(statement)
+            word_statement = '''
+            CALL {{
+                LOAD CSV WITH HEADERS FROM "{path}" AS csvLine
+                MATCH (n)-[:contained_by*]->()-[:contained_by]->(utt:utterance:{corpus}:speech {{id: csvLine.id}})
+                WITH utt, collect(n) AS subunits
+                UNWIND subunits AS w
+                CREATE (w)-[:contained_by]->(utt)
+            }} IN TRANSACTIONS OF 1000 ROWS
+            '''
+            statement = word_statement.format(path=csv_path,
+                                              corpus=corpus_context.cypher_safe_name,
+                                              word_type=corpus_context.word_name)
+            corpus_context.execute_cypher(statement)
             if corpus_context.config.debug:
                 print('Hierarchical relationship creation took {} seconds.'.format(time.time() - begin))
             os.remove(path)
@@ -895,6 +911,21 @@ def import_syllable_csv(corpus_context, call_back=None, stop_check=None):
                                              word_name=corpus_context.word_name,
                                              phone_name=corpus_context.phone_name)
             corpus_context.execute_cypher(statement)
+            rel_statement = '''
+            CALL {{
+                LOAD CSV WITH HEADERS FROM "{path}" AS csvLine
+                MATCH (s:syllable:{corpus}:speech {{id: csvLine.id}})-[:contained_by]->(w:{word_name}:{corpus}:speech),
+                    (w)-[:contained_by]->(n)
+                WITH s, collect(n) as superunits
+                UNWIND superunits AS u
+                CREATE (s)-[:contained_by]->(u)
+            }} IN TRANSACTIONS OF 2000 ROWS
+            '''
+            statement = rel_statement.format(path=csv_path,
+                                             corpus=corpus_context.cypher_safe_name,
+                                             word_name=corpus_context.word_name,
+                                             phone_name=corpus_context.phone_name)
+            corpus_context.execute_cypher(statement)
             if corpus_context.config.debug:
                 print('Hierarchical relationship creation took {} seconds.'.format(time.time() - begin))
 
@@ -937,21 +968,6 @@ def import_syllable_csv(corpus_context, call_back=None, stop_check=None):
             if corpus_context.config.debug:
                 print('Precedence relationship creation took {} seconds.'.format(time.time() - begin))
 
-            begin = time.time()
-            del_rel_statement = '''
-            CALL {{
-                LOAD CSV WITH HEADERS FROM "{path}" AS csvLine
-                MATCH (n:{phone_name}:{corpus}:speech:nucleus {{id: csvLine.vowel_id}})-[r:contained_by]->(w:{word_name}:{corpus}:speech)
-                DELETE r
-            }} IN TRANSACTIONS OF 2000 ROWS
-            '''
-            statement = del_rel_statement.format(path=csv_path,
-                                                 corpus=corpus_context.cypher_safe_name,
-                                                 word_name=corpus_context.word_name,
-                                                 phone_name=corpus_context.phone_name)
-            corpus_context.execute_cypher(statement)
-            if corpus_context.config.debug:
-                print('Phone-word relationship deletion took {} seconds.'.format(time.time() - begin))
 
             begin = time.time()
             onset_statement = '''
@@ -964,12 +980,9 @@ def import_syllable_csv(corpus_context, call_back=None, stop_check=None):
                     onspath = (onset)-[:precedes*1..10]->(n)
                 WITH n, w, s, csvLine, onspath
                 UNWIND (CASE WHEN onspath IS NOT NULL THEN nodes(onspath)[0..-1] ELSE [NULL] END) AS o
-                OPTIONAL MATCH (o)-[r:contained_by]->(w)
-                WITH n, w, s, csvLine, [x IN collect(o) WHERE x IS NOT NULL | x] AS ons,
-                    [x IN collect(r) WHERE x IS NOT NULL | x] AS rels
+                WITH n, s, csvLine, [x IN collect(o) WHERE x IS NOT NULL | x] AS ons
                 FOREACH (o IN ons | SET o :onset, o.syllable_position = 'onset')
                 FOREACH (o IN ons | CREATE (o)-[:contained_by]->(s))
-                FOREACH (r IN rels | DELETE r)
             }} IN TRANSACTIONS OF 2000 ROWS
             '''
             statement = onset_statement.format(path=csv_path,
@@ -991,12 +1004,9 @@ def import_syllable_csv(corpus_context, call_back=None, stop_check=None):
                     codapath = (n)-[:precedes*1..10]->(coda)
                 WITH n, w, s, codapath
                 UNWIND (CASE WHEN codapath IS NOT NULL THEN nodes(codapath)[1..] ELSE [NULL] END) AS c
-                OPTIONAL MATCH (c)-[r:contained_by]->(w)
-                WITH n, w, s, [x IN collect(c) WHERE x IS NOT NULL | x] AS cod,
-                    [x IN collect(r) WHERE x IS NOT NULL | x] AS rels
+                WITH n, s, [x IN collect(c) WHERE x IS NOT NULL | x] AS cod
                 FOREACH (c IN cod | SET c :coda, c.syllable_position = 'coda')
                 FOREACH (c IN cod | CREATE (c)-[:contained_by]->(s))
-                FOREACH (r IN rels | DELETE r)
             }} IN TRANSACTIONS OF 2000 ROWS
             '''
             statement = coda_statement.format(path=csv_path,
@@ -1118,6 +1128,21 @@ def import_nonsyl_csv(corpus_context, call_back=None, stop_check=None):
                                              word_name=corpus_context.word_name,
                                              phone_name=corpus_context.phone_name)
             corpus_context.execute_cypher(statement)
+            rel_statement = '''
+            CALL {{
+                LOAD CSV WITH HEADERS FROM "{path}" AS csvLine
+                MATCH (s:syllable:{corpus}:speech {{id: csvLine.id}})-[:contained_by]->(w:{word_name}:{corpus}:speech),
+                    (w)-[:contained_by]->(n)
+                WITH s, collect(n) as superunits
+                UNWIND superunits AS u
+                CREATE (s)-[:contained_by]->(u)
+            }} IN TRANSACTIONS OF 2000 ROWS
+            '''
+            statement = rel_statement.format(path=csv_path,
+                                             corpus=corpus_context.cypher_safe_name,
+                                             word_name=corpus_context.word_name,
+                                             phone_name=corpus_context.phone_name)
+            corpus_context.execute_cypher(statement)
             if corpus_context.config.debug:
                 print('Hierarchical and spoken relationship creation took {} seconds.'.format(time.time() - begin))
 
@@ -1169,13 +1194,10 @@ def import_nonsyl_csv(corpus_context, call_back=None, stop_check=None):
                     p = (o)-[:precedes*..10]->(c)
                 WITH o, w, s, p, csvLine
                 UNWIND (CASE WHEN p IS NOT NULL THEN nodes(p) ELSE [o] END) AS c
-                OPTIONAL MATCH (c)-[r:contained_by]->(w)
-                WITH w, s, toInteger(csvLine.break) AS break, [x IN collect(c) WHERE x IS NOT NULL | x] AS cod,
-                    [x IN collect(r) WHERE x IS NOT NULL | x] AS rels
+                WITH s, toInteger(csvLine.break) AS break, [x IN collect(c) WHERE x IS NOT NULL | x] AS cod
                 FOREACH (c IN cod[break..] | SET c :coda, c.syllable_position = 'coda')
                 FOREACH (c IN cod[..break] | SET c :onset, c.syllable_position = 'onset')
                 FOREACH (c IN cod | CREATE (c)-[:contained_by]->(s))
-                FOREACH (r IN rels | DELETE r)
             }} IN TRANSACTIONS OF 2000 ROWS
             '''
             statement = phone_statement.format(path=csv_path,

diff --git a/polyglotdb/pgdb.py b/polyglotdb/pgdb.py
@@ -215,7 +215,7 @@ def stop():
             pid = int(f.read().strip())
         try:
             os.kill(pid, signal.SIGINT)
-        except ProcessLookupError:
+        except (ProcessLookupError, OSError):
             pass
         os.remove(pid_file)
     except FileNotFoundError:

diff --git a/polyglotdb/query/annotations/attributes/base.py b/polyglotdb/query/annotations/attributes/base.py
@@ -341,15 +341,8 @@ def __getattr__(self, key):
             return AcousticAttribute(self, key)
         elif self.hierarchy is not None and key in self.hierarchy.get_higher_types(self.node_type):
             from .hierarchical import HierarchicalAnnotation
-            types = self.hierarchy.get_higher_types(self.node_type)
             prev_node = self
-            cur_node = None
-            for t in types:
-                higher_node = AnnotationNode(t, corpus=self.corpus, hierarchy=self.hierarchy)
-                cur_node = HierarchicalAnnotation(prev_node, higher_node)
-                prev_node = cur_node
-                if t == key:
-                    break
+            cur_node = HierarchicalAnnotation(prev_node, AnnotationNode(key, corpus=self.corpus, hierarchy=self.hierarchy))
             return cur_node
         elif self.hierarchy is not None and key in self.hierarchy.get_lower_types(self.node_type):
             from .path import SubPathAnnotation