MontrealCorpusTools · mmcauliffe · Jan 2, 2026 · Jan 2, 2026
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -4,7 +4,7 @@ on:
   push:
   pull_request:
   workflow_dispatch:
-  
+
 jobs:
   test:
     runs-on: ubuntu-latest
@@ -26,20 +26,20 @@ jobs:
     - name: Set up JDK 21
       uses: actions/setup-java@v3
       with:
-        distribution: 'temurin' 
-        java-version: '21'   
+        distribution: 'temurin'
+        java-version: '21'
 
     - name: Download and set up Praat
       run: |
         wget https://github.com/praat/praat/releases/download/v6.4.21/praat6421_linux-intel64-barren.tar.gz -O praat.tar.gz
         tar -xvzf praat.tar.gz
         echo "praat=$(pwd)/praat_barren" >> $GITHUB_ENV
-        
+
     - name: Install system dependencies
       run: |
         sudo apt-get update
-        sudo apt-get install -y libsndfile1 
-        
+        sudo apt-get install -y libsndfile1
+
     - name: Install required packages and run pytest
       run: |
         python -m venv venv

diff --git a/.github/workflows/stress-test.yml b/.github/workflows/stress-test.yml
@@ -35,19 +35,19 @@ jobs:
       uses: actions/setup-python@v4
       with:
         python-version: '3.12'
-        
+
     - name: Set up JDK 21
       uses: actions/setup-java@v3
       with:
         distribution: 'temurin'  # This specifies the JDK distribution. Temurin is a popular choice.
-        java-version: '21'   
+        java-version: '21'
 
     - name: Install system dependencies
       run: |
         sudo apt-get update
-        sudo apt-get install -y libsndfile1 
+        sudo apt-get install -y libsndfile1
+
 
-
     - name: Install required packages
       run: |
         python -m venv venv
@@ -57,7 +57,7 @@ jobs:
         pip install pyyaml setuptools pandas
         pgdb install $PGDB_HOME
         pgdb start
-        
+
     - name: Download and set up Praat
       run: |
         wget https://github.com/praat/praat/releases/download/v6.4.21/praat6421_linux-intel64-barren.tar.gz -O praat.tar.gz

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -0,0 +1,48 @@
+repos:
+  - repo: local
+    hooks:
+      - id: profile-check
+        name: no profiling
+        entry: '@profile'
+        language: pygrep
+        types: [ python ]
+  - repo: https://github.com/psf/black
+    rev: 23.9.1
+    hooks:
+      - id: black
+  - repo: https://github.com/pycqa/flake8
+    rev: 7.0.0
+    hooks:
+      - id: flake8
+        entry: pflake8
+        additional_dependencies:
+          - pyproject-flake8
+  - repo: https://github.com/pre-commit/mirrors-isort
+    rev: v5.10.1
+    hooks:
+      - id: isort
+        additional_dependencies: [toml]
+  - repo: https://github.com/asottile/setup-cfg-fmt
+    rev: v2.2.0
+    hooks:
+      - id: setup-cfg-fmt
+        args:
+          - --min-py3-version
+          - "3.8"
+  - repo: https://github.com/pre-commit/pre-commit-hooks
+    rev: v4.0.1
+    hooks:
+      - id: check-ast
+      - id: check-builtin-literals
+      - id: check-docstring-first
+      - id: check-merge-conflict
+      - id: check-yaml
+      - id: check-toml
+      - id: debug-statements
+      - id: trailing-whitespace
+        exclude: .txt$
+      - id: end-of-file-fixer
+        exclude: .txt$
+      - id: check-added-large-files
+        args: ['--maxkb=2000']
+      - id: mixed-line-ending
diff --git a/LICENSE b/LICENSE
@@ -19,4 +19,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
 LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
 SOFTWARE.
-
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -1,2 +1,2 @@
 include README.md
-include LICENSE
+include LICENSE
diff --git a/docker/v1.3.0/Dockerfile b/docker/v1.3.0/Dockerfile
@@ -15,7 +15,7 @@ RUN apt update && apt install -y \
     libsndfile1 \
     git \
     cmake \
-    nano \ 
+    nano \
     vim
 
 # Install praat

diff --git a/docker/v1.3.0/config.ini b/docker/v1.3.0/config.ini
@@ -1,2 +1,2 @@
 [Data]
-directory: /temp
+directory: /temp
diff --git a/docker/v1.3.3/Dockerfile b/docker/v1.3.3/Dockerfile
@@ -13,7 +13,7 @@ RUN apt update && apt install -y \
     tar \
     git \
     cmake \
-    nano \ 
+    nano \
     vim
 
 # Install praat
@@ -40,7 +40,7 @@ ENV reaper=/pgdb/tools/reaper
 
 COPY ./config.ini /pgdb/config.ini
 
-# Create a new conda environment named polyglotdb with Python 3.12  
+# Create a new conda environment named polyglotdb with Python 3.12
 RUN conda create -n polyglotdb -c conda-forge polyglotdb python=3.12
 
 # Make sure bash is the default shell

diff --git a/docker/v1.3.3/config.ini b/docker/v1.3.3/config.ini
@@ -1,2 +1,2 @@
 [Data]
-directory: /temp
+directory: /temp
diff --git a/docs/source/ISCANrestart.rst b/docs/source/ISCANrestart.rst
@@ -25,7 +25,7 @@ This section describes instructions for restarting the Roquefort ISCAN server at
     - ``cd /data/mmcauliffe/dev/iscan-spade-server/``
     - ``celery -A iscan_server worker -l info``
 
-#. Press *ctrl+a then d* to close the screen without stopping it. Then, in the main window, run the following command: 
+#. Press *ctrl+a then d* to close the screen without stopping it. Then, in the main window, run the following command:
 
     - ``sudo service apache2 restart``
 

diff --git a/docs/source/acoustics_encoding.rst b/docs/source/acoustics_encoding.rst
@@ -118,9 +118,9 @@ Encoding formants
 
 There are several ways of encoding formants.  The first is encodes formant tracks similar to encoding pitch or intensity
 tracks (i.e., done over utterances).
-There is also support for encoding formants tracks just over specified vowel segments.  
+There is also support for encoding formants tracks just over specified vowel segments.
 Finally, point measures of formants can be encoded.
-Both formant tracks and points can be calculated using either just a simple one-pass algorithm 
+Both formant tracks and points can be calculated using either just a simple one-pass algorithm
 or by using a multiple-pass refinement algorithm.
 
 Basic formant tracks
@@ -158,7 +158,7 @@ These formant tracks do not do any specialised analysis to ensure that they are
 Basic formant point measurements
 --------------------------------
 
-The :code:`analyze_formant_points` function will generate measure for F1, F2, F3, B1, B2, and B3 at the time 
+The :code:`analyze_formant_points` function will generate measure for F1, F2, F3, B1, B2, and B3 at the time
 point 33% of the way through the vowel for every vowel specified.
 
 .. code-block:: python
@@ -183,9 +183,9 @@ multiple values of :code:`n_formants` from 4 to 7.  To pick the best measurement
 means and standard deviations with the :code:`F1, F2, F3, B1, B2, B3` values
 generated by :code:`n_formants=5`.  Then, it performs multiple iterations that select the new best track as the one that
 minimizes the Mahalanobis distance to the relevant prototype.
-In order to choose whether you wish to save tracks or points in the database, just change the `output_tracks` parameter to `true` if you would 
+In order to choose whether you wish to save tracks or points in the database, just change the `output_tracks` parameter to `true` if you would
 like tracks, and `false` otherwise.
-When operating over tracks, the algorithm still only evaluates the best parameters by using the 33% point. 
+When operating over tracks, the algorithm still only evaluates the best parameters by using the 33% point.
 
 .. code-block:: python
 
@@ -197,7 +197,7 @@ Following encoding, phone types that were analyzed will have properties for :cod
 
 .. _script_encoding:
 
-Encoding Voice Onset Time(VOT) 
+Encoding Voice Onset Time(VOT)
 ==============================
 
 Currently there is only one method to encode Voice Onset Times(VOTs) into PolyglotDB.
@@ -223,12 +223,12 @@ VOTs are encoded over a specific subset of phones using :code:`analyze_vot` as f
 Parameters
 ----------
 The :code:`analyze_vot` function has a variety of parameters that are important for running the function properly.
-`classifier` is a string which has a paht to an AutoVOT classifier directory. 
+`classifier` is a string which has a paht to an AutoVOT classifier directory.
 A default classifier is available in `/tests/data/classifier/sotc_classifiers`.
 
-`stop_label` refers to the name of the subset of phones that you intend to calculate VOTs for. 
+`stop_label` refers to the name of the subset of phones that you intend to calculate VOTs for.
 
-`vot_min` and `vot_max` refer to the minimum and maximum duration of any VOT that is calculated. 
+`vot_min` and `vot_max` refer to the minimum and maximum duration of any VOT that is calculated.
 The `AutoVOT repo <https://github.com/mlml/autovot>` has some sane defaults for English voiced and voiceless stops.
 
 `window_min` and `window_max` refer to the edges of a given phone's duration.
@@ -240,17 +240,17 @@ a `window_max` of 30 means that it will look up to 30 milliseconds after the end
 Encoding other measures using a Praat script
 ============================================
 
-You can encode additional acoustic measures by passing a Praat script to either 
-:code:`analyze_script` or :code:`analyze_track_script`. It is essential to follow the exact input and output format for 
+You can encode additional acoustic measures by passing a Praat script to either
+:code:`analyze_script` or :code:`analyze_track_script`. It is essential to follow the exact input and output format for
 your Praat script to ensure compatibility with the system.
 
-- :code:`analyze_script`: Designed for single-point measurements. This function works for user-specific 
-  measurements that occur at exactly one point in time for any target annotation type 
+- :code:`analyze_script`: Designed for single-point measurements. This function works for user-specific
+  measurements that occur at exactly one point in time for any target annotation type
   (or a defined subset of that type) in the hierarchy, such as a predefined set of vowels within all phones.
 
-- :code:`analyze_track_script`: Use this for continuous measurements or when measurements are required 
-  at multiple time points per annotation. This function allows you to configure your Praat script to 
-  output results for multiple time points. 
+- :code:`analyze_track_script`: Use this for continuous measurements or when measurements are required
+  at multiple time points per annotation. This function allows you to configure your Praat script to
+  output results for multiple time points.
 
 analyze_script
 --------------
@@ -259,34 +259,34 @@ There are two input formats available for designing your Praat script:
 
 Format 1:
 ~~~~~~~~~
-This is sufficient for most use cases and should be your default choice unless runtime efficiency is critical. 
-In this format, the system generates temporary sound files, each containing one instance of your chosen annotation type. 
+This is sufficient for most use cases and should be your default choice unless runtime efficiency is critical.
+In this format, the system generates temporary sound files, each containing one instance of your chosen annotation type.
 
 **Input Requirements:**
 
 - One required input: the full path to the sound file. This input will be automatically filled by the system. You can define additional attributes as needed.
 
 Example Praat script using Format 1 can be found `here <https://github.com/MontrealCorpusTools/PolyglotDB/tree/main/examples/praat_scripts/mean_pitch.praat>`_.
-This script computes the mean F0 (pitch) over a sound file. 
+This script computes the mean F0 (pitch) over a sound file.
 
 Format 2 (for optimized analysis):
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 This format is more efficient as it reuses the same discourse sound file for all annotations in the same discourse, avoiding the creation of extra files.
 
 **Input Requirements:**
 
-- Five required inputs: 
+- Five required inputs:
     - Full path to the **long** sound file
     - `begin` time
     - `end` time
     - `channel`
     - `padding`
 
-Do not assign values to these five fields; the system will populate them during processing. You may include additional 
+Do not assign values to these five fields; the system will populate them during processing. You may include additional
 attributes beyond these five, but ensure that values are passed as an array via the API.
 
-Example Praat script using Format 2 can be found `here <https://github.com/MontrealCorpusTools/PolyglotDB/tree/main/examples/praat_scripts/mean_pitch_optimized.praat>`_. 
-Similar to the previous example script, this script computes the mean F0 (pitch) over a sound file, but this time includes the extra four inputs. 
+Example Praat script using Format 2 can be found `here <https://github.com/MontrealCorpusTools/PolyglotDB/tree/main/examples/praat_scripts/mean_pitch_optimized.praat>`_.
+Similar to the previous example script, this script computes the mean F0 (pitch) over a sound file, but this time includes the extra four inputs.
 
 **Key Notes:**
 
@@ -330,10 +330,10 @@ To run :code:`analyze_script`, follow these steps:
 analyze_track_script
 --------------------
 
-This function shares the same input formats and functionality as :code:`analyze_script`. However, 
+This function shares the same input formats and functionality as :code:`analyze_script`. However,
 :code:`analyze_track_script` is specifically designed for continuous measurements.
-Before using this functionality, you must add utterance encoding. When calling the API, you will 
-need to specify an annotation type (e.g., phone, syllable, or word) to perform the analysis. 
+Before using this functionality, you must add utterance encoding. When calling the API, you will
+need to specify an annotation type (e.g., phone, syllable, or word) to perform the analysis.
 The script will then run separately for each instance of the selected annotation type in a multiprocessing manner.
 
 **Output Requirements:**
@@ -369,11 +369,11 @@ Sometimes you may want to use external software to generate measurement tracks.
     - Voice quality tracks for each vowel, computed using `VoiceSauce`_
     - Vowel formant tracks, e.g. using `FastTrack`_.
 
-If you have generated tracks using other software, you can import them into PolyglotDB using the functions :code:`save_track_from_csvs` and :code:`save_track_from_csv` as long as the files 
+If you have generated tracks using other software, you can import them into PolyglotDB using the functions :code:`save_track_from_csvs` and :code:`save_track_from_csv` as long as the files
 follow the expected structure.
 
-CSV Format:: 
-    
+CSV Format::
+
     time, measurement1, measurement2, measurement3, ...
 
 Additionally, the file name should match the name of the discourse for which the track should be saved.
@@ -382,44 +382,44 @@ Calling the function :code:`save_track_from_csv` with the file path will save th
 
 To load multiple CSV files at once, pass a directory path to :code:`save_track_from_csvs`.
 
-**Example** (FastTrack output): 
+**Example** (FastTrack output):
 
 .. image:: images/fasttrack_csvoutput.png
    :width: 600
 
-To load all the measures from the generated tracks: 
+To load all the measures from the generated tracks:
 
-.. code-block:: python 
+.. code-block:: python
 
     with CorpusContext(config) as c:
         # loading one file
         c.save_track_from_csv('formants', '/path/to/csv', ['f1','b1','f2','b2','f3','b3','f1p','f2p','f3p','f0','intensity','harmonicity'])
-        # loading multiple csv files 
+        # loading multiple csv files
         c.save_track_from_csvs('formants', '/path/to/directory', ['f1','b1','f2','b2','f3','b3','f1p','f2p','f3p','f0','intensity','harmonicity'])
 
 
 Encoding acoustic track statistics
 ==================================
 
 After encoding an acoustic track measurement—either through the built-in algorithms or custom Praat scripts—
-you can perform statistical aggregation on these data tracks. The supported statistical measures are: mean, median, 
-standard deviation (stddev), sum, mode, and count. 
+you can perform statistical aggregation on these data tracks. The supported statistical measures are: mean, median,
+standard deviation (stddev), sum, mode, and count.
 
-Aggregation can be performed on a specified annotation type, such as phones, words, or syllables 
+Aggregation can be performed on a specified annotation type, such as phones, words, or syllables
 (if syllable encoding is available). The aggregation is conducted for all annotations with the same label.
 
-Aggregation can be performed by speaker, in which case the results will be grouped by speaker, 
+Aggregation can be performed by speaker, in which case the results will be grouped by speaker,
 and each (annotation_label, speaker) pair will have its corresponding statistical measure computed.
 
 Once encoded, the computed statistics are stored and can be queried later.
 
 .. code-block:: python
 
-    with CorpusContext(config) as c:        
+    with CorpusContext(config) as c:
         # Encode a statistic for an acoustic measure
         c.encode_acoustic_statistic('voice_quality', 'mean', by_annotation='phone', by_speaker=True)
-        
+
         # Alternatively, call the get function directly; it will encode the statistic if not already available
         results = c.get_acoustic_statistic('voice_quality', 'mean', by_annotation='phone', by_speaker=True)
-        # This would compute, save, and return the mean values for all voice quality measurements on a by speaker and by phone basis. 
+        # This would compute, save, and return the mean values for all voice quality measurements on a by speaker and by phone basis.
         # for example ('speaker1', 'AO1'): [1.4283178345991416, 5.21375241700153, 28.8672225446156, 18.57861883658481]
diff --git a/docs/source/api_acoustics.rst b/docs/source/api_acoustics.rst
@@ -82,5 +82,3 @@ Conch function generators
 -------------------------
 
 .. autofunction:: polyglotdb.acoustics.other.generate_praat_script_function
-
-
diff --git a/docs/source/api_corpus.rst b/docs/source/api_corpus.rst
@@ -80,4 +80,3 @@ Corpus config class
 -------------------
 
 .. autoclass:: polyglotdb.config.CorpusConfig
-
Original file line number	Diff line number	Diff line change
Expand Up		@@ -19,4 +19,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
		LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
		OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
		SOFTWARE.
-Original file line number
+Diff line change
@@ Expand Up / @@ -15,7 +15,7 @@ RUN apt update && apt install -y \ @@
         libsndfile1 \
         git \
         cmake \
-        nano \
+        nano \
         vim
     # Install praat
@@ Expand Down @@
Original file line number	Diff line number	Diff line change
Expand Up		@@ -82,5 +82,3 @@ Conch function generators
		-------------------------

		.. autofunction:: polyglotdb.acoustics.other.generate_praat_script_function
Original file line number	Diff line number	Diff line change
Expand Up		@@ -80,4 +80,3 @@ Corpus config class
		-------------------

		.. autoclass:: polyglotdb.config.CorpusConfig