Add scripts and docs for making use of eduskunta-v1 dataset

vjoki · Jan 26, 2021 · ae0bb5c · ae0bb5c
1 parent 125a9c0
commit ae0bb5c
Show file tree

Hide file tree

Showing 4 changed files with 193 additions and 0 deletions.
diff --git a/EDUSKUNTA.md b/EDUSKUNTA.md
@@ -0,0 +1,65 @@
+# Preparing Plenary Sessions of the Parliament of Finland dataset for use in speaker verification
+
+The [Plenary Sessions of the Parliament of
+Finland](http://urn.fi/urn:nbn:fi:lb-2017030901) dataset is a sizable corpus of
+transcripted finnish audio. The transcriptions constitute of per word aligned
+annotations in EAF-format files. To convert them for more convenient form for
+use in speaker verification tasks it's necessary to group the annotations and
+split the utterances from the larger WAV files.
+
+## Caveats of the described process
+
+1. Samples have not been verified for overlapping speech or misaligned
+   timestamps.
+
+2. Word grouping logic is quite rough and likely has fair bit of room for
+   improvement.
+
+3. Hashing speaker ids (tier ids) is currently quite frail, encoding and spaces
+   affect results.
+
+4. Some speakers have multiple tier ids due to additional prefixes (ministerial
+   portfolio).
+
+5. Due to 3. and 4. the speaker ids (tier ids) of the resulting files likely
+   require manual tweaking.
+
+6. There might be some finland swedish mixed in the audio.
+
+# 1. Preparations
+
+- Install `scripts/eaf-word2sentence` dependencies:
+```shell
+$ python -m venv word2sentence
+$ source word2sentence/bin/activate
+$ pip install pympi-ling
+```
+
+- Build `elan2split`, requires Boost.Filesystem and Xerces-C++:
+TODO: This could be replaced with a simple iteration step in `scripts/eaf-word2sentence.py`.
+```shell
+$ git clone https://github.com/vjoki/ELAN2split
+$ cd elan2split/
+$ mkdir build/
+$ cmake ../
+$ make
+```
+
+# 2. Converting the eaf per word annotations to longer groups of words.
+
+1. Unpack the dataset.
+
+2. Iterate through the `.eaf` files:
+```shell
+$ source word2sentence/bin/activate
+$ for eaf in 2016-kevat/2016-*/*.eaf; do
+    python scripts/eaf-word2sentence.py --file_path "$eaf"
+    elan2split --name -o ./eduskunta/ "$eaf"
+  done
+```
+
+3. Organize the files into directories per speaker with `scripts/organize.sh`.
+
+4. Optionally take a subset of the dataset using `scripts/sample_dataset.py`.
+
+4. Manually fix any issues with tier ids.
diff --git a/scripts/eaf-word2sentence.py b/scripts/eaf-word2sentence.py
@@ -0,0 +1,67 @@
+# Script for grouping per word annotations in an EAF file into sentences or longer groups of words.
+# Intended for converting Plenary Sessions of the Parliament of Finland, Downloadable Version 1 EAF
+# files into suitable form for use in speaker verification testing.
+#
+# Replaces the original EAF file (original is moved out of the way by appending ".bak" to the filename).
+import os
+import argparse
+import pympi.Elan as e
+
+MIN_DURATION = 3000
+MAX_DURATION = 25000
+
+parser = argparse.ArgumentParser()
+parser.add_argument('--file_path', type=str)
+args = parser.parse_args()
+
+eaf = e.Eaf(args.file_path)
+
+# Dataset MEDIA_DESCRIPTORs only have MEDIA_URL defined to the original mp4 video,
+# however we need path to the wav files for the extraction step.
+linked_path, ext = os.path.splitext(args.file_path)
+for linked in eaf.get_linked_files():
+    eaf.remove_linked_files(linked['MEDIA_URL'])
+eaf.add_linked_file(linked_path + '.wav',
+                    relpath=os.path.basename(linked_path) + '.wav',
+                    mimetype='audio/wav')
+
+# Try to group word annotations into sentences, only keep sentences < MAX_DURATION.
+for tid, (anno, _, _, _) in eaf.tiers.items():
+    utterances = []
+    utterance = []
+    utterance_str = []
+    utterance_start = 0
+
+    for aid, (start_ts, end_ts, val, _) in anno.items():
+        utterance.append(aid)
+        utterance_str.append(val)
+        if utterance_start == 0:
+            utterance_start = eaf.timeslots[start_ts]
+
+        duration = eaf.timeslots[end_ts] - utterance_start
+
+        if (val.rstrip().endswith('.') and duration > MIN_DURATION) \
+           or duration > MAX_DURATION*0.8:
+            value = ' '.join(utterance_str)
+
+            if duration < MAX_DURATION:
+                utterances.append((utterance, utterance_start,
+                                   eaf.timeslots[end_ts], value))
+                # print('added {}s [{}]: {}'.format(duration/1000,
+                #                                   tid, value))
+            else:
+                print('skipped {}s [{}]: {}'.format(duration/1000,
+                                                    tid, value))
+
+            utterance = []
+            utterance_start = 0
+            utterance_str = []
+
+    eaf.remove_all_annotations_from_tier(tid)
+
+    for (aid, start, end, val) in utterances:
+        eaf.add_annotation(tid, start, end, val)
+
+print('Collected {} annotations for {} speakers.'
+      .format(len(eaf.annotations), len(eaf.tiers)))
+eaf.to_file(args.file_path)
diff --git a/scripts/organize.sh b/scripts/organize.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+# Move files in the form DIR/TIER_* into subdirs, to form a DIR/TIER/TIER_* directory structure.
+DIR="$*"
+ARR=()
+for f in "$DIR"/*; do
+    [ -d "$f" ] && continue
+
+    TIER="${f%%_*}"
+    if [ ! -d "./$TIER" ]; then
+        mkdir -v "./$TIER"
+        ARR+=("./$TIER")
+        continue
+    fi
+
+    MATCH=0
+    for d in "${ARR[@]}"; do
+        if [ "$d" == "./$TIER" ]; then
+            MATCH=1
+            break
+        fi
+    done
+
+    if [ "$MATCH" == 0 ]; then
+        ARR+=("./$TIER")
+    fi
+done
+
+for d in "${ARR[@]}"; do
+    echo "moving files to $d"
+    mv -vn "./${d}_"* "./${d}"
+done
diff --git a/scripts/sample_dataset.py b/scripts/sample_dataset.py
@@ -0,0 +1,30 @@
+# Make a smaller subset of a bigger dataset.
+#
+# Made adhoc for Plenary Sessions of the Parliament of Finland, Downloadable Version 1 dataset.
+import os
+import random
+import glob
+import shutil
+
+PICKS = 80
+DEST_DIR = 'data/edus80/'
+DATA_DIR = 'data/eduskunta/'
+
+# Take PICKS samples from each speaker, skipping speakers that don't have enough samples.
+for d in os.listdir(DATA_DIR):
+    # Each .wav file is accompanied by a .txt file.
+    if len(os.listdir(DATA_DIR + d)) < 2*PICKS:
+        print('skipping ' + d)
+        continue
+
+    newd = DEST_DIR + os.path.basename(d)
+    os.makedirs(newd)
+    #print('mkdir ' + newd)
+
+    picks = random.sample(glob.glob(DATA_DIR + d + '/*wav'), k=PICKS)
+
+    for f in picks:
+        base, _ = os.path.splitext(f)
+        #print('copy ' + DATA_DIR + d + base + ' to ' + newd)
+        shutil.copy(base + '.wav', newd)
+        shutil.copy(base + '.txt', newd)