diff --git a/EDUSKUNTA.md b/EDUSKUNTA.md new file mode 100644 index 0000000..995c78a --- /dev/null +++ b/EDUSKUNTA.md @@ -0,0 +1,65 @@ +# Preparing Plenary Sessions of the Parliament of Finland dataset for use in speaker verification + +The [Plenary Sessions of the Parliament of +Finland](http://urn.fi/urn:nbn:fi:lb-2017030901) dataset is a sizable corpus of +transcripted finnish audio. The transcriptions constitute of per word aligned +annotations in EAF-format files. To convert them for more convenient form for +use in speaker verification tasks it's necessary to group the annotations and +split the utterances from the larger WAV files. + +## Caveats of the described process + +1. Samples have not been verified for overlapping speech or misaligned + timestamps. + +2. Word grouping logic is quite rough and likely has fair bit of room for + improvement. + +3. Hashing speaker ids (tier ids) is currently quite frail, encoding and spaces + affect results. + +4. Some speakers have multiple tier ids due to additional prefixes (ministerial + portfolio). + +5. Due to 3. and 4. the speaker ids (tier ids) of the resulting files likely + require manual tweaking. + +6. There might be some finland swedish mixed in the audio. + +# 1. Preparations + +- Install `scripts/eaf-word2sentence` dependencies: +```shell +$ python -m venv word2sentence +$ source word2sentence/bin/activate +$ pip install pympi-ling +``` + +- Build `elan2split`, requires Boost.Filesystem and Xerces-C++: +TODO: This could be replaced with a simple iteration step in `scripts/eaf-word2sentence.py`. +```shell +$ git clone https://github.com/vjoki/ELAN2split +$ cd elan2split/ +$ mkdir build/ +$ cmake ../ +$ make +``` + +# 2. Converting the eaf per word annotations to longer groups of words. + +1. Unpack the dataset. + +2. Iterate through the `.eaf` files: +```shell +$ source word2sentence/bin/activate +$ for eaf in 2016-kevat/2016-*/*.eaf; do + python scripts/eaf-word2sentence.py --file_path "$eaf" + elan2split --name -o ./eduskunta/ "$eaf" + done +``` + +3. Organize the files into directories per speaker with `scripts/organize.sh`. + +4. Optionally take a subset of the dataset using `scripts/sample_dataset.py`. + +4. Manually fix any issues with tier ids. diff --git a/scripts/eaf-word2sentence.py b/scripts/eaf-word2sentence.py new file mode 100644 index 0000000..6ef0e11 --- /dev/null +++ b/scripts/eaf-word2sentence.py @@ -0,0 +1,67 @@ +# Script for grouping per word annotations in an EAF file into sentences or longer groups of words. +# Intended for converting Plenary Sessions of the Parliament of Finland, Downloadable Version 1 EAF +# files into suitable form for use in speaker verification testing. +# +# Replaces the original EAF file (original is moved out of the way by appending ".bak" to the filename). +import os +import argparse +import pympi.Elan as e + +MIN_DURATION = 3000 +MAX_DURATION = 25000 + +parser = argparse.ArgumentParser() +parser.add_argument('--file_path', type=str) +args = parser.parse_args() + +eaf = e.Eaf(args.file_path) + +# Dataset MEDIA_DESCRIPTORs only have MEDIA_URL defined to the original mp4 video, +# however we need path to the wav files for the extraction step. +linked_path, ext = os.path.splitext(args.file_path) +for linked in eaf.get_linked_files(): + eaf.remove_linked_files(linked['MEDIA_URL']) +eaf.add_linked_file(linked_path + '.wav', + relpath=os.path.basename(linked_path) + '.wav', + mimetype='audio/wav') + +# Try to group word annotations into sentences, only keep sentences < MAX_DURATION. +for tid, (anno, _, _, _) in eaf.tiers.items(): + utterances = [] + utterance = [] + utterance_str = [] + utterance_start = 0 + + for aid, (start_ts, end_ts, val, _) in anno.items(): + utterance.append(aid) + utterance_str.append(val) + if utterance_start == 0: + utterance_start = eaf.timeslots[start_ts] + + duration = eaf.timeslots[end_ts] - utterance_start + + if (val.rstrip().endswith('.') and duration > MIN_DURATION) \ + or duration > MAX_DURATION*0.8: + value = ' '.join(utterance_str) + + if duration < MAX_DURATION: + utterances.append((utterance, utterance_start, + eaf.timeslots[end_ts], value)) + # print('added {}s [{}]: {}'.format(duration/1000, + # tid, value)) + else: + print('skipped {}s [{}]: {}'.format(duration/1000, + tid, value)) + + utterance = [] + utterance_start = 0 + utterance_str = [] + + eaf.remove_all_annotations_from_tier(tid) + + for (aid, start, end, val) in utterances: + eaf.add_annotation(tid, start, end, val) + +print('Collected {} annotations for {} speakers.' + .format(len(eaf.annotations), len(eaf.tiers))) +eaf.to_file(args.file_path) diff --git a/scripts/organize.sh b/scripts/organize.sh new file mode 100644 index 0000000..961cb2b --- /dev/null +++ b/scripts/organize.sh @@ -0,0 +1,31 @@ +#!/bin/bash +# Move files in the form DIR/TIER_* into subdirs, to form a DIR/TIER/TIER_* directory structure. +DIR="$*" +ARR=() +for f in "$DIR"/*; do + [ -d "$f" ] && continue + + TIER="${f%%_*}" + if [ ! -d "./$TIER" ]; then + mkdir -v "./$TIER" + ARR+=("./$TIER") + continue + fi + + MATCH=0 + for d in "${ARR[@]}"; do + if [ "$d" == "./$TIER" ]; then + MATCH=1 + break + fi + done + + if [ "$MATCH" == 0 ]; then + ARR+=("./$TIER") + fi +done + +for d in "${ARR[@]}"; do + echo "moving files to $d" + mv -vn "./${d}_"* "./${d}" +done diff --git a/scripts/sample_dataset.py b/scripts/sample_dataset.py new file mode 100644 index 0000000..3ed97bf --- /dev/null +++ b/scripts/sample_dataset.py @@ -0,0 +1,30 @@ +# Make a smaller subset of a bigger dataset. +# +# Made adhoc for Plenary Sessions of the Parliament of Finland, Downloadable Version 1 dataset. +import os +import random +import glob +import shutil + +PICKS = 80 +DEST_DIR = 'data/edus80/' +DATA_DIR = 'data/eduskunta/' + +# Take PICKS samples from each speaker, skipping speakers that don't have enough samples. +for d in os.listdir(DATA_DIR): + # Each .wav file is accompanied by a .txt file. + if len(os.listdir(DATA_DIR + d)) < 2*PICKS: + print('skipping ' + d) + continue + + newd = DEST_DIR + os.path.basename(d) + os.makedirs(newd) + #print('mkdir ' + newd) + + picks = random.sample(glob.glob(DATA_DIR + d + '/*wav'), k=PICKS) + + for f in picks: + base, _ = os.path.splitext(f) + #print('copy ' + DATA_DIR + d + base + ' to ' + newd) + shutil.copy(base + '.wav', newd) + shutil.copy(base + '.txt', newd)