-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Add scripts and docs for making use of eduskunta-v1 dataset
- Loading branch information
Showing
4 changed files
with
193 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
# Preparing Plenary Sessions of the Parliament of Finland dataset for use in speaker verification | ||
|
||
The [Plenary Sessions of the Parliament of | ||
Finland](http://urn.fi/urn:nbn:fi:lb-2017030901) dataset is a sizable corpus of | ||
transcripted finnish audio. The transcriptions constitute of per word aligned | ||
annotations in EAF-format files. To convert them for more convenient form for | ||
use in speaker verification tasks it's necessary to group the annotations and | ||
split the utterances from the larger WAV files. | ||
|
||
## Caveats of the described process | ||
|
||
1. Samples have not been verified for overlapping speech or misaligned | ||
timestamps. | ||
|
||
2. Word grouping logic is quite rough and likely has fair bit of room for | ||
improvement. | ||
|
||
3. Hashing speaker ids (tier ids) is currently quite frail, encoding and spaces | ||
affect results. | ||
|
||
4. Some speakers have multiple tier ids due to additional prefixes (ministerial | ||
portfolio). | ||
|
||
5. Due to 3. and 4. the speaker ids (tier ids) of the resulting files likely | ||
require manual tweaking. | ||
|
||
6. There might be some finland swedish mixed in the audio. | ||
|
||
# 1. Preparations | ||
|
||
- Install `scripts/eaf-word2sentence` dependencies: | ||
```shell | ||
$ python -m venv word2sentence | ||
$ source word2sentence/bin/activate | ||
$ pip install pympi-ling | ||
``` | ||
|
||
- Build `elan2split`, requires Boost.Filesystem and Xerces-C++: | ||
TODO: This could be replaced with a simple iteration step in `scripts/eaf-word2sentence.py`. | ||
```shell | ||
$ git clone https://github.com/vjoki/ELAN2split | ||
$ cd elan2split/ | ||
$ mkdir build/ | ||
$ cmake ../ | ||
$ make | ||
``` | ||
|
||
# 2. Converting the eaf per word annotations to longer groups of words. | ||
|
||
1. Unpack the dataset. | ||
|
||
2. Iterate through the `.eaf` files: | ||
```shell | ||
$ source word2sentence/bin/activate | ||
$ for eaf in 2016-kevat/2016-*/*.eaf; do | ||
python scripts/eaf-word2sentence.py --file_path "$eaf" | ||
elan2split --name -o ./eduskunta/ "$eaf" | ||
done | ||
``` | ||
|
||
3. Organize the files into directories per speaker with `scripts/organize.sh`. | ||
|
||
4. Optionally take a subset of the dataset using `scripts/sample_dataset.py`. | ||
|
||
4. Manually fix any issues with tier ids. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,67 @@ | ||
# Script for grouping per word annotations in an EAF file into sentences or longer groups of words. | ||
# Intended for converting Plenary Sessions of the Parliament of Finland, Downloadable Version 1 EAF | ||
# files into suitable form for use in speaker verification testing. | ||
# | ||
# Replaces the original EAF file (original is moved out of the way by appending ".bak" to the filename). | ||
import os | ||
import argparse | ||
import pympi.Elan as e | ||
|
||
MIN_DURATION = 3000 | ||
MAX_DURATION = 25000 | ||
|
||
parser = argparse.ArgumentParser() | ||
parser.add_argument('--file_path', type=str) | ||
args = parser.parse_args() | ||
|
||
eaf = e.Eaf(args.file_path) | ||
|
||
# Dataset MEDIA_DESCRIPTORs only have MEDIA_URL defined to the original mp4 video, | ||
# however we need path to the wav files for the extraction step. | ||
linked_path, ext = os.path.splitext(args.file_path) | ||
for linked in eaf.get_linked_files(): | ||
eaf.remove_linked_files(linked['MEDIA_URL']) | ||
eaf.add_linked_file(linked_path + '.wav', | ||
relpath=os.path.basename(linked_path) + '.wav', | ||
mimetype='audio/wav') | ||
|
||
# Try to group word annotations into sentences, only keep sentences < MAX_DURATION. | ||
for tid, (anno, _, _, _) in eaf.tiers.items(): | ||
utterances = [] | ||
utterance = [] | ||
utterance_str = [] | ||
utterance_start = 0 | ||
|
||
for aid, (start_ts, end_ts, val, _) in anno.items(): | ||
utterance.append(aid) | ||
utterance_str.append(val) | ||
if utterance_start == 0: | ||
utterance_start = eaf.timeslots[start_ts] | ||
|
||
duration = eaf.timeslots[end_ts] - utterance_start | ||
|
||
if (val.rstrip().endswith('.') and duration > MIN_DURATION) \ | ||
or duration > MAX_DURATION*0.8: | ||
value = ' '.join(utterance_str) | ||
|
||
if duration < MAX_DURATION: | ||
utterances.append((utterance, utterance_start, | ||
eaf.timeslots[end_ts], value)) | ||
# print('added {}s [{}]: {}'.format(duration/1000, | ||
# tid, value)) | ||
else: | ||
print('skipped {}s [{}]: {}'.format(duration/1000, | ||
tid, value)) | ||
|
||
utterance = [] | ||
utterance_start = 0 | ||
utterance_str = [] | ||
|
||
eaf.remove_all_annotations_from_tier(tid) | ||
|
||
for (aid, start, end, val) in utterances: | ||
eaf.add_annotation(tid, start, end, val) | ||
|
||
print('Collected {} annotations for {} speakers.' | ||
.format(len(eaf.annotations), len(eaf.tiers))) | ||
eaf.to_file(args.file_path) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#!/bin/bash | ||
# Move files in the form DIR/TIER_* into subdirs, to form a DIR/TIER/TIER_* directory structure. | ||
DIR="$*" | ||
ARR=() | ||
for f in "$DIR"/*; do | ||
[ -d "$f" ] && continue | ||
|
||
TIER="${f%%_*}" | ||
if [ ! -d "./$TIER" ]; then | ||
mkdir -v "./$TIER" | ||
ARR+=("./$TIER") | ||
continue | ||
fi | ||
|
||
MATCH=0 | ||
for d in "${ARR[@]}"; do | ||
if [ "$d" == "./$TIER" ]; then | ||
MATCH=1 | ||
break | ||
fi | ||
done | ||
|
||
if [ "$MATCH" == 0 ]; then | ||
ARR+=("./$TIER") | ||
fi | ||
done | ||
|
||
for d in "${ARR[@]}"; do | ||
echo "moving files to $d" | ||
mv -vn "./${d}_"* "./${d}" | ||
done |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,30 @@ | ||
# Make a smaller subset of a bigger dataset. | ||
# | ||
# Made adhoc for Plenary Sessions of the Parliament of Finland, Downloadable Version 1 dataset. | ||
import os | ||
import random | ||
import glob | ||
import shutil | ||
|
||
PICKS = 80 | ||
DEST_DIR = 'data/edus80/' | ||
DATA_DIR = 'data/eduskunta/' | ||
|
||
# Take PICKS samples from each speaker, skipping speakers that don't have enough samples. | ||
for d in os.listdir(DATA_DIR): | ||
# Each .wav file is accompanied by a .txt file. | ||
if len(os.listdir(DATA_DIR + d)) < 2*PICKS: | ||
print('skipping ' + d) | ||
continue | ||
|
||
newd = DEST_DIR + os.path.basename(d) | ||
os.makedirs(newd) | ||
#print('mkdir ' + newd) | ||
|
||
picks = random.sample(glob.glob(DATA_DIR + d + '/*wav'), k=PICKS) | ||
|
||
for f in picks: | ||
base, _ = os.path.splitext(f) | ||
#print('copy ' + DATA_DIR + d + base + ' to ' + newd) | ||
shutil.copy(base + '.wav', newd) | ||
shutil.copy(base + '.txt', newd) |