Skip to content

adding multilingual support #2

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 14 commits into
base: main
Choose a base branch
from
Open
Binary file added .DS_Store
Binary file not shown.
29 changes: 20 additions & 9 deletions configs/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ speech_config:
normalize_signal: True
normalize_feature: True
normalize_per_feature: False
use_fma: True
use_neon: False

model_config:
name: acrnn
Expand All @@ -16,20 +18,26 @@ model_config:
kernel_size: [[11,5],[11,5],[11,5]]
rnn_cell: 256
seq_mask: True
num_languages: 100

dataset_config:
vocabulary: vocab/vocab.txt
data_path: ./data/wavs/
corpus_name: ./data/demo_txt/demo
corpus_name: ./data/multilingual/
fleurs_path: ./data/fleurs/
file_nums: 1
max_audio_length: 2000
shuffle_size: 1200
data_length: None
suffix: .txt
load_type: txt
load_type: multilingual
train: train
dev: dev
dev: validation
test: test
languages_file: configs/languages.json
max_samples_per_language: 10000
audio_format: wav
metadata_format: json

optimizer_config:
init_steps: 0
Expand All @@ -38,12 +46,15 @@ optimizer_config:
beta1: 0.9
beta2: 0.999
epsilon: 1e-9
use_mixed_precision: True

running_config:
prefetch: False
load_weights: ./saved_weights/20230228-084356/last/model
prefetch: True
load_weights: ./saved_weights/multilingual/last/model
num_epochs: 100
batch_size: 1
train_steps: 50
dev_steps: 10
test_steps: 10
batch_size: 32
train_steps: 1000
dev_steps: 100
test_steps: 100
save_interval: 5
eval_interval: 1
18 changes: 18 additions & 0 deletions configs/languages.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
{
"supported_languages": [
"be_by",
"bg_bg",
"bs_ba",
"ca_cs",
"cs_cz",
"cy_gb"
],
"language_names": {
"be_by": "Belarusian",
"bg_bg": "Bulgarian",
"bs_ba": "Bosnian",
"ca_cs": "Catalan",
"cs_cz": "Czech",
"cy_gb": "Welsh"
}
}
35 changes: 24 additions & 11 deletions convert_to_pb.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,25 +20,38 @@

vocab = Vocab(vocabulary)

# build model
model=Model(**config.model_config,vocab_size=len(vocab.token_list))
# Build model
model = Model(**config.model_config, vocab_size=len(vocab.token_list))
model.init_build([None, config.speech_config['num_feature_bins']])
model.load_weights(weights_dir + "last/model")
model.add_featurizers(speech_featurizer)


version = 2
#****convert to pb******
tf.saved_model.save(model, "saved_models/lang14/pb/" + str(version))
print('convert to pb model successful')

#****convert to serving******
# Convert to SavedModel format with signatures
@tf.function(input_signature=[tf.TensorSpec(shape=[None], dtype=tf.float32)])
def predict_fn(signal):
output, prob = model.predict_pb(signal)
return {"output_0": output, "output_1": prob}

# Save model with proper signatures
tf.saved_model.save(
model,
"./saved_models/lang14/serving/"+str(version),
f"saved_models/lang14/pb/{version}",
signatures={
'predict_pb': model.predict_pb
}
"serving_default": predict_fn,
"predict_pb": model.predict_pb
}
)
print('Model converted to SavedModel format successfully')

print('convert to serving model successful')
# Save model for TensorFlow Serving
tf.saved_model.save(
model,
f"saved_models/lang14/serving/{version}",
signatures={
"serving_default": predict_fn,
"predict_pb": model.predict_pb
}
)
print('Model converted for TensorFlow Serving successfully')
184 changes: 184 additions & 0 deletions download_fleurs.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
import os
import json
import argparse
import shutil
import time
from tqdm import tqdm
from datasets import load_dataset, get_dataset_config_names
import soundfile as sf
import numpy as np
from pathlib import Path

# All FLEURS languages
ALL_LANGUAGES = [
'af', 'am', 'ar', 'as', 'az', 'be', 'bg', 'bn', 'br', 'bs', 'ca', 'cs', 'cy', 'da',
'de', 'el', 'en', 'es', 'et', 'eu', 'fa', 'fi', 'fr', 'ga', 'gl', 'gu', 'ha', 'he',
'hi', 'hr', 'hu', 'hy', 'id', 'ig', 'is', 'it', 'ja', 'jv', 'ka', 'kk', 'km', 'kn',
'ko', 'ky', 'lb', 'lg', 'ln', 'lo', 'lt', 'lv', 'mg', 'mk', 'ml', 'mn', 'mr', 'ms',
'my', 'ne', 'nl', 'no', 'ny', 'or', 'pa', 'pl', 'ps', 'pt', 'ro', 'ru', 'rw', 'sd',
'si', 'sk', 'sl', 'sn', 'so', 'sq', 'sr', 'su', 'sv', 'sw', 'ta', 'te', 'tg', 'th',
'tk', 'tr', 'uk', 'ur', 'uz', 'vi', 'wo', 'xh', 'yi', 'yo', 'zh', 'zu'
]

def ensure_dir(path):
"""Create directory if it doesn't exist"""
Path(path).mkdir(parents=True, exist_ok=True)

def save_audio(audio_data, sample_rate, output_path):
"""Save audio data to WAV file"""
sf.write(output_path, audio_data, sample_rate)

def download_language(lang, output_dir, splits=None, retry_count=3, retry_delay=5):
"""Download and organize dataset for a specific language with retries"""
if splits is None:
splits = ['train', 'validation', 'test']

lang_dir = os.path.join(output_dir, lang)
print(f"\nProcessing language: {lang}")

for split in splits:
print(f"\nDownloading {split} split...")
split_dir = os.path.join(lang_dir, split)
audio_dir = os.path.join(split_dir, 'audio')

# Skip if already downloaded
metadata_path = os.path.join(split_dir, 'metadata.json')
if os.path.exists(metadata_path):
print(f"Skipping {lang} {split} - already downloaded")
continue

ensure_dir(audio_dir)

# Load dataset with retries
dataset = None
for attempt in range(retry_count):
try:
dataset = load_dataset("google/fleurs", lang, split=split)
break
except Exception as e:
if attempt < retry_count - 1:
print(f"Attempt {attempt + 1} failed for {lang} {split}: {str(e)}")
print(f"Retrying in {retry_delay} seconds...")
time.sleep(retry_delay)
else:
print(f"Error downloading {lang} {split} after {retry_count} attempts: {str(e)}")
return False

if dataset is None:
continue

# Prepare metadata
metadata = {
'data': [],
'lang': lang,
'split': split
}

# Process each example
for idx, item in enumerate(tqdm(dataset, desc=f"Processing {split}")):
try:
# Extract audio
audio_data = item['audio']['array']
sample_rate = item['audio']['sampling_rate']

# Generate ID
item_id = f"{lang}_{split}_{idx:06d}"

# Save audio file
audio_path = os.path.join(audio_dir, f"{item_id}.wav")
save_audio(audio_data, sample_rate, audio_path)

# Add to metadata
metadata['data'].append({
'id': item_id,
'transcription': item.get('transcription', ''),
'raw_transcription': item.get('raw_transcription', ''),
'language': item.get('language', lang),
'gender': item.get('gender', ''),
'lang_id': item.get('lang_id', -1)
})

except Exception as e:
print(f"Error processing item {idx} in {lang} {split}: {str(e)}")
continue

# Save metadata
with open(metadata_path, 'w', encoding='utf-8') as f:
json.dump(metadata, f, ensure_ascii=False, indent=2)

print(f"Saved {len(metadata['data'])} examples for {lang} {split}")

return True

def download_languages_in_batches(languages, output_dir, batch_size=5, splits=None):
"""Download languages in batches to manage memory usage"""
total_languages = len(languages)
successful = []
failed = []

for i in range(0, total_languages, batch_size):
batch = languages[i:i + batch_size]
print(f"\nProcessing batch {i//batch_size + 1} of {(total_languages + batch_size - 1)//batch_size}")
print(f"Languages in this batch: {', '.join(batch)}")

for lang in batch:
try:
if download_language(lang, output_dir, splits):
successful.append(lang)
else:
failed.append(lang)
except Exception as e:
print(f"Failed to download {lang}: {str(e)}")
failed.append(lang)

# Clear some memory
if i + batch_size < total_languages:
print("\nClearing memory before next batch...")
time.sleep(5) # Give some time for memory cleanup

return successful, failed

def main():
parser = argparse.ArgumentParser(description='Download and organize FLEURS dataset')
parser.add_argument('--output_dir', type=str, default='./data/fleurs',
help='Output directory for the dataset')
parser.add_argument('--languages', type=str, nargs='+',
help='List of language codes to download (default: all languages)')
parser.add_argument('--splits', type=str, nargs='+',
default=['train', 'validation', 'test'],
help='Dataset splits to download')
parser.add_argument('--batch_size', type=int, default=5,
help='Number of languages to download in parallel')
args = parser.parse_args()

# Use all languages if none specified
languages = args.languages if args.languages else ALL_LANGUAGES

# Create output directory
ensure_dir(args.output_dir)

# Download languages in batches
print(f"Starting download of {len(languages)} languages in batches of {args.batch_size}")
successful, failed = download_languages_in_batches(
languages, args.output_dir, args.batch_size, args.splits
)

# Print summary
print("\n=== Download Summary ===")
print(f"Successfully downloaded: {len(successful)} languages")
print(f"Failed to download: {len(failed)} languages")

if failed:
print("\nFailed languages:")
print(", ".join(failed))

# Save failed languages to file for retry
failed_file = os.path.join(args.output_dir, "failed_languages.txt")
with open(failed_file, 'w') as f:
f.write("\n".join(failed))
print(f"\nFailed languages list saved to: {failed_file}")
print("You can retry failed languages using:")
print(f"python download_fleurs.py --languages {' '.join(failed)}")

if __name__ == '__main__':
main()
Loading