Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
122 changes: 74 additions & 48 deletions benchmarks/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,18 +120,22 @@ def normalize_text_cpp_style(text: str) -> str:


def calculate_wer(hypothesis: str, reference: str) -> Dict[str, Any]:
"""Calculate WER metrics using jiwer with Whisper normalization"""
"""Calculate WER and CER metrics using jiwer with Whisper normalization"""
hyp_norm = normalize_text(hypothesis)
ref_norm = normalize_text(reference)

# Calculate WER
wer_score = jiwer.wer(ref_norm, hyp_norm)

# Get detailed measures
# Get detailed word measures
output = jiwer.process_words(ref_norm, hyp_norm)

# Calculate CER (Character Error Rate)
cer_score = jiwer.cer(ref_norm, hyp_norm)

return {
"wer": wer_score * 100, # as percentage
"cer": cer_score * 100, # as percentage
"substitutions": output.substitutions,
"deletions": output.deletions,
"insertions": output.insertions,
Expand Down Expand Up @@ -204,6 +208,7 @@ def main():
ap.add_argument("--lib", default=None, help="Path to eddy_c shared library (auto-detected if not specified)")
ap.add_argument("--device", default="CPU", help="Device: CPU/GPU/NPU/AUTO (default: CPU)")
ap.add_argument("--model-dir", default=None, help="Directory with parakeet model files (defaults to Eddy cache)")
ap.add_argument("--model", choices=["parakeet-v2", "parakeet-v3"], default="parakeet-v2", help="Model version (default: parakeet-v2)")
ap.add_argument("--max-files", type=str, default="50", help="Max files to evaluate (default: 50, use 'all' for full dataset)")
ap.add_argument("--dataset-config", default="clean", help="HF datasets config (default: clean)")
ap.add_argument("--split", default="test", help="HF datasets split (default: test)")
Expand Down Expand Up @@ -281,13 +286,26 @@ def main():
ds = ds.cast_column("audio", Audio(sampling_rate=16000))

# Load library and create model
print(f"Loading model on device: {args.device}")
print(f"Loading model {args.model} on device: {args.device}")
lib = load_lib(str(lib_path))
err = C.c_char_p()

# Set blank_token_id based on model version
blank_token_id = 8192 if args.model == "parakeet-v3" else 1024

# Auto-resolve model directory from cache if not explicitly provided
# The C++ code will use get_model_assets_dir(model_name) based on blank_token_id
# So we can pass None and let C++ handle it, OR explicitly construct the path
model_dir_to_use = args.model_dir
if not model_dir_to_use:
# Let C++ auto-select based on blank_token_id (it infers model name from it)
# This ensures we use the correct cache directory: ~/.cache/eddy/models/parakeet-v{2,3}/
model_dir_to_use = None

cfg = EddyParakeetConfig(
device=args.device.encode("utf-8"),
model_dir=args.model_dir.encode("utf-8") if args.model_dir else None,
blank_token_id=1024,
model_dir=model_dir_to_use.encode("utf-8") if model_dir_to_use else None,
blank_token_id=blank_token_id,
)
handle = lib.eddy_parakeet_create(cfg, C.byref(err))
if not handle:
Expand All @@ -302,14 +320,6 @@ def main():

results = []

# C++ normalization totals (lowercase + remove punctuation)
total_cpp_edits = 0
total_cpp_words = 0

# Whisper normalization totals (full OpenAI normalization)
total_whisper_edits = 0
total_whisper_words = 0

total_audio_duration = 0.0
total_processing_time = 0.0
start_time = time.time()
Expand Down Expand Up @@ -361,24 +371,9 @@ def main():

hyp = (res.text or b"").decode("utf-8", errors="ignore")

# Calculate WER with C++ normalization (lowercase + remove punctuation - matches C++ TextNormalizer)
cpp_normalized_ref = normalize_text_cpp_style(ref)
cpp_normalized_hyp = normalize_text_cpp_style(hyp)
cpp_output = jiwer.process_words(cpp_normalized_ref, cpp_normalized_hyp)
cpp_wer = cpp_output.wer * 100.0

# Calculate WER with OpenAI Whisper normalization (full normalization)
# Calculate WER and CER with OpenAI Whisper normalization
wer_metrics = calculate_wer(hyp, ref)

# Track totals (C++ style normalization)
total_cpp_edits += cpp_output.substitutions + cpp_output.deletions + cpp_output.insertions
total_cpp_words += cpp_output.substitutions + cpp_output.deletions + cpp_output.hits

# Track totals (OpenAI Whisper normalization)
ref_words = len(normalize_text(ref).split())
total_whisper_edits += wer_metrics["substitutions"] + wer_metrics["deletions"] + wer_metrics["insertions"]
total_whisper_words += ref_words

total_audio_duration += audio_duration
total_processing_time += res.latency_ms / 1000.0

Expand All @@ -388,7 +383,7 @@ def main():
"reference": ref,
"hypothesis": hyp,
"wer": wer_metrics["wer"],
"wer_cpp": cpp_wer,
"cer": wer_metrics["cer"],
"audio_duration_sec": audio_duration,
"processing_time_sec": res.latency_ms / 1000.0,
"rtfx": audio_duration / (res.latency_ms / 1000.0),
Expand All @@ -403,36 +398,52 @@ def main():

# Progress update
if (i + 1) % 10 == 0 or i == n - 1:
current_wer = (total_whisper_edits / max(1, total_whisper_words)) * 100
# Calculate current average WER (per-file average, not corpus-level)
current_wer_values = [r["wer"] for r in results]
current_avg_wer = sum(current_wer_values) / len(current_wer_values) if current_wer_values else 0.0
current_rtfx = total_audio_duration / max(0.001, total_processing_time)
print(f"[{i+1}/{n}] WER: {current_wer:.2f}% RTFx: {current_rtfx:.1f}x Last: {wer_metrics['wer']:.2f}%")
print(f"[{i+1}/{n}] Avg WER: {current_avg_wer:.2f}% RTFx: {current_rtfx:.1f}x Last: {wer_metrics['wer']:.2f}%")

lib.eddy_parakeet_destroy(handle)

# Calculate final metrics
elapsed_time = time.time() - start_time
overall_wer_whisper = (total_whisper_edits / max(1, total_whisper_words)) * 100
overall_wer_cpp = (total_cpp_edits / max(1, total_cpp_words)) * 100
overall_rtfx = total_audio_duration / max(0.001, total_processing_time)

# Compute per-file WER statistics
# Compute per-file WER and CER statistics
wer_values = [r["wer"] for r in results]
cer_values = [r["cer"] for r in results]
rtfx_values = [r["rtfx"] for r in results]

wer_values.sort()
cer_values.sort()
rtfx_values.sort()

median_wer = wer_values[len(wer_values) // 2] if wer_values else 0.0
median_cer = cer_values[len(cer_values) // 2] if cer_values else 0.0
median_rtfx = rtfx_values[len(rtfx_values) // 2] if rtfx_values else 0.0

# Calculate average metrics
average_wer = sum(wer_values) / len(wer_values) if wer_values else 0.0
average_cer = sum(cer_values) / len(cer_values) if cer_values else 0.0

# Save results
output_data = {
"config": {
"device": args.device,
"model_dir": args.model_dir or "cache",
"model": args.model,
"blank_token_id": blank_token_id,
"dataset": f"librispeech_asr/{args.dataset_config}",
"split": args.split,
"num_files": n,
},
"metrics": {
"overall_wer": overall_wer_whisper,
"overall_wer_cpp": overall_wer_cpp,
"average_wer": average_wer,
"median_wer": median_wer,
"average_cer": average_cer,
"median_cer": median_cer,
"median_rtfx": median_rtfx,
"overall_rtfx": overall_rtfx,
"total_audio_duration_sec": total_audio_duration,
"total_processing_time_sec": total_processing_time,
Expand All @@ -446,19 +457,34 @@ def main():
with open(output_path, "w") as f:
json.dump(output_data, f, indent=2)

# Print summary
# Print summary (FluidAudio-style format)
print("\n" + "=" * 80)
print("BENCHMARK SUMMARY")
print("BENCHMARK RESULTS")
print("=" * 80)
print(f" Dataset: librispeech {args.split}-{args.dataset_config}")
print(f" Model: {args.model}")
print(f" Device: {args.device}")
print(f" Files processed: {n}")
print(f" Average WER: {average_wer:.1f}%")
print(f" Median WER: {median_wer:.1f}%")
print(f" Average CER: {average_cer:.1f}%")
print(f" Median CER: {median_cer:.1f}%")
print(f" Median RTFx: {median_rtfx:.1f}x")
print(f" Overall RTFx: {overall_rtfx:.1f}x ({total_audio_duration:.1f}s / {total_processing_time:.1f}s)")
print(f" Benchmark runtime: {elapsed_time:.1f}s")
print(f"Results saved to: {output_path}")
print(f" Normalization: OpenAI Whisper English")
print("=" * 80)
print()
print("=" * 80)
print("REFERENCE BENCHMARKS - FluidAudio CoreML (M4 Pro, 2620 files)")
print("=" * 80)
print("Model | Avg WER | Med WER | Avg CER | Med RTFx | Overall RTFx | Runtime")
print("-----------+---------+---------+---------+----------+--------------+---------")
print("v2 (EN) | 2.2% | 0.0% | 0.7% | 125.6x | 141.2x | 3m 25s")
print("v3 (multi) | 2.6% | 0.0% | 1.1% | 137.8x | 153.4x | 3m 2s")
print("=" * 80)
print(f"Files processed: {n}")
print(f"Overall WER (C++): {overall_wer_cpp:.2f}%")
print(f"Overall WER (norm): {overall_wer_whisper:.2f}% (OpenAI Whisper normalized)")
print(f"Median WER: {median_wer:.2f}%")
print(f"Overall RTFx: {overall_rtfx:.1f}x")
print(f"Total audio: {total_audio_duration:.1f}s")
print(f"Total processing: {total_processing_time:.1f}s")
print(f"Benchmark elapsed: {elapsed_time:.1f}s")
print(f"Results saved to: {output_path}")
print("Note: v2 is more accurate for English, v3 is faster and supports multilingual")
print("=" * 80)


Expand Down
9 changes: 8 additions & 1 deletion include/eddy/core/model_configs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,16 @@ namespace model_configs {
.cache_subdir = "parakeet-v2"
};

inline const ModelConfig PARAKEET_V3 = {
.repo_id = "FluidInference/parakeet-tdt-0.6b-v3-ov",
.required_files = PARAKEET_STANDARD_FILES,
.cache_subdir = "parakeet-v3"
};

// Model name lookup map
inline const std::map<std::string, ModelConfig> MODEL_MAP = {
{"parakeet-v2", PARAKEET_V2}
{"parakeet-v2", PARAKEET_V2},
{"parakeet-v3", PARAKEET_V3}
};

// Default model
Expand Down
35 changes: 24 additions & 11 deletions src/eddy_c.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -375,32 +375,45 @@ EDDY_API EddyParakeetModel eddy_parakeet_create(EddyParakeetConfig config, char*
try {
std::string device = config.device ? config.device : "CPU";

// Infer model version from blank_token_id
const char* model_name = (config.blank_token_id == 8192) ? "parakeet-v3" : "parakeet-v2";

auto backend = std::make_shared<eddy::OpenVINOBackend>(
eddy::OpenVINOOptions{ .device = device, .cache_dir = eddy::get_model_dir("parakeet-v2").string() }
eddy::OpenVINOOptions{ .device = device, .cache_dir = eddy::get_model_dir(model_name).string() }
);

// Resolve model directory: prefer explicit, else cache and ensure availability
std::filesystem::path model_dir;
if (config.model_dir && std::string(config.model_dir).size() > 0) {
model_dir = config.model_dir;
std::string dir_str = config.model_dir;
// Treat "cache" as a special value meaning "use default cache location"
if (dir_str == "cache") {
model_dir = eddy::get_model_assets_dir(model_name);
} else {
model_dir = config.model_dir;
}
} else {
model_dir = eddy::get_model_assets_dir("parakeet-v2");
std::string err;
(void)eddy::parakeet::check_models_available(model_dir, &err);
model_dir = eddy::get_model_assets_dir(model_name);
}

std::string err;
(void)eddy::parakeet::check_models_available(model_dir, &err);
#if defined(_WIN32)
if (!std::filesystem::exists(model_dir)) {
auto legacy = eddy::get_app_data_dir() / "cache" / "models" / "parakeet-v2" / "files";
if (std::filesystem::exists(legacy)) model_dir = legacy;
}
#endif
if (!std::filesystem::exists(model_dir)) {
auto legacy = eddy::get_app_data_dir() / "cache" / "models" / "parakeet-v2" / "files";
if (std::filesystem::exists(legacy)) model_dir = legacy;
}
#endif

// Both v2 and v3 use the same vocab filename (as per HuggingFace repos)
std::string vocab_filename = "parakeet_vocab.json";

eddy::parakeet::ModelPaths paths{
.preprocessor = {.path = (model_dir / "parakeet_melspectogram.xml").string()},
.encoder = {.path = (model_dir / "parakeet_encoder.xml").string()},
.decoder = {.path = (model_dir / "parakeet_decoder.xml").string()},
.joint = {.path = (model_dir / "parakeet_joint.xml").string()},
.tokenizer_json = (model_dir / "parakeet_vocab.json").string()
.tokenizer_json = (model_dir / vocab_filename).string()
};

eddy::parakeet::RuntimeConfig cfg{
Expand Down
16 changes: 14 additions & 2 deletions src/models/parakeet-v2/tokenizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,16 +21,28 @@ void Tokenizer::load(const std::string& path, int blank_id) {
nlohmann::json json;
stream >> json;

// Handle two vocabulary formats:
// V2 format: {"0": "token0", "1": "token1", ...}
// V3 format: {"id_to_token": {"0": "token0", "1": "token1", ...}, "vocab_size": 8192, ...}
nlohmann::json vocab_json;
if (json.contains("id_to_token")) {
// V3 format with nested id_to_token
vocab_json = json["id_to_token"];
} else {
// V2 format (flat dictionary)
vocab_json = json;
}

// Find max token ID to size vocabulary array
size_t max_id = 0;
for (const auto& item : json.items()) {
for (const auto& item : vocab_json.items()) {
const size_t id = static_cast<size_t>(std::stoul(item.key()));
max_id = std::max(max_id, id);
}

// Populate vocabulary from JSON
vocab_.assign(max_id + 1, std::string{});
for (const auto& item : json.items()) {
for (const auto& item : vocab_json.items()) {
const size_t id = static_cast<size_t>(std::stoul(item.key()));
vocab_[id] = item.value().get<std::string>();
}
Expand Down