Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,12 @@ set(MUTTERKEY_CORE_SOURCES
src/transcription/transcriptiontypes.h
src/transcription/transcriptionengine.cpp
src/transcription/transcriptionengine.h
src/transcription/audiochunker.cpp
src/transcription/audiochunker.h
src/transcription/transcriptassembler.cpp
src/transcription/transcriptassembler.h
src/transcription/transcriptioncompat.cpp
src/transcription/transcriptioncompat.h
src/transcription/transcriptionworker.cpp
src/transcription/transcriptionworker.h
src/transcription/whispercpptranscriber.cpp
Expand Down Expand Up @@ -87,7 +93,7 @@ target_include_directories(mutterkey-tray PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/sr
target_link_libraries(mutterkey_core PUBLIC Qt6::Core Qt6::Gui Qt6::Multimedia KF6::GlobalAccel KF6::GuiAddons)
target_link_libraries(mutterkey_control PUBLIC Qt6::Core Qt6::Network mutterkey_core)
target_link_libraries(mutterkey_app PUBLIC Qt6::Core Qt6::Gui mutterkey_control)
target_link_libraries(mutterkey PRIVATE mutterkey_app whisper)
target_link_libraries(mutterkey PRIVATE mutterkey_app)
target_link_libraries(mutterkey-tray PRIVATE Qt6::Core Qt6::Gui Qt6::Widgets mutterkey_control)
set_target_properties(mutterkey PROPERTIES
BUILD_RPATH "$ORIGIN/../lib"
Expand Down Expand Up @@ -206,7 +212,7 @@ add_subdirectory(third_party/whisper.cpp EXCLUDE_FROM_ALL)
# upstream public headers as part of its own package layout.
set_target_properties(whisper ggml PROPERTIES PUBLIC_HEADER "")

target_link_libraries(mutterkey_core PUBLIC whisper)
target_link_libraries(mutterkey_core PRIVATE whisper)

install(TARGETS mutterkey RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
install(TARGETS mutterkey-tray RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
Expand Down
19 changes: 16 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,16 @@ Current behavior:
- copies the resulting text to the clipboard
- expects you to paste the text yourself with `Ctrl+V`

Current runtime shape:

- `TranscriptionEngine` is the immutable runtime/provider boundary
- `TranscriptionSession` is the mutable per-session decode boundary
- internal audio flow is streaming-first through normalized chunks and transcript events
- `BackendCapabilities` reports static backend support, while `RuntimeDiagnostics`
reports runtime/device/model inspection data
- the current daemon and `once` user flows still collapse the streaming path back
into a final clipboard-friendly transcript

Current direction:

- KDE-first
Expand Down Expand Up @@ -362,9 +372,12 @@ Repository layout:
- `src/audio/audiorecorder.*`: microphone capture
- `src/audio/recording.h`: shared recorded-audio payload passed between subsystems
- `src/audio/recordingnormalizer.*`: conversion to Whisper-ready mono `float32` at `16 kHz`
- `src/transcription/whispercpptranscriber.*`: embedded Whisper integration
- `src/transcription/audiochunker.*`: fixed-size normalized streaming chunk generation
- `src/transcription/transcriptassembler.*`: final transcript assembly from streaming events
- `src/transcription/transcriptioncompat.*`: compatibility wrapper from one-shot recordings to the streaming runtime path
- `src/transcription/whispercpptranscriber.*`: embedded Whisper integration behind the app-owned runtime seam
- `src/transcription/transcriptionworker.*`: worker object on a dedicated `QThread`
- `src/transcription/transcriptiontypes.h`: normalized-audio and transcription result value types
- `src/transcription/transcriptiontypes.h`: runtime diagnostics, normalized-audio, chunk, event, and error value types
- `src/clipboardwriter.*`: clipboard writes with KDE-first fallback behavior
- `src/config.*`: JSON config loading and defaults
- `src/app/*`: shared CLI/runtime command helpers used by the main entrypoint
Expand Down Expand Up @@ -462,7 +475,7 @@ Notes:
libraries without inheriting upstream header-install warnings
- the `valgrind` target runs the repo-owned Memcheck lane used for release readiness
- tests are small headless `Qt Test` cases
- `config` and `recordingnormalizer` currently have the main unit-test coverage because they contain the most deterministic logic without KDE session or device dependencies
- streaming runtime helpers and worker orchestration now also have deterministic headless coverage through fake backends
- GitHub Actions CI runs the hygiene job on Ubuntu 24.04 and the configure/build/test job in a Debian Trixie container because the needed KF6 dev packages are not available on the stock Ubuntu 24.04 runner image
- successful `main` branch CI runs publish `build/docs/doxygen/html` to GitHub Pages with the official Pages actions
- GitHub Actions release checks run a separate Valgrind Memcheck lane on manual dispatch and `v*` tags so normal PR CI stays faster
Expand Down
13 changes: 10 additions & 3 deletions docs/mainpage.md
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,27 @@ Current runtime shape:

- `TranscriptionEngine` is the immutable runtime/provider boundary
- `TranscriptionSession` is the mutable per-session decode boundary
- `BackendCapabilities` reports engine-owned runtime metadata used for
diagnostics and orchestration
- internal audio flow is streaming-first through normalized chunks and
transcript events
- `BackendCapabilities` reports static backend support used for orchestration
- `RuntimeDiagnostics` reports runtime/device/model inspection data separately
from static capabilities
- `RuntimeError` and `RuntimeErrorCode` provide typed runtime failures
- `TranscriptionWorker` hosts transcription on a dedicated `QThread` and
creates live sessions lazily on that worker thread
- the shipped daemon and `once` flows still use a compatibility wrapper that
assembles a final transcript from the streaming runtime path
- config parsing under `src/config.*` stays product-shaped and permissive, while
backend-specific support checks live in the runtime layer

Core API surface covered here:

- `HotkeyManager` registers the global push-to-talk shortcut through KDE.
- `AudioRecorder` captures microphone audio while the shortcut is held.
- `RecordingNormalizer` converts captured audio to Whisper-ready mono `float32`
- `RecordingNormalizer` converts captured audio to runtime-ready mono `float32`
samples at `16 kHz`.
- `AudioChunker` splits normalized audio into deterministic stream chunks.
- `TranscriptAssembler` builds final transcript text from streaming events.
- `TranscriptionEngine` and `TranscriptionSession` define the app-owned runtime
seam.
- `WhisperCppTranscriber` performs in-process transcription through vendored
Expand Down
9 changes: 6 additions & 3 deletions src/app/applicationcommands.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include "clipboardwriter.h"
#include "control/daemoncontrolserver.h"
#include "service.h"
#include "transcription/transcriptioncompat.h"
#include "transcription/transcriptionengine.h"
#include "transcription/transcriptiontypes.h"

Expand Down Expand Up @@ -82,7 +83,9 @@ int runOnce(QGuiApplication &app, const AppConfig &config, double seconds)
}
}

QTimer::singleShot(0, &app, [&app, &recorder, transcriber = transcriber.get(), &clipboardWriter, seconds]() {
QTimer::singleShot(0,
&app,
[&app, &recorder, transcriber = transcriber.get(), &clipboardWriter, seconds, normalizer = RecordingNormalizer()]() {
QString errorMessage;
if (!recorder.start(&errorMessage)) {
qCCritical(appLog) << "Failed to start one-shot recording:" << errorMessage;
Expand All @@ -91,15 +94,15 @@ int runOnce(QGuiApplication &app, const AppConfig &config, double seconds)
}

qCInfo(appLog) << "Recording for" << seconds << "seconds";
QTimer::singleShot(static_cast<int>(seconds * 1000), &app, [&app, &recorder, transcriber, &clipboardWriter]() {
QTimer::singleShot(static_cast<int>(seconds * 1000), &app, [&app, &recorder, transcriber, &clipboardWriter, normalizer]() {
const Recording recording = recorder.stop();
if (!recording.isValid()) {
qCCritical(appLog) << "Recorder returned no audio";
QGuiApplication::exit(1);
return;
}

const TranscriptionResult result = transcriber->transcribe(recording);
const TranscriptionResult result = transcribeRecordingViaStreaming(*transcriber, recording, normalizer);
if (!result.success) {
qCCritical(appLog) << "One-shot transcription failed:" << result.error.message;
QGuiApplication::exit(1);
Expand Down
2 changes: 1 addition & 1 deletion src/audio/recordingnormalizer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -95,7 +95,7 @@ std::vector<float> resampleLinear(const std::vector<float> &samples, int inputSa

} // namespace

bool RecordingNormalizer::normalizeForWhisper(const Recording &recording,
bool RecordingNormalizer::normalizeForRuntime(const Recording &recording,
NormalizedAudio *normalizedAudio,
QString *errorMessage) const
{
Expand Down
4 changes: 2 additions & 2 deletions src/audio/recordingnormalizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -17,11 +17,11 @@ class RecordingNormalizer final
{
public:
/**
* @brief Converts a captured recording into Whisper input audio.
* @brief Converts a captured recording into runtime input audio.
* @param recording Source recording and its original device format.
* @param normalizedAudio Output location for normalized samples.
* @param errorMessage Optional output for conversion failures.
* @return `true` when normalization succeeded.
*/
bool normalizeForWhisper(const Recording &recording, NormalizedAudio *normalizedAudio, QString *errorMessage = nullptr) const;
bool normalizeForRuntime(const Recording &recording, NormalizedAudio *normalizedAudio, QString *errorMessage = nullptr) const;
};
9 changes: 5 additions & 4 deletions src/service.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -80,11 +80,12 @@ QJsonObject MutterkeyService::diagnostics() const
object.insert(QStringLiteral("transcriptions_completed"), m_transcriptionsCompleted);
object.insert(QStringLiteral("transcriber_backend"),
m_transcriptionWorker != nullptr ? m_transcriptionWorker->backendName() : QStringLiteral("unconfigured"));
object.insert(QStringLiteral("transcriber_model"),
m_transcriptionWorker != nullptr ? m_transcriptionWorker->loadedModelDescription() : QString());
const RuntimeDiagnostics runtimeDiagnostics =
m_transcriptionWorker != nullptr ? m_transcriptionWorker->runtimeDiagnostics() : m_transcriptionEngine->diagnostics();
object.insert(QStringLiteral("transcriber_model"), runtimeDiagnostics.loadedModelDescription);
object.insert(QStringLiteral("transcriber_runtime"), runtimeDiagnostics.runtimeDescription);
const BackendCapabilities capabilities =
m_transcriptionWorker != nullptr ? m_transcriptionWorker->capabilities() : m_transcriptionEngine->capabilities();
object.insert(QStringLiteral("transcriber_runtime"), capabilities.runtimeDescription);
object.insert(QStringLiteral("transcriber_supports_translation"), capabilities.supportsTranslation);
object.insert(QStringLiteral("transcriber_supports_auto_language"), capabilities.supportsAutoLanguage);
return object;
Expand Down Expand Up @@ -168,7 +169,7 @@ void MutterkeyService::transcribeInBackground(Recording recording)
// another owner of the PCM payload alive on the service thread.
QMetaObject::invokeMethod(m_transcriptionWorker,
[worker = m_transcriptionWorker, recording = std::move(recording)]() mutable {
worker->transcribe(recording);
worker->transcribeRecordingCompat(recording);
},
Qt::QueuedConnection);
}
Expand Down
56 changes: 56 additions & 0 deletions src/transcription/audiochunker.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
#include "transcription/audiochunker.h"

#include <algorithm>
#include <cstddef>

namespace {

constexpr int kChunkDurationMs = 200;

} // namespace

bool AudioChunker::chunkAudio(const NormalizedAudio &audio, std::vector<AudioChunk> *chunks, QString *errorMessage) const
{
if (chunks == nullptr) {
if (errorMessage != nullptr) {
*errorMessage = QStringLiteral("Internal error: missing audio chunk output");
}
return false;
}

chunks->clear();

if (!audio.isValid()) {
if (errorMessage != nullptr) {
*errorMessage = QStringLiteral("Normalized audio is empty");
}
return false;
}

if (audio.sampleRate <= 0 || audio.channels != 1) {
if (errorMessage != nullptr) {
*errorMessage = QStringLiteral("Normalized audio format is invalid");
}
return false;
}

const int chunkFrames = std::max(1, (audio.sampleRate * kChunkDurationMs) / 1000);
chunks->reserve((static_cast<int>(audio.samples.size()) + chunkFrames - 1) / chunkFrames);

std::int64_t streamOffsetFrames = 0;
for (std::size_t startIndex = 0; startIndex < audio.samples.size(); startIndex += static_cast<std::size_t>(chunkFrames)) {
const std::size_t endIndex =
std::min(startIndex + static_cast<std::size_t>(chunkFrames), audio.samples.size());

AudioChunk chunk;
chunk.sampleRate = audio.sampleRate;
chunk.channels = audio.channels;
chunk.streamOffsetFrames = streamOffsetFrames;
chunk.samples.assign(audio.samples.begin() + static_cast<std::ptrdiff_t>(startIndex),
audio.samples.begin() + static_cast<std::ptrdiff_t>(endIndex));
chunks->push_back(std::move(chunk));
streamOffsetFrames += static_cast<std::int64_t>(endIndex - startIndex);
}

return !chunks->empty();
}
27 changes: 27 additions & 0 deletions src/transcription/audiochunker.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
#pragma once

#include "transcription/transcriptiontypes.h"

#include <QString>
#include <vector>

/**
* @file
* @brief Helpers for splitting normalized audio into deterministic stream chunks.
*/

/**
* @brief Splits normalized utterance audio into fixed-size streaming chunks.
*/
class AudioChunker final
{
public:
/**
* @brief Converts normalized audio into ordered stream chunks.
* @param audio Normalized utterance audio.
* @param chunks Output destination for generated chunks.
* @param errorMessage Optional output for validation failures.
* @return `true` when chunking succeeded.
*/
bool chunkAudio(const NormalizedAudio &audio, std::vector<AudioChunk> *chunks, QString *errorMessage = nullptr) const;
};
33 changes: 33 additions & 0 deletions src/transcription/transcriptassembler.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,33 @@
#include "transcription/transcriptassembler.h"

void TranscriptAssembler::reset()
{
m_finalTranscript.clear();
m_latestPartial.clear();
}

void TranscriptAssembler::applyUpdate(const TranscriptUpdate &update)
{
for (const TranscriptEvent &event : update.events) {
const QString trimmedText = event.text.trimmed();
if (trimmedText.isEmpty()) {
continue;
}

if (event.kind == TranscriptEventKind::Final) {
if (!m_finalTranscript.isEmpty()) {
m_finalTranscript += QLatin1Char(' ');
}
m_finalTranscript += trimmedText;
m_latestPartial.clear();
continue;
}

m_latestPartial = trimmedText;
}
}

QString TranscriptAssembler::finalTranscript() const
{
return m_finalTranscript.trimmed();
}
38 changes: 38 additions & 0 deletions src/transcription/transcriptassembler.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
#pragma once

#include "transcription/transcriptiontypes.h"

#include <QString>

/**
* @file
* @brief Helpers for assembling clipboard-friendly text from transcript events.
*/

/**
* @brief Collects streaming transcript events into a final user-facing transcript.
*/
class TranscriptAssembler final
{
public:
/**
* @brief Resets any accumulated transcript state.
*/
void reset();

/**
* @brief Applies a streaming update to the assembled transcript state.
* @param update Session update containing zero or more events.
*/
void applyUpdate(const TranscriptUpdate &update);

/**
* @brief Returns the current assembled final transcript.
* @return Clipboard-friendly joined final transcript text.
*/
[[nodiscard]] QString finalTranscript() const;

private:
QString m_finalTranscript;
QString m_latestPartial;
};
Loading
Loading