n-n-code · n-n-code · Mar 27, 2026 · Mar 27, 2026
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -45,6 +45,12 @@ set(MUTTERKEY_CORE_SOURCES
     src/transcription/transcriptiontypes.h
     src/transcription/transcriptionengine.cpp
     src/transcription/transcriptionengine.h
+    src/transcription/audiochunker.cpp
+    src/transcription/audiochunker.h
+    src/transcription/transcriptassembler.cpp
+    src/transcription/transcriptassembler.h
+    src/transcription/transcriptioncompat.cpp
+    src/transcription/transcriptioncompat.h
     src/transcription/transcriptionworker.cpp
     src/transcription/transcriptionworker.h
     src/transcription/whispercpptranscriber.cpp
@@ -87,7 +93,7 @@ target_include_directories(mutterkey-tray PRIVATE ${CMAKE_CURRENT_SOURCE_DIR}/sr
 target_link_libraries(mutterkey_core PUBLIC Qt6::Core Qt6::Gui Qt6::Multimedia KF6::GlobalAccel KF6::GuiAddons)
 target_link_libraries(mutterkey_control PUBLIC Qt6::Core Qt6::Network mutterkey_core)
 target_link_libraries(mutterkey_app PUBLIC Qt6::Core Qt6::Gui mutterkey_control)
-target_link_libraries(mutterkey PRIVATE mutterkey_app whisper)
+target_link_libraries(mutterkey PRIVATE mutterkey_app)
 target_link_libraries(mutterkey-tray PRIVATE Qt6::Core Qt6::Gui Qt6::Widgets mutterkey_control)
 set_target_properties(mutterkey PROPERTIES
     BUILD_RPATH "$ORIGIN/../lib"
@@ -206,7 +212,7 @@ add_subdirectory(third_party/whisper.cpp EXCLUDE_FROM_ALL)
 # upstream public headers as part of its own package layout.
 set_target_properties(whisper ggml PROPERTIES PUBLIC_HEADER "")
 
-target_link_libraries(mutterkey_core PUBLIC whisper)
+target_link_libraries(mutterkey_core PRIVATE whisper)
 
 install(TARGETS mutterkey RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})
 install(TARGETS mutterkey-tray RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR})

diff --git a/README.md b/README.md
@@ -18,6 +18,16 @@ Current behavior:
 - copies the resulting text to the clipboard
 - expects you to paste the text yourself with `Ctrl+V`
 
+Current runtime shape:
+
+- `TranscriptionEngine` is the immutable runtime/provider boundary
+- `TranscriptionSession` is the mutable per-session decode boundary
+- internal audio flow is streaming-first through normalized chunks and transcript events
+- `BackendCapabilities` reports static backend support, while `RuntimeDiagnostics`
+  reports runtime/device/model inspection data
+- the current daemon and `once` user flows still collapse the streaming path back
+  into a final clipboard-friendly transcript
+
 Current direction:
 
 - KDE-first
@@ -362,9 +372,12 @@ Repository layout:
 - `src/audio/audiorecorder.*`: microphone capture
 - `src/audio/recording.h`: shared recorded-audio payload passed between subsystems
 - `src/audio/recordingnormalizer.*`: conversion to Whisper-ready mono `float32` at `16 kHz`
-- `src/transcription/whispercpptranscriber.*`: embedded Whisper integration
+- `src/transcription/audiochunker.*`: fixed-size normalized streaming chunk generation
+- `src/transcription/transcriptassembler.*`: final transcript assembly from streaming events
+- `src/transcription/transcriptioncompat.*`: compatibility wrapper from one-shot recordings to the streaming runtime path
+- `src/transcription/whispercpptranscriber.*`: embedded Whisper integration behind the app-owned runtime seam
 - `src/transcription/transcriptionworker.*`: worker object on a dedicated `QThread`
-- `src/transcription/transcriptiontypes.h`: normalized-audio and transcription result value types
+- `src/transcription/transcriptiontypes.h`: runtime diagnostics, normalized-audio, chunk, event, and error value types
 - `src/clipboardwriter.*`: clipboard writes with KDE-first fallback behavior
 - `src/config.*`: JSON config loading and defaults
 - `src/app/*`: shared CLI/runtime command helpers used by the main entrypoint
@@ -462,7 +475,7 @@ Notes:
   libraries without inheriting upstream header-install warnings
 - the `valgrind` target runs the repo-owned Memcheck lane used for release readiness
 - tests are small headless `Qt Test` cases
-- `config` and `recordingnormalizer` currently have the main unit-test coverage because they contain the most deterministic logic without KDE session or device dependencies
+- streaming runtime helpers and worker orchestration now also have deterministic headless coverage through fake backends
 - GitHub Actions CI runs the hygiene job on Ubuntu 24.04 and the configure/build/test job in a Debian Trixie container because the needed KF6 dev packages are not available on the stock Ubuntu 24.04 runner image
 - successful `main` branch CI runs publish `build/docs/doxygen/html` to GitHub Pages with the official Pages actions
 - GitHub Actions release checks run a separate Valgrind Memcheck lane on manual dispatch and `v*` tags so normal PR CI stays faster

diff --git a/docs/mainpage.md b/docs/mainpage.md
@@ -21,20 +21,27 @@ Current runtime shape:
 
 - `TranscriptionEngine` is the immutable runtime/provider boundary
 - `TranscriptionSession` is the mutable per-session decode boundary
-- `BackendCapabilities` reports engine-owned runtime metadata used for
-  diagnostics and orchestration
+- internal audio flow is streaming-first through normalized chunks and
+  transcript events
+- `BackendCapabilities` reports static backend support used for orchestration
+- `RuntimeDiagnostics` reports runtime/device/model inspection data separately
+  from static capabilities
 - `RuntimeError` and `RuntimeErrorCode` provide typed runtime failures
 - `TranscriptionWorker` hosts transcription on a dedicated `QThread` and
   creates live sessions lazily on that worker thread
+- the shipped daemon and `once` flows still use a compatibility wrapper that
+  assembles a final transcript from the streaming runtime path
 - config parsing under `src/config.*` stays product-shaped and permissive, while
   backend-specific support checks live in the runtime layer
 
 Core API surface covered here:
 
 - `HotkeyManager` registers the global push-to-talk shortcut through KDE.
 - `AudioRecorder` captures microphone audio while the shortcut is held.
-- `RecordingNormalizer` converts captured audio to Whisper-ready mono `float32`
+- `RecordingNormalizer` converts captured audio to runtime-ready mono `float32`
   samples at `16 kHz`.
+- `AudioChunker` splits normalized audio into deterministic stream chunks.
+- `TranscriptAssembler` builds final transcript text from streaming events.
 - `TranscriptionEngine` and `TranscriptionSession` define the app-owned runtime
   seam.
 - `WhisperCppTranscriber` performs in-process transcription through vendored

diff --git a/src/app/applicationcommands.cpp b/src/app/applicationcommands.cpp
@@ -6,6 +6,7 @@
 #include "clipboardwriter.h"
 #include "control/daemoncontrolserver.h"
 #include "service.h"
+#include "transcription/transcriptioncompat.h"
 #include "transcription/transcriptionengine.h"
 #include "transcription/transcriptiontypes.h"
 
@@ -82,7 +83,9 @@ int runOnce(QGuiApplication &app, const AppConfig &config, double seconds)
         }
     }
 
-    QTimer::singleShot(0, &app, [&app, &recorder, transcriber = transcriber.get(), &clipboardWriter, seconds]() {
+    QTimer::singleShot(0,
+                       &app,
+                       [&app, &recorder, transcriber = transcriber.get(), &clipboardWriter, seconds, normalizer = RecordingNormalizer()]() {
         QString errorMessage;
         if (!recorder.start(&errorMessage)) {
             qCCritical(appLog) << "Failed to start one-shot recording:" << errorMessage;
@@ -91,15 +94,15 @@ int runOnce(QGuiApplication &app, const AppConfig &config, double seconds)
         }
 
         qCInfo(appLog) << "Recording for" << seconds << "seconds";
-        QTimer::singleShot(static_cast<int>(seconds * 1000), &app, [&app, &recorder, transcriber, &clipboardWriter]() {
+        QTimer::singleShot(static_cast<int>(seconds * 1000), &app, [&app, &recorder, transcriber, &clipboardWriter, normalizer]() {
             const Recording recording = recorder.stop();
             if (!recording.isValid()) {
                 qCCritical(appLog) << "Recorder returned no audio";
                 QGuiApplication::exit(1);
                 return;
             }
 
-            const TranscriptionResult result = transcriber->transcribe(recording);
+            const TranscriptionResult result = transcribeRecordingViaStreaming(*transcriber, recording, normalizer);
             if (!result.success) {
                 qCCritical(appLog) << "One-shot transcription failed:" << result.error.message;
                 QGuiApplication::exit(1);

diff --git a/src/audio/recordingnormalizer.cpp b/src/audio/recordingnormalizer.cpp
@@ -95,7 +95,7 @@ std::vector<float> resampleLinear(const std::vector<float> &samples, int inputSa
 
 } // namespace
 
-bool RecordingNormalizer::normalizeForWhisper(const Recording &recording,
+bool RecordingNormalizer::normalizeForRuntime(const Recording &recording,
                                               NormalizedAudio *normalizedAudio,
                                               QString *errorMessage) const
 {

diff --git a/src/audio/recordingnormalizer.h b/src/audio/recordingnormalizer.h
@@ -17,11 +17,11 @@ class RecordingNormalizer final
 {
 public:
     /**
-     * @brief Converts a captured recording into Whisper input audio.
+     * @brief Converts a captured recording into runtime input audio.
      * @param recording Source recording and its original device format.
      * @param normalizedAudio Output location for normalized samples.
      * @param errorMessage Optional output for conversion failures.
      * @return `true` when normalization succeeded.
      */
-    bool normalizeForWhisper(const Recording &recording, NormalizedAudio *normalizedAudio, QString *errorMessage = nullptr) const;
+    bool normalizeForRuntime(const Recording &recording, NormalizedAudio *normalizedAudio, QString *errorMessage = nullptr) const;
 };
diff --git a/src/service.cpp b/src/service.cpp
@@ -80,11 +80,12 @@ QJsonObject MutterkeyService::diagnostics() const
     object.insert(QStringLiteral("transcriptions_completed"), m_transcriptionsCompleted);
     object.insert(QStringLiteral("transcriber_backend"),
                   m_transcriptionWorker != nullptr ? m_transcriptionWorker->backendName() : QStringLiteral("unconfigured"));
-    object.insert(QStringLiteral("transcriber_model"),
-                  m_transcriptionWorker != nullptr ? m_transcriptionWorker->loadedModelDescription() : QString());
+    const RuntimeDiagnostics runtimeDiagnostics =
+        m_transcriptionWorker != nullptr ? m_transcriptionWorker->runtimeDiagnostics() : m_transcriptionEngine->diagnostics();
+    object.insert(QStringLiteral("transcriber_model"), runtimeDiagnostics.loadedModelDescription);
+    object.insert(QStringLiteral("transcriber_runtime"), runtimeDiagnostics.runtimeDescription);
     const BackendCapabilities capabilities =
         m_transcriptionWorker != nullptr ? m_transcriptionWorker->capabilities() : m_transcriptionEngine->capabilities();
-    object.insert(QStringLiteral("transcriber_runtime"), capabilities.runtimeDescription);
     object.insert(QStringLiteral("transcriber_supports_translation"), capabilities.supportsTranslation);
     object.insert(QStringLiteral("transcriber_supports_auto_language"), capabilities.supportsAutoLanguage);
     return object;
@@ -168,7 +169,7 @@ void MutterkeyService::transcribeInBackground(Recording recording)
     // another owner of the PCM payload alive on the service thread.
     QMetaObject::invokeMethod(m_transcriptionWorker,
                               [worker = m_transcriptionWorker, recording = std::move(recording)]() mutable {
-                                  worker->transcribe(recording);
+                                  worker->transcribeRecordingCompat(recording);
                               },
                               Qt::QueuedConnection);
 }

diff --git a/src/transcription/audiochunker.cpp b/src/transcription/audiochunker.cpp
@@ -0,0 +1,56 @@
+#include "transcription/audiochunker.h"
+
+#include <algorithm>
+#include <cstddef>
+
+namespace {
+
+constexpr int kChunkDurationMs = 200;
+
+} // namespace
+
+bool AudioChunker::chunkAudio(const NormalizedAudio &audio, std::vector<AudioChunk> *chunks, QString *errorMessage) const
+{
+    if (chunks == nullptr) {
+        if (errorMessage != nullptr) {
+            *errorMessage = QStringLiteral("Internal error: missing audio chunk output");
+        }
+        return false;
+    }
+
+    chunks->clear();
+
+    if (!audio.isValid()) {
+        if (errorMessage != nullptr) {
+            *errorMessage = QStringLiteral("Normalized audio is empty");
+        }
+        return false;
+    }
+
+    if (audio.sampleRate <= 0 || audio.channels != 1) {
+        if (errorMessage != nullptr) {
+            *errorMessage = QStringLiteral("Normalized audio format is invalid");
+        }
+        return false;
+    }
+
+    const int chunkFrames = std::max(1, (audio.sampleRate * kChunkDurationMs) / 1000);
+    chunks->reserve((static_cast<int>(audio.samples.size()) + chunkFrames - 1) / chunkFrames);
+
+    std::int64_t streamOffsetFrames = 0;
+    for (std::size_t startIndex = 0; startIndex < audio.samples.size(); startIndex += static_cast<std::size_t>(chunkFrames)) {
+        const std::size_t endIndex =
+            std::min(startIndex + static_cast<std::size_t>(chunkFrames), audio.samples.size());
+
+        AudioChunk chunk;
+        chunk.sampleRate = audio.sampleRate;
+        chunk.channels = audio.channels;
+        chunk.streamOffsetFrames = streamOffsetFrames;
+        chunk.samples.assign(audio.samples.begin() + static_cast<std::ptrdiff_t>(startIndex),
+                             audio.samples.begin() + static_cast<std::ptrdiff_t>(endIndex));
+        chunks->push_back(std::move(chunk));
+        streamOffsetFrames += static_cast<std::int64_t>(endIndex - startIndex);
+    }
+
+    return !chunks->empty();
+}
diff --git a/src/transcription/audiochunker.h b/src/transcription/audiochunker.h
@@ -0,0 +1,27 @@
+#pragma once
+
+#include "transcription/transcriptiontypes.h"
+
+#include <QString>
+#include <vector>
+
+/**
+ * @file
+ * @brief Helpers for splitting normalized audio into deterministic stream chunks.
+ */
+
+/**
+ * @brief Splits normalized utterance audio into fixed-size streaming chunks.
+ */
+class AudioChunker final
+{
+public:
+    /**
+     * @brief Converts normalized audio into ordered stream chunks.
+     * @param audio Normalized utterance audio.
+     * @param chunks Output destination for generated chunks.
+     * @param errorMessage Optional output for validation failures.
+     * @return `true` when chunking succeeded.
+     */
+    bool chunkAudio(const NormalizedAudio &audio, std::vector<AudioChunk> *chunks, QString *errorMessage = nullptr) const;
+};
diff --git a/src/transcription/transcriptassembler.cpp b/src/transcription/transcriptassembler.cpp
@@ -0,0 +1,33 @@
+#include "transcription/transcriptassembler.h"
+
+void TranscriptAssembler::reset()
+{
+    m_finalTranscript.clear();
+    m_latestPartial.clear();
+}
+
+void TranscriptAssembler::applyUpdate(const TranscriptUpdate &update)
+{
+    for (const TranscriptEvent &event : update.events) {
+        const QString trimmedText = event.text.trimmed();
+        if (trimmedText.isEmpty()) {
+            continue;
+        }
+
+        if (event.kind == TranscriptEventKind::Final) {
+            if (!m_finalTranscript.isEmpty()) {
+                m_finalTranscript += QLatin1Char(' ');
+            }
+            m_finalTranscript += trimmedText;
+            m_latestPartial.clear();
+            continue;
+        }
+
+        m_latestPartial = trimmedText;
+    }
+}
+
+QString TranscriptAssembler::finalTranscript() const
+{
+    return m_finalTranscript.trimmed();
+}
diff --git a/src/transcription/transcriptassembler.h b/src/transcription/transcriptassembler.h
@@ -0,0 +1,38 @@
+#pragma once
+
+#include "transcription/transcriptiontypes.h"
+
+#include <QString>
+
+/**
+ * @file
+ * @brief Helpers for assembling clipboard-friendly text from transcript events.
+ */
+
+/**
+ * @brief Collects streaming transcript events into a final user-facing transcript.
+ */
+class TranscriptAssembler final
+{
+public:
+    /**
+     * @brief Resets any accumulated transcript state.
+     */
+    void reset();
+
+    /**
+     * @brief Applies a streaming update to the assembled transcript state.
+     * @param update Session update containing zero or more events.
+     */
+    void applyUpdate(const TranscriptUpdate &update);
+
+    /**
+     * @brief Returns the current assembled final transcript.
+     * @return Clipboard-friendly joined final transcript text.
+     */
+    [[nodiscard]] QString finalTranscript() const;
+
+private:
+    QString m_finalTranscript;
+    QString m_latestPartial;
+};