Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,38 @@ endif()
add_executable(voxtral src/main.cpp)
target_link_libraries(voxtral PRIVATE voxtral_lib)

# ── voxtral-server (HTTP transcription server) ──────────────────────
option(VOXTRAL_BUILD_SERVER "Build voxtral-server HTTP executable" ON)

if(VOXTRAL_BUILD_SERVER)
include(FetchContent)
FetchContent_Declare(
httplib
GIT_REPOSITORY https://github.com/yhirose/cpp-httplib.git
GIT_TAG v0.20.0
GIT_SHALLOW TRUE
)
set(HTTPLIB_COMPILE OFF CACHE BOOL "" FORCE)
FetchContent_MakeAvailable(httplib)

add_executable(voxtral-server src/server.cpp)
target_link_libraries(voxtral-server PRIVATE voxtral_lib httplib::httplib)

if(VOXTRAL_WARNINGS_AS_ERRORS)
if(MSVC)
target_compile_options(voxtral-server PRIVATE /W4 /WX)
else()
target_compile_options(voxtral-server PRIVATE -Wall -Wextra -Wpedantic -Werror)
endif()
endif()

if(VOXTRAL_NATIVE_OPT AND NOT MSVC)
if(NOT APPLE)
target_compile_options(voxtral-server PRIVATE -march=native -mtune=native)
endif()
endif()
endif()

add_executable(voxtral-quantize src/voxtral-quantize.cpp)
target_link_libraries(voxtral-quantize PRIVATE ggml Threads::Threads)

Expand Down
47 changes: 45 additions & 2 deletions src/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ struct cli_params {
int32_t max_tokens = 256;
voxtral_log_level log_level = voxtral_log_level::info;
voxtral_gpu_backend gpu = voxtral_gpu_backend::none;
bool stdin_mode = false;
};

struct backend_reg_info {
Expand Down Expand Up @@ -213,6 +214,9 @@ void print_usage(const char * argv0) {
<< " --output-text PATH write decoded text to file (still prints to stdout)\n"
<< " --gpu BACKEND gpu backend: auto|cuda|metal|vulkan|none (default: none)\n"
<< " --metal alias for --gpu metal\n"
<< " --stdin interactive mode: read audio paths from stdin (one per line),\n"
<< " keeps model loaded between transcriptions.\n"
<< " Output ends with __VOXTRAL_END__ sentinel per file.\n"
<< " -h, --help show this help\n";
}

Expand Down Expand Up @@ -357,6 +361,8 @@ bool parse_args(int argc, char ** argv, cli_params & p) {
}
} else if (a == "--metal") {
p.gpu = voxtral_gpu_backend::metal;
} else if (a == "--stdin") {
p.stdin_mode = true;
} else {
std::cerr << "unknown option: " << a << "\n";
return false;
Expand All @@ -368,8 +374,8 @@ bool parse_args(int argc, char ** argv, cli_params & p) {
return false;
}

if (p.audio.empty()) {
std::cerr << "--audio is required\n";
if (p.audio.empty() && !p.stdin_mode) {
std::cerr << "--audio is required (or use --stdin for interactive mode)\n";
return false;
}

Expand Down Expand Up @@ -438,6 +444,43 @@ int main(int argc, char ** argv) {
return finish(3);
}

if (p.stdin_mode) {
std::cerr << "voxtral: stdin mode ready, waiting for audio paths...\n";
std::cout << "__VOXTRAL_READY__" << std::endl;

std::string line;
while (std::getline(std::cin, line)) {
// Trim whitespace
while (!line.empty() && (line.back() == '\r' || line.back() == '\n' || line.back() == ' ')) {
line.pop_back();
}
if (line.empty()) {
continue;
}

const auto t_req = std::chrono::steady_clock::now();

voxtral_result result;
if (!voxtral_transcribe_file(*ctx, line, p.max_tokens, result)) {
std::cout << "[error] transcription failed for: " << line << "\n";
} else {
const std::string text = result.text.empty() ? std::string("[no-transcript]") : result.text;
std::cout << text << "\n";
}

const double req_ms = std::chrono::duration<double, std::milli>(
std::chrono::steady_clock::now() - t_req).count();
std::cerr << std::fixed << std::setprecision(2)
<< "[stdin] transcribed " << line << " in " << req_ms << " ms\n";

std::cout << "__VOXTRAL_END__" << std::endl;
}

voxtral_free(ctx);
voxtral_model_free(model);
return finish(0);
}

voxtral_result result;
if (!voxtral_transcribe_file(*ctx, p.audio, p.max_tokens, result)) {
voxtral_free(ctx);
Expand Down
Loading