fix: handle both UTF-8 and ANSI model paths on Windows

Replace `std::codecvt_utf8` with `MultiByteToWideChar` for converting
model file paths to wide strings on Windows.

The previous code assumed `path_model` was always UTF-8 encoded, but
when whisper-cli is invoked via `main(argc, argv)`, the MSVC C runtime
converts the UTF-16 command line to the system ANSI code page (e.g.
CP936 for Chinese Windows), not UTF-8. Passing these ANSI bytes to
`codecvt_utf8::from_bytes()` causes `std::range_error`, which triggers
STATUS_STACK_BUFFER_OVERRUN (0xC0000409) and crashes the process.

The fix tries `MultiByteToWideChar(CP_UTF8)` first, and if the string
is not valid UTF-8, falls back to `MultiByteToWideChar(CP_ACP)`. This
correctly handles both:
- UTF-8 paths (from manifest-enabled or Unicode-aware callers)
- ANSI paths (from the default MSVC main() using the system code page)

Also changes the guard from `_MSC_VER` to `_WIN32` to cover MinGW/Clang
on Windows, and removes the deprecated `<codecvt>` header dependency.

Fixes model loading crashes for users with non-ASCII characters in their
model file paths (e.g. Chinese, Japanese, Korean, Hebrew, Arabic).
This commit is contained in:
corvo007 2026-02-06 21:46:28 +08:00
parent 941bdabbe4
commit 5ee29c5e5b
1 changed files with 32 additions and 9 deletions

View File

@ -34,8 +34,8 @@
#include <thread>
#include <vector>
#ifdef _MSC_VER
#include <codecvt>
#ifdef _WIN32
#include <windows.h>
#endif
#if defined(WHISPER_BIG_ENDIAN)
@ -3623,10 +3623,23 @@ struct whisper_context_params whisper_context_default_params() {
struct whisper_context * whisper_init_from_file_with_params_no_state(const char * path_model, struct whisper_context_params params) {
WHISPER_LOG_INFO("%s: loading model from '%s'\n", __func__, path_model);
#ifdef _MSC_VER
// Convert UTF-8 path to wide string (UTF-16) for Windows, resolving character encoding issues.
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
std::wstring path_model_wide = converter.from_bytes(path_model);
#ifdef _WIN32
// Convert path to wide string (UTF-16) for Windows.
// Try UTF-8 first; if invalid, fall back to the system ANSI code page (e.g. CP936).
// This handles both UTF-8 paths (from manifest-enabled or Unicode-aware callers)
// and ANSI paths (from the default MSVC main() which uses the system code page).
std::wstring path_model_wide;
int wlen = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, path_model, -1, nullptr, 0);
if (wlen > 0) {
path_model_wide.resize(wlen - 1);
MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, path_model, -1, &path_model_wide[0], wlen);
} else {
wlen = MultiByteToWideChar(CP_ACP, 0, path_model, -1, nullptr, 0);
if (wlen > 0) {
path_model_wide.resize(wlen - 1);
MultiByteToWideChar(CP_ACP, 0, path_model, -1, &path_model_wide[0], wlen);
}
}
auto fin = std::ifstream(path_model_wide, std::ios::binary);
#else
auto fin = std::ifstream(path_model, std::ios::binary);
@ -4718,9 +4731,19 @@ struct whisper_vad_context * whisper_vad_init_from_file_with_params(
const char * path_model,
struct whisper_vad_context_params params) {
WHISPER_LOG_INFO("%s: loading VAD model from '%s'\n", __func__, path_model);
#ifdef _MSC_VER
std::wstring_convert<std::codecvt_utf8<wchar_t>> converter;
std::wstring path_model_wide = converter.from_bytes(path_model);
#ifdef _WIN32
std::wstring path_model_wide;
int wlen = MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, path_model, -1, nullptr, 0);
if (wlen > 0) {
path_model_wide.resize(wlen - 1);
MultiByteToWideChar(CP_UTF8, MB_ERR_INVALID_CHARS, path_model, -1, &path_model_wide[0], wlen);
} else {
wlen = MultiByteToWideChar(CP_ACP, 0, path_model, -1, nullptr, 0);
if (wlen > 0) {
path_model_wide.resize(wlen - 1);
MultiByteToWideChar(CP_ACP, 0, path_model, -1, &path_model_wide[0], wlen);
}
}
auto fin = std::ifstream(path_model_wide, std::ios::binary);
#else
auto fin = std::ifstream(path_model, std::ios::binary);