[libcxx] Fix using std::wcout/wcin on Windows with streams configured in wide mode

On Windows, the underlying file descriptors for stdout/stdin/stderr
can be reconfigured to wide mode. In the default (narrow) mode, the
charset usually isn't utf8 (as libcxx assumes), but normally a locale
specific codepage (where each codepage only can represent a small
subset of unicode characters).

By configuring the stdout file descriptor to wide mode, the user can
output wchar_t based strings without convesion to the narrow charset.
Within libcxx, don't try to use codecvt to convert this to a narrow
character encoding, but output these strings as such with fputwc.

In wide mode, such strings could be output directly with fwrite too,
but if the file descriptor hasn't been configured in wide mode, that
breaks the output (which currently works reasonably). By always
outputting one character at a time with fputwc, it works regardless
of mode of the stdout file descriptor.

For the narrow output stream, std::cout, outputting (via fwrite)
does fail when the file descriptor is set to wide mode. This matches
how it behaves with both MS STL and GNU libstdc++ too, so this is
probably acceptable.

This fixes https://github.com/llvm/llvm-project/issues/46646, and
the downstream bugs https://github.com/mstorsjo/llvm-mingw/issues/145
and https://github.com/mstorsjo/llvm-mingw/issues/222.

Differential Revision: https://reviews.llvm.org/D146398
This commit is contained in:
Martin Storsjö 2023-03-15 12:11:28 +02:00
parent 4b9764959d
commit fcbbd9649a
12 changed files with 387 additions and 36 deletions

View File

@ -552,3 +552,26 @@ Unpoisoning may not be an option, if (for example) you are not maintaining the a
* You are using allocator, which does not call destructor during deallocation.
* You are aware that memory allocated with an allocator may be accessed, even when unused by container.
Platform specific behavior
==========================
Windows
-------
The ``stdout``, ``stderr``, and ``stdin`` file streams can be placed in
Unicode mode by a suitable call to ``_setmode()``. When in this mode,
the sequence of bytes read from, or written to, these streams is interpreted
as a sequence of little-endian ``wchar_t`` elements. Thus, use of
``std::cout``, ``std::cerr``, or ``std::cin`` with streams in Unicode mode
will not behave as they usually do since bytes read or written won't be
interpreted as individual ``char`` elements. However, ``std::wcout``,
``std::wcerr``, and ``std::wcin`` will behave as expected.
Wide character stream such as ``std::wcin`` or ``std::wcout`` imbued with a
locale behave differently than they otherwise do. By default, wide character
streams don't convert wide characters but input/output them as is. If a
specific locale is imbued, the IO with the underlying stream happens with
regular ``char`` elements, which are converted to/from wide characters
according to the locale. Note that this doesn't behave as expected if the
stream has been set in Unicode mode.

View File

@ -60,6 +60,12 @@ private:
bool __last_consumed_is_next_;
bool __always_noconv_;
#if defined(_LIBCPP_WIN32API)
static constexpr bool __is_win32api_wide_char = !is_same_v<_CharT, char>;
#else
static constexpr bool __is_win32api_wide_char = false;
#endif
__stdinbuf(const __stdinbuf&);
__stdinbuf& operator=(const __stdinbuf&);
@ -74,6 +80,12 @@ __stdinbuf<_CharT>::__stdinbuf(FILE* __fp, state_type* __st)
__last_consumed_is_next_(false)
{
imbue(this->getloc());
// On Windows, in wchar_t mode, ignore the codecvt from the locale by
// default and assume noconv; this passes wchar_t through unmodified from
// getwc. If the user sets a custom locale with imbue(), that gets honored,
// the IO is done with getc() and converted with the provided codecvt.
if constexpr (__is_win32api_wide_char)
__always_noconv_ = true;
}
template <class _CharT>
@ -101,6 +113,36 @@ __stdinbuf<_CharT>::uflow()
return __getchar(true);
}
static bool __do_getc(FILE *__fp, char *__pbuf) {
int __c = getc(__fp);
if (__c == EOF)
return false;
*__pbuf = static_cast<char>(__c);
return true;
}
#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
static bool __do_getc(FILE *__fp, wchar_t *__pbuf) {
wint_t __c = getwc(__fp);
if (__c == WEOF)
return false;
*__pbuf = static_cast<wchar_t>(__c);
return true;
}
#endif
static bool __do_ungetc(int __c, FILE *__fp, char __dummy) {
if (ungetc(__c, __fp) == EOF)
return false;
return true;
}
#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
static bool __do_ungetc(std::wint_t __c, FILE *__fp, wchar_t __dummy) {
if (ungetwc(__c, __fp) == WEOF)
return false;
return true;
}
#endif
template <class _CharT>
typename __stdinbuf<_CharT>::int_type
__stdinbuf<_CharT>::__getchar(bool __consume)
@ -115,6 +157,20 @@ __stdinbuf<_CharT>::__getchar(bool __consume)
}
return __result;
}
if (__always_noconv_) {
char_type __1buf;
if (!__do_getc(__file_, &__1buf))
return traits_type::eof();
if (!__consume)
{
if (!__do_ungetc(traits_type::to_int_type(__1buf), __file_, __1buf))
return traits_type::eof();
}
else
__last_consumed_ = traits_type::to_int_type(__1buf);
return traits_type::to_int_type(__1buf);
}
char __extbuf[__limit];
int __nread = _VSTD::max(1, __encoding_);
for (int __i = 0; __i < __nread; ++__i)
@ -125,42 +181,37 @@ __stdinbuf<_CharT>::__getchar(bool __consume)
__extbuf[__i] = static_cast<char>(__c);
}
char_type __1buf;
if (__always_noconv_)
__1buf = static_cast<char_type>(__extbuf[0]);
else
const char* __enxt;
char_type* __inxt;
codecvt_base::result __r;
do
{
const char* __enxt;
char_type* __inxt;
codecvt_base::result __r;
do
state_type __sv_st = *__st_;
__r = __cv_->in(*__st_, __extbuf, __extbuf + __nread, __enxt,
&__1buf, &__1buf + 1, __inxt);
switch (__r)
{
state_type __sv_st = *__st_;
__r = __cv_->in(*__st_, __extbuf, __extbuf + __nread, __enxt,
&__1buf, &__1buf + 1, __inxt);
switch (__r)
{
case _VSTD::codecvt_base::ok:
break;
case codecvt_base::partial:
*__st_ = __sv_st;
if (__nread == sizeof(__extbuf))
return traits_type::eof();
{
int __c = getc(__file_);
if (__c == EOF)
return traits_type::eof();
__extbuf[__nread] = static_cast<char>(__c);
}
++__nread;
break;
case codecvt_base::error:
case _VSTD::codecvt_base::ok:
break;
case codecvt_base::partial:
*__st_ = __sv_st;
if (__nread == sizeof(__extbuf))
return traits_type::eof();
case _VSTD::codecvt_base::noconv:
__1buf = static_cast<char_type>(__extbuf[0]);
break;
{
int __c = getc(__file_);
if (__c == EOF)
return traits_type::eof();
__extbuf[__nread] = static_cast<char>(__c);
}
} while (__r == _VSTD::codecvt_base::partial);
}
++__nread;
break;
case codecvt_base::error:
return traits_type::eof();
case _VSTD::codecvt_base::noconv:
__1buf = static_cast<char_type>(__extbuf[0]);
break;
}
} while (__r == _VSTD::codecvt_base::partial);
if (!__consume)
{
for (int __i = __nread; __i > 0;)
@ -188,8 +239,11 @@ __stdinbuf<_CharT>::pbackfail(int_type __c)
}
return __c;
}
if (__last_consumed_is_next_)
{
if (__always_noconv_ && __last_consumed_is_next_) {
if (!__do_ungetc(__last_consumed_, __file_,
traits_type::to_char_type(__last_consumed_)))
return traits_type::eof();
} else if (__last_consumed_is_next_) {
char __extbuf[__limit];
char* __enxt;
const char_type __ci = traits_type::to_char_type(__last_consumed_);
@ -244,6 +298,12 @@ private:
state_type* __st_;
bool __always_noconv_;
#if defined(_LIBCPP_WIN32API)
static constexpr bool __is_win32api_wide_char = !is_same_v<_CharT, char>;
#else
static constexpr bool __is_win32api_wide_char = false;
#endif
__stdoutbuf(const __stdoutbuf&);
__stdoutbuf& operator=(const __stdoutbuf&);
};
@ -255,8 +315,31 @@ __stdoutbuf<_CharT>::__stdoutbuf(FILE* __fp, state_type* __st)
__st_(__st),
__always_noconv_(__cv_->always_noconv())
{
// On Windows, in wchar_t mode, ignore the codecvt from the locale by
// default and assume noconv; this passes wchar_t through unmodified to
// fputwc, which handles it correctly depending on the actual mode of the
// output stream. If the user sets a custom locale with imbue(), that
// gets honored.
if constexpr (__is_win32api_wide_char)
__always_noconv_ = true;
}
static bool __do_fputc(char __c, FILE* __fp) {
if (fwrite(&__c, sizeof(__c), 1, __fp) != 1)
return false;
return true;
}
#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
static bool __do_fputc(wchar_t __c, FILE* __fp) {
// fputwc works regardless of wide/narrow mode of stdout, while
// fwrite of wchar_t only works if the stream actually has been set
// into wide mode.
if (fputwc(__c, __fp) == WEOF)
return false;
return true;
}
#endif
template <class _CharT>
typename __stdoutbuf<_CharT>::int_type
__stdoutbuf<_CharT>::overflow(int_type __c)
@ -268,7 +351,7 @@ __stdoutbuf<_CharT>::overflow(int_type __c)
__1buf = traits_type::to_char_type(__c);
if (__always_noconv_)
{
if (fwrite(&__1buf, sizeof(char_type), 1, __file_) != 1)
if (!__do_fputc(__1buf, __file_))
return traits_type::eof();
}
else
@ -313,7 +396,10 @@ template <class _CharT>
streamsize
__stdoutbuf<_CharT>::xsputn(const char_type* __s, streamsize __n)
{
if (__always_noconv_)
// For wchar_t on Windows, don't call fwrite(), but write characters one
// at a time with fputwc(); that works both when stdout is in the default
// mode and when it is set to Unicode mode.
if (__always_noconv_ && !__is_win32api_wide_char)
return fwrite(__s, sizeof(char_type), __n, __file_);
streamsize __i = 0;
for (; __i < __n; ++__i, ++__s)

View File

@ -0,0 +1,5 @@
# Check that the stderr of the executed program matches a reference file.
program=${1}
expected_file=${2}
${program} 2>stderr.log >stdout.log
cmp stderr.log "${expected_file}"

View File

@ -0,0 +1,5 @@
# Check that the stdout of the executed program matches a reference file.
program=${1}
expected_file=${2}
${program} 2>stderr.log >stdout.log
cmp stdout.log "${expected_file}"

View File

@ -0,0 +1,4 @@
# Pass a reference file as stdin to a test executable.
program=${1}
input=${2}
cat ${input} | ${program}

View File

@ -0,0 +1,42 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// <iostream>
// istream wcerr;
// UNSUPPORTED: no-wide-characters
// UNSUPPORTED: executor-has-no-bash
// FILE_DEPENDENCIES: ../check-stderr.sh
// RUN: %{build}
// RUN: %{exec} bash check-stderr.sh "%t.exe" "zzzz"
#include <iostream>
struct custom_codecvt : std::codecvt<wchar_t, char, std::mbstate_t> {
using base = std::codecvt<wchar_t, char, std::mbstate_t>;
protected:
result do_out(std::mbstate_t&, const wchar_t *from, const wchar_t *from_end,
const wchar_t *&from_next, char *to, char *to_end, char *&to_next) const {
while (from != from_end && to != to_end) {
++from;
*to++ = 'z';
}
from_next = from;
to_next = to;
return ok;
}
};
int main(int, char**) {
std::locale loc(std::locale::classic(), new custom_codecvt);
std::wcerr.imbue(loc);
std::wcerr << L"1234";
return 0;
}

View File

@ -0,0 +1,32 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// <iostream>
// istream wcerr;
// UNSUPPORTED: no-wide-characters
// REQUIRES: target={{.+}}-windows-{{.+}}
// UNSUPPORTED: executor-has-no-bash
// FILE_DEPENDENCIES: check-stderr.sh, test.dat
// RUN: %{build}
// RUN: %{exec} bash check-stderr.sh "%t.exe" "test.dat"
// Check that wcerr works, preserving the unicode characters, after switching
// stderr to wide mode.
#include <iostream>
#include <io.h>
#include <fcntl.h>
int main(int, char**) {
_setmode(_fileno(stderr), _O_WTEXT);
std::wcerr << L"1234\u20ac\u00e5\u00e4\u00f6";
return 0;
}

View File

@ -0,0 +1,45 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// <iostream>
// istream wcin;
// UNSUPPORTED: no-wide-characters
// UNSUPPORTED: executor-has-no-bash
// FILE_DEPENDENCIES: ../send-stdin.sh
// RUN: %{build}
// RUN: %{exec} bash send-stdin.sh "%t.exe" "1234"
#include <iostream>
#include <cassert>
struct custom_codecvt : std::codecvt<wchar_t, char, std::mbstate_t> {
using base = std::codecvt<wchar_t, char, std::mbstate_t>;
protected:
result do_in(std::mbstate_t&, const char *from, const char *from_end,
const char *&from_next, wchar_t *to, wchar_t *to_end, wchar_t *&to_next) const {
while (from != from_end && to != to_end) {
++from;
*to++ = L'z';
}
from_next = from;
to_next = to;
return ok;
}
};
int main(int, char**) {
std::locale loc(std::locale::classic(), new custom_codecvt);
std::wcin.imbue(loc);
std::wstring str;
std::wcin >> str;
assert(str == L"zzzz");
return 0;
}

View File

@ -0,0 +1,35 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// <iostream>
// istream wcin;
// UNSUPPORTED: no-wide-characters
// REQUIRES: target={{.+}}-windows-{{.+}}
// UNSUPPORTED: executor-has-no-bash
// FILE_DEPENDENCIES: send-stdin.sh, test.dat
// RUN: %{build}
// RUN: %{exec} bash send-stdin.sh "%t.exe" "test.dat"
// Check that wcin works, preserving the unicode characters, after switching
// stdin to wide mode.
#include <iostream>
#include <cassert>
#include <io.h>
#include <fcntl.h>
int main(int, char**) {
_setmode(_fileno(stdin), _O_WTEXT);
std::wstring str;
std::wcin >> str;
assert(str == L"1234\u20ac\u00e5\u00e4\u00f6");
return 0;
}

View File

@ -0,0 +1,42 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// <iostream>
// istream wcout;
// UNSUPPORTED: no-wide-characters
// UNSUPPORTED: executor-has-no-bash
// FILE_DEPENDENCIES: ../check-stdout.sh
// RUN: %{build}
// RUN: %{exec} bash check-stdout.sh "%t.exe" "zzzz"
#include <iostream>
struct custom_codecvt : std::codecvt<wchar_t, char, std::mbstate_t> {
using base = std::codecvt<wchar_t, char, std::mbstate_t>;
protected:
result do_out(std::mbstate_t&, const wchar_t *from, const wchar_t *from_end,
const wchar_t *&from_next, char *to, char *to_end, char *&to_next) const {
while (from != from_end && to != to_end) {
++from;
*to++ = 'z';
}
from_next = from;
to_next = to;
return ok;
}
};
int main(int, char**) {
std::locale loc(std::locale::classic(), new custom_codecvt);
std::wcout.imbue(loc);
std::wcout << L"1234";
return 0;
}

View File

@ -0,0 +1,32 @@
//===----------------------------------------------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
// <iostream>
// istream wcout;
// UNSUPPORTED: no-wide-characters
// REQUIRES: target={{.+}}-windows-{{.+}}
// UNSUPPORTED: executor-has-no-bash
// FILE_DEPENDENCIES: check-stdout.sh, test.dat
// RUN: %{build}
// RUN: %{exec} bash check-stdout.sh "%t.exe" "test.dat"
// Check that wcout works, preserving the unicode characters, after switching
// stdout to wide mode.
#include <iostream>
#include <io.h>
#include <fcntl.h>
int main(int, char**) {
_setmode(_fileno(stdout), _O_WTEXT);
std::wcout << L"1234\u20ac\u00e5\u00e4\u00f6";
return 0;
}