[libcxx] Fix using std::wcout/wcin on Windows with streams configured in wide mode

On Windows, the underlying file descriptors for stdout/stdin/stderr can be reconfigured to wide mode. In the default (narrow) mode, the charset usually isn't utf8 (as libcxx assumes), but normally a locale specific codepage (where each codepage only can represent a small subset of unicode characters). By configuring the stdout file descriptor to wide mode, the user can output wchar_t based strings without convesion to the narrow charset. Within libcxx, don't try to use codecvt to convert this to a narrow character encoding, but output these strings as such with fputwc. In wide mode, such strings could be output directly with fwrite too, but if the file descriptor hasn't been configured in wide mode, that breaks the output (which currently works reasonably). By always outputting one character at a time with fputwc, it works regardless of mode of the stdout file descriptor. For the narrow output stream, std::cout, outputting (via fwrite) does fail when the file descriptor is set to wide mode. This matches how it behaves with both MS STL and GNU libstdc++ too, so this is probably acceptable. This fixes https://github.com/llvm/llvm-project/issues/46646, and the downstream bugs https://github.com/mstorsjo/llvm-mingw/issues/145 and https://github.com/mstorsjo/llvm-mingw/issues/222. Differential Revision: https://reviews.llvm.org/D146398
2024-10-08 03:13:58 +00:00 · 2023-03-15 12:11:28 +02:00 · 2023-03-15 12:11:28 +02:00 · fcbbd9649a
commit fcbbd9649a
parent 4b9764959d
12 changed files with 387 additions and 36 deletions
--- a/libcxx/docs/UsingLibcxx.rst
+++ b/libcxx/docs/UsingLibcxx.rst
@ -552,3 +552,26 @@ Unpoisoning may not be an option, if (for example) you are not maintaining the a

 * You are using allocator, which does not call destructor during deallocation.
 * You are aware that memory allocated with an allocator may be accessed, even when unused by container.
+
+Platform specific behavior
+==========================
+
+Windows
+-------
+
+The ``stdout``, ``stderr``, and ``stdin`` file streams can be placed in
+Unicode mode by a suitable call to ``_setmode()``. When in this mode,
+the sequence of bytes read from, or written to, these streams is interpreted
+as a sequence of little-endian ``wchar_t`` elements. Thus, use of
+``std::cout``, ``std::cerr``, or ``std::cin`` with streams in Unicode mode
+will not behave as they usually do since bytes read or written won't be
+interpreted as individual ``char`` elements. However, ``std::wcout``,
+``std::wcerr``, and ``std::wcin`` will behave as expected.
+
+Wide character stream such as ``std::wcin`` or ``std::wcout`` imbued with a
+locale behave differently than they otherwise do. By default, wide character
+streams don't convert wide characters but input/output them as is. If a
+specific locale is imbued, the IO with the underlying stream happens with
+regular ``char`` elements, which are converted to/from wide characters
+according to the locale. Note that this doesn't behave as expected if the
+stream has been set in Unicode mode.
--- a/libcxx/src/std_stream.h
+++ b/libcxx/src/std_stream.h
@ -60,6 +60,12 @@ private:
    bool __last_consumed_is_next_;
    bool __always_noconv_;

+#if defined(_LIBCPP_WIN32API)
+    static constexpr bool __is_win32api_wide_char = !is_same_v<_CharT, char>;
+#else
+    static constexpr bool __is_win32api_wide_char = false;
+#endif
+
    __stdinbuf(const __stdinbuf&);
    __stdinbuf& operator=(const __stdinbuf&);

@ -74,6 +80,12 @@ __stdinbuf<_CharT>::__stdinbuf(FILE* __fp, state_type* __st)
      __last_consumed_is_next_(false)
 {
    imbue(this->getloc());
+    // On Windows, in wchar_t mode, ignore the codecvt from the locale by
+    // default and assume noconv; this passes wchar_t through unmodified from
+    // getwc. If the user sets a custom locale with imbue(), that gets honored,
+    // the IO is done with getc() and converted with the provided codecvt.
+    if constexpr (__is_win32api_wide_char)
+        __always_noconv_ = true;
 }

 template <class _CharT>
@ -101,6 +113,36 @@ __stdinbuf<_CharT>::uflow()
    return __getchar(true);
 }

+static bool __do_getc(FILE *__fp, char *__pbuf) {
+    int __c = getc(__fp);
+    if (__c == EOF)
+        return false;
+    *__pbuf = static_cast<char>(__c);
+    return true;
+}
+#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
+static bool __do_getc(FILE *__fp, wchar_t *__pbuf) {
+    wint_t __c = getwc(__fp);
+    if (__c == WEOF)
+        return false;
+    *__pbuf = static_cast<wchar_t>(__c);
+    return true;
+}
+#endif
+
+static bool __do_ungetc(int __c, FILE *__fp, char __dummy) {
+    if (ungetc(__c, __fp) == EOF)
+        return false;
+    return true;
+}
+#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
+static bool __do_ungetc(std::wint_t __c, FILE *__fp, wchar_t __dummy) {
+    if (ungetwc(__c, __fp) == WEOF)
+        return false;
+    return true;
+}
+#endif
+
 template <class _CharT>
 typename __stdinbuf<_CharT>::int_type
 __stdinbuf<_CharT>::__getchar(bool __consume)
@ -115,6 +157,20 @@ __stdinbuf<_CharT>::__getchar(bool __consume)
        }
        return __result;
    }
+    if (__always_noconv_) {
+        char_type __1buf;
+        if (!__do_getc(__file_, &__1buf))
+            return traits_type::eof();
+        if (!__consume)
+        {
+            if (!__do_ungetc(traits_type::to_int_type(__1buf), __file_, __1buf))
+                return traits_type::eof();
+        }
+        else
+            __last_consumed_ = traits_type::to_int_type(__1buf);
+        return traits_type::to_int_type(__1buf);
+    }
+
    char __extbuf[__limit];
    int __nread = _VSTD::max(1, __encoding_);
    for (int __i = 0; __i < __nread; ++__i)
@ -125,42 +181,37 @@ __stdinbuf<_CharT>::__getchar(bool __consume)
        __extbuf[__i] = static_cast<char>(__c);
    }
    char_type __1buf;
-    if (__always_noconv_)
-        __1buf = static_cast<char_type>(__extbuf[0]);
-    else
+    const char* __enxt;
+    char_type* __inxt;
+    codecvt_base::result __r;
+    do
    {
-        const char* __enxt;
-        char_type* __inxt;
-        codecvt_base::result __r;
-        do
+        state_type __sv_st = *__st_;
+        __r = __cv_->in(*__st_, __extbuf, __extbuf + __nread, __enxt,
+                               &__1buf, &__1buf + 1, __inxt);
+        switch (__r)
        {
-            state_type __sv_st = *__st_;
-            __r = __cv_->in(*__st_, __extbuf, __extbuf + __nread, __enxt,
-                                   &__1buf, &__1buf + 1, __inxt);
-            switch (__r)
-            {
-            case _VSTD::codecvt_base::ok:
-                break;
-            case codecvt_base::partial:
-                *__st_ = __sv_st;
-                if (__nread == sizeof(__extbuf))
-                    return traits_type::eof();
-                {
-                    int __c = getc(__file_);
-                    if (__c == EOF)
-                        return traits_type::eof();
-                    __extbuf[__nread] = static_cast<char>(__c);
-                }
-                ++__nread;
-                break;
-            case codecvt_base::error:
+        case _VSTD::codecvt_base::ok:
+            break;
+        case codecvt_base::partial:
+            *__st_ = __sv_st;
+            if (__nread == sizeof(__extbuf))
                return traits_type::eof();
-            case _VSTD::codecvt_base::noconv:
-                __1buf = static_cast<char_type>(__extbuf[0]);
-                break;
+            {
+                int __c = getc(__file_);
+                if (__c == EOF)
+                    return traits_type::eof();
+                __extbuf[__nread] = static_cast<char>(__c);
            }
-        } while (__r == _VSTD::codecvt_base::partial);
-    }
+            ++__nread;
+            break;
+        case codecvt_base::error:
+            return traits_type::eof();
+        case _VSTD::codecvt_base::noconv:
+            __1buf = static_cast<char_type>(__extbuf[0]);
+            break;
+        }
+    } while (__r == _VSTD::codecvt_base::partial);
    if (!__consume)
    {
        for (int __i = __nread; __i > 0;)
@ -188,8 +239,11 @@ __stdinbuf<_CharT>::pbackfail(int_type __c)
        }
        return __c;
    }
-    if (__last_consumed_is_next_)
-    {
+    if (__always_noconv_ && __last_consumed_is_next_) {
+        if (!__do_ungetc(__last_consumed_, __file_,
+                         traits_type::to_char_type(__last_consumed_)))
+            return traits_type::eof();
+    } else if (__last_consumed_is_next_) {
        char __extbuf[__limit];
        char* __enxt;
        const char_type __ci = traits_type::to_char_type(__last_consumed_);
@ -244,6 +298,12 @@ private:
    state_type* __st_;
    bool __always_noconv_;

+#if defined(_LIBCPP_WIN32API)
+    static constexpr bool __is_win32api_wide_char = !is_same_v<_CharT, char>;
+#else
+    static constexpr bool __is_win32api_wide_char = false;
+#endif
+
    __stdoutbuf(const __stdoutbuf&);
    __stdoutbuf& operator=(const __stdoutbuf&);
 };
@ -255,8 +315,31 @@ __stdoutbuf<_CharT>::__stdoutbuf(FILE* __fp, state_type* __st)
      __st_(__st),
      __always_noconv_(__cv_->always_noconv())
 {
+    // On Windows, in wchar_t mode, ignore the codecvt from the locale by
+    // default and assume noconv; this passes wchar_t through unmodified to
+    // fputwc, which handles it correctly depending on the actual mode of the
+    // output stream. If the user sets a custom locale with imbue(), that
+    // gets honored.
+    if constexpr (__is_win32api_wide_char)
+        __always_noconv_ = true;
 }

+static bool __do_fputc(char __c, FILE* __fp) {
+    if (fwrite(&__c, sizeof(__c), 1, __fp) != 1)
+        return false;
+    return true;
+}
+#ifndef _LIBCPP_HAS_NO_WIDE_CHARACTERS
+static bool __do_fputc(wchar_t __c, FILE* __fp) {
+    // fputwc works regardless of wide/narrow mode of stdout, while
+    // fwrite of wchar_t only works if the stream actually has been set
+    // into wide mode.
+    if (fputwc(__c, __fp) == WEOF)
+        return false;
+    return true;
+}
+#endif
+
 template <class _CharT>
 typename __stdoutbuf<_CharT>::int_type
 __stdoutbuf<_CharT>::overflow(int_type __c)
@ -268,7 +351,7 @@ __stdoutbuf<_CharT>::overflow(int_type __c)
        __1buf = traits_type::to_char_type(__c);
        if (__always_noconv_)
        {
-            if (fwrite(&__1buf, sizeof(char_type), 1, __file_) != 1)
+            if (!__do_fputc(__1buf, __file_))
                return traits_type::eof();
        }
        else
@ -313,7 +396,10 @@ template <class _CharT>
 streamsize
 __stdoutbuf<_CharT>::xsputn(const char_type* __s, streamsize __n)
 {
-    if (__always_noconv_)
+    // For wchar_t on Windows, don't call fwrite(), but write characters one
+    // at a time with fputwc(); that works both when stdout is in the default
+    // mode and when it is set to Unicode mode.
+    if (__always_noconv_ && !__is_win32api_wide_char)
        return fwrite(__s, sizeof(char_type), __n, __file_);
    streamsize __i = 0;
    for (; __i < __n; ++__i, ++__s)
--- a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/check-stderr.sh
+++ b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/check-stderr.sh
@ -0,0 +1,5 @@
+# Check that the stderr of the executed program matches a reference file.
+program=${1}
+expected_file=${2}
+${program} 2>stderr.log >stdout.log
+cmp stderr.log "${expected_file}"
--- a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/check-stdout.sh
+++ b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/check-stdout.sh
@ -0,0 +1,5 @@
+# Check that the stdout of the executed program matches a reference file.
+program=${1}
+expected_file=${2}
+${program} 2>stderr.log >stdout.log
+cmp stdout.log "${expected_file}"
--- a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/send-stdin.sh
+++ b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/send-stdin.sh
@ -0,0 +1,4 @@
+# Pass a reference file as stdin to a test executable.
+program=${1}
+input=${2}
+cat ${input} | ${program}
--- a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/test.dat
+++ b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/test.dat
--- a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcerr-imbue.sh.cpp
+++ b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcerr-imbue.sh.cpp
@ -0,0 +1,42 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <iostream>
+
+// istream wcerr;
+
+// UNSUPPORTED: no-wide-characters
+
+// UNSUPPORTED: executor-has-no-bash
+// FILE_DEPENDENCIES: ../check-stderr.sh
+// RUN: %{build}
+// RUN: %{exec} bash check-stderr.sh "%t.exe" "zzzz"
+
+#include <iostream>
+
+struct custom_codecvt : std::codecvt<wchar_t, char, std::mbstate_t> {
+  using base = std::codecvt<wchar_t, char, std::mbstate_t>;
+protected:
+  result do_out(std::mbstate_t&, const wchar_t *from, const wchar_t *from_end,
+                const wchar_t *&from_next, char *to, char *to_end, char *&to_next) const {
+    while (from != from_end && to != to_end) {
+      ++from;
+      *to++ = 'z';
+    }
+    from_next = from;
+    to_next = to;
+    return ok;
+  }
+};
+
+int main(int, char**) {
+    std::locale loc(std::locale::classic(), new custom_codecvt);
+    std::wcerr.imbue(loc);
+    std::wcerr << L"1234";
+    return 0;
+}
--- a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcerr-wide-mode.sh.cpp
+++ b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcerr-wide-mode.sh.cpp
@ -0,0 +1,32 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <iostream>
+
+// istream wcerr;
+
+// UNSUPPORTED: no-wide-characters
+// REQUIRES: target={{.+}}-windows-{{.+}}
+
+// UNSUPPORTED: executor-has-no-bash
+// FILE_DEPENDENCIES: check-stderr.sh, test.dat
+// RUN: %{build}
+// RUN: %{exec} bash check-stderr.sh "%t.exe" "test.dat"
+
+// Check that wcerr works, preserving the unicode characters, after switching
+// stderr to wide mode.
+
+#include <iostream>
+#include <io.h>
+#include <fcntl.h>
+
+int main(int, char**) {
+    _setmode(_fileno(stderr), _O_WTEXT);
+    std::wcerr << L"1234\u20ac\u00e5\u00e4\u00f6";
+    return 0;
+}
--- a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcin-imbue.sh.cpp
+++ b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcin-imbue.sh.cpp
@ -0,0 +1,45 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <iostream>
+
+// istream wcin;
+
+// UNSUPPORTED: no-wide-characters
+
+// UNSUPPORTED: executor-has-no-bash
+// FILE_DEPENDENCIES: ../send-stdin.sh
+// RUN: %{build}
+// RUN: %{exec} bash send-stdin.sh "%t.exe" "1234"
+
+#include <iostream>
+#include <cassert>
+
+struct custom_codecvt : std::codecvt<wchar_t, char, std::mbstate_t> {
+  using base = std::codecvt<wchar_t, char, std::mbstate_t>;
+protected:
+  result do_in(std::mbstate_t&, const char *from, const char *from_end,
+                const char *&from_next, wchar_t *to, wchar_t *to_end, wchar_t *&to_next) const {
+    while (from != from_end && to != to_end) {
+      ++from;
+      *to++ = L'z';
+    }
+    from_next = from;
+    to_next = to;
+    return ok;
+  }
+};
+
+int main(int, char**) {
+    std::locale loc(std::locale::classic(), new custom_codecvt);
+    std::wcin.imbue(loc);
+    std::wstring str;
+    std::wcin >> str;
+    assert(str == L"zzzz");
+    return 0;
+}
--- a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcin-wide-mode.sh.cpp
+++ b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcin-wide-mode.sh.cpp
@ -0,0 +1,35 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <iostream>
+
+// istream wcin;
+
+// UNSUPPORTED: no-wide-characters
+// REQUIRES: target={{.+}}-windows-{{.+}}
+
+// UNSUPPORTED: executor-has-no-bash
+// FILE_DEPENDENCIES: send-stdin.sh, test.dat
+// RUN: %{build}
+// RUN: %{exec} bash send-stdin.sh "%t.exe" "test.dat"
+
+// Check that wcin works, preserving the unicode characters, after switching
+// stdin to wide mode.
+
+#include <iostream>
+#include <cassert>
+#include <io.h>
+#include <fcntl.h>
+
+int main(int, char**) {
+    _setmode(_fileno(stdin), _O_WTEXT);
+    std::wstring str;
+    std::wcin >> str;
+    assert(str == L"1234\u20ac\u00e5\u00e4\u00f6");
+    return 0;
+}
--- a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcout-imbue.sh.cpp
+++ b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcout-imbue.sh.cpp
@ -0,0 +1,42 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <iostream>
+
+// istream wcout;
+
+// UNSUPPORTED: no-wide-characters
+
+// UNSUPPORTED: executor-has-no-bash
+// FILE_DEPENDENCIES: ../check-stdout.sh
+// RUN: %{build}
+// RUN: %{exec} bash check-stdout.sh "%t.exe" "zzzz"
+
+#include <iostream>
+
+struct custom_codecvt : std::codecvt<wchar_t, char, std::mbstate_t> {
+  using base = std::codecvt<wchar_t, char, std::mbstate_t>;
+protected:
+  result do_out(std::mbstate_t&, const wchar_t *from, const wchar_t *from_end,
+                const wchar_t *&from_next, char *to, char *to_end, char *&to_next) const {
+    while (from != from_end && to != to_end) {
+      ++from;
+      *to++ = 'z';
+    }
+    from_next = from;
+    to_next = to;
+    return ok;
+  }
+};
+
+int main(int, char**) {
+    std::locale loc(std::locale::classic(), new custom_codecvt);
+    std::wcout.imbue(loc);
+    std::wcout << L"1234";
+    return 0;
+}
--- a/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcout-wide-mode.sh.cpp
+++ b/libcxx/test/std/input.output/iostream.objects/wide.stream.objects/wcout-wide-mode.sh.cpp
@ -0,0 +1,32 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// <iostream>
+
+// istream wcout;
+
+// UNSUPPORTED: no-wide-characters
+// REQUIRES: target={{.+}}-windows-{{.+}}
+
+// UNSUPPORTED: executor-has-no-bash
+// FILE_DEPENDENCIES: check-stdout.sh, test.dat
+// RUN: %{build}
+// RUN: %{exec} bash check-stdout.sh "%t.exe" "test.dat"
+
+// Check that wcout works, preserving the unicode characters, after switching
+// stdout to wide mode.
+
+#include <iostream>
+#include <io.h>
+#include <fcntl.h>
+
+int main(int, char**) {
+    _setmode(_fileno(stdout), _O_WTEXT);
+    std::wcout << L"1234\u20ac\u00e5\u00e4\u00f6";
+    return 0;
+}