Merge pull request #304 from easyaspi314/unicode-windows-fixes

Fix Unicode support on Windows, minor Windows tweaks
2024-11-24 06:59:40 +00:00 · 2020-02-24 09:55:47 -08:00 · 2020-02-24 09:55:47 -08:00 · 64f655a28e
commit 64f655a28e
parent 71f0f6ffd3 0197a2b5b0
6 changed files with 346 additions and 28 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,20 +1,21 @@
 # objects
 *.o
+*.obj
 *.s

 # libraries
 libxxhash.*

 # Executables
+*.exe
 xxh32sum
 xxh64sum
 xxh128sum
 xxhsum
-xxhsum.exe
 xxhsum32
 xxhsum_privateXXH
 xxhsum_inlinedXXH
-xxhsum_inlinedXXH.exe
+tests/generate_unicode_test

 # compilation chain
 .clang_complete
@ -40,3 +41,5 @@ infer-out
 # test artifacts
 .test*
 tmp*
+tests/*.unicode
+tests/unicode_test*
--- a/33
+++ b/33
@ -76,20 +76,22 @@ default: lib xxhsum_and_links
 all: lib xxhsum xxhsum_inlinedXXH

 xxhsum: xxhash.o xxhsum.o  ## generate command line interface (CLI)
+	$(CC) $(FLAGS) $^ $(LDFLAGS) -o $@$(EXT)

 xxhsum32: CFLAGS += -m32  ## generate CLI in 32-bits mode
 xxhsum32: xxhash.c xxhsum.c  ## do not generate object (avoid mixing different ABI)
 	$(CC) $(FLAGS) $^ $(LDFLAGS) -o $@$(EXT)

-xxhash.o: xxhash.h xxh3.h
-
-xxhsum.o: xxhash.h
+xxhash.o: xxhash.c xxhash.h xxh3.h
+	$(CC) $(FLAGS) -c $< -o $@
+xxhsum.o: xxhsum.c xxhash.h
+	$(CC) $(FLAGS) -c $< -o $@

 .PHONY: xxhsum_and_links
 xxhsum_and_links: xxhsum xxh32sum xxh64sum xxh128sum

 xxh32sum xxh64sum xxh128sum: xxhsum
-	ln -sf $^ $@
+	ln -sf $<$(EXT) $@$(EXT)

 xxhsum_inlinedXXH: CPPFLAGS += -DXXH_INLINE_ALL
 xxhsum_inlinedXXH: xxhsum.c
@ -141,7 +143,7 @@ clean:  ## remove all build artifacts
 	@$(RM) -r *.dSYM   # Mac OS-X specific
 	@$(RM) core *.o libxxhash.*
 	@$(RM) xxhsum$(EXT) xxhsum32$(EXT) xxhsum_inlinedXXH$(EXT)
-	@$(RM) xxh32sum xxh64sum xxh128sum
+	@$(RM) xxh32sum$(EXT) xxh64sum$(EXT) xxh128sum$(EXT)
 	@echo cleaning completed


@ -154,20 +156,23 @@ clean:  ## remove all build artifacts
 .PHONY: check
 check: xxhsum   ## basic tests for xxhsum CLI, set RUN_ENV for emulated environments
 	# stdin
-	$(RUN_ENV) ./xxhsum < xxhash.c
+	$(RUN_ENV) ./xxhsum$(EXT) < xxhash.c
 	# multiple files
-	$(RUN_ENV) ./xxhsum xxhash.* xxhsum.*
+	$(RUN_ENV) ./xxhsum$(EXT) xxhash.* xxhsum.*
 	# internal bench
-	$(RUN_ENV) ./xxhsum -bi1
+	$(RUN_ENV) ./xxhsum$(EXT) -bi1
 	# file bench
-	$(RUN_ENV) ./xxhsum -bi1 xxhash.c
+	$(RUN_ENV) ./xxhsum$(EXT) -bi1 xxhash.c
 	# 32-bit
-	$(RUN_ENV) ./xxhsum -H0 xxhash.c
+	$(RUN_ENV) ./xxhsum$(EXT) -H0 xxhash.c
 	# 128-bit
-	$(RUN_ENV) ./xxhsum -H2 xxhash.c
+	$(RUN_ENV) ./xxhsum$(EXT) -H2 xxhash.c
 	# request incorrect variant
-	$(RUN_ENV) ./xxhsum -H9 xxhash.c ; test $$? -eq 1
+	$(RUN_ENV) ./xxhsum$(EXT) -H9 xxhash.c ; test $$? -eq 1

+.PHONY: test-unicode
+test-unicode:
+	$(MAKE) -C tests test_unicode

 .PHONY: test-mem
 VALGRIND = valgrind --leak-check=yes --error-exitcode=1
@ -285,11 +290,11 @@ test: all namespaceTest check test-xxhsum-c c90test test-tools

 .PHONY: test-inline
 test-inline:
-	$(MAKE) -C tests test
+	$(MAKE) -C tests test_multiInclude

 .PHONY: test-all
 test-all: CFLAGS += -Werror
-test-all: test test32 clangtest cxxtest usan test-inline listL120 trailingWhitespace staticAnalyze
+test-all: test test32 clangtest cxxtest usan test-inline listL120 trailingWhitespace staticAnalyze test-unicode

 .PHONY: test-tools
 test-tools:
--- a/tests/Makefile
+++ b/tests/Makefile
@ -3,6 +3,19 @@ CFLAGS += -Wall -Wextra -g
 NM = nm
 GREP = grep

+# Define *.exe as extension for Windows systems
+ifneq (,$(filter Windows%,$(OS)))
+EXT =.exe
+else
+EXT =
+endif
+
+ifneq (,$(filter %UTF-8,$(LANG)))
+ENABLE_UNICODE ?= 1
+else
+ENABLE_UNICODE ?= 0
+endif
+
 .PHONY: default
 default: all

@ -10,10 +23,10 @@ default: all
 all: test

 .PHONY: test
-test: test_multiinclude
+test: test_multiInclude test_unicode

-.PHONY: test_multiinclude
-test_multiinclude:
+.PHONY: test_multiInclude
+test_multiInclude:
 	@$(MAKE) clean
 	# compile without xxhash.o, ensure symbols exist within target
 	# note : built using only default rules
@ -34,6 +47,25 @@ test_multiinclude:
 	# ! $(NM) multiInclude | $(GREP) TESTN_
 	#@$(MAKE) clean

+xxhsum$(EXT): ../xxhash.c ../xxhash.h ../xxh3.h ../xxhsum.c
+	$(CC) $(CFLAGS) $(LDFLAGS) ../xxhash.c ../xxhsum.c -o $@
+
+# Make sure that Unicode filenames work.
+# https://github.com/Cyan4973/xxHash/issues/293
+.PHONY: test_unicode
+ifeq (0,$(ENABLE_UNICODE))
+test_unicode:
+	@echo "Skipping Unicode test, your terminal doesn't appear to support UTF-8."
+	@echo "Try with ENABLE_UNICODE=1"
+else
+test_unicode: xxhsum$(EXT) generate_unicode_test.c
+	# Generate a Unicode filename test dynamically
+	# to keep UTF-8 out of the source tree.
+	$(CC) $(CFLAGS) $(LDFLAGS) generate_unicode_test.c -o generate_unicode_test$(EXT)
+	./generate_unicode_test$(EXT)
+	$(SHELL) ./unicode_test.sh
+endif
+
 xxhash.o: ../xxhash.c ../xxhash.h
 	$(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -c -o $@ $<

@ -43,3 +75,4 @@ multiInclude_withxxhash: multiInclude.o xxhash.o
 clean:
 	@$(RM) *.o
 	@$(RM) multiInclude multiInclude_withxxhash
+	@$(RM) *.unicode generate_unicode_test$(EXT) unicode_test.* xxhsum$(EXT)
--- a/tests/generate_unicode_test.c
+++ b/tests/generate_unicode_test.c
@ -0,0 +1,126 @@
+/*
+ * Generates a Unicode test without using Unicode in the source
+ * files.
+ *
+ * Certain terminals don't properly handle UTF-8 (i.e. rxvt and command prompt
+ * in the default codepage), and that can cause issues when editing text
+ *
+ * We use this C file to generate a file with a Unicode filename, a
+ * file with a checksum of said file, and both a Windows batch script
+ * and a Unix shell script to test the file.
+ */
+
+#define _CRT_SECURE_NO_WARNINGS /* Silence warnings on MSVC */
+#include <stdio.h>
+
+/* Use a Japanese filename, something that can't be cheated with ANSI.
+ * yuniko-do.unicode (literally unicode.unicode) */
+
+/* Use raw hex values to ensure that the output is well-formed UTF-8. It is also more C90 compliant. */
+static const char FILENAME[] = {
+    (char)0xe3, (char)0x83, (char)0xa6,  /* U+30e6: Katakana letter yu */
+    (char)0xe3, (char)0x83, (char)0x8b,  /* U+30cb: Katakana letter ni */
+    (char)0xe3, (char)0x82, (char)0xb3,  /* U+30b3: Katakana letter ko */
+    (char)0xe3, (char)0x83, (char)0xbc,  /* U+30fc: Katakana-Hiragana prolonged sound mark (dash) */
+    (char)0xe3, (char)0x83, (char)0x89,  /* U+30c9: Katakana letter do */
+    '.','u','n','i','c','o','d','e','\0' /* ".unicode" (so we can glob in make clean and .gitignore) */
+};
+
+#ifdef _WIN32
+/* The same text as above, but encoded in Windows UTF-16. */
+static const wchar_t WFILENAME[] = { 0x30e6, 0x30cb, 0x30b3, 0x30fc, 0x30c9, L'.', L'u', L'n', L'i', L'c', L'o', L'd', L'e', L'\0' };
+#endif
+
+int main(void)
+{
+    FILE *f, *script, *checksum;
+
+    /* Create our Unicode file. Use _wfopen on Windows as fopen doesn't support Unicode filenames. */
+#ifdef _WIN32
+    if (!(f = _wfopen(WFILENAME, L"wb"))) return 1;
+#else
+    if (!(f = fopen(FILENAME, "wb"))) return 1;
+#endif
+    fprintf(f, "test\n");
+    fclose(f);
+
+    /* XXH64 checksum file with the precalculated checksum for said file. */
+    if (!(checksum = fopen("unicode_test.xxh64", "wb")))
+        return 1;
+    fprintf(checksum, "2d7f1808da1fa63c  %s\n", FILENAME);
+    fclose(checksum);
+
+
+    /* Create two scripts for both Windows and Unix. */
+
+    /* Generate a Windows batch script. Always insert CRLF manually. */
+    if (!(script = fopen("unicode_test.bat", "wb")))
+        return 1;
+
+    /* Disable echoing the commands. We do that ourselves the naive way. */
+    fprintf(script, "@echo off\r\n");
+
+    /* Change to codepage 65001 to enable UTF-8 support. */
+    fprintf(script, "chcp 65001 >NUL 2>&1\r\n");
+
+    /* First test a Unicode filename */
+    fprintf(script, "echo Testing filename provided on command line...\r\n");
+    fprintf(script, "echo xxhsum.exe \"%s\"\r\n", FILENAME);
+    fprintf(script, "xxhsum.exe \"%s\"\r\n", FILENAME);
+
+    /* Bail on error */
+    fprintf(script, "if %%ERRORLEVEL%% neq 0 (\r\n");
+    fprintf(script, "    exit /B %%ERRORLEVEL%%\r\n");
+    fprintf(script, ")\r\n");
+
+    /* Then test a checksum file. */
+    fprintf(script, "echo Testing a checksum file...\r\n");
+    fprintf(script, "echo xxhsum.exe -c unicode_test.xxh64\r\n");
+    fprintf(script, "xxhsum.exe -c unicode_test.xxh64\r\n");
+
+    fprintf(script, "exit /B %%ERRORLEVEL%%\r\n");
+
+    fclose(script);
+
+    /* Generate a Unix shell script */
+    if (!(script = fopen("unicode_test.sh", "wb")))
+        return 1;
+
+    fprintf(script, "#!/bin/sh\n");
+    /*
+     * Some versions of MSYS, MinGW and Cygwin do not support UTF-8, and the ones that
+     * don't may error with something like this:
+     *
+     *    Error: Could not open '<mojibake>.unicode': No such file or directory.
+     *
+     * which is an internal error that happens when it tries to convert MinGW/Cygwin
+     * paths to Windows paths.
+     *
+     * In that case, we bail to cmd.exe and the batch script, which supports UTF-8
+     * on Windows 7 and later.
+     */
+    fprintf(script, "case $(uname) in\n");
+    /* MinGW/MSYS converts /c to C:\ unless you have a double slash,
+     * Cygwin does not. */
+    fprintf(script, "    *CYGWIN*)\n");
+    fprintf(script, "        exec cmd.exe /c unicode_test.bat\n");
+    fprintf(script, "        ;;\n");
+    fprintf(script, "    *MINGW*|*MSYS*)\n");
+    fprintf(script, "        exec cmd.exe //c unicode_test.bat\n");
+    fprintf(script, "        ;;\n");
+    fprintf(script, "esac\n");
+
+    /* First test a Unicode filename */
+    fprintf(script, "echo Testing filename provided on command line...\n");
+    fprintf(script, "echo './xxhsum \"%s\" || exit $?'\n", FILENAME);
+    fprintf(script, "./xxhsum \"%s\" || exit $?\n", FILENAME);
+
+    /* Then test a checksum file. */
+    fprintf(script, "echo Testing a checksum file...\n");
+    fprintf(script, "echo './xxhsum -c unicode_test.xxh64 || exit $?'\n");
+    fprintf(script, "./xxhsum -c unicode_test.xxh64 || exit $?\n");
+
+    fclose(script);
+
+    return 0;
+}
--- a/tests/multiInclude.c
+++ b/tests/multiInclude.c
@ -55,7 +55,7 @@ int main(void)
    XXH3_64bits_update(&state, input, sizeof(input));

    XXH64_hash_t const h = XXH3_64bits_digest(&state);
-    printf("hash '%s' : %0llx \n", input, (unsigned long long)h);
+    printf("hash '%s' : %08x%08x \n", input, (unsigned)(h >> 32), (unsigned)h);

    return 0;
 }
--- a/xxhsum.c
+++ b/xxhsum.c
@ -126,6 +126,69 @@ static __inline int IS_CONSOLE(FILE* stdStream) {
 #  define S_ISREG(x) (((x) & S_IFMT) == S_IFREG)
 #endif

+/* Unicode helpers for Windows */
+#if defined(_WIN32)
+/* Converts a UTF-8 string to UTF-16. Acts like strdup. The string must be freed afterwards. */
+static wchar_t *utf8_to_utf16(const char *str)
+{
+    int len = MultiByteToWideChar(CP_UTF8, 0, str, -1, NULL, 0);
+    if (len == 0) {
+        return NULL;
+    }
+    {   wchar_t *buf = (wchar_t *)malloc((size_t)len * sizeof(wchar_t));
+        if (buf != NULL) {
+            if (MultiByteToWideChar(CP_UTF8, 0, str, -1, buf, len) == 0) {
+                free(buf);
+                return NULL;
+            }
+       }
+       return buf;
+    }
+}
+/* Converts a UTF-16 string to UTF-8. Acts like strdup. The string must be freed afterwards. */
+static char *utf16_to_utf8(const wchar_t *str)
+{
+    int len = WideCharToMultiByte(CP_UTF8, 0, str, -1, NULL, 0, NULL, NULL);
+    if (len == 0) {
+        return NULL;
+    }
+    {   char *buf = (char *)malloc((size_t)len * sizeof(char));
+        if (buf != NULL) {
+            if (WideCharToMultiByte(CP_UTF8, 0, str, -1, buf, len, NULL, NULL) == 0) {
+                free(buf);
+                return NULL;
+            }
+       }
+       return buf;
+    }
+}
+
+/*
+ * fopen on Windows, like main's argv, is useless.
+ *
+ * fopen will only accept ANSI filenames, which means that we can't open Unicode filenames.
+ *
+ * In order to open a Unicode filename, we need to convert filenames to UTF-16 and use _wfopen.
+ */
+static FILE *XXH_fopen_wrapped(const char *filename, const wchar_t *mode)
+{
+    FILE *f = NULL;
+    wchar_t *wide_filename = utf8_to_utf16(filename);
+    if (wide_filename != NULL) {
+        f = _wfopen(wide_filename, mode);
+        free(wide_filename);
+    }
+    return f;
+}
+
+/*
+ * Since we always use literals in the "mode" argument, it is just easier to append "L" to
+ * the string to make it UTF-16 and avoid the hassle of a second manual conversion.
+ */
+#  define XXH_fopen(filename, mode) XXH_fopen_wrapped(filename, L##mode)
+#else
+#  define XXH_fopen(filename, mode) fopen(filename, mode)
+#endif

 /* ************************************
 *  Basic Types
@ -489,7 +552,7 @@ static size_t BMK_selectBenchedSize(const char* fileName)
 }


-static int BMK_benchFiles(const char** fileNamesTable, int nbFiles, U32 specificTest)
+static int BMK_benchFiles(char** fileNamesTable, int nbFiles, U32 specificTest)
 {
    int result = 0;
    int fileIdx;
@ -498,7 +561,7 @@ static int BMK_benchFiles(const char** fileNamesTable, int nbFiles, U32 specific
        const char* const inFileName = fileNamesTable[fileIdx];
        assert(inFileName != NULL);
        {
-            FILE* const inFile = fopen( inFileName, "rb" );
+            FILE* const inFile = XXH_fopen( inFileName, "rb" );
            size_t const benchedSize = BMK_selectBenchedSize(inFileName);
            char* const buffer = (char*)calloc(benchedSize+16+3, 1);
            void* const alignedBuffer = (buffer+15) - (((size_t)(buffer+15)) & 0xF);  /* align on next 16 bytes */
@ -1020,7 +1083,7 @@ static int BMK_hash(const char* fileName,
        fileName = "stdin";
        SET_BINARY_MODE(stdin);
    } else {
-        inFile = fopen( fileName, "rb" );
+        inFile = XXH_fopen( fileName, "rb" );
    }
    if (inFile==NULL) {
        DISPLAY("Error: Could not open '%s': %s. \n", fileName, strerror(errno));
@ -1090,7 +1153,7 @@ static int BMK_hash(const char* fileName,
 /* BMK_hashFiles:
 * if fnTotal==0, read from stdin insteal
 */
-static int BMK_hashFiles(const char** fnList, int fnTotal,
+static int BMK_hashFiles(char** fnList, int fnTotal,
                         algoType hashType, endianess displayEndianess)
 {
    int fnNb;
@ -1408,7 +1471,7 @@ static void parseFile1(ParseFileArg* parseFileArg)
        }

        do {
-            FILE* const fp = fopen(parsedLine.filename, "rb");
+            FILE* const fp = XXH_fopen(parsedLine.filename, "rb");
            if (fp == NULL) {
                lineStatus = LineStatus_failedToOpen;
                break;
@ -1519,7 +1582,7 @@ static int checkFile(const char* inFileName,
        inFileName = "stdin";
        inFile = stdin;
    } else {
-        inFile = fopen( inFileName, "rt" );
+        inFile = XXH_fopen( inFileName, "rt" );
    }

    if (inFile == NULL) {
@ -1577,7 +1640,7 @@ static int checkFile(const char* inFileName,
 }


-static int checkFiles(const char** fnList, int fnTotal,
+static int checkFiles(char** fnList, int fnTotal,
                      const endianess displayEndianess,
                      U32 strictMode,
                      U32 statusOnly,
@ -1693,7 +1756,7 @@ static unsigned readU32FromChar(const char** stringPtr) {
    return result;
 }

-int main(int argc, const char** argv)
+static int XXH_main(int argc, char** argv)
 {
    int i, filenamesStart = 0;
    const char* const exename = argv[0];
@ -1815,3 +1878,91 @@ int main(int argc, const char** argv)
        return BMK_hashFiles(argv+filenamesStart, argc-filenamesStart, algo, displayEndianess);
    }
 }
+
+#if defined(_WIN32)
+/* Converts a UTF-16 argv to UTF-8. */
+static char **convert_argv(int argc, wchar_t **argv)
+{
+    char **buf = (char **)malloc((size_t)(argc + 1) * sizeof(char *));
+    if (buf != NULL) {
+        int i;
+        for (i = 0; i < argc; i++) {
+            buf[i] = utf16_to_utf8(argv[i]);
+        }
+        buf[argc] = NULL;
+    }
+    return buf;
+}
+/* Frees arguments returned by convert_argv */
+static void free_argv(int argc, char **argv)
+{
+    int i;
+    if (argv == NULL) {
+        return;
+    }
+    for (i = 0; i < argc; i++) {
+        free(argv[i]);
+    }
+    free(argv);
+}
+
+/*
+ * On Windows, main's argv parameter is useless. Instead of UTF-8, you get ANSI
+ * encoding, and unknown characters will show up as mojibake.
+ *
+ * While this doesn't affect most programs, what does happen is that we can't
+ * open any files with Unicode filenames.
+ *
+ * On MSVC or when -municode is used in MSYS2, we can just use wmain to get
+ * UTF-16 command line arguments and convert the to UTF-8.
+ *
+ * However, without the -municode flag (which isn't even available on the
+ * original MinGW), we will get a linker error.
+ *
+ * To fix this, we can combine main with GetCommandLineW and
+ * CommandLineToArgvW to get the real UTF-16 arguments.
+ */
+#if defined(_MSC_VER) || defined(_UNICODE) || defined(UNICODE)
+
+#if defined(__cplusplus)
+extern "C"
+#endif
+int wmain(int argc, wchar_t **utf16_argv)
+{
+    char **argv;
+#else
+int main(int argc, char **argv)
+{
+    wchar_t **utf16_argv = CommandLineToArgvW(GetCommandLineW(), &argc);
+#endif
+    int ret;
+    /* Convert the UTF-16 arguments to UTF-8. */
+    argv = convert_argv(argc, utf16_argv);
+
+    if (argv == NULL) {
+        fprintf(stderr, "Error converting command line arguments!\n");
+        /* return 1; */
+        ret = 1;
+    } else {
+        /* While we're here, we will set stderr to unbuffered mode to make text
+         * display instantly on MinGW. */
+        setvbuf(stderr, NULL, _IONBF, 0);
+
+        /* Call our real main function */
+        ret = XXH_main(argc, argv);
+
+        free_argv(argc, argv);
+    }
+#if !(defined(_MSC_VER) || defined(_UNICODE) || defined(UNICODE))
+    /* CommandLineToArgvW needs to be freed with LocalFree. */
+    LocalFree(utf16_argv);
+#endif
+    return ret;
+}
+
+#else
+int main(int argc, char **argv)
+{
+    return XXH_main(argc, argv);
+}
+#endif