Implement a safer Unicode test

This new test doesn't use any Unicode in the source files, instead encoding all UTF-8 and UTF-16 as hex. The test script will be generated from a C file, in which both a shell script and a batch script will be generated, as well as the Unicode file to test. On Cygwin, MinGW, and MSYS, we will automatically bail from the shell script to the batch script, as cmd.exe has more reliable Unicode support, at least on Windows 7 and later. When the make rule is called, it first checks if `$LANG` contains UTF-8, defining the (overridable) ENABLE_UNICODE flag. If so, it will skip the test with a warning. Also fixed an issue with printf in multiInclude.c causing warnings on old MinGW versions which expect %I64, and updated the .gitignore.
2025-02-17 05:20:22 +00:00 · 2020-02-14 19:08:09 -05:00 · 2020-02-14 19:08:09 -05:00 · cac3ca4d5d
commit cac3ca4d5d
parent 9bd98b0b45
5 changed files with 169 additions and 13 deletions
--- a/.gitignore
+++ b/.gitignore
@ -1,20 +1,21 @@
 # objects
 *.o
+*.obj
 *.s

 # libraries
 libxxhash.*

 # Executables
+*.exe
 xxh32sum
 xxh64sum
 xxh128sum
 xxhsum
-xxhsum.exe
 xxhsum32
 xxhsum_privateXXH
 xxhsum_inlinedXXH
-xxhsum_inlinedXXH.exe
+tests/generate_unicode_test

 # compilation chain
 .clang_complete
@ -40,3 +41,5 @@ infer-out
 # test artifacts
 .test*
 tmp*
+tests/*.unicode
+tests/unicode_test*
--- a/12
+++ b/12
@ -169,15 +169,9 @@ check: xxhsum   ## basic tests for xxhsum CLI, set RUN_ENV for emulated environm
 	# request incorrect variant
 	$(RUN_ENV) ./xxhsum$(EXT) -H9 xxhash.c ; test $$? -eq 1

-# Make sure that Unicode works.
-# https://github.com/Cyan4973/xxHash/issues/293
-# Japanese: echo "This filename is Unicode." > "Unicode.txt"
 .PHONY: test-unicode
-test-unicode: xxhsum check
-	# Test Unicode filenames.
-	echo "このファイル名はユニコードです。" > "ユニコード.txt"
-	$(RUN_ENV) ./xxhsum$(EXT) "ユニコード.txt"
-	@$(RM) "ユニコード.txt"
+test-unicode:
+	$(MAKE) -C tests test_unicode

 .PHONY: test-mem
 VALGRIND = valgrind --leak-check=yes --error-exitcode=1
@ -295,7 +289,7 @@ test: all namespaceTest check test-xxhsum-c c90test test-tools

 .PHONY: test-inline
 test-inline:
-	$(MAKE) -C tests test
+	$(MAKE) -C tests test_multiInclude

 .PHONY: test-all
 test-all: CFLAGS += -Werror
--- a/tests/Makefile
+++ b/tests/Makefile
@ -3,6 +3,19 @@ CFLAGS += -Wall -Wextra -g
 NM = nm
 GREP = grep

+# Define *.exe as extension for Windows systems
+ifneq (,$(filter Windows%,$(OS)))
+EXT =.exe
+else
+EXT =
+endif
+
+ifneq (,$(filter %UTF-8,$(LANG)))
+ENABLE_UNICODE ?= 1
+else
+ENABLE_UNICODE ?= 0
+endif
+
 .PHONY: default
 default: all

@ -10,7 +23,7 @@ default: all
 all: test

 .PHONY: test
-test: test_multiinclude
+test: test_multiinclude test_unicode

 .PHONY: test_multiinclude
 test_multiinclude:
@ -34,6 +47,25 @@ test_multiinclude:
 	# ! $(NM) multiInclude | $(GREP) TESTN_
 	#@$(MAKE) clean

+xxhsum$(EXT): ../xxhash.c ../xxhash.h ../xxh3.h ../xxhsum.c
+	$(CC) $(CFLAGS) $(LDFLAGS) ../xxhash.c ../xxhsum.c -o $@
+
+# Make sure that Unicode filenames work.
+# https://github.com/Cyan4973/xxHash/issues/293
+.PHONY: test_unicode
+ifeq (0,$(ENABLE_UNICODE))
+test_unicode:
+	@echo "Skipping Unicode test, your terminal doesn't appear to support UTF-8."
+	@echo "Try with ENABLE_UNICODE=1"
+else
+test_unicode: xxhsum$(EXT) generate_unicode_test.c
+	# Generate a Unicode filename test dynamically
+	# to keep UTF-8 out of the source tree.
+	$(CC) $(CFLAGS) $(LDFLAGS) generate_unicode_test.c -o generate_unicode_test$(EXT)
+	./generate_unicode_test$(EXT)
+	$(SHELL) ./unicode_test.sh
+endif
+
 xxhash.o: ../xxhash.c ../xxhash.h
 	$(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -c -o $@ $<

@ -43,3 +75,4 @@ multiInclude_withxxhash: multiInclude.o xxhash.o
 clean:
 	@$(RM) *.o
 	@$(RM) multiInclude multiInclude_withxxhash
+	@$(RM) *.unicode generate_unicode_test$(EXT) unicode_test.* xxhsum$(EXT)
--- a/tests/generate_unicode_test.c
+++ b/tests/generate_unicode_test.c
@ -0,0 +1,126 @@
+/*
+ * Generates a Unicode test without using Unicode in the source
+ * files.
+ *
+ * Certain terminals don't properly handle UTF-8 (i.e. rxvt and command prompt
+ * in the default codepage), and that can cause issues when editing text
+ *
+ * We use this C file to generate a file with a Unicode filename, a
+ * file with a checksum of said file, and both a Windows batch script
+ * and a Unix shell script to test the file.
+ */
+
+#define _CRT_SECURE_NO_WARNINGS /* Silence warnings on MSVC */
+#include <stdio.h>
+
+/* Use a Japanese filename, something that can't be cheated with ANSI.
+ * yuniko-do.unicode (literally unicode.unicode) */
+
+/* Use raw hex values to ensure that the output is well-formed UTF-8. It is also more C90 compliant. */
+static const char FILENAME[] = {
+    (char)0xe3, (char)0x83, (char)0xa6,  /* U+30e6: Katakana letter yu */
+    (char)0xe3, (char)0x83, (char)0x8b,  /* U+30cb: Katakana letter ni */
+    (char)0xe3, (char)0x82, (char)0xb3,  /* U+30b3: Katakana letter ko */
+    (char)0xe3, (char)0x83, (char)0xbc,  /* U+30fc: Katakana-Hiragana prolonged sound mark (dash) */
+    (char)0xe3, (char)0x83, (char)0x89,  /* U+30c9: Katakana letter do */
+    '.','u','n','i','c','o','d','e','\0' /* ".unicode" (so we can glob in make clean and .gitignore) */
+};
+
+#ifdef _WIN32
+/* The same text as above, but encoded in Windows UTF-16. */
+static const wchar_t WFILENAME[] = { 0x30e6, 0x30cb, 0x30b3, 0x30fc, 0x30c9, L'.', L'u', L'n', L'i', L'c', L'o', L'd', L'e', L'\0' };
+#endif
+
+int main(void)
+{
+    FILE *f, *script, *checksum;
+
+    /* Create our Unicode file. Use _wfopen on Windows as fopen doesn't support Unicode filenames. */
+#ifdef _WIN32
+    if (!(f = _wfopen(WFILENAME, L"wb"))) return 1;
+#else
+    if (!(f = fopen(FILENAME, "wb"))) return 1;
+#endif
+    fprintf(f, "test\n");
+    fclose(f);
+
+    /* XXH64 checksum file with the precalculated checksum for said file. */
+    if (!(checksum = fopen("unicode_test.xxh64", "wb")))
+        return 1;
+    fprintf(checksum, "2d7f1808da1fa63c  %s\n", FILENAME);
+    fclose(checksum);
+
+
+    /* Create two scripts for both Windows and Unix. */
+
+    /* Generate a Windows batch script. Always insert CRLF manually. */
+    if (!(script = fopen("unicode_test.bat", "wb")))
+        return 1;
+
+    /* Disable echoing the commands. We do that ourselves the naive way. */
+    fprintf(script, "@echo off\r\n");
+
+    /* Change to codepage 65001 to enable UTF-8 support. */
+    fprintf(script, "chcp 65001 >NUL 2>&1\r\n");
+
+    /* First test a Unicode filename */
+    fprintf(script, "echo Testing filename provided on command line...\r\n");
+    fprintf(script, "echo xxhsum.exe \"%s\"\r\n", FILENAME);
+    fprintf(script, "xxhsum.exe \"%s\"\r\n", FILENAME);
+
+    /* Bail on error */
+    fprintf(script, "if %%ERRORLEVEL%% neq 0 (\r\n");
+    fprintf(script, "    exit /B %%ERRORLEVEL%%\r\n");
+    fprintf(script, ")\r\n");
+
+    /* Then test a checksum file. */
+    fprintf(script, "echo Testing a checksum file...\r\n");
+    fprintf(script, "echo xxhsum.exe -c unicode_test.xxh64\r\n");
+    fprintf(script, "xxhsum.exe -c unicode_test.xxh64\r\n");
+
+    fprintf(script, "exit /B %%ERRORLEVEL%%\r\n");
+
+    fclose(script);
+
+    /* Generate a Unix shell script */
+    if (!(script = fopen("unicode_test.sh", "wb")))
+        return 1;
+
+    fprintf(script, "#!/bin/sh\n");
+    /*
+     * Some versions of MSYS, MinGW and Cygwin do not support UTF-8, and the ones that
+     * don't may error with something like this:
+     *
+     *    Error: Could not open '<mojibake>.unicode': No such file or directory.
+     *
+     * which is an internal error that happens when it tries to convert MinGW/Cygwin
+     * paths to Windows paths.
+     *
+     * In that case, we bail to cmd.exe and the batch script, which supports UTF-8
+     * on Windows 7 and later.
+     */
+    fprintf(script, "case $(uname) in\n");
+    /* MinGW/MSYS converts /c to C:\ unless you have a double slash,
+     * Cygwin does not. */
+    fprintf(script, "    *CYGWIN*)\n");
+    fprintf(script, "        exec cmd.exe /c unicode_test.bat\n");
+    fprintf(script, "        ;;\n");
+    fprintf(script, "    *MINGW*|*MSYS*)\n");
+    fprintf(script, "        exec cmd.exe //c unicode_test.bat\n");
+    fprintf(script, "        ;;\n");
+    fprintf(script, "esac\n");
+
+    /* First test a Unicode filename */
+    fprintf(script, "echo Testing filename provided on command line...\n");
+    fprintf(script, "echo './xxhsum \"%s\" || exit $?'\n", FILENAME);
+    fprintf(script, "./xxhsum \"%s\" || exit $?\n", FILENAME);
+
+    /* Then test a checksum file. */
+    fprintf(script, "echo Testing a checksum file...\n");
+    fprintf(script, "echo './xxhsum -c unicode_test.xxh64 || exit $?'\n");
+    fprintf(script, "./xxhsum -c unicode_test.xxh64 || exit $?\n");
+
+    fclose(script);
+
+    return 0;
+}
--- a/tests/multiInclude.c
+++ b/tests/multiInclude.c
@ -55,7 +55,7 @@ int main(void)
    XXH3_64bits_update(&state, input, sizeof(input));

    XXH64_hash_t const h = XXH3_64bits_digest(&state);
-    printf("hash '%s' : %0llx \n", input, (unsigned long long)h);
+    printf("hash '%s' : %08x%08x \n", input, (unsigned)(h >> 32), (unsigned)h);

    return 0;
 }