Implement a safer Unicode test

This new test doesn't use any Unicode in the source files, instead
encoding all UTF-8 and UTF-16 as hex.

The test script will be generated from a C file, in which both a shell
script and a batch script will be generated, as well as the Unicode file
to test.

On Cygwin, MinGW, and MSYS, we will automatically bail from the shell
script to the batch script, as cmd.exe has more reliable Unicode
support, at least on Windows 7 and later.

When the make rule is called, it first checks if `$LANG` contains UTF-8,
defining the (overridable) ENABLE_UNICODE flag. If so, it will skip the
test with a warning.

Also fixed an issue with printf in multiInclude.c causing warnings on
old MinGW versions which expect %I64, and updated the .gitignore.
This commit is contained in:
easyaspi314 (Devin) 2020-02-14 19:08:09 -05:00
parent 9bd98b0b45
commit cac3ca4d5d
5 changed files with 169 additions and 13 deletions

7
.gitignore vendored
View File

@ -1,20 +1,21 @@
# objects
*.o
*.obj
*.s
# libraries
libxxhash.*
# Executables
*.exe
xxh32sum
xxh64sum
xxh128sum
xxhsum
xxhsum.exe
xxhsum32
xxhsum_privateXXH
xxhsum_inlinedXXH
xxhsum_inlinedXXH.exe
tests/generate_unicode_test
# compilation chain
.clang_complete
@ -40,3 +41,5 @@ infer-out
# test artifacts
.test*
tmp*
tests/*.unicode
tests/unicode_test*

View File

@ -169,15 +169,9 @@ check: xxhsum ## basic tests for xxhsum CLI, set RUN_ENV for emulated environm
# request incorrect variant
$(RUN_ENV) ./xxhsum$(EXT) -H9 xxhash.c ; test $$? -eq 1
# Make sure that Unicode works.
# https://github.com/Cyan4973/xxHash/issues/293
# Japanese: echo "This filename is Unicode." > "Unicode.txt"
.PHONY: test-unicode
test-unicode: xxhsum check
# Test Unicode filenames.
echo "このファイル名はユニコードです。" > "ユニコード.txt"
$(RUN_ENV) ./xxhsum$(EXT) "ユニコード.txt"
@$(RM) "ユニコード.txt"
test-unicode:
$(MAKE) -C tests test_unicode
.PHONY: test-mem
VALGRIND = valgrind --leak-check=yes --error-exitcode=1
@ -295,7 +289,7 @@ test: all namespaceTest check test-xxhsum-c c90test test-tools
.PHONY: test-inline
test-inline:
$(MAKE) -C tests test
$(MAKE) -C tests test_multiInclude
.PHONY: test-all
test-all: CFLAGS += -Werror

View File

@ -3,6 +3,19 @@ CFLAGS += -Wall -Wextra -g
NM = nm
GREP = grep
# Define *.exe as extension for Windows systems
ifneq (,$(filter Windows%,$(OS)))
EXT =.exe
else
EXT =
endif
ifneq (,$(filter %UTF-8,$(LANG)))
ENABLE_UNICODE ?= 1
else
ENABLE_UNICODE ?= 0
endif
.PHONY: default
default: all
@ -10,7 +23,7 @@ default: all
all: test
.PHONY: test
test: test_multiinclude
test: test_multiinclude test_unicode
.PHONY: test_multiinclude
test_multiinclude:
@ -34,6 +47,25 @@ test_multiinclude:
# ! $(NM) multiInclude | $(GREP) TESTN_
#@$(MAKE) clean
xxhsum$(EXT): ../xxhash.c ../xxhash.h ../xxh3.h ../xxhsum.c
$(CC) $(CFLAGS) $(LDFLAGS) ../xxhash.c ../xxhsum.c -o $@
# Make sure that Unicode filenames work.
# https://github.com/Cyan4973/xxHash/issues/293
.PHONY: test_unicode
ifeq (0,$(ENABLE_UNICODE))
test_unicode:
@echo "Skipping Unicode test, your terminal doesn't appear to support UTF-8."
@echo "Try with ENABLE_UNICODE=1"
else
test_unicode: xxhsum$(EXT) generate_unicode_test.c
# Generate a Unicode filename test dynamically
# to keep UTF-8 out of the source tree.
$(CC) $(CFLAGS) $(LDFLAGS) generate_unicode_test.c -o generate_unicode_test$(EXT)
./generate_unicode_test$(EXT)
$(SHELL) ./unicode_test.sh
endif
xxhash.o: ../xxhash.c ../xxhash.h
$(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -c -o $@ $<
@ -43,3 +75,4 @@ multiInclude_withxxhash: multiInclude.o xxhash.o
clean:
@$(RM) *.o
@$(RM) multiInclude multiInclude_withxxhash
@$(RM) *.unicode generate_unicode_test$(EXT) unicode_test.* xxhsum$(EXT)

View File

@ -0,0 +1,126 @@
/*
* Generates a Unicode test without using Unicode in the source
* files.
*
* Certain terminals don't properly handle UTF-8 (i.e. rxvt and command prompt
* in the default codepage), and that can cause issues when editing text
*
* We use this C file to generate a file with a Unicode filename, a
* file with a checksum of said file, and both a Windows batch script
* and a Unix shell script to test the file.
*/
#define _CRT_SECURE_NO_WARNINGS /* Silence warnings on MSVC */
#include <stdio.h>
/* Use a Japanese filename, something that can't be cheated with ANSI.
* yuniko-do.unicode (literally unicode.unicode) */
/* Use raw hex values to ensure that the output is well-formed UTF-8. It is also more C90 compliant. */
static const char FILENAME[] = {
(char)0xe3, (char)0x83, (char)0xa6, /* U+30e6: Katakana letter yu */
(char)0xe3, (char)0x83, (char)0x8b, /* U+30cb: Katakana letter ni */
(char)0xe3, (char)0x82, (char)0xb3, /* U+30b3: Katakana letter ko */
(char)0xe3, (char)0x83, (char)0xbc, /* U+30fc: Katakana-Hiragana prolonged sound mark (dash) */
(char)0xe3, (char)0x83, (char)0x89, /* U+30c9: Katakana letter do */
'.','u','n','i','c','o','d','e','\0' /* ".unicode" (so we can glob in make clean and .gitignore) */
};
#ifdef _WIN32
/* The same text as above, but encoded in Windows UTF-16. */
static const wchar_t WFILENAME[] = { 0x30e6, 0x30cb, 0x30b3, 0x30fc, 0x30c9, L'.', L'u', L'n', L'i', L'c', L'o', L'd', L'e', L'\0' };
#endif
int main(void)
{
FILE *f, *script, *checksum;
/* Create our Unicode file. Use _wfopen on Windows as fopen doesn't support Unicode filenames. */
#ifdef _WIN32
if (!(f = _wfopen(WFILENAME, L"wb"))) return 1;
#else
if (!(f = fopen(FILENAME, "wb"))) return 1;
#endif
fprintf(f, "test\n");
fclose(f);
/* XXH64 checksum file with the precalculated checksum for said file. */
if (!(checksum = fopen("unicode_test.xxh64", "wb")))
return 1;
fprintf(checksum, "2d7f1808da1fa63c %s\n", FILENAME);
fclose(checksum);
/* Create two scripts for both Windows and Unix. */
/* Generate a Windows batch script. Always insert CRLF manually. */
if (!(script = fopen("unicode_test.bat", "wb")))
return 1;
/* Disable echoing the commands. We do that ourselves the naive way. */
fprintf(script, "@echo off\r\n");
/* Change to codepage 65001 to enable UTF-8 support. */
fprintf(script, "chcp 65001 >NUL 2>&1\r\n");
/* First test a Unicode filename */
fprintf(script, "echo Testing filename provided on command line...\r\n");
fprintf(script, "echo xxhsum.exe \"%s\"\r\n", FILENAME);
fprintf(script, "xxhsum.exe \"%s\"\r\n", FILENAME);
/* Bail on error */
fprintf(script, "if %%ERRORLEVEL%% neq 0 (\r\n");
fprintf(script, " exit /B %%ERRORLEVEL%%\r\n");
fprintf(script, ")\r\n");
/* Then test a checksum file. */
fprintf(script, "echo Testing a checksum file...\r\n");
fprintf(script, "echo xxhsum.exe -c unicode_test.xxh64\r\n");
fprintf(script, "xxhsum.exe -c unicode_test.xxh64\r\n");
fprintf(script, "exit /B %%ERRORLEVEL%%\r\n");
fclose(script);
/* Generate a Unix shell script */
if (!(script = fopen("unicode_test.sh", "wb")))
return 1;
fprintf(script, "#!/bin/sh\n");
/*
* Some versions of MSYS, MinGW and Cygwin do not support UTF-8, and the ones that
* don't may error with something like this:
*
* Error: Could not open '<mojibake>.unicode': No such file or directory.
*
* which is an internal error that happens when it tries to convert MinGW/Cygwin
* paths to Windows paths.
*
* In that case, we bail to cmd.exe and the batch script, which supports UTF-8
* on Windows 7 and later.
*/
fprintf(script, "case $(uname) in\n");
/* MinGW/MSYS converts /c to C:\ unless you have a double slash,
* Cygwin does not. */
fprintf(script, " *CYGWIN*)\n");
fprintf(script, " exec cmd.exe /c unicode_test.bat\n");
fprintf(script, " ;;\n");
fprintf(script, " *MINGW*|*MSYS*)\n");
fprintf(script, " exec cmd.exe //c unicode_test.bat\n");
fprintf(script, " ;;\n");
fprintf(script, "esac\n");
/* First test a Unicode filename */
fprintf(script, "echo Testing filename provided on command line...\n");
fprintf(script, "echo './xxhsum \"%s\" || exit $?'\n", FILENAME);
fprintf(script, "./xxhsum \"%s\" || exit $?\n", FILENAME);
/* Then test a checksum file. */
fprintf(script, "echo Testing a checksum file...\n");
fprintf(script, "echo './xxhsum -c unicode_test.xxh64 || exit $?'\n");
fprintf(script, "./xxhsum -c unicode_test.xxh64 || exit $?\n");
fclose(script);
return 0;
}

View File

@ -55,7 +55,7 @@ int main(void)
XXH3_64bits_update(&state, input, sizeof(input));
XXH64_hash_t const h = XXH3_64bits_digest(&state);
printf("hash '%s' : %0llx \n", input, (unsigned long long)h);
printf("hash '%s' : %08x%08x \n", input, (unsigned)(h >> 32), (unsigned)h);
return 0;
}