Merge pull request #304 from easyaspi314/unicode-windows-fixes

Fix Unicode support on Windows, minor Windows tweaks
This commit is contained in:
Yann Collet 2020-02-24 09:55:47 -08:00 committed by GitHub
commit 64f655a28e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
6 changed files with 346 additions and 28 deletions

7
.gitignore vendored
View File

@ -1,20 +1,21 @@
# objects
*.o
*.obj
*.s
# libraries
libxxhash.*
# Executables
*.exe
xxh32sum
xxh64sum
xxh128sum
xxhsum
xxhsum.exe
xxhsum32
xxhsum_privateXXH
xxhsum_inlinedXXH
xxhsum_inlinedXXH.exe
tests/generate_unicode_test
# compilation chain
.clang_complete
@ -40,3 +41,5 @@ infer-out
# test artifacts
.test*
tmp*
tests/*.unicode
tests/unicode_test*

View File

@ -76,20 +76,22 @@ default: lib xxhsum_and_links
all: lib xxhsum xxhsum_inlinedXXH
xxhsum: xxhash.o xxhsum.o ## generate command line interface (CLI)
$(CC) $(FLAGS) $^ $(LDFLAGS) -o $@$(EXT)
xxhsum32: CFLAGS += -m32 ## generate CLI in 32-bits mode
xxhsum32: xxhash.c xxhsum.c ## do not generate object (avoid mixing different ABI)
$(CC) $(FLAGS) $^ $(LDFLAGS) -o $@$(EXT)
xxhash.o: xxhash.h xxh3.h
xxhsum.o: xxhash.h
xxhash.o: xxhash.c xxhash.h xxh3.h
$(CC) $(FLAGS) -c $< -o $@
xxhsum.o: xxhsum.c xxhash.h
$(CC) $(FLAGS) -c $< -o $@
.PHONY: xxhsum_and_links
xxhsum_and_links: xxhsum xxh32sum xxh64sum xxh128sum
xxh32sum xxh64sum xxh128sum: xxhsum
ln -sf $^ $@
ln -sf $<$(EXT) $@$(EXT)
xxhsum_inlinedXXH: CPPFLAGS += -DXXH_INLINE_ALL
xxhsum_inlinedXXH: xxhsum.c
@ -141,7 +143,7 @@ clean: ## remove all build artifacts
@$(RM) -r *.dSYM # Mac OS-X specific
@$(RM) core *.o libxxhash.*
@$(RM) xxhsum$(EXT) xxhsum32$(EXT) xxhsum_inlinedXXH$(EXT)
@$(RM) xxh32sum xxh64sum xxh128sum
@$(RM) xxh32sum$(EXT) xxh64sum$(EXT) xxh128sum$(EXT)
@echo cleaning completed
@ -154,20 +156,23 @@ clean: ## remove all build artifacts
.PHONY: check
check: xxhsum ## basic tests for xxhsum CLI, set RUN_ENV for emulated environments
# stdin
$(RUN_ENV) ./xxhsum < xxhash.c
$(RUN_ENV) ./xxhsum$(EXT) < xxhash.c
# multiple files
$(RUN_ENV) ./xxhsum xxhash.* xxhsum.*
$(RUN_ENV) ./xxhsum$(EXT) xxhash.* xxhsum.*
# internal bench
$(RUN_ENV) ./xxhsum -bi1
$(RUN_ENV) ./xxhsum$(EXT) -bi1
# file bench
$(RUN_ENV) ./xxhsum -bi1 xxhash.c
$(RUN_ENV) ./xxhsum$(EXT) -bi1 xxhash.c
# 32-bit
$(RUN_ENV) ./xxhsum -H0 xxhash.c
$(RUN_ENV) ./xxhsum$(EXT) -H0 xxhash.c
# 128-bit
$(RUN_ENV) ./xxhsum -H2 xxhash.c
$(RUN_ENV) ./xxhsum$(EXT) -H2 xxhash.c
# request incorrect variant
$(RUN_ENV) ./xxhsum -H9 xxhash.c ; test $$? -eq 1
$(RUN_ENV) ./xxhsum$(EXT) -H9 xxhash.c ; test $$? -eq 1
.PHONY: test-unicode
test-unicode:
$(MAKE) -C tests test_unicode
.PHONY: test-mem
VALGRIND = valgrind --leak-check=yes --error-exitcode=1
@ -285,11 +290,11 @@ test: all namespaceTest check test-xxhsum-c c90test test-tools
.PHONY: test-inline
test-inline:
$(MAKE) -C tests test
$(MAKE) -C tests test_multiInclude
.PHONY: test-all
test-all: CFLAGS += -Werror
test-all: test test32 clangtest cxxtest usan test-inline listL120 trailingWhitespace staticAnalyze
test-all: test test32 clangtest cxxtest usan test-inline listL120 trailingWhitespace staticAnalyze test-unicode
.PHONY: test-tools
test-tools:

View File

@ -3,6 +3,19 @@ CFLAGS += -Wall -Wextra -g
NM = nm
GREP = grep
# Define *.exe as extension for Windows systems
ifneq (,$(filter Windows%,$(OS)))
EXT =.exe
else
EXT =
endif
ifneq (,$(filter %UTF-8,$(LANG)))
ENABLE_UNICODE ?= 1
else
ENABLE_UNICODE ?= 0
endif
.PHONY: default
default: all
@ -10,10 +23,10 @@ default: all
all: test
.PHONY: test
test: test_multiinclude
test: test_multiInclude test_unicode
.PHONY: test_multiinclude
test_multiinclude:
.PHONY: test_multiInclude
test_multiInclude:
@$(MAKE) clean
# compile without xxhash.o, ensure symbols exist within target
# note : built using only default rules
@ -34,6 +47,25 @@ test_multiinclude:
# ! $(NM) multiInclude | $(GREP) TESTN_
#@$(MAKE) clean
xxhsum$(EXT): ../xxhash.c ../xxhash.h ../xxh3.h ../xxhsum.c
$(CC) $(CFLAGS) $(LDFLAGS) ../xxhash.c ../xxhsum.c -o $@
# Make sure that Unicode filenames work.
# https://github.com/Cyan4973/xxHash/issues/293
.PHONY: test_unicode
ifeq (0,$(ENABLE_UNICODE))
test_unicode:
@echo "Skipping Unicode test, your terminal doesn't appear to support UTF-8."
@echo "Try with ENABLE_UNICODE=1"
else
test_unicode: xxhsum$(EXT) generate_unicode_test.c
# Generate a Unicode filename test dynamically
# to keep UTF-8 out of the source tree.
$(CC) $(CFLAGS) $(LDFLAGS) generate_unicode_test.c -o generate_unicode_test$(EXT)
./generate_unicode_test$(EXT)
$(SHELL) ./unicode_test.sh
endif
xxhash.o: ../xxhash.c ../xxhash.h
$(CC) $(CFLAGS) $(CPPFLAGS) $(LDFLAGS) -c -o $@ $<
@ -43,3 +75,4 @@ multiInclude_withxxhash: multiInclude.o xxhash.o
clean:
@$(RM) *.o
@$(RM) multiInclude multiInclude_withxxhash
@$(RM) *.unicode generate_unicode_test$(EXT) unicode_test.* xxhsum$(EXT)

View File

@ -0,0 +1,126 @@
/*
* Generates a Unicode test without using Unicode in the source
* files.
*
* Certain terminals don't properly handle UTF-8 (i.e. rxvt and command prompt
* in the default codepage), and that can cause issues when editing text
*
* We use this C file to generate a file with a Unicode filename, a
* file with a checksum of said file, and both a Windows batch script
* and a Unix shell script to test the file.
*/
#define _CRT_SECURE_NO_WARNINGS /* Silence warnings on MSVC */
#include <stdio.h>
/* Use a Japanese filename, something that can't be cheated with ANSI.
* yuniko-do.unicode (literally unicode.unicode) */
/* Use raw hex values to ensure that the output is well-formed UTF-8. It is also more C90 compliant. */
static const char FILENAME[] = {
(char)0xe3, (char)0x83, (char)0xa6, /* U+30e6: Katakana letter yu */
(char)0xe3, (char)0x83, (char)0x8b, /* U+30cb: Katakana letter ni */
(char)0xe3, (char)0x82, (char)0xb3, /* U+30b3: Katakana letter ko */
(char)0xe3, (char)0x83, (char)0xbc, /* U+30fc: Katakana-Hiragana prolonged sound mark (dash) */
(char)0xe3, (char)0x83, (char)0x89, /* U+30c9: Katakana letter do */
'.','u','n','i','c','o','d','e','\0' /* ".unicode" (so we can glob in make clean and .gitignore) */
};
#ifdef _WIN32
/* The same text as above, but encoded in Windows UTF-16. */
static const wchar_t WFILENAME[] = { 0x30e6, 0x30cb, 0x30b3, 0x30fc, 0x30c9, L'.', L'u', L'n', L'i', L'c', L'o', L'd', L'e', L'\0' };
#endif
int main(void)
{
FILE *f, *script, *checksum;
/* Create our Unicode file. Use _wfopen on Windows as fopen doesn't support Unicode filenames. */
#ifdef _WIN32
if (!(f = _wfopen(WFILENAME, L"wb"))) return 1;
#else
if (!(f = fopen(FILENAME, "wb"))) return 1;
#endif
fprintf(f, "test\n");
fclose(f);
/* XXH64 checksum file with the precalculated checksum for said file. */
if (!(checksum = fopen("unicode_test.xxh64", "wb")))
return 1;
fprintf(checksum, "2d7f1808da1fa63c %s\n", FILENAME);
fclose(checksum);
/* Create two scripts for both Windows and Unix. */
/* Generate a Windows batch script. Always insert CRLF manually. */
if (!(script = fopen("unicode_test.bat", "wb")))
return 1;
/* Disable echoing the commands. We do that ourselves the naive way. */
fprintf(script, "@echo off\r\n");
/* Change to codepage 65001 to enable UTF-8 support. */
fprintf(script, "chcp 65001 >NUL 2>&1\r\n");
/* First test a Unicode filename */
fprintf(script, "echo Testing filename provided on command line...\r\n");
fprintf(script, "echo xxhsum.exe \"%s\"\r\n", FILENAME);
fprintf(script, "xxhsum.exe \"%s\"\r\n", FILENAME);
/* Bail on error */
fprintf(script, "if %%ERRORLEVEL%% neq 0 (\r\n");
fprintf(script, " exit /B %%ERRORLEVEL%%\r\n");
fprintf(script, ")\r\n");
/* Then test a checksum file. */
fprintf(script, "echo Testing a checksum file...\r\n");
fprintf(script, "echo xxhsum.exe -c unicode_test.xxh64\r\n");
fprintf(script, "xxhsum.exe -c unicode_test.xxh64\r\n");
fprintf(script, "exit /B %%ERRORLEVEL%%\r\n");
fclose(script);
/* Generate a Unix shell script */
if (!(script = fopen("unicode_test.sh", "wb")))
return 1;
fprintf(script, "#!/bin/sh\n");
/*
* Some versions of MSYS, MinGW and Cygwin do not support UTF-8, and the ones that
* don't may error with something like this:
*
* Error: Could not open '<mojibake>.unicode': No such file or directory.
*
* which is an internal error that happens when it tries to convert MinGW/Cygwin
* paths to Windows paths.
*
* In that case, we bail to cmd.exe and the batch script, which supports UTF-8
* on Windows 7 and later.
*/
fprintf(script, "case $(uname) in\n");
/* MinGW/MSYS converts /c to C:\ unless you have a double slash,
* Cygwin does not. */
fprintf(script, " *CYGWIN*)\n");
fprintf(script, " exec cmd.exe /c unicode_test.bat\n");
fprintf(script, " ;;\n");
fprintf(script, " *MINGW*|*MSYS*)\n");
fprintf(script, " exec cmd.exe //c unicode_test.bat\n");
fprintf(script, " ;;\n");
fprintf(script, "esac\n");
/* First test a Unicode filename */
fprintf(script, "echo Testing filename provided on command line...\n");
fprintf(script, "echo './xxhsum \"%s\" || exit $?'\n", FILENAME);
fprintf(script, "./xxhsum \"%s\" || exit $?\n", FILENAME);
/* Then test a checksum file. */
fprintf(script, "echo Testing a checksum file...\n");
fprintf(script, "echo './xxhsum -c unicode_test.xxh64 || exit $?'\n");
fprintf(script, "./xxhsum -c unicode_test.xxh64 || exit $?\n");
fclose(script);
return 0;
}

View File

@ -55,7 +55,7 @@ int main(void)
XXH3_64bits_update(&state, input, sizeof(input));
XXH64_hash_t const h = XXH3_64bits_digest(&state);
printf("hash '%s' : %0llx \n", input, (unsigned long long)h);
printf("hash '%s' : %08x%08x \n", input, (unsigned)(h >> 32), (unsigned)h);
return 0;
}

167
xxhsum.c
View File

@ -126,6 +126,69 @@ static __inline int IS_CONSOLE(FILE* stdStream) {
# define S_ISREG(x) (((x) & S_IFMT) == S_IFREG)
#endif
/* Unicode helpers for Windows */
#if defined(_WIN32)
/* Converts a UTF-8 string to UTF-16. Acts like strdup. The string must be freed afterwards. */
static wchar_t *utf8_to_utf16(const char *str)
{
int len = MultiByteToWideChar(CP_UTF8, 0, str, -1, NULL, 0);
if (len == 0) {
return NULL;
}
{ wchar_t *buf = (wchar_t *)malloc((size_t)len * sizeof(wchar_t));
if (buf != NULL) {
if (MultiByteToWideChar(CP_UTF8, 0, str, -1, buf, len) == 0) {
free(buf);
return NULL;
}
}
return buf;
}
}
/* Converts a UTF-16 string to UTF-8. Acts like strdup. The string must be freed afterwards. */
static char *utf16_to_utf8(const wchar_t *str)
{
int len = WideCharToMultiByte(CP_UTF8, 0, str, -1, NULL, 0, NULL, NULL);
if (len == 0) {
return NULL;
}
{ char *buf = (char *)malloc((size_t)len * sizeof(char));
if (buf != NULL) {
if (WideCharToMultiByte(CP_UTF8, 0, str, -1, buf, len, NULL, NULL) == 0) {
free(buf);
return NULL;
}
}
return buf;
}
}
/*
* fopen on Windows, like main's argv, is useless.
*
* fopen will only accept ANSI filenames, which means that we can't open Unicode filenames.
*
* In order to open a Unicode filename, we need to convert filenames to UTF-16 and use _wfopen.
*/
static FILE *XXH_fopen_wrapped(const char *filename, const wchar_t *mode)
{
FILE *f = NULL;
wchar_t *wide_filename = utf8_to_utf16(filename);
if (wide_filename != NULL) {
f = _wfopen(wide_filename, mode);
free(wide_filename);
}
return f;
}
/*
* Since we always use literals in the "mode" argument, it is just easier to append "L" to
* the string to make it UTF-16 and avoid the hassle of a second manual conversion.
*/
# define XXH_fopen(filename, mode) XXH_fopen_wrapped(filename, L##mode)
#else
# define XXH_fopen(filename, mode) fopen(filename, mode)
#endif
/* ************************************
* Basic Types
@ -489,7 +552,7 @@ static size_t BMK_selectBenchedSize(const char* fileName)
}
static int BMK_benchFiles(const char** fileNamesTable, int nbFiles, U32 specificTest)
static int BMK_benchFiles(char** fileNamesTable, int nbFiles, U32 specificTest)
{
int result = 0;
int fileIdx;
@ -498,7 +561,7 @@ static int BMK_benchFiles(const char** fileNamesTable, int nbFiles, U32 specific
const char* const inFileName = fileNamesTable[fileIdx];
assert(inFileName != NULL);
{
FILE* const inFile = fopen( inFileName, "rb" );
FILE* const inFile = XXH_fopen( inFileName, "rb" );
size_t const benchedSize = BMK_selectBenchedSize(inFileName);
char* const buffer = (char*)calloc(benchedSize+16+3, 1);
void* const alignedBuffer = (buffer+15) - (((size_t)(buffer+15)) & 0xF); /* align on next 16 bytes */
@ -1020,7 +1083,7 @@ static int BMK_hash(const char* fileName,
fileName = "stdin";
SET_BINARY_MODE(stdin);
} else {
inFile = fopen( fileName, "rb" );
inFile = XXH_fopen( fileName, "rb" );
}
if (inFile==NULL) {
DISPLAY("Error: Could not open '%s': %s. \n", fileName, strerror(errno));
@ -1090,7 +1153,7 @@ static int BMK_hash(const char* fileName,
/* BMK_hashFiles:
* if fnTotal==0, read from stdin insteal
*/
static int BMK_hashFiles(const char** fnList, int fnTotal,
static int BMK_hashFiles(char** fnList, int fnTotal,
algoType hashType, endianess displayEndianess)
{
int fnNb;
@ -1408,7 +1471,7 @@ static void parseFile1(ParseFileArg* parseFileArg)
}
do {
FILE* const fp = fopen(parsedLine.filename, "rb");
FILE* const fp = XXH_fopen(parsedLine.filename, "rb");
if (fp == NULL) {
lineStatus = LineStatus_failedToOpen;
break;
@ -1519,7 +1582,7 @@ static int checkFile(const char* inFileName,
inFileName = "stdin";
inFile = stdin;
} else {
inFile = fopen( inFileName, "rt" );
inFile = XXH_fopen( inFileName, "rt" );
}
if (inFile == NULL) {
@ -1577,7 +1640,7 @@ static int checkFile(const char* inFileName,
}
static int checkFiles(const char** fnList, int fnTotal,
static int checkFiles(char** fnList, int fnTotal,
const endianess displayEndianess,
U32 strictMode,
U32 statusOnly,
@ -1693,7 +1756,7 @@ static unsigned readU32FromChar(const char** stringPtr) {
return result;
}
int main(int argc, const char** argv)
static int XXH_main(int argc, char** argv)
{
int i, filenamesStart = 0;
const char* const exename = argv[0];
@ -1815,3 +1878,91 @@ int main(int argc, const char** argv)
return BMK_hashFiles(argv+filenamesStart, argc-filenamesStart, algo, displayEndianess);
}
}
#if defined(_WIN32)
/* Converts a UTF-16 argv to UTF-8. */
static char **convert_argv(int argc, wchar_t **argv)
{
char **buf = (char **)malloc((size_t)(argc + 1) * sizeof(char *));
if (buf != NULL) {
int i;
for (i = 0; i < argc; i++) {
buf[i] = utf16_to_utf8(argv[i]);
}
buf[argc] = NULL;
}
return buf;
}
/* Frees arguments returned by convert_argv */
static void free_argv(int argc, char **argv)
{
int i;
if (argv == NULL) {
return;
}
for (i = 0; i < argc; i++) {
free(argv[i]);
}
free(argv);
}
/*
* On Windows, main's argv parameter is useless. Instead of UTF-8, you get ANSI
* encoding, and unknown characters will show up as mojibake.
*
* While this doesn't affect most programs, what does happen is that we can't
* open any files with Unicode filenames.
*
* On MSVC or when -municode is used in MSYS2, we can just use wmain to get
* UTF-16 command line arguments and convert the to UTF-8.
*
* However, without the -municode flag (which isn't even available on the
* original MinGW), we will get a linker error.
*
* To fix this, we can combine main with GetCommandLineW and
* CommandLineToArgvW to get the real UTF-16 arguments.
*/
#if defined(_MSC_VER) || defined(_UNICODE) || defined(UNICODE)
#if defined(__cplusplus)
extern "C"
#endif
int wmain(int argc, wchar_t **utf16_argv)
{
char **argv;
#else
int main(int argc, char **argv)
{
wchar_t **utf16_argv = CommandLineToArgvW(GetCommandLineW(), &argc);
#endif
int ret;
/* Convert the UTF-16 arguments to UTF-8. */
argv = convert_argv(argc, utf16_argv);
if (argv == NULL) {
fprintf(stderr, "Error converting command line arguments!\n");
/* return 1; */
ret = 1;
} else {
/* While we're here, we will set stderr to unbuffered mode to make text
* display instantly on MinGW. */
setvbuf(stderr, NULL, _IONBF, 0);
/* Call our real main function */
ret = XXH_main(argc, argv);
free_argv(argc, argv);
}
#if !(defined(_MSC_VER) || defined(_UNICODE) || defined(UNICODE))
/* CommandLineToArgvW needs to be freed with LocalFree. */
LocalFree(utf16_argv);
#endif
return ret;
}
#else
int main(int argc, char **argv)
{
return XXH_main(argc, argv);
}
#endif