Fix Unicode support on Windows, minor Windows tweaks

- Unicode filenames should now work, with a method that works with
and without Unicode mode on Windows.
   - Added a test in the Makefile
 - Use unbuffered stderr output on Windows, fixes output not updating
immediately on MinGW.
 - Fix some missing $(EXT)s in the Makefile, causing Clang to emit
xxhsum instead of xxhsum.exe on Windows, as well as xxhsum's rule
ignoring $(FLAGS).
This commit is contained in:
easyaspi314 (Devin) 2020-02-12 20:37:34 -05:00
parent aee51d5e7b
commit 261c28b676
2 changed files with 156 additions and 20 deletions

View File

@ -76,9 +76,10 @@ default: lib xxhsum_and_links
all: lib xxhsum xxhsum_inlinedXXH
xxhsum: xxhash.o xxhsum.o ## generate command line interface (CLI)
$(CC) $(FLAGS) $^ $(LDFLAGS) -o $@$(EXT)
xxhsum32: CFLAGS += -m32 ## generate CLI in 32-bits mode
xxhsum32: xxhash.c xxhsum.c ## do not generate object (avoid mixing different ABI)
xxhsum32$(EXT): CFLAGS += -m32 ## generate CLI in 32-bits mode
xxhsum32$(EXT): xxhash.c xxhsum.c ## do not generate object (avoid mixing different ABI)
$(CC) $(FLAGS) $^ $(LDFLAGS) -o $@$(EXT)
xxhash.o: xxhash.h xxh3.h
@ -89,7 +90,7 @@ xxhsum.o: xxhash.h
xxhsum_and_links: xxhsum xxh32sum xxh64sum xxh128sum
xxh32sum xxh64sum xxh128sum: xxhsum
ln -sf $^ $@
ln -sf $<$(EXT) $@$(EXT)
xxhsum_inlinedXXH: CPPFLAGS += -DXXH_INLINE_ALL
xxhsum_inlinedXXH: xxhsum.c
@ -141,7 +142,7 @@ clean: ## remove all build artifacts
@$(RM) -r *.dSYM # Mac OS-X specific
@$(RM) core *.o libxxhash.*
@$(RM) xxhsum$(EXT) xxhsum32$(EXT) xxhsum_inlinedXXH$(EXT)
@$(RM) xxh32sum xxh64sum xxh128sum
@$(RM) xxh32sum$(EXT) xxh64sum$(EXT) xxh128sum$(EXT)
@echo cleaning completed
@ -154,20 +155,29 @@ clean: ## remove all build artifacts
.PHONY: check
check: xxhsum ## basic tests for xxhsum CLI, set RUN_ENV for emulated environments
# stdin
$(RUN_ENV) ./xxhsum < xxhash.c
$(RUN_ENV) ./xxhsum$(EXT) < xxhash.c
# multiple files
$(RUN_ENV) ./xxhsum xxhash.* xxhsum.*
$(RUN_ENV) ./xxhsum$(EXT) xxhash.* xxhsum.*
# internal bench
$(RUN_ENV) ./xxhsum -bi1
$(RUN_ENV) ./xxhsum$(EXT) -bi1
# file bench
$(RUN_ENV) ./xxhsum -bi1 xxhash.c
$(RUN_ENV) ./xxhsum$(EXT) -bi1 xxhash.c
# 32-bit
$(RUN_ENV) ./xxhsum -H0 xxhash.c
$(RUN_ENV) ./xxhsum$(EXT) -H0 xxhash.c
# 128-bit
$(RUN_ENV) ./xxhsum -H2 xxhash.c
$(RUN_ENV) ./xxhsum$(EXT) -H2 xxhash.c
# request incorrect variant
$(RUN_ENV) ./xxhsum -H9 xxhash.c ; test $$? -eq 1
$(RUN_ENV) ./xxhsum$(EXT) -H9 xxhash.c ; test $$? -eq 1
# Make sure that Unicode works.
# https://github.com/Cyan4973/xxHash/issues/293
# Japanese: echo "This filename is Unicode." > "Unicode.txt"
.PHOHY: test-unicode
test-unicode: xxhsum check
# Test Unicode filenames.
echo "このファイル名はユニコードです。" > "ユニコード.txt"
$(RUN_ENV) ./xxhsum$(EXT) "ユニコード.txt"
@$(RM) "ユニコード.txt"
.PHONY: test-mem
VALGRIND = valgrind --leak-check=yes --error-exitcode=1
@ -281,7 +291,7 @@ preview-man: man
.PHONY: test
test: DEBUGFLAGS += -DDEBUGLEVEL=1
test: all namespaceTest check test-xxhsum-c c90test test-tools
test: all namespaceTest check test-xxhsum-c c90test test-tools test-unicode
.PHONY: test-inline
test-inline:

142
xxhsum.c
View File

@ -126,6 +126,60 @@ static __inline int IS_CONSOLE(FILE* stdStream) {
# define S_ISREG(x) (((x) & S_IFMT) == S_IFREG)
#endif
#if defined(_WIN32)
/* Converts a UTF-8 string to UTF-16. Acts like strdup. The string must be freed afterwards. */
static wchar_t *utf8_to_utf16(const char *str)
{
int len = MultiByteToWideChar(CP_UTF8, 0, str, -1, NULL, 0);
if (len == 0) {
return NULL;
}
{ wchar_t *buf = (wchar_t *)malloc((size_t)len * sizeof(wchar_t));
if (buf != NULL) {
if (MultiByteToWideChar(CP_UTF8, 0, str, -1, buf, len) == 0) {
free(buf);
return NULL;
}
}
return buf;
}
}
/* Converts a UTF-16 string to UTF-8. Acts like strdup. The string must be freed afterwards. */
static char *utf16_to_utf8(const wchar_t *str)
{
int len = WideCharToMultiByte(CP_UTF8, 0, str, -1, NULL, 0, NULL, NULL);
if (len == 0) {
return NULL;
}
{ char *buf = (char *)malloc((size_t)len * sizeof(char));
if (buf != NULL) {
if (WideCharToMultiByte(CP_UTF8, 0, str, -1, buf, len, NULL, NULL) == 0) {
free(buf);
return NULL;
}
}
return buf;
}
}
/* We need to use UTF-16 and _wfopen_s on Windows, otherwise we can't open Unicode filenames. */
static FILE *XXH_fopen_wrapped(const char *filename, const wchar_t *mode)
{
FILE *f = NULL;
wchar_t *wide_filename = utf8_to_utf16(filename);
if (wide_filename != NULL) {
errno = _wfopen_s(&f, wide_filename, mode);
free(wide_filename);
}
return f;
}
/* Since we always use literals in the "mode" argument, it is just easier to append "L" to the string
* and avoid the hassle of a *second* conversion. */
# define XXH_fopen(filename, mode) XXH_fopen_wrapped(filename, L##mode)
#else
# define XXH_fopen(filename, mode) fopen(filename, mode)
#endif
/* ************************************
* Basic Types
@ -489,7 +543,7 @@ static size_t BMK_selectBenchedSize(const char* fileName)
}
static int BMK_benchFiles(const char** fileNamesTable, int nbFiles, U32 specificTest)
static int BMK_benchFiles(char** fileNamesTable, int nbFiles, U32 specificTest)
{
int result = 0;
int fileIdx;
@ -498,7 +552,7 @@ static int BMK_benchFiles(const char** fileNamesTable, int nbFiles, U32 specific
const char* const inFileName = fileNamesTable[fileIdx];
assert(inFileName != NULL);
{
FILE* const inFile = fopen( inFileName, "rb" );
FILE* const inFile = XXH_fopen( inFileName, "rb" );
size_t const benchedSize = BMK_selectBenchedSize(inFileName);
char* const buffer = (char*)calloc(benchedSize+16+3, 1);
void* const alignedBuffer = (buffer+15) - (((size_t)(buffer+15)) & 0xF); /* align on next 16 bytes */
@ -1033,7 +1087,7 @@ static int BMK_hash(const char* fileName,
fileName = "stdin";
SET_BINARY_MODE(stdin);
} else {
inFile = fopen( fileName, "rb" );
inFile = XXH_fopen( fileName, "rb" );
}
if (inFile==NULL) {
DISPLAY("Error: Could not open '%s': %s. \n", fileName, strerror(errno));
@ -1103,7 +1157,7 @@ static int BMK_hash(const char* fileName,
/* BMK_hashFiles:
* if fnTotal==0, read from stdin insteal
*/
static int BMK_hashFiles(const char** fnList, int fnTotal,
static int BMK_hashFiles(char** fnList, int fnTotal,
algoType hashType, endianess displayEndianess)
{
int fnNb;
@ -1421,7 +1475,7 @@ static void parseFile1(ParseFileArg* parseFileArg)
}
do {
FILE* const fp = fopen(parsedLine.filename, "rb");
FILE* const fp = XXH_fopen(parsedLine.filename, "rb");
if (fp == NULL) {
lineStatus = LineStatus_failedToOpen;
break;
@ -1532,7 +1586,7 @@ static int checkFile(const char* inFileName,
inFileName = "stdin";
inFile = stdin;
} else {
inFile = fopen( inFileName, "rt" );
inFile = XXH_fopen( inFileName, "rt" );
}
if (inFile == NULL) {
@ -1590,7 +1644,7 @@ static int checkFile(const char* inFileName,
}
static int checkFiles(const char** fnList, int fnTotal,
static int checkFiles(char** fnList, int fnTotal,
const endianess displayEndianess,
U32 strictMode,
U32 statusOnly,
@ -1706,7 +1760,7 @@ static unsigned readU32FromChar(const char** stringPtr) {
return result;
}
int main(int argc, const char** argv)
static int XXH_main(int argc, char** argv)
{
int i, filenamesStart = 0;
const char* const exename = argv[0];
@ -1828,3 +1882,75 @@ int main(int argc, const char** argv)
return BMK_hashFiles(argv+filenamesStart, argc-filenamesStart, algo, displayEndianess);
}
}
#if defined(_WIN32)
static char **convert_argv(int argc, wchar_t **argv)
{
char **buf = (char **)malloc((size_t)(argc + 1) * sizeof(char *));
int i;
if (buf != NULL) {
for (i = 0; i < argc; i++) {
buf[i] = utf16_to_utf8(argv[i]);
}
buf[argc] = NULL;
}
return buf;
}
static void free_argv(int argc, char **argv)
{
int i;
for (i = 0; i < argc; i++) {
free(argv[i]);
}
free(argv);
}
/*
* On Windows, main's argv parameter is useless. Instead of UTF-8, you get ANSI
* encoding, and unknown characters will show up as mojibake.
*
* While this doesn't affect most programs, what does happen is that we can't
* open any files with Unicode filenames.
*
* Another possible method is to use wmain, but in order for that to link on
* MinGW, an extra flag is required.
*
* Instead, we can use GetCommandLineW and CommandLineToArgvW to get a fully
* functional wmain equivalent.
*
* We still use wmain if UNICODE is defined, though.
*/
#if defined(_UNICODE) || defined(UNICODE)
int wmain(int argc, wchar_t **utf16_argv)
{
char **argv;
#else
int main(int argc, char **argv)
{
wchar_t **utf16_argv = CommandLineToArgvW(GetCommandLineW(), &argc);
#endif
int ret;
argv = convert_argv(argc, utf16_argv);
/* While we're here, we will set stderr to unbuffered mode to make text
* display instantly on MinGW. */
setvbuf(stderr, NULL, _IONBF, 0);
ret = XXH_main(argc, argv);
free_argv(argc, argv);
#if !defined(_UNICODE) && !defined(UNICODE)
/* Microsoft says to use LocalFree here. */
LocalFree(utf16_argv);
#endif
return ret;
}
#else
int main(int argc, char **argv)
{
return XXH_main(argc, argv);
}
#endif