diff --git a/include/llvm/Support/CommandLine.h b/include/llvm/Support/CommandLine.h index 531760b01d0..8fb4630667a 100644 --- a/include/llvm/Support/CommandLine.h +++ b/include/llvm/Support/CommandLine.h @@ -1745,6 +1745,59 @@ void PrintHelpMessage(bool Hidden=false, bool Categorized=false); /// llvm::cl::ParseCommandLineOptions(). void getRegisteredOptions(StringMap<Option*> &Map); +//===----------------------------------------------------------------------===// +// Standalone command line processing utilities. +// + +/// \brief Saves strings in the inheritor's stable storage and returns a stable +/// raw character pointer. +class StringSaver { +public: + virtual const char *SaveString(const char *Str) = 0; + virtual ~StringSaver() {}; // Pacify -Wnon-virtual-dtor. +}; + +/// \brief Tokenizes a command line that can contain escapes and quotes. +// +/// The quoting rules match those used by GCC and other tools that use +/// libiberty's buildargv() or expandargv() utilities, and do not match bash. +/// They differ from buildargv() on treatment of backslashes that do not escape +/// a special character to make it possible to accept most Windows file paths. +/// +/// \param [in] Source The string to be split on whitespace with quotes. +/// \param [in] Saver Delegates back to the caller for saving parsed strings. +/// \param [out] NewArgv All parsed strings are appended to NewArgv. +void TokenizeGNUCommandLine(StringRef Source, StringSaver &Saver, + SmallVectorImpl<const char *> &NewArgv); + +/// \brief Tokenizes a Windows command line which may contain quotes and escaped +/// quotes. +/// +/// See MSDN docs for CommandLineToArgvW for information on the quoting rules. +/// http://msdn.microsoft.com/en-us/library/windows/desktop/17w5ykft(v=vs.85).aspx +/// +/// \param [in] Source The string to be split on whitespace with quotes. +/// \param [in] Saver Delegates back to the caller for saving parsed strings. +/// \param [out] NewArgv All parsed strings are appended to NewArgv. +void TokenizeWindowsCommandLine(StringRef Source, StringSaver &Saver, + SmallVectorImpl<const char *> &NewArgv); + +/// \brief String tokenization function type. Should be compatible with either +/// Windows or Unix command line tokenizers. +typedef void (*TokenizerCallback)(StringRef Source, StringSaver &Saver, + SmallVectorImpl<const char *> &NewArgv); + +/// \brief Expand response files on a command line recursively using the given +/// StringSaver and tokenization strategy. Argv should contain the command line +/// before expansion and will be modified in place. +/// +/// \param [in] Saver Delegates back to the caller for saving parsed strings. +/// \param [in] Tokenize Tokenization strategy. Typically Unix or Windows. +/// \param [in,out] Argv Command line into which to expand response files. +/// \return true if all @files were expanded successfully or there were none. +bool ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer, + SmallVectorImpl<const char *> &Argv); + } // End namespace cl } // End namespace llvm diff --git a/lib/Support/CommandLine.cpp b/lib/Support/CommandLine.cpp index 6ab03dc675d..c2a25a18cd9 100644 --- a/lib/Support/CommandLine.cpp +++ b/lib/Support/CommandLine.cpp @@ -17,12 +17,14 @@ //===----------------------------------------------------------------------===// #include "llvm/Support/CommandLine.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/OwningPtr.h" #include "llvm/ADT/SmallPtrSet.h" #include "llvm/ADT/SmallString.h" #include "llvm/ADT/StringMap.h" #include "llvm/ADT/Twine.h" #include "llvm/Config/config.h" +#include "llvm/Support/ConvertUTF.h" #include "llvm/Support/Debug.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/Support/Host.h" @@ -434,39 +436,137 @@ static bool EatsUnboundedNumberOfValues(const Option *O) { O->getNumOccurrencesFlag() == cl::OneOrMore; } -/// ParseCStringVector - Break INPUT up wherever one or more -/// whitespace characters are found, and store the resulting tokens in -/// OUTPUT. The tokens stored in OUTPUT are dynamically allocated -/// using strdup(), so it is the caller's responsibility to free() -/// them later. -/// -static void ParseCStringVector(std::vector<char *> &OutputVector, - const char *Input) { - // Characters which will be treated as token separators: - StringRef Delims = " \v\f\t\r\n"; +static bool isWhitespace(char C) { + return strchr(" \t\n\r\f\v", C); +} - StringRef WorkStr(Input); - while (!WorkStr.empty()) { - // If the first character is a delimiter, strip them off. - if (Delims.find(WorkStr[0]) != StringRef::npos) { - size_t Pos = WorkStr.find_first_not_of(Delims); - if (Pos == StringRef::npos) Pos = WorkStr.size(); - WorkStr = WorkStr.substr(Pos); +static bool isQuote(char C) { + return C == '\"' || C == '\''; +} + +static bool isGNUSpecial(char C) { + return strchr("\\\"\' ", C); +} + +void cl::TokenizeGNUCommandLine(StringRef Src, StringSaver &Saver, + SmallVectorImpl<const char *> &NewArgv) { + SmallString<128> Token; + for (size_t I = 0, E = Src.size(); I != E; ++I) { + // Consume runs of whitespace. + if (Token.empty()) { + while (I != E && isWhitespace(Src[I])) + ++I; + if (I == E) break; + } + + // Backslashes can escape backslashes, spaces, and other quotes. Otherwise + // they are literal. This makes it much easier to read Windows file paths. + if (I + 1 < E && Src[I] == '\\' && isGNUSpecial(Src[I + 1])) { + ++I; // Skip the escape. + Token.push_back(Src[I]); continue; } - // Find position of first delimiter. - size_t Pos = WorkStr.find_first_of(Delims); - if (Pos == StringRef::npos) Pos = WorkStr.size(); + // Consume a quoted string. + if (isQuote(Src[I])) { + char Quote = Src[I++]; + while (I != E && Src[I] != Quote) { + // Backslashes are literal, unless they escape a special character. + if (Src[I] == '\\' && I + 1 != E && isGNUSpecial(Src[I + 1])) + ++I; + Token.push_back(Src[I]); + ++I; + } + if (I == E) break; + continue; + } - // Everything from 0 to Pos is the next word to copy. - char *NewStr = (char*)malloc(Pos+1); - memcpy(NewStr, WorkStr.data(), Pos); - NewStr[Pos] = 0; - OutputVector.push_back(NewStr); + // End the token if this is whitespace. + if (isWhitespace(Src[I])) { + if (!Token.empty()) + NewArgv.push_back(Saver.SaveString(Token.c_str())); + Token.clear(); + continue; + } - WorkStr = WorkStr.substr(Pos); + // This is a normal character. Append it. + Token.push_back(Src[I]); } + + // Append the last token after hitting EOF with no whitespace. + if (!Token.empty()) + NewArgv.push_back(Saver.SaveString(Token.c_str())); +} + +void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver, + SmallVectorImpl<const char *> &NewArgv) { + llvm_unreachable("FIXME not implemented"); +} + +static bool ExpandResponseFile(const char *FName, StringSaver &Saver, + TokenizerCallback Tokenizer, + SmallVectorImpl<const char *> &NewArgv) { + OwningPtr<MemoryBuffer> MemBuf; + if (MemoryBuffer::getFile(FName, MemBuf)) + return false; + StringRef Str(MemBuf->getBufferStart(), MemBuf->getBufferSize()); + + // If we have a UTF-16 byte order mark, convert to UTF-8 for parsing. + ArrayRef<char> BufRef(MemBuf->getBufferStart(), MemBuf->getBufferEnd()); + std::string UTF8Buf; + if (hasUTF16ByteOrderMark(BufRef)) { + if (!convertUTF16ToUTF8String(BufRef, UTF8Buf)) + return false; + Str = StringRef(UTF8Buf); + } + + // Tokenize the contents into NewArgv. + Tokenizer(Str, Saver, NewArgv); + + return true; +} + +/// \brief Expand response files on a command line recursively using the given +/// StringSaver and tokenization strategy. +bool cl::ExpandResponseFiles(StringSaver &Saver, TokenizerCallback Tokenizer, + SmallVectorImpl<const char *> &Argv) { + unsigned RspFiles = 0; + bool AllExpanded = false; + + // Don't cache Argv.size() because it can change. + for (unsigned I = 0; I != Argv.size(); ) { + const char *Arg = Argv[I]; + if (Arg[0] != '@') { + ++I; + continue; + } + + // If we have too many response files, leave some unexpanded. This avoids + // crashing on self-referential response files. + if (RspFiles++ > 20) + return false; + + // Replace this response file argument with the tokenization of its + // contents. Nested response files are expanded in subsequent iterations. + // FIXME: If a nested response file uses a relative path, is it relative to + // the cwd of the process or the response file? + SmallVector<const char *, 0> ExpandedArgv; + if (!ExpandResponseFile(Arg + 1, Saver, Tokenizer, ExpandedArgv)) { + AllExpanded = false; + continue; + } + Argv.erase(Argv.begin() + I); + Argv.insert(Argv.begin() + I, ExpandedArgv.begin(), ExpandedArgv.end()); + } + return AllExpanded; +} + +namespace { + class StrDupSaver : public StringSaver { + const char *SaveString(const char *Str) LLVM_OVERRIDE { + return strdup(Str); + } + }; } /// ParseEnvironmentOptions - An alternative entry point to the @@ -487,45 +587,21 @@ void cl::ParseEnvironmentOptions(const char *progName, const char *envVar, // Get program's "name", which we wouldn't know without the caller // telling us. - std::vector<char*> newArgv; + SmallVector<const char *, 20> newArgv; newArgv.push_back(strdup(progName)); // Parse the value of the environment variable into a "command line" // and hand it off to ParseCommandLineOptions(). - ParseCStringVector(newArgv, envValue); + StrDupSaver Saver; + TokenizeGNUCommandLine(envValue, Saver, newArgv); int newArgc = static_cast<int>(newArgv.size()); ParseCommandLineOptions(newArgc, &newArgv[0], Overview); // Free all the strdup()ed strings. - for (std::vector<char*>::iterator i = newArgv.begin(), e = newArgv.end(); + for (SmallVectorImpl<const char *>::iterator i = newArgv.begin(), + e = newArgv.end(); i != e; ++i) - free(*i); -} - - -/// ExpandResponseFiles - Copy the contents of argv into newArgv, -/// substituting the contents of the response files for the arguments -/// of type @file. -static void ExpandResponseFiles(unsigned argc, const char*const* argv, - std::vector<char*>& newArgv) { - for (unsigned i = 1; i != argc; ++i) { - const char *arg = argv[i]; - - if (arg[0] == '@') { - // TODO: we should also support recursive loading of response files, - // since this is how gcc behaves. (From their man page: "The file may - // itself contain additional @file options; any such options will be - // processed recursively.") - - // Mmap the response file into memory. - OwningPtr<MemoryBuffer> respFilePtr; - if (!MemoryBuffer::getFile(arg + 1, respFilePtr)) { - ParseCStringVector(newArgv, respFilePtr->getBufferStart()); - continue; - } - } - newArgv.push_back(strdup(arg)); - } + free(const_cast<char *>(*i)); } void cl::ParseCommandLineOptions(int argc, const char * const *argv, @@ -540,9 +616,11 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv, "No options specified!"); // Expand response files. - std::vector<char*> newArgv; - newArgv.push_back(strdup(argv[0])); - ExpandResponseFiles(argc, argv, newArgv); + SmallVector<const char *, 20> newArgv; + for (int i = 0; i != argc; ++i) + newArgv.push_back(strdup(argv[i])); + StrDupSaver Saver; + ExpandResponseFiles(Saver, TokenizeGNUCommandLine, newArgv); argv = &newArgv[0]; argc = static_cast<int>(newArgv.size()); @@ -838,9 +916,10 @@ void cl::ParseCommandLineOptions(int argc, const char * const *argv, // Free the memory allocated by ExpandResponseFiles. // Free all the strdup()ed strings. - for (std::vector<char*>::iterator i = newArgv.begin(), e = newArgv.end(); + for (SmallVectorImpl<const char *>::iterator i = newArgv.begin(), + e = newArgv.end(); i != e; ++i) - free(*i); + free(const_cast<char *>(*i)); // If we had an error processing our arguments, don't let the program execute if (ErrorParsing) exit(1); diff --git a/test/Other/ResponseFile.ll b/test/Other/ResponseFile.ll index b8b3d0a9023..914e5480f20 100644 --- a/test/Other/ResponseFile.ll +++ b/test/Other/ResponseFile.ll @@ -1,5 +1,9 @@ -; RUN: echo %s > %t.list -; RUN: llvm-as @%t.list -o %t.bc +; Test that we can recurse, at least a little bit. The -time-passes flag here +; is a hack to make sure that neither echo nor the shell expands the response +; file for us. Tokenization with quotes is tested in unittests. +; RUN: echo %s > %t.list1 +; RUN: echo "-time-passes @%t.list1" > %t.list2 +; RUN: llvm-as @%t.list2 -o %t.bc ; RUN: llvm-nm %t.bc 2>&1 | FileCheck %s ; CHECK: T foobar diff --git a/unittests/Support/CommandLineTest.cpp b/unittests/Support/CommandLineTest.cpp index cd235d274e6..7a1c3821d7d 100644 --- a/unittests/Support/CommandLineTest.cpp +++ b/unittests/Support/CommandLineTest.cpp @@ -7,6 +7,7 @@ // //===----------------------------------------------------------------------===// +#include "llvm/ADT/STLExtras.h" #include "llvm/Support/CommandLine.h" #include "llvm/Config/config.h" #include "gtest/gtest.h" @@ -118,4 +119,27 @@ TEST(CommandLineTest, UseOptionCategory) { "Category."; } +class StrDupSaver : public cl::StringSaver { + const char *SaveString(const char *Str) LLVM_OVERRIDE { + return strdup(Str); + } +}; + +TEST(CommandLineTest, TokenizeGNUCommandLine) { + const char *Input = "foo\\ bar \"foo bar\" \'foo bar\' 'foo\\\\bar' " + "foo\"bar\"baz C:\\src\\foo.cpp \"C:\\src\\foo.cpp\""; + const char *const Output[] = { "foo bar", "foo bar", "foo bar", "foo\\bar", + "foobarbaz", "C:\\src\\foo.cpp", + "C:\\src\\foo.cpp" }; + SmallVector<const char *, 0> Actual; + StrDupSaver Saver; + cl::TokenizeGNUCommandLine(Input, Saver, Actual); + EXPECT_EQ(array_lengthof(Output), Actual.size()); + for (unsigned I = 0, E = Actual.size(); I != E; ++I) { + if (I < array_lengthof(Output)) + EXPECT_STREQ(Output[I], Actual[I]); + free(const_cast<char *>(Actual[I])); + } +} + } // anonymous namespace