llvm-mc: Support escaped characters in string literals (for .ascii and .asciz)

git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@79010 91177308-0d34-0410-b5e6-96231b3b80d8
2025-01-19 08:24:12 +00:00 · 2009-08-14 18:19:52 +00:00 · 2009-08-14 18:19:52 +00:00 · 1ab7594946
commit 1ab7594946
parent 2247276c6f
3 changed files with 93 additions and 7 deletions
--- a/test/MC/AsmParser/directive_ascii.s
+++ b/test/MC/AsmParser/directive_ascii.s
@ -1,5 +1,6 @@
 # RUN: llvm-mc -triple i386-unknown-unknown %s | FileCheck %s

+        .data
 # CHECK: TEST0:
 TEST0:  
        .ascii
@ -20,5 +21,28 @@ TEST2:
 # CHECK: .byte 0
 TEST3:  
        .asciz "B", "C"
-
-       
+        
+# CHECK: TEST4:
+# CHECK: .byte 1
+# CHECK: .byte 1
+# CHECK: .byte 7
+# CHECK: .byte 0
+# CHECK: .byte 56
+# CHECK: .byte 1
+# CHECK: .byte 0
+# CHECK: .byte 49
+# CHECK: .byte 0
+TEST4:  
+        .ascii "\1\01\07\08\001\0001\b\0"
+        
+# CHECK: TEST5:
+# CHECK: .byte 8
+# CHECK: .byte 12
+# CHECK: .byte 10
+# CHECK: .byte 13
+# CHECK: .byte 9
+# CHECK: .byte 92
+# CHECK: .byte 34
+TEST5:
+        .ascii "\b\f\n\r\t\\\""
+        
--- a/tools/llvm-mc/AsmParser.cpp
+++ b/tools/llvm-mc/AsmParser.cpp
@ -765,6 +765,64 @@ bool AsmParser::ParseDirectiveSectionSwitch(const char *Segment,
  return false;
 }

+bool AsmParser::ParseEscapedString(std::string &Data) {
+  assert(Lexer.is(AsmToken::String) && "Unexpected current token!");
+
+  Data = "";
+  StringRef Str = Lexer.getTok().getStringContents();
+  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
+    if (Str[i] != '\\') {
+      Data += Str[i];
+      continue;
+    }
+
+    // Recognize escaped characters. Note that this escape semantics currently
+    // loosely follows Darwin 'as'. Notably, it doesn't support hex escapes.
+    ++i;
+    if (i == e)
+      return TokError("unexpected backslash at end of string");
+
+    // Recognize octal sequences.
+    if ((unsigned) (Str[i] - '0') <= 7) {
+      // Consume up to three octal characters.
+      unsigned Value = Str[i] - '0';
+
+      if (i + 1 != e && ((unsigned) (Str[i + 1] - '0')) <= 7) {
+        ++i;
+        Value = Value * 8 + (Str[i] - '0');
+
+        if (i + 1 != e && ((unsigned) (Str[i + 1] - '0')) <= 7) {
+          ++i;
+          Value = Value * 8 + (Str[i] - '0');
+        }
+      }
+
+      if (Value > 255)
+        return TokError("invalid octal escape sequence (out of range)");
+
+      Data += (unsigned char) Value;
+      continue;
+    }
+
+    // Otherwise recognize individual escapes.
+    switch (Str[i]) {
+    default:
+      // Just reject invalid escape sequences for now.
+      return TokError("invalid escape sequence (unrecognized character)");
+
+    case 'b': Data += '\b'; break;
+    case 'f': Data += '\f'; break;
+    case 'n': Data += '\n'; break;
+    case 'r': Data += '\r'; break;
+    case 't': Data += '\t'; break;
+    case '"': Data += '"'; break;
+    case '\\': Data += '\\'; break;
+    }
+  }
+
+  return false;
+}
+
 /// ParseDirectiveAscii:
 ///   ::= ( .ascii | .asciz ) [ "string" ( , "string" )* ]
 bool AsmParser::ParseDirectiveAscii(bool ZeroTerminated) {
@ -773,11 +831,11 @@ bool AsmParser::ParseDirectiveAscii(bool ZeroTerminated) {
      if (Lexer.isNot(AsmToken::String))
        return TokError("expected string in '.ascii' or '.asciz' directive");
      
-      // FIXME: This shouldn't use a const char* + strlen, the string could have
-      // embedded nulls.
-      // FIXME: Should have accessor for getting string contents.
-      StringRef Str = Lexer.getTok().getString();
-      Out.EmitBytes(Str.substr(1, Str.size() - 2));
+      std::string Data;
+      if (ParseEscapedString(Data))
+        return true;
+      
+      Out.EmitBytes(Data);
      if (ZeroTerminated)
        Out.EmitBytes(StringRef("\0", 1));
      
--- a/tools/llvm-mc/AsmParser.h
+++ b/tools/llvm-mc/AsmParser.h
@ -135,6 +135,10 @@ private:
  bool ParseDirectiveFile(SMLoc DirectiveLoc); // ".file"
  bool ParseDirectiveLine(SMLoc DirectiveLoc); // ".line"
  bool ParseDirectiveLoc(SMLoc DirectiveLoc); // ".loc"
+
+  /// ParseEscapedString - Parse the current token as a string which may include
+  /// escaped characters and return the string contents.
+  bool ParseEscapedString(std::string &Data);
 };

 } // end namespace llvm