From f66a3e2474a0e4fab50745f352fe060c8b026891 Mon Sep 17 00:00:00 2001 From: Roja A M Date: Sat, 26 Aug 2023 13:45:34 +0530 Subject: [PATCH] Fix - Invalid prettification of object with unicode as key --- js/src/javascript/tokenizer.js | 15 +++++++++++++++ python/jsbeautifier/javascript/tokenizer.py | 13 +++++++++++++ test/data/javascript/tests.js | 9 +++++++++ 3 files changed, 37 insertions(+) diff --git a/js/src/javascript/tokenizer.js b/js/src/javascript/tokenizer.js index ee35c571..1abbc598 100644 --- a/js/src/javascript/tokenizer.js +++ b/js/src/javascript/tokenizer.js @@ -57,6 +57,7 @@ var TOKEN = { BLOCK_COMMENT: 'TK_BLOCK_COMMENT', COMMENT: 'TK_COMMENT', DOT: 'TK_DOT', + UNICODE: 'TK_UNICODE', UNKNOWN: 'TK_UNKNOWN', START: BASETOKEN.START, RAW: BASETOKEN.RAW, @@ -129,6 +130,7 @@ var Tokenizer = function(input_string, options) { xml: pattern_reader.matching(/[\s\S]*?<(\/?)([-a-zA-Z:0-9_.]+|{[^}]+?}|!\[CDATA\[[^\]]*?\]\]|)(\s*{[^}]+?}|\s+[-a-zA-Z:0-9_.]+|\s+[-a-zA-Z:0-9_.]+\s*=\s*('[^']*'|"[^"]*"|{([^{}]|{[^}]+?})+?}))*\s*(\/?)\s*>/), single_quote: templatable.until(/['\\\n\r\u2028\u2029]/), double_quote: templatable.until(/["\\\n\r\u2028\u2029]/), + unicode: pattern_reader.matching(/\\u{[0-9a-fA-F]{4,5}}/), template_text: templatable.until(/[`\\$]/), template_expression: templatable.until(/[`}\\]/) }; @@ -174,6 +176,7 @@ Tokenizer.prototype._get_next_token = function(previous_token, open_token) { // token = token || this._read_regexp(c, previous_token); token = token || this._read_xml(c, previous_token); token = token || this._read_punctuation(); + token = token || this._read_unicode_with_braces(c); token = token || this._create_token(TOKEN.UNKNOWN, this._input.next()); return token; @@ -457,6 +460,18 @@ Tokenizer.prototype._read_xml = function(c, previous_token) { return null; }; +Tokenizer.prototype._read_unicode_with_braces = function(c) { + var token = null; + if(c === '\\'){ + var unicode = ''; + if (this._input.peek(1) === 'u') { + unicode = this.__patterns.unicode.read(); + token = this._create_token(TOKEN.UNICODE, unicode); + } + } + return token; +}; + function unescape_string(s) { // You think that a regex would work for this // return s.replace(/\\x([0-9a-f]{2})/gi, function(match, val) { diff --git a/python/jsbeautifier/javascript/tokenizer.py b/python/jsbeautifier/javascript/tokenizer.py index 0eeb8a07..1727cea9 100644 --- a/python/jsbeautifier/javascript/tokenizer.py +++ b/python/jsbeautifier/javascript/tokenizer.py @@ -51,6 +51,7 @@ class TokenTypes(BaseTokenTypes): BLOCK_COMMENT = "TK_BLOCK_COMMENT" COMMENT = "TK_COMMENT" DOT = "TK_DOT" + UNICODE = ("TK_UNICODE",) UNKNOWN = "TK_UNKNOWN" def __init__(self): @@ -164,6 +165,8 @@ class TokenizerPatterns(BaseTokenizerPatterns): self.template_text = templatable.until(r"[`\\$]") self.template_expression = templatable.until(r"[`}\\]") + self.unicode = pattern.matching(r"\\u{[0-9a-fA-F]{4,5}}") + class Tokenizer(BaseTokenizer): positionable_operators = positionable_operators @@ -229,6 +232,7 @@ class Tokenizer(BaseTokenizer): token = token or self._read_regexp(c, previous_token) token = token or self._read_xml(c, previous_token) token = token or self._read_punctuation() + token = token or self._read_unicode_with_braces(c) token = token or self._create_token(TOKEN.UNKNOWN, self._input.next()) return token @@ -500,6 +504,15 @@ class Tokenizer(BaseTokenizer): return token + def _read_unicode_with_braces(self, c): + token = None + if c == "\\": + unicode = "" + if self._input.peek(1) == "u": + unicode = self._patterns.unicode.read() + token = self._create_token(TOKEN.UNICODE, unicode) + return token + __regexTokens = { TOKEN.COMMENT, TOKEN.START_EXPR, diff --git a/test/data/javascript/tests.js b/test/data/javascript/tests.js index 25967eea..a1fa610a 100644 --- a/test/data/javascript/tests.js +++ b/test/data/javascript/tests.js @@ -1750,6 +1750,15 @@ exports.test_data = { { input: 'fn[0]`tagged`', output: 'fn[0] `tagged`' + }, + { + comment: 'Issue #2159: Invalid prettification of object with unicode escape character as object key - test scenario: object with unicode as key', + input: '{\\\\u{1d4b6}:"ascr"}', + output: [ + '{', + ' \\\\u{1d4b6}: "ascr"', + '}' + ] } ] }, {