Reorganize tokenizer

2024-11-23 12:49:40 +00:00 · 2018-08-05 18:45:17 -07:00 · 2018-08-05 18:45:17 -07:00 · 00e5407b85
commit 00e5407b85
parent 28e86b9af8
6 changed files with 209 additions and 220 deletions
--- a/js/src/core/tokenizer.js
+++ b/js/src/core/tokenizer.js
@ -31,6 +31,7 @@ var Token = require('../core/token').Token;
 var TokenStream = require('../core/tokenstream').TokenStream;

 var TOKEN = {
+  START: 'TK_START',
  RAW: 'TK_RAW',
  EOF: 'TK_EOF'
 };
@ -53,16 +54,16 @@ function Tokenizer(input_string) { // jshint unused:false
    this.reset();

    var current;
-    var last = new Token(TOKEN.RAW, '');
+    var last = new Token(TOKEN.START, '');
    var open_token = null;
    var open_stack = [];
    var comments = new TokenStream();

    while (last.type !== TOKEN.EOF) {
-      current = this.get_next_token();
+      current = this.get_next_token(last);
      while (this.is_comment(current)) {
        comments.add(current);
-        current = this.get_next_token();
+        current = this.get_next_token(last);
      }

      if (!comments.isEmpty()) {
@ -90,7 +91,7 @@ function Tokenizer(input_string) { // jshint unused:false

  this.reset = function() {};

-  this.get_next_token = function() {
+  this.get_next_token = function(last_token) { // jshint unused:false
    this.readWhitespace();
    var resulting_string = this._input.read(/.+/g);
    if (resulting_string) {
--- a/js/src/core/tokenstream.js
+++ b/js/src/core/tokenstream.js
@ -54,14 +54,6 @@ function TokenStream(parent_token) {
    return val;
  };

-  this.last = function() {
-    var val = null;
-    if (!this.isEmpty()) {
-      val = this._tokens[this._tokens_length - 1];
-    }
-    return val;
-  };
-
  this.peek = function(index) {
    var val = null;
    index = index || 0;
--- a/js/src/javascript/tokenizer.js
+++ b/js/src/javascript/tokenizer.js
@ -27,7 +27,6 @@
 */

 var InputScanner = require('../core/inputscanner').InputScanner;
-var Token = require('../core/token').Token;
 var BaseTokenizer = require('../core/tokenizer').Tokenizer;
 var BASETOKEN = require('../core/tokenizer').TOKEN;
 var acorn = require('../core/acorn');
@ -54,6 +53,8 @@ var TOKEN = {
  COMMENT: 'TK_COMMENT',
  DOT: 'TK_DOT',
  UNKNOWN: 'TK_UNKNOWN',
+  START: BASETOKEN.START,
+  RAW: BASETOKEN.RAW,
  EOF: BASETOKEN.EOF
 };

@ -66,10 +67,20 @@ function Tokenizer(input_string, opts) {

  var digit = /[0-9]/;

-  this.positionable_operators = '!= !== % & && * ** + - / : < << <= == === > >= >> >>> ? ^ | ||'.split(' ');
-  var punct = this.positionable_operators.concat(
-    // non-positionable operators - these do not follow operator position settings
-    '! %= &= *= **= ++ += , -- -= /= :: <<= = => >>= >>>= ^= |= ~ ...'.split(' '));
+  this.positionable_operators = (
+    ">>> === !== " +
+    "<< && >= ** != == <= >> || " +
+    "< / - + > : & % ? ^ | *").split(' ');
+
+  // IMPORTANT: this must be sorted longest to shortest or tokenizing many not work.
+  // Also, you must update possitionable operators separately from punct
+  var punct =
+    ">>>= " +
+    "... >>= <<= === >>> !== **= " +
+    "=> ^= :: /= << <= == && -= >= >> != -- += ** || ++ %= &= *= |= " +
+    "= ! , ? > < : / ^ - + * & % ~ | .";
+
+  var punct_pattern = new RegExp(punct.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&").replace(/ /g, '|'), 'g');

  // words which should always start on new line.
  this.line_starters = 'continue,try,throw,return,var,let,const,if,switch,case,default,for,while,break,function,import,export'.split(',');
@ -105,19 +116,26 @@ function Tokenizer(input_string, opts) {
    in_html_comment = false;
  };

-  this.get_next_token = function() {
-    var resulting_string;
-
-    var last_token;
-    if (this._tokens.isEmpty()) {
-      // For the sake of tokenizing we can pretend that there was on open brace to start
-      last_token = new Token(TOKEN.START_BLOCK, '{');
-    } else {
-      last_token = this._tokens.last();
-    }
-
+  this.get_next_token = function(last_token) {
    this.readWhitespace();
+    var token = null;
+    var c = this._input.peek();

+    token = token || this._read_singles(c);
+    token = token || this._read_word(last_token);
+    token = token || this._read_comment(c);
+    token = token || this._read_string(c);
+    token = token || this._read_regexp(c, last_token);
+    token = token || this._read_xml(c, last_token);
+    token = token || this._read_non_javascript(c);
+    token = token || this._read_punctuation();
+    token = token || this.create_token(TOKEN.UNKNOWN, this._input.next());
+
+    return token;
+  };
+
+  this._read_word = function(last_token) {
+    var resulting_string;
    resulting_string = this._input.read(acorn.identifier);
    if (resulting_string !== '') {
      if (!(last_token.type === TOKEN.DOT ||
@ -136,40 +154,111 @@ function Tokenizer(input_string, opts) {
    if (resulting_string !== '') {
      return this.create_token(TOKEN.WORD, resulting_string);
    }
+  };

-    var c = this._input.next();
-
+  this._read_singles = function(c) {
+    var token = null;
    if (c === null) {
-      return this.create_token(TOKEN.EOF, '');
+      token = this.create_token(TOKEN.EOF, '');
+    } else if (c === '(' || c === '[') {
+      token = this.create_token(TOKEN.START_EXPR, c);
+    } else if (c === ')' || c === ']') {
+      token = this.create_token(TOKEN.END_EXPR, c);
+    } else if (c === '{') {
+      token = this.create_token(TOKEN.START_BLOCK, c);
+    } else if (c === '}') {
+      token = this.create_token(TOKEN.END_BLOCK, c);
+    } else if (c === ';') {
+      token = this.create_token(TOKEN.SEMICOLON, c);
+    }
+    if (token) {
+      this._input.next();
+    }
+    return token;
+  };
+
+  this._read_punctuation = function() {
+    var resulting_string = this._input.read(punct_pattern);
+
+    if (resulting_string !== '') {
+      if (resulting_string === ',') {
+        return this.create_token(TOKEN.COMMA, resulting_string);
+      } else if (resulting_string === '=') {
+        return this.create_token(TOKEN.EQUALS, resulting_string);
+      } else if (resulting_string === '.') {
+        return this.create_token(TOKEN.DOT, resulting_string);
+      } else {
+        return this.create_token(TOKEN.OPERATOR, resulting_string);
+      }
+    }
+  };
+
+  this._read_non_javascript = function(c) {
+    var resulting_string = '';
+
+    if (c === '#') {
+      c = this._input.next();
+
+      if (this._tokens.isEmpty() && this._input.peek() === '!') {
+        // shebang
+        resulting_string = c;
+        while (this._input.hasNext() && c !== '\n') {
+          c = this._input.next();
+          resulting_string += c;
+        }
+        return this.create_token(TOKEN.UNKNOWN, resulting_string.trim() + '\n');
+      }
+
+      // Spidermonkey-specific sharp variables for circular references. Considered obsolete.
+      var sharp = '#';
+      if (this._input.hasNext() && this._input.testChar(digit)) {
+        do {
+          c = this._input.next();
+          sharp += c;
+        } while (this._input.hasNext() && c !== '#' && c !== '=');
+        if (c === '#') {
+          //
+        } else if (this._input.peek() === '[' && this._input.peek(1) === ']') {
+          sharp += '[]';
+          this._input.next();
+          this._input.next();
+        } else if (this._input.peek() === '{' && this._input.peek(1) === '}') {
+          sharp += '{}';
+          this._input.next();
+          this._input.next();
+        }
+        return this.create_token(TOKEN.WORD, sharp);
+      }
+
+      this._input.back();
+
+    } else if (c === '<' && (this._input.peek(1) === '?' || this._input.peek(1) === '%')) {
+      resulting_string = this._input.read(template_pattern);
+      if (resulting_string) {
+        resulting_string = resulting_string.replace(acorn.allLineBreaks, '\n');
+        return this.create_token(TOKEN.STRING, resulting_string);
+      }
+    } else if (c === '<' && this._input.match(/<\!--/g)) {
+      c = '<!--';
+      while (this._input.hasNext() && !this._input.testChar(acorn.newline)) {
+        c += this._input.next();
+      }
+      in_html_comment = true;
+      return this.create_token(TOKEN.COMMENT, c);
+    } else if (c === '-' && in_html_comment && this._input.match(/-->/g)) {
+      in_html_comment = false;
+      return this.create_token(TOKEN.COMMENT, '-->');
    }

+    return null;
+  };

-    if (c === '(' || c === '[') {
-      return this.create_token(TOKEN.START_EXPR, c);
-    }
-
-    if (c === ')' || c === ']') {
-      return this.create_token(TOKEN.END_EXPR, c);
-    }
-
-    if (c === '{') {
-      return this.create_token(TOKEN.START_BLOCK, c);
-    }
-
-    if (c === '}') {
-      return this.create_token(TOKEN.END_BLOCK, c);
-    }
-
-    if (c === ';') {
-      return this.create_token(TOKEN.SEMICOLON, c);
-    }
-
+  this._read_comment = function(c) {
+    var token = null;
    if (c === '/') {
-      var token;
      var comment = '';
-      // peek for comment /* ... */
-      if (this._input.peek() === '*') {
-        this._input.back();
+      if (this._input.peek(1) === '*') {
+        // peek for comment /* ... */
        comment = this._input.read(block_comment_pattern);
        var directives = directives_core.get_directives(comment);
        if (directives && directives.ignore === 'start') {
@ -178,55 +267,60 @@ function Tokenizer(input_string, opts) {
        comment = comment.replace(acorn.allLineBreaks, '\n');
        token = this.create_token(TOKEN.BLOCK_COMMENT, comment);
        token.directives = directives;
-        return token;
-      }
-      // peek for comment // ...
-      if (this._input.peek() === '/') {
-        this._input.back();
+      } else if (this._input.peek(1) === '/') {
+        // peek for comment // ...
        comment = this._input.read(comment_pattern);
-        return this.create_token(TOKEN.COMMENT, comment);
+        token = this.create_token(TOKEN.COMMENT, comment);
      }
-
    }
+    return token;
+  };

-    var startXmlRegExp = /<()([-a-zA-Z:0-9_.]+|{[\s\S]+?}|!\[CDATA\[[\s\S]*?\]\])(\s+{[\s\S]+?}|\s+[-a-zA-Z:0-9_.]+|\s+[-a-zA-Z:0-9_.]+\s*=\s*('[^']*'|"[^"]*"|{[\s\S]+?}))*\s*(\/?)\s*>/g;
+  this._read_string = function(c) {
+    if (c === '`' || c === "'" || c === '"') {
+      var resulting_string = this._input.next();
+      this.has_char_escapes = false;

-    var xmlRegExp = /[\s\S]*?<(\/?)([-a-zA-Z:0-9_.]+|{[\s\S]+?}|!\[CDATA\[[\s\S]*?\]\])(\s+{[\s\S]+?}|\s+[-a-zA-Z:0-9_.]+|\s+[-a-zA-Z:0-9_.]+\s*=\s*('[^']*'|"[^"]*"|{[\s\S]+?}))*\s*(\/?)\s*>/g;
-
-    function allowRegExOrXML() {
-      // regex and xml can only appear in specific locations during parsing
-      return (last_token.type === TOKEN.RESERVED && in_array(last_token.text, ['return', 'case', 'throw', 'else', 'do', 'typeof', 'yield'])) ||
-        (last_token.type === TOKEN.END_EXPR && last_token.text === ')' &&
-          last_token.parent && last_token.parent.type === TOKEN.RESERVED && in_array(last_token.parent.text, ['if', 'while', 'for'])) ||
-        (in_array(last_token.type, [TOKEN.COMMENT, TOKEN.START_EXPR, TOKEN.START_BLOCK,
-          TOKEN.END_BLOCK, TOKEN.OPERATOR, TOKEN.EQUALS, TOKEN.EOF, TOKEN.SEMICOLON, TOKEN.COMMA
-        ]));
-    }
-
-    var isString = (c === '`' || c === "'" || c === '"');
-    var isRegExp = (c === '/') && allowRegExOrXML();
-    var isXML = (opts.e4x && c === "<" && this._input.test(startXmlRegExp, -1)) && allowRegExOrXML();
-    var sep = c,
-      esc = false;
-
-    this.has_char_escapes = false;
-
-    resulting_string = c;
-
-
-    if (isString) {
-      if (sep === '`') {
-        resulting_string += this._read_string('`', true, '${');
+      if (c === '`') {
+        resulting_string += this._read_string_recursive('`', true, '${');
      } else {
-        resulting_string += this._read_string(sep);
+        resulting_string += this._read_string_recursive(c);
      }

-    } else if (isRegExp) {
+      if (this.has_char_escapes && opts.unescape_strings) {
+        resulting_string = unescape_string(resulting_string);
+      }
+      if (this._input.peek() === c) {
+        resulting_string += this._input.next();
+      }
+
+      return this.create_token(TOKEN.STRING, resulting_string);
+    }
+
+    return null;
+  };
+
+  this._allow_regexp_or_xml = function(last_token) {
+    // regex and xml can only appear in specific locations during parsing
+    return (last_token.type === TOKEN.RESERVED && in_array(last_token.text, ['return', 'case', 'throw', 'else', 'do', 'typeof', 'yield'])) ||
+      (last_token.type === TOKEN.END_EXPR && last_token.text === ')' &&
+        last_token.parent && last_token.parent.type === TOKEN.RESERVED && in_array(last_token.parent.text, ['if', 'while', 'for'])) ||
+      (in_array(last_token.type, [TOKEN.COMMENT, TOKEN.START_EXPR, TOKEN.START_BLOCK, TOKEN.START,
+        TOKEN.END_BLOCK, TOKEN.OPERATOR, TOKEN.EQUALS, TOKEN.EOF, TOKEN.SEMICOLON, TOKEN.COMMA
+      ]));
+  };
+
+  this._read_regexp = function(c, last_token) {
+
+    if (c === '/' && this._allow_regexp_or_xml(last_token)) {
      // handle regexp
      //
+      var resulting_string = this._input.next();
+      var esc = false;
+
      var in_char_class = false;
      while (this._input.hasNext() &&
-        ((esc || in_char_class || this._input.peek() !== sep) &&
+        ((esc || in_char_class || this._input.peek() !== c) &&
          !this._input.testChar(acorn.newline))) {
        resulting_string += this._input.peek();
        if (!esc) {
@ -242,10 +336,27 @@ function Tokenizer(input_string, opts) {
        this._input.next();
      }

-    } else if (isXML) {
+      if (this._input.peek() === c) {
+        resulting_string += this._input.next();
+
+        // regexps may have modifiers /regexp/MOD , so fetch those, too
+        // Only [gim] are valid, but if the user puts in garbage, do what we can to take it.
+        resulting_string += this._input.read(acorn.identifier);
+      }
+      return this.create_token(TOKEN.STRING, resulting_string);
+    }
+    return null;
+  };
+
+
+  var startXmlRegExp = /<()([-a-zA-Z:0-9_.]+|{[\s\S]+?}|!\[CDATA\[[\s\S]*?\]\])(\s+{[\s\S]+?}|\s+[-a-zA-Z:0-9_.]+|\s+[-a-zA-Z:0-9_.]+\s*=\s*('[^']*'|"[^"]*"|{[\s\S]+?}))*\s*(\/?)\s*>/g;
+  var xmlRegExp = /[\s\S]*?<(\/?)([-a-zA-Z:0-9_.]+|{[\s\S]+?}|!\[CDATA\[[\s\S]*?\]\])(\s+{[\s\S]+?}|\s+[-a-zA-Z:0-9_.]+|\s+[-a-zA-Z:0-9_.]+\s*=\s*('[^']*'|"[^"]*"|{[\s\S]+?}))*\s*(\/?)\s*>/g;
+
+  this._read_xml = function(c, last_token) {
+
+    if (opts.e4x && c === "<" && this._input.test(startXmlRegExp) && this._allow_regexp_or_xml(last_token)) {
      // handle e4x xml literals
      //
-      this._input.back();
      var xmlStr = '';
      var match = this._input.match(startXmlRegExp);
      if (match) {
@ -280,110 +391,9 @@ function Tokenizer(input_string, opts) {
      }
    }

-    if (isRegExp || isString) {
-      if (this.has_char_escapes && opts.unescape_strings) {
-        resulting_string = unescape_string(resulting_string);
-      }
-      if (this._input.peek() === sep) {
-        resulting_string += sep;
-        this._input.next();
-
-        if (sep === '/') {
-          // regexps may have modifiers /regexp/MOD , so fetch those, too
-          // Only [gim] are valid, but if the user puts in garbage, do what we can to take it.
-          resulting_string += this._input.read(acorn.identifier);
-        }
-      }
-      return this.create_token(TOKEN.STRING, resulting_string);
-    }
-
-    if (c === '#') {
-
-      if (this._tokens.isEmpty() && this._input.peek() === '!') {
-        // shebang
-        resulting_string = c;
-        while (this._input.hasNext() && c !== '\n') {
-          c = this._input.next();
-          resulting_string += c;
-        }
-        return this.create_token(TOKEN.UNKNOWN, resulting_string.trim() + '\n');
-      }
-
-      // Spidermonkey-specific sharp variables for circular references. Considered obsolete.
-      var sharp = '#';
-      if (this._input.hasNext() && this._input.testChar(digit)) {
-        do {
-          c = this._input.next();
-          sharp += c;
-        } while (this._input.hasNext() && c !== '#' && c !== '=');
-        if (c === '#') {
-          //
-        } else if (this._input.peek() === '[' && this._input.peek(1) === ']') {
-          sharp += '[]';
-          this._input.next();
-          this._input.next();
-        } else if (this._input.peek() === '{' && this._input.peek(1) === '}') {
-          sharp += '{}';
-          this._input.next();
-          this._input.next();
-        }
-        return this.create_token(TOKEN.WORD, sharp);
-      }
-    }
-
-    if (c === '<' && (this._input.peek() === '?' || this._input.peek() === '%')) {
-      this._input.back();
-      var template_match = this._input.match(template_pattern);
-      if (template_match) {
-        c = template_match[0];
-        c = c.replace(acorn.allLineBreaks, '\n');
-        return this.create_token(TOKEN.STRING, c);
-      }
-    }
-
-    if (c === '<' && this._input.match(/\!--/g)) {
-      c = '<!--';
-      while (this._input.hasNext() && !this._input.testChar(acorn.newline)) {
-        c += this._input.next();
-      }
-      in_html_comment = true;
-      return this.create_token(TOKEN.COMMENT, c);
-    }
-
-    if (c === '-' && in_html_comment && this._input.match(/->/g)) {
-      in_html_comment = false;
-      return this.create_token(TOKEN.COMMENT, '-->');
-    }
-
-    if (c === '.') {
-      if (this._input.peek() === '.' && this._input.peek(1) === '.') {
-        c += this._input.next() + this._input.next();
-        return this.create_token(TOKEN.OPERATOR, c);
-      }
-      return this.create_token(TOKEN.DOT, c);
-    }
-
-    if (in_array(c, punct)) {
-      while (this._input.hasNext() && in_array(c + this._input.peek(), punct)) {
-        c += this._input.next();
-        if (!this._input.hasNext()) {
-          break;
-        }
-      }
-
-      if (c === ',') {
-        return this.create_token(TOKEN.COMMA, c);
-      } else if (c === '=') {
-        return this.create_token(TOKEN.EQUALS, c);
-      } else {
-        return this.create_token(TOKEN.OPERATOR, c);
-      }
-    }
-
-    return this.create_token(TOKEN.UNKNOWN, c);
+    return null;
  };

-
  function unescape_string(s) {
    // You think that a regex would work for this
    // return s.replace(/\\x([0-9a-f]{2})/gi, function(match, val) {
@ -449,7 +459,7 @@ function Tokenizer(input_string, opts) {

  // handle string
  //
-  this._read_string = function(delimiter, allow_unescaped_newlines, start_sub) {
+  this._read_string_recursive = function(delimiter, allow_unescaped_newlines, start_sub) {
    // Template strings can travers lines without escape characters.
    // Other strings cannot
    var current_char;
@ -486,9 +496,9 @@ function Tokenizer(input_string, opts) {

      if (start_sub && resulting_string.indexOf(start_sub, resulting_string.length - start_sub.length) !== -1) {
        if (delimiter === '`') {
-          resulting_string += this._read_string('}', allow_unescaped_newlines, '`');
+          resulting_string += this._read_string_recursive('}', allow_unescaped_newlines, '`');
        } else {
-          resulting_string += this._read_string('`', allow_unescaped_newlines, '${');
+          resulting_string += this._read_string_recursive('`', allow_unescaped_newlines, '${');
        }

        if (this._input.hasNext()) {
--- a/python/jsbeautifier/core/tokenizer.py
+++ b/python/jsbeautifier/core/tokenizer.py
@ -29,6 +29,7 @@ from ..core.tokenstream import TokenStream


 class TokenTypes:
+    START = 'TK_START'
    RAW = 'TK_RAW'
    EOF = 'TK_EOF'

@ -58,17 +59,17 @@ class Tokenizer:
        self._tokens = TokenStream()

        current = None
-        last = Token(TOKEN.RAW,'')
+        last = Token(TOKEN.START,'')
        open_token = None
        open_stack = []
        comments = TokenStream()

        while last.type != TOKEN.EOF:
-            current = self.get_next_token()
+            current = self.get_next_token(last)

            while self.is_comment(current):
                comments.add(current)
-                current = self.get_next_token()
+                current = self.get_next_token(last)

            if not comments.isEmpty():
                current.comments_before = comments
@ -90,7 +91,7 @@ class Tokenizer:
    def reset(self):
        pass

-    def get_next_token(self):
+    def get_next_token(self, last_token):
        self.readWhitespace()
        resulting_string = self._input.read(re.compile(r'.+'))
        if resulting_string:
--- a/python/jsbeautifier/core/tokenstream.py
+++ b/python/jsbeautifier/core/tokenstream.py
@ -51,13 +51,6 @@ class TokenStream:
        else:
            raise StopIteration

-    def last(self):
-        val = None
-        if not self.isEmpty():
-            val = self._tokens[-1]
-
-        return val
-
    def peek(self, index=0):
        val = None
        index += self._position
--- a/python/jsbeautifier/javascript/tokenizer.py
+++ b/python/jsbeautifier/javascript/tokenizer.py
@ -24,7 +24,6 @@

 import re
 from ..core.inputscanner import InputScanner
-from ..core.token import Token
 from ..core.tokenizer import TokenTypes as BaseTokenTypes
 from ..core.tokenizer import Tokenizer as BaseTokenizer
 from ..core.directives import Directives
@ -131,14 +130,7 @@ class Tokenizer(BaseTokenizer):
                            (current_token.text == ')' and open_token.text == '(') or
                            (current_token.text == '}' and open_token.text == '{')))

-    def get_next_token(self):
-        if self._tokens.isEmpty():
-            # For the sake of tokenizing we can pretend that there was on open
-            # brace to start
-            last_token = Token(TOKEN.START_BLOCK, '{')
-        else:
-            last_token = self._tokens.last()
-
+    def get_next_token(self, last_token):
        self.readWhitespace()

        resulting_string = self._input.read(self.acorn.identifier)
@ -203,7 +195,7 @@ class Tokenizer(BaseTokenizer):
            return (last_token.type == TOKEN.RESERVED and last_token.text in ['return', 'case', 'throw', 'else', 'do', 'typeof', 'yield']) or \
                (last_token.type == TOKEN.END_EXPR and last_token.text == ')' and
                 last_token.parent and last_token.parent.type == TOKEN.RESERVED and last_token.parent.text in ['if', 'while', 'for']) or \
-                (last_token.type in [TOKEN.COMMENT, TOKEN.START_EXPR, TOKEN.START_BLOCK, TOKEN.END_BLOCK, TOKEN.OPERATOR,
+                (last_token.type in [TOKEN.COMMENT, TOKEN.START_EXPR, TOKEN.START_BLOCK, TOKEN.START, TOKEN.END_BLOCK, TOKEN.OPERATOR,
                                     TOKEN.EQUALS, TOKEN.EOF, TOKEN.SEMICOLON, TOKEN.COMMA])

        self.has_char_escapes = False