Reorganize tokenizer

This commit is contained in:
Liam Newman 2018-08-05 18:45:17 -07:00
parent 28e86b9af8
commit 00e5407b85
6 changed files with 209 additions and 220 deletions

View File

@ -31,6 +31,7 @@ var Token = require('../core/token').Token;
var TokenStream = require('../core/tokenstream').TokenStream;
var TOKEN = {
START: 'TK_START',
RAW: 'TK_RAW',
EOF: 'TK_EOF'
};
@ -53,16 +54,16 @@ function Tokenizer(input_string) { // jshint unused:false
this.reset();
var current;
var last = new Token(TOKEN.RAW, '');
var last = new Token(TOKEN.START, '');
var open_token = null;
var open_stack = [];
var comments = new TokenStream();
while (last.type !== TOKEN.EOF) {
current = this.get_next_token();
current = this.get_next_token(last);
while (this.is_comment(current)) {
comments.add(current);
current = this.get_next_token();
current = this.get_next_token(last);
}
if (!comments.isEmpty()) {
@ -90,7 +91,7 @@ function Tokenizer(input_string) { // jshint unused:false
this.reset = function() {};
this.get_next_token = function() {
this.get_next_token = function(last_token) { // jshint unused:false
this.readWhitespace();
var resulting_string = this._input.read(/.+/g);
if (resulting_string) {

View File

@ -54,14 +54,6 @@ function TokenStream(parent_token) {
return val;
};
this.last = function() {
var val = null;
if (!this.isEmpty()) {
val = this._tokens[this._tokens_length - 1];
}
return val;
};
this.peek = function(index) {
var val = null;
index = index || 0;

View File

@ -27,7 +27,6 @@
*/
var InputScanner = require('../core/inputscanner').InputScanner;
var Token = require('../core/token').Token;
var BaseTokenizer = require('../core/tokenizer').Tokenizer;
var BASETOKEN = require('../core/tokenizer').TOKEN;
var acorn = require('../core/acorn');
@ -54,6 +53,8 @@ var TOKEN = {
COMMENT: 'TK_COMMENT',
DOT: 'TK_DOT',
UNKNOWN: 'TK_UNKNOWN',
START: BASETOKEN.START,
RAW: BASETOKEN.RAW,
EOF: BASETOKEN.EOF
};
@ -66,10 +67,20 @@ function Tokenizer(input_string, opts) {
var digit = /[0-9]/;
this.positionable_operators = '!= !== % & && * ** + - / : < << <= == === > >= >> >>> ? ^ | ||'.split(' ');
var punct = this.positionable_operators.concat(
// non-positionable operators - these do not follow operator position settings
'! %= &= *= **= ++ += , -- -= /= :: <<= = => >>= >>>= ^= |= ~ ...'.split(' '));
this.positionable_operators = (
">>> === !== " +
"<< && >= ** != == <= >> || " +
"< / - + > : & % ? ^ | *").split(' ');
// IMPORTANT: this must be sorted longest to shortest or tokenizing many not work.
// Also, you must update possitionable operators separately from punct
var punct =
">>>= " +
"... >>= <<= === >>> !== **= " +
"=> ^= :: /= << <= == && -= >= >> != -- += ** || ++ %= &= *= |= " +
"= ! , ? > < : / ^ - + * & % ~ | .";
var punct_pattern = new RegExp(punct.replace(/[-[\]{}()*+?.,\\^$|#]/g, "\\$&").replace(/ /g, '|'), 'g');
// words which should always start on new line.
this.line_starters = 'continue,try,throw,return,var,let,const,if,switch,case,default,for,while,break,function,import,export'.split(',');
@ -105,19 +116,26 @@ function Tokenizer(input_string, opts) {
in_html_comment = false;
};
this.get_next_token = function() {
var resulting_string;
var last_token;
if (this._tokens.isEmpty()) {
// For the sake of tokenizing we can pretend that there was on open brace to start
last_token = new Token(TOKEN.START_BLOCK, '{');
} else {
last_token = this._tokens.last();
}
this.get_next_token = function(last_token) {
this.readWhitespace();
var token = null;
var c = this._input.peek();
token = token || this._read_singles(c);
token = token || this._read_word(last_token);
token = token || this._read_comment(c);
token = token || this._read_string(c);
token = token || this._read_regexp(c, last_token);
token = token || this._read_xml(c, last_token);
token = token || this._read_non_javascript(c);
token = token || this._read_punctuation();
token = token || this.create_token(TOKEN.UNKNOWN, this._input.next());
return token;
};
this._read_word = function(last_token) {
var resulting_string;
resulting_string = this._input.read(acorn.identifier);
if (resulting_string !== '') {
if (!(last_token.type === TOKEN.DOT ||
@ -136,40 +154,111 @@ function Tokenizer(input_string, opts) {
if (resulting_string !== '') {
return this.create_token(TOKEN.WORD, resulting_string);
}
};
var c = this._input.next();
this._read_singles = function(c) {
var token = null;
if (c === null) {
return this.create_token(TOKEN.EOF, '');
token = this.create_token(TOKEN.EOF, '');
} else if (c === '(' || c === '[') {
token = this.create_token(TOKEN.START_EXPR, c);
} else if (c === ')' || c === ']') {
token = this.create_token(TOKEN.END_EXPR, c);
} else if (c === '{') {
token = this.create_token(TOKEN.START_BLOCK, c);
} else if (c === '}') {
token = this.create_token(TOKEN.END_BLOCK, c);
} else if (c === ';') {
token = this.create_token(TOKEN.SEMICOLON, c);
}
if (token) {
this._input.next();
}
return token;
};
this._read_punctuation = function() {
var resulting_string = this._input.read(punct_pattern);
if (resulting_string !== '') {
if (resulting_string === ',') {
return this.create_token(TOKEN.COMMA, resulting_string);
} else if (resulting_string === '=') {
return this.create_token(TOKEN.EQUALS, resulting_string);
} else if (resulting_string === '.') {
return this.create_token(TOKEN.DOT, resulting_string);
} else {
return this.create_token(TOKEN.OPERATOR, resulting_string);
}
}
};
this._read_non_javascript = function(c) {
var resulting_string = '';
if (c === '#') {
c = this._input.next();
if (this._tokens.isEmpty() && this._input.peek() === '!') {
// shebang
resulting_string = c;
while (this._input.hasNext() && c !== '\n') {
c = this._input.next();
resulting_string += c;
}
return this.create_token(TOKEN.UNKNOWN, resulting_string.trim() + '\n');
}
if (c === '(' || c === '[') {
return this.create_token(TOKEN.START_EXPR, c);
// Spidermonkey-specific sharp variables for circular references. Considered obsolete.
var sharp = '#';
if (this._input.hasNext() && this._input.testChar(digit)) {
do {
c = this._input.next();
sharp += c;
} while (this._input.hasNext() && c !== '#' && c !== '=');
if (c === '#') {
//
} else if (this._input.peek() === '[' && this._input.peek(1) === ']') {
sharp += '[]';
this._input.next();
this._input.next();
} else if (this._input.peek() === '{' && this._input.peek(1) === '}') {
sharp += '{}';
this._input.next();
this._input.next();
}
return this.create_token(TOKEN.WORD, sharp);
}
if (c === ')' || c === ']') {
return this.create_token(TOKEN.END_EXPR, c);
}
if (c === '{') {
return this.create_token(TOKEN.START_BLOCK, c);
}
if (c === '}') {
return this.create_token(TOKEN.END_BLOCK, c);
}
if (c === ';') {
return this.create_token(TOKEN.SEMICOLON, c);
}
if (c === '/') {
var token;
var comment = '';
// peek for comment /* ... */
if (this._input.peek() === '*') {
this._input.back();
} else if (c === '<' && (this._input.peek(1) === '?' || this._input.peek(1) === '%')) {
resulting_string = this._input.read(template_pattern);
if (resulting_string) {
resulting_string = resulting_string.replace(acorn.allLineBreaks, '\n');
return this.create_token(TOKEN.STRING, resulting_string);
}
} else if (c === '<' && this._input.match(/<\!--/g)) {
c = '<!--';
while (this._input.hasNext() && !this._input.testChar(acorn.newline)) {
c += this._input.next();
}
in_html_comment = true;
return this.create_token(TOKEN.COMMENT, c);
} else if (c === '-' && in_html_comment && this._input.match(/-->/g)) {
in_html_comment = false;
return this.create_token(TOKEN.COMMENT, '-->');
}
return null;
};
this._read_comment = function(c) {
var token = null;
if (c === '/') {
var comment = '';
if (this._input.peek(1) === '*') {
// peek for comment /* ... */
comment = this._input.read(block_comment_pattern);
var directives = directives_core.get_directives(comment);
if (directives && directives.ignore === 'start') {
@ -178,55 +267,60 @@ function Tokenizer(input_string, opts) {
comment = comment.replace(acorn.allLineBreaks, '\n');
token = this.create_token(TOKEN.BLOCK_COMMENT, comment);
token.directives = directives;
return token;
}
} else if (this._input.peek(1) === '/') {
// peek for comment // ...
if (this._input.peek() === '/') {
this._input.back();
comment = this._input.read(comment_pattern);
return this.create_token(TOKEN.COMMENT, comment);
token = this.create_token(TOKEN.COMMENT, comment);
}
}
return token;
};
this._read_string = function(c) {
if (c === '`' || c === "'" || c === '"') {
var resulting_string = this._input.next();
this.has_char_escapes = false;
if (c === '`') {
resulting_string += this._read_string_recursive('`', true, '${');
} else {
resulting_string += this._read_string_recursive(c);
}
if (this.has_char_escapes && opts.unescape_strings) {
resulting_string = unescape_string(resulting_string);
}
if (this._input.peek() === c) {
resulting_string += this._input.next();
}
var startXmlRegExp = /<()([-a-zA-Z:0-9_.]+|{[\s\S]+?}|!\[CDATA\[[\s\S]*?\]\])(\s+{[\s\S]+?}|\s+[-a-zA-Z:0-9_.]+|\s+[-a-zA-Z:0-9_.]+\s*=\s*('[^']*'|"[^"]*"|{[\s\S]+?}))*\s*(\/?)\s*>/g;
return this.create_token(TOKEN.STRING, resulting_string);
}
var xmlRegExp = /[\s\S]*?<(\/?)([-a-zA-Z:0-9_.]+|{[\s\S]+?}|!\[CDATA\[[\s\S]*?\]\])(\s+{[\s\S]+?}|\s+[-a-zA-Z:0-9_.]+|\s+[-a-zA-Z:0-9_.]+\s*=\s*('[^']*'|"[^"]*"|{[\s\S]+?}))*\s*(\/?)\s*>/g;
return null;
};
function allowRegExOrXML() {
this._allow_regexp_or_xml = function(last_token) {
// regex and xml can only appear in specific locations during parsing
return (last_token.type === TOKEN.RESERVED && in_array(last_token.text, ['return', 'case', 'throw', 'else', 'do', 'typeof', 'yield'])) ||
(last_token.type === TOKEN.END_EXPR && last_token.text === ')' &&
last_token.parent && last_token.parent.type === TOKEN.RESERVED && in_array(last_token.parent.text, ['if', 'while', 'for'])) ||
(in_array(last_token.type, [TOKEN.COMMENT, TOKEN.START_EXPR, TOKEN.START_BLOCK,
(in_array(last_token.type, [TOKEN.COMMENT, TOKEN.START_EXPR, TOKEN.START_BLOCK, TOKEN.START,
TOKEN.END_BLOCK, TOKEN.OPERATOR, TOKEN.EQUALS, TOKEN.EOF, TOKEN.SEMICOLON, TOKEN.COMMA
]));
}
};
var isString = (c === '`' || c === "'" || c === '"');
var isRegExp = (c === '/') && allowRegExOrXML();
var isXML = (opts.e4x && c === "<" && this._input.test(startXmlRegExp, -1)) && allowRegExOrXML();
var sep = c,
esc = false;
this._read_regexp = function(c, last_token) {
this.has_char_escapes = false;
resulting_string = c;
if (isString) {
if (sep === '`') {
resulting_string += this._read_string('`', true, '${');
} else {
resulting_string += this._read_string(sep);
}
} else if (isRegExp) {
if (c === '/' && this._allow_regexp_or_xml(last_token)) {
// handle regexp
//
var resulting_string = this._input.next();
var esc = false;
var in_char_class = false;
while (this._input.hasNext() &&
((esc || in_char_class || this._input.peek() !== sep) &&
((esc || in_char_class || this._input.peek() !== c) &&
!this._input.testChar(acorn.newline))) {
resulting_string += this._input.peek();
if (!esc) {
@ -242,10 +336,27 @@ function Tokenizer(input_string, opts) {
this._input.next();
}
} else if (isXML) {
if (this._input.peek() === c) {
resulting_string += this._input.next();
// regexps may have modifiers /regexp/MOD , so fetch those, too
// Only [gim] are valid, but if the user puts in garbage, do what we can to take it.
resulting_string += this._input.read(acorn.identifier);
}
return this.create_token(TOKEN.STRING, resulting_string);
}
return null;
};
var startXmlRegExp = /<()([-a-zA-Z:0-9_.]+|{[\s\S]+?}|!\[CDATA\[[\s\S]*?\]\])(\s+{[\s\S]+?}|\s+[-a-zA-Z:0-9_.]+|\s+[-a-zA-Z:0-9_.]+\s*=\s*('[^']*'|"[^"]*"|{[\s\S]+?}))*\s*(\/?)\s*>/g;
var xmlRegExp = /[\s\S]*?<(\/?)([-a-zA-Z:0-9_.]+|{[\s\S]+?}|!\[CDATA\[[\s\S]*?\]\])(\s+{[\s\S]+?}|\s+[-a-zA-Z:0-9_.]+|\s+[-a-zA-Z:0-9_.]+\s*=\s*('[^']*'|"[^"]*"|{[\s\S]+?}))*\s*(\/?)\s*>/g;
this._read_xml = function(c, last_token) {
if (opts.e4x && c === "<" && this._input.test(startXmlRegExp) && this._allow_regexp_or_xml(last_token)) {
// handle e4x xml literals
//
this._input.back();
var xmlStr = '';
var match = this._input.match(startXmlRegExp);
if (match) {
@ -280,110 +391,9 @@ function Tokenizer(input_string, opts) {
}
}
if (isRegExp || isString) {
if (this.has_char_escapes && opts.unescape_strings) {
resulting_string = unescape_string(resulting_string);
}
if (this._input.peek() === sep) {
resulting_string += sep;
this._input.next();
if (sep === '/') {
// regexps may have modifiers /regexp/MOD , so fetch those, too
// Only [gim] are valid, but if the user puts in garbage, do what we can to take it.
resulting_string += this._input.read(acorn.identifier);
}
}
return this.create_token(TOKEN.STRING, resulting_string);
}
if (c === '#') {
if (this._tokens.isEmpty() && this._input.peek() === '!') {
// shebang
resulting_string = c;
while (this._input.hasNext() && c !== '\n') {
c = this._input.next();
resulting_string += c;
}
return this.create_token(TOKEN.UNKNOWN, resulting_string.trim() + '\n');
}
// Spidermonkey-specific sharp variables for circular references. Considered obsolete.
var sharp = '#';
if (this._input.hasNext() && this._input.testChar(digit)) {
do {
c = this._input.next();
sharp += c;
} while (this._input.hasNext() && c !== '#' && c !== '=');
if (c === '#') {
//
} else if (this._input.peek() === '[' && this._input.peek(1) === ']') {
sharp += '[]';
this._input.next();
this._input.next();
} else if (this._input.peek() === '{' && this._input.peek(1) === '}') {
sharp += '{}';
this._input.next();
this._input.next();
}
return this.create_token(TOKEN.WORD, sharp);
}
}
if (c === '<' && (this._input.peek() === '?' || this._input.peek() === '%')) {
this._input.back();
var template_match = this._input.match(template_pattern);
if (template_match) {
c = template_match[0];
c = c.replace(acorn.allLineBreaks, '\n');
return this.create_token(TOKEN.STRING, c);
}
}
if (c === '<' && this._input.match(/\!--/g)) {
c = '<!--';
while (this._input.hasNext() && !this._input.testChar(acorn.newline)) {
c += this._input.next();
}
in_html_comment = true;
return this.create_token(TOKEN.COMMENT, c);
}
if (c === '-' && in_html_comment && this._input.match(/->/g)) {
in_html_comment = false;
return this.create_token(TOKEN.COMMENT, '-->');
}
if (c === '.') {
if (this._input.peek() === '.' && this._input.peek(1) === '.') {
c += this._input.next() + this._input.next();
return this.create_token(TOKEN.OPERATOR, c);
}
return this.create_token(TOKEN.DOT, c);
}
if (in_array(c, punct)) {
while (this._input.hasNext() && in_array(c + this._input.peek(), punct)) {
c += this._input.next();
if (!this._input.hasNext()) {
break;
}
}
if (c === ',') {
return this.create_token(TOKEN.COMMA, c);
} else if (c === '=') {
return this.create_token(TOKEN.EQUALS, c);
} else {
return this.create_token(TOKEN.OPERATOR, c);
}
}
return this.create_token(TOKEN.UNKNOWN, c);
return null;
};
function unescape_string(s) {
// You think that a regex would work for this
// return s.replace(/\\x([0-9a-f]{2})/gi, function(match, val) {
@ -449,7 +459,7 @@ function Tokenizer(input_string, opts) {
// handle string
//
this._read_string = function(delimiter, allow_unescaped_newlines, start_sub) {
this._read_string_recursive = function(delimiter, allow_unescaped_newlines, start_sub) {
// Template strings can travers lines without escape characters.
// Other strings cannot
var current_char;
@ -486,9 +496,9 @@ function Tokenizer(input_string, opts) {
if (start_sub && resulting_string.indexOf(start_sub, resulting_string.length - start_sub.length) !== -1) {
if (delimiter === '`') {
resulting_string += this._read_string('}', allow_unescaped_newlines, '`');
resulting_string += this._read_string_recursive('}', allow_unescaped_newlines, '`');
} else {
resulting_string += this._read_string('`', allow_unescaped_newlines, '${');
resulting_string += this._read_string_recursive('`', allow_unescaped_newlines, '${');
}
if (this._input.hasNext()) {

View File

@ -29,6 +29,7 @@ from ..core.tokenstream import TokenStream
class TokenTypes:
START = 'TK_START'
RAW = 'TK_RAW'
EOF = 'TK_EOF'
@ -58,17 +59,17 @@ class Tokenizer:
self._tokens = TokenStream()
current = None
last = Token(TOKEN.RAW,'')
last = Token(TOKEN.START,'')
open_token = None
open_stack = []
comments = TokenStream()
while last.type != TOKEN.EOF:
current = self.get_next_token()
current = self.get_next_token(last)
while self.is_comment(current):
comments.add(current)
current = self.get_next_token()
current = self.get_next_token(last)
if not comments.isEmpty():
current.comments_before = comments
@ -90,7 +91,7 @@ class Tokenizer:
def reset(self):
pass
def get_next_token(self):
def get_next_token(self, last_token):
self.readWhitespace()
resulting_string = self._input.read(re.compile(r'.+'))
if resulting_string:

View File

@ -51,13 +51,6 @@ class TokenStream:
else:
raise StopIteration
def last(self):
val = None
if not self.isEmpty():
val = self._tokens[-1]
return val
def peek(self, index=0):
val = None
index += self._position

View File

@ -24,7 +24,6 @@
import re
from ..core.inputscanner import InputScanner
from ..core.token import Token
from ..core.tokenizer import TokenTypes as BaseTokenTypes
from ..core.tokenizer import Tokenizer as BaseTokenizer
from ..core.directives import Directives
@ -131,14 +130,7 @@ class Tokenizer(BaseTokenizer):
(current_token.text == ')' and open_token.text == '(') or
(current_token.text == '}' and open_token.text == '{')))
def get_next_token(self):
if self._tokens.isEmpty():
# For the sake of tokenizing we can pretend that there was on open
# brace to start
last_token = Token(TOKEN.START_BLOCK, '{')
else:
last_token = self._tokens.last()
def get_next_token(self, last_token):
self.readWhitespace()
resulting_string = self._input.read(self.acorn.identifier)
@ -203,7 +195,7 @@ class Tokenizer(BaseTokenizer):
return (last_token.type == TOKEN.RESERVED and last_token.text in ['return', 'case', 'throw', 'else', 'do', 'typeof', 'yield']) or \
(last_token.type == TOKEN.END_EXPR and last_token.text == ')' and
last_token.parent and last_token.parent.type == TOKEN.RESERVED and last_token.parent.text in ['if', 'while', 'for']) or \
(last_token.type in [TOKEN.COMMENT, TOKEN.START_EXPR, TOKEN.START_BLOCK, TOKEN.END_BLOCK, TOKEN.OPERATOR,
(last_token.type in [TOKEN.COMMENT, TOKEN.START_EXPR, TOKEN.START_BLOCK, TOKEN.START, TOKEN.END_BLOCK, TOKEN.OPERATOR,
TOKEN.EQUALS, TOKEN.EOF, TOKEN.SEMICOLON, TOKEN.COMMA])
self.has_char_escapes = False