You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
163 lines
5.2 KiB
163 lines
5.2 KiB
4 weeks ago
|
/**
|
||
|
* The Lexer class handles tokenizing the input in various ways. Since our
|
||
|
* parser expects us to be able to backtrack, the lexer allows lexing from any
|
||
|
* given starting point.
|
||
|
*
|
||
|
* Its main exposed function is the `lex` function, which takes a position to
|
||
|
* lex from and a type of token to lex. It defers to the appropriate `_innerLex`
|
||
|
* function.
|
||
|
*
|
||
|
* The various `_innerLex` functions perform the actual lexing of different
|
||
|
* kinds.
|
||
|
*/
|
||
|
|
||
|
var matchAt = require("match-at");
|
||
|
|
||
|
var ParseError = require("./ParseError");
|
||
|
|
||
|
// The main lexer class
|
||
|
function Lexer(input) {
|
||
|
this._input = input;
|
||
|
}
|
||
|
|
||
|
// The resulting token returned from `lex`.
|
||
|
function Token(text, data, position) {
|
||
|
this.text = text;
|
||
|
this.data = data;
|
||
|
this.position = position;
|
||
|
}
|
||
|
|
||
|
/* The following tokenRegex
|
||
|
* - matches typical whitespace (but not NBSP etc.) using its first group
|
||
|
* - matches symbol combinations which result in a single output character
|
||
|
* - does not match any control character \x00-\x1f except whitespace
|
||
|
* - does not match a bare backslash
|
||
|
* - matches any ASCII character except those just mentioned
|
||
|
* - does not match the BMP private use area \uE000-\uF8FF
|
||
|
* - does not match bare surrogate code units
|
||
|
* - matches any BMP character except for those just described
|
||
|
* - matches any valid Unicode surrogate pair
|
||
|
* - matches a backslash followed by one or more letters
|
||
|
* - matches a backslash followed by any BMP character, including newline
|
||
|
* Just because the Lexer matches something doesn't mean it's valid input:
|
||
|
* If there is no matching function or symbol definition, the Parser will
|
||
|
* still reject the input.
|
||
|
*/
|
||
|
var tokenRegex = new RegExp(
|
||
|
"([ \r\n\t]+)|(" + // whitespace
|
||
|
"---?" + // special combinations
|
||
|
"|[!-\\[\\]-\u2027\u202A-\uD7FF\uF900-\uFFFF]" + // single codepoint
|
||
|
"|[\uD800-\uDBFF][\uDC00-\uDFFF]" + // surrogate pair
|
||
|
"|\\\\(?:[a-zA-Z]+|[^\uD800-\uDFFF])" + // function name
|
||
|
")"
|
||
|
);
|
||
|
|
||
|
var whitespaceRegex = /\s*/;
|
||
|
|
||
|
/**
|
||
|
* This function lexes a single normal token. It takes a position and
|
||
|
* whether it should completely ignore whitespace or not.
|
||
|
*/
|
||
|
Lexer.prototype._innerLex = function(pos, ignoreWhitespace) {
|
||
|
var input = this._input;
|
||
|
if (pos === input.length) {
|
||
|
return new Token("EOF", null, pos);
|
||
|
}
|
||
|
var match = matchAt(tokenRegex, input, pos);
|
||
|
if (match === null) {
|
||
|
throw new ParseError(
|
||
|
"Unexpected character: '" + input[pos] + "'",
|
||
|
this, pos);
|
||
|
} else if (match[2]) { // matched non-whitespace
|
||
|
return new Token(match[2], null, pos + match[2].length);
|
||
|
} else if (ignoreWhitespace) {
|
||
|
return this._innerLex(pos + match[1].length, true);
|
||
|
} else { // concatenate whitespace to a single space
|
||
|
return new Token(" ", null, pos + match[1].length);
|
||
|
}
|
||
|
};
|
||
|
|
||
|
// A regex to match a CSS color (like #ffffff or BlueViolet)
|
||
|
var cssColor = /#[a-z0-9]+|[a-z]+/i;
|
||
|
|
||
|
/**
|
||
|
* This function lexes a CSS color.
|
||
|
*/
|
||
|
Lexer.prototype._innerLexColor = function(pos) {
|
||
|
var input = this._input;
|
||
|
|
||
|
// Ignore whitespace
|
||
|
var whitespace = matchAt(whitespaceRegex, input, pos)[0];
|
||
|
pos += whitespace.length;
|
||
|
|
||
|
var match;
|
||
|
if ((match = matchAt(cssColor, input, pos))) {
|
||
|
// If we look like a color, return a color
|
||
|
return new Token(match[0], null, pos + match[0].length);
|
||
|
} else {
|
||
|
throw new ParseError("Invalid color", this, pos);
|
||
|
}
|
||
|
};
|
||
|
|
||
|
// A regex to match a dimension. Dimensions look like
|
||
|
// "1.2em" or ".4pt" or "1 ex"
|
||
|
var sizeRegex = /(-?)\s*(\d+(?:\.\d*)?|\.\d+)\s*([a-z]{2})/;
|
||
|
|
||
|
/**
|
||
|
* This function lexes a dimension.
|
||
|
*/
|
||
|
Lexer.prototype._innerLexSize = function(pos) {
|
||
|
var input = this._input;
|
||
|
|
||
|
// Ignore whitespace
|
||
|
var whitespace = matchAt(whitespaceRegex, input, pos)[0];
|
||
|
pos += whitespace.length;
|
||
|
|
||
|
var match;
|
||
|
if ((match = matchAt(sizeRegex, input, pos))) {
|
||
|
var unit = match[3];
|
||
|
// We only currently handle "em" and "ex" units
|
||
|
if (unit !== "em" && unit !== "ex") {
|
||
|
throw new ParseError("Invalid unit: '" + unit + "'", this, pos);
|
||
|
}
|
||
|
return new Token(match[0], {
|
||
|
number: +(match[1] + match[2]),
|
||
|
unit: unit,
|
||
|
}, pos + match[0].length);
|
||
|
}
|
||
|
|
||
|
throw new ParseError("Invalid size", this, pos);
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* This function lexes a string of whitespace.
|
||
|
*/
|
||
|
Lexer.prototype._innerLexWhitespace = function(pos) {
|
||
|
var input = this._input;
|
||
|
|
||
|
var whitespace = matchAt(whitespaceRegex, input, pos)[0];
|
||
|
pos += whitespace.length;
|
||
|
|
||
|
return new Token(whitespace[0], null, pos);
|
||
|
};
|
||
|
|
||
|
/**
|
||
|
* This function lexes a single token starting at `pos` and of the given mode.
|
||
|
* Based on the mode, we defer to one of the `_innerLex` functions.
|
||
|
*/
|
||
|
Lexer.prototype.lex = function(pos, mode) {
|
||
|
if (mode === "math") {
|
||
|
return this._innerLex(pos, true);
|
||
|
} else if (mode === "text") {
|
||
|
return this._innerLex(pos, false);
|
||
|
} else if (mode === "color") {
|
||
|
return this._innerLexColor(pos);
|
||
|
} else if (mode === "size") {
|
||
|
return this._innerLexSize(pos);
|
||
|
} else if (mode === "whitespace") {
|
||
|
return this._innerLexWhitespace(pos);
|
||
|
}
|
||
|
};
|
||
|
|
||
|
module.exports = Lexer;
|