Fix scanning edge cases and optimize slightly

* Bug fixes
  * tokenValue() was wrong if an identifier ever came right after a
    string literal.
  * There was an incorrect and confusing end-of-file error if file
    ends with numeric literal. Generally hardened and cleaned up EOF
    checking throughout.
  * Digits weren't allowed in identifier with non-ascii characters.

* Perf
  * Add more ASCII fast paths
  * Inline scanUntil everywhere, and simplify each use to what it
    actually needed
  * Avoid eager substring allocation and map lookup for keywords in
    more cases
This commit is contained in:
Nick Guerrera 2021-04-24 18:17:28 -07:00
Родитель 9063e1b63f
Коммит 5fb28d6378
7 изменённых файлов: 425 добавлений и 294 удалений

Просмотреть файл

@ -12,13 +12,13 @@ export const enum CharCode {
LineSeparator = 0x2028,
ParagraphSeparator = 0x2029,
// ASCII whitespace
// ASCII whitespace excluding line breaks
Space = 0x20,
FormFeed = 0x0c,
Tab = 0x09,
VerticalTab = 0x0b,
// Non-ASCII whitespace
// Non-ASCII whitespace excluding line breaks
ByteOrderMark = 0xfeff, // currently allowed anywhere
NextLine = 0x0085, // not considered a line break, mirroring ECMA-262
NonBreakingSpace = 0x00a0,
@ -144,15 +144,24 @@ export const enum CharCode {
Tilde = 0x7e,
}
/** Does not include line breaks. For that, see isWhiteSpaceLike. */
export function isWhiteSpaceSingleLine(ch: number): boolean {
// Note: nextLine is in the Zs space, and should be considered to be a whitespace.
// It is explicitly not a line-break as it isn't in the exact set specified by EcmaScript.
export function isAsciiLineBreak(ch: number) {
return ch === CharCode.LineFeed || ch == CharCode.CarriageReturn;
}
export function isAsciiWhiteSpaceSingleLine(ch: number): boolean {
return (
ch === CharCode.Space ||
ch === CharCode.Tab ||
ch === CharCode.VerticalTab ||
ch === CharCode.FormFeed ||
ch === CharCode.FormFeed
);
}
export function isNonAsciiWhiteSpaceSingleLine(ch: number) {
// Note: nextLine is in the Zs space, and should be considered to be a
// whitespace. It is explicitly not a line-break as it isn't in the exact set
// inherited by ADL from JavaScript.
return (
ch === CharCode.NonBreakingSpace ||
ch === CharCode.NextLine ||
ch === CharCode.Ogham ||
@ -164,17 +173,23 @@ export function isWhiteSpaceSingleLine(ch: number): boolean {
);
}
export function isLineBreak(ch: number): boolean {
// Other new line or line
// breaking characters are treated as white space but not as line terminators.
export function isNonAsciiLineBreak(ch: number) {
// Other new line or line breaking characters are treated as white space but
// not as line terminators.
return ch === CharCode.ParagraphSeparator || ch === CharCode.LineSeparator;
}
export function isWhiteSpaceSingleLine(ch: number) {
return (
ch === CharCode.LineFeed ||
ch === CharCode.CarriageReturn ||
ch === CharCode.LineSeparator ||
ch === CharCode.ParagraphSeparator
isAsciiWhiteSpaceSingleLine(ch) ||
(ch > CharCode.MaxAscii && isNonAsciiWhiteSpaceSingleLine(ch))
);
}
export function isLineBreak(ch: number): boolean {
return isAsciiLineBreak(ch) || (ch > CharCode.MaxAscii && isNonAsciiLineBreak(ch));
}
export function isDigit(ch: number): boolean {
return ch >= CharCode._0 && ch <= CharCode._9;
}
@ -210,7 +225,7 @@ export function isAsciiIdentifierContinue(ch: number): boolean {
export function isIdentifierContinue(codePoint: number) {
return (
isAsciiIdentifierStart(codePoint) ||
isAsciiIdentifierContinue(codePoint) ||
(codePoint > CharCode.MaxAscii && isNonAsciiIdentifierContinue(codePoint))
);
}

Просмотреть файл

@ -1,5 +1,5 @@
import { AssertionError } from "assert";
import { CharCode } from "./charcode.js";
import { CharCode, isNonAsciiLineBreak } from "./charcode.js";
import { Message } from "./messages.js";
import { Diagnostic, Node, SourceFile, SourceLocation, Sym, SyntaxKind, Type } from "./types.js";
@ -113,7 +113,7 @@ export function createSourceFile(text: string, path: string): SourceFile {
};
function getLineStarts() {
return (lineStarts = lineStarts ?? scanLineStarts());
return (lineStarts = lineStarts ?? scanLineStarts(text));
}
function getLineAndCharacterOfPosition(position: number) {
@ -136,57 +136,6 @@ export function createSourceFile(text: string, path: string): SourceFile {
character: position - starts[line],
};
}
function scanLineStarts() {
const starts = [];
let start = 0;
let pos = 0;
while (pos < text.length) {
const ch = text.charCodeAt(pos);
pos++;
switch (ch) {
case CharCode.CarriageReturn:
if (text.charCodeAt(pos) === CharCode.LineFeed) {
pos++;
}
// fallthrough
case CharCode.LineFeed:
case CharCode.LineSeparator:
case CharCode.ParagraphSeparator:
starts.push(start);
start = pos;
break;
}
}
starts.push(start);
return starts;
}
/**
* Search sorted array of numbers for the given value. If found, return index
* in array where value was found. If not found, return a negative number that
* is the bitwise complement of the index where value would need to be inserted
* to keep the array sorted.
*/
function binarySearch(array: readonly number[], value: number) {
let low = 0;
let high = array.length - 1;
while (low <= high) {
const middle = low + ((high - low) >> 1);
const v = array[middle];
if (v < value) {
low = middle + 1;
} else if (v > value) {
high = middle - 1;
} else {
return middle;
}
}
return ~low;
}
}
export function getSourceLocation(target: DiagnosticTarget): SourceLocation {
@ -328,3 +277,58 @@ function format(text: string, args?: (string | number)[]): [string, Error?] {
function isNotUndefined<T>(value: T | undefined): value is T {
return value !== undefined;
}
function scanLineStarts(text: string): number[] {
const starts = [];
let start = 0;
let pos = 0;
while (pos < text.length) {
const ch = text.charCodeAt(pos);
pos++;
switch (ch) {
case CharCode.CarriageReturn:
if (text.charCodeAt(pos) === CharCode.LineFeed) {
pos++;
}
// fallthrough
case CharCode.LineFeed:
starts.push(start);
start = pos;
break;
default:
if (ch > CharCode.MaxAscii && isNonAsciiLineBreak(ch)) {
starts.push(start);
start = pos;
break;
}
}
}
starts.push(start);
return starts;
}
/**
* Search sorted array of numbers for the given value. If found, return index
* in array where value was found. If not found, return a negative number that
* is the bitwise complement of the index where value would need to be inserted
* to keep the array sorted.
*/
function binarySearch(array: readonly number[], value: number) {
let low = 0;
let high = array.length - 1;
while (low <= high) {
const middle = low + ((high - low) >> 1);
const v = array[middle];
if (v < value) {
low = middle + 1;
} else if (v > value) {
high = middle - 1;
} else {
return middle;
}
}
return ~low;
}

Просмотреть файл

@ -8,50 +8,50 @@ export const Message = {
DigitExpected: {
code: 1100,
severity: "error",
text: "Digit expected (0-9)",
text: "Digit expected.",
} as const,
HexDigitExpected: {
code: 1101,
severity: "error",
text: "Hex Digit expected (0-F)",
text: "Hexadecimal digit expected.",
} as const,
BinaryDigitExpected: {
code: 1102,
severity: "error",
text: "Binary Digit expected (0,1)",
text: "Binary digit expected.",
} as const,
UnexpectedEndOfFile: {
Unterminated: {
code: 1103,
severity: "error",
text: "Unexpected end of file while searching for '{0}'",
text: "Unterminated {0}.",
} as const,
InvalidEscapeSequence: {
code: 1104,
severity: "error",
text: "Invalid escape sequence",
text: "Invalid escape sequence.",
} as const,
NoNewLineAtStartOfTripleQuotedString: {
code: 1105,
severity: "error",
text: "String content in triple quotes must begin on a new line",
text: "String content in triple quotes must begin on a new line.",
} as const,
NoNewLineAtEndOfTripleQuotedString: {
code: 1106,
severity: "error",
text: "Closing triple quotes must begin on a new line",
text: "Closing triple quotes must begin on a new line.",
} as const,
InconsistentTripleQuoteIndentation: {
code: 1107,
severity: "error",
text:
"All lines in triple-quoted string lines must have the same indentation as closing triple quotes",
"All lines in triple-quoted string lines must have the same indentation as closing triple quotes.",
} as const,
InvalidCharacter: {

Просмотреть файл

@ -280,7 +280,7 @@ export function parse(code: string | SourceFile) {
stmts.push(parseUsingStatement());
break;
case Token.EndOfFile:
error("End of file reached without '}'.");
parseExpected(Token.CloseBrace);
return stmts;
case Token.Semicolon:
reportInvalidDecorators(decorators, "empty statement");
@ -990,9 +990,8 @@ export function parse(code: string | SourceFile) {
if (realPositionOfLastError === realPos) {
return;
}
realPositionOfLastError = realPos;
parseErrorInNextFinishedNode = true;
reportDiagnostic(message, location);
}
@ -1001,6 +1000,9 @@ export function parse(code: string | SourceFile) {
target: DiagnosticTarget,
args?: (string | number)[]
) {
if (typeof message === "string" || message.severity === "error") {
parseErrorInNextFinishedNode = true;
}
const diagnostic = createDiagnostic(message, target, args);
parseDiagnostics.push(diagnostic);
}

Просмотреть файл

@ -9,6 +9,8 @@ import {
isLineBreak,
isNonAsciiIdentifierContinue,
isNonAsciiIdentifierStart,
isNonAsciiLineBreak,
isNonAsciiWhiteSpaceSingleLine,
isWhiteSpaceSingleLine,
} from "./charcode.js";
import { createSourceFile, Message, throwOnError } from "./diagnostics.js";
@ -82,6 +84,7 @@ const MaxPunctuation = Token.At;
const MinStatementKeyword = Token.ImportKeyword;
const MaxStatementKeyword = Token.OpKeyword;
/** @internal */
export const TokenDisplay: readonly string[] = [
"<none>",
"<invalid>",
@ -122,6 +125,7 @@ export const TokenDisplay: readonly string[] = [
"'false'",
];
/** @internal */
export const Keywords: ReadonlyMap<string, Token> = new Map([
["import", Token.ImportKeyword],
["model", Token.ModelKeyword],
@ -133,7 +137,13 @@ export const Keywords: ReadonlyMap<string, Token> = new Map([
["false", Token.FalseKeyword],
]);
export const maxKeywordLength = 9;
/** @internal */
export const enum KeywordLimit {
MinLength = 2,
MaxLength = 9,
MinStartChar = CharCode.e,
MaxStartChar = CharCode.u,
}
export interface Scanner {
/** The source code being scanned. */
@ -264,32 +274,12 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
}
// fallthrough
case CharCode.LineFeed:
case CharCode.LineSeparator:
case CharCode.ParagraphSeparator:
return next(Token.NewLine);
case CharCode.Space:
case CharCode.Tab:
case CharCode.VerticalTab:
case CharCode.FormFeed:
case CharCode.Space:
case CharCode.NonBreakingSpace:
case CharCode.Ogham:
case CharCode.EnQuad:
case CharCode.EmQuad:
case CharCode.EnSpace:
case CharCode.EmSpace:
case CharCode.ThreePerEmSpace:
case CharCode.FourPerEmSpace:
case CharCode.SixPerEmSpace:
case CharCode.FigureSpace:
case CharCode.PunctuationSpace:
case CharCode.ThinSpace:
case CharCode.HairSpace:
case CharCode.ZeroWidthSpace:
case CharCode.NarrowNoBreakSpace:
case CharCode.MathematicalSpace:
case CharCode.IdeographicSpace:
case CharCode.ByteOrderMark:
return scanWhitespace();
case CharCode.OpenParen:
@ -382,16 +372,45 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
: next(Token.Bar);
case CharCode.DoubleQuote:
return scanString();
return lookAhead(1) === CharCode.DoubleQuote && lookAhead(2) === CharCode.DoubleQuote
? scanTripleQuotedString()
: scanString();
default:
return scanIdentifierOrKeyword();
if (isAsciiIdentifierStart(ch)) {
return scanIdentifierOrKeyword();
}
if (ch <= CharCode.MaxAscii) {
return scanInvalidCharacter();
}
return scanNonAsciiToken();
}
}
return (token = Token.EndOfFile);
}
function scanNonAsciiToken() {
const ch = input.charCodeAt(position);
if (isNonAsciiLineBreak(ch)) {
return next(Token.NewLine);
}
if (isNonAsciiWhiteSpaceSingleLine(ch)) {
return scanWhitespace();
}
const codePoint = input.codePointAt(position)!;
if (isNonAsciiIdentifierStart(codePoint)) {
return scanNonAsciiIdentifierContinue(codePoint);
}
return scanInvalidCharacter();
}
function scanInvalidCharacter() {
const codePoint = input.codePointAt(position)!;
token = next(Token.Invalid, utf16CodeUnits(codePoint));
@ -423,152 +442,184 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
function scanWhitespace(): Token {
do {
position++;
} while (isWhiteSpaceSingleLine(input.charCodeAt(position)));
} while (!eof() && isWhiteSpaceSingleLine(input.charCodeAt(position)));
return (token = Token.Whitespace);
}
function scanDigits() {
while (isDigit(input.charCodeAt(position))) {
position++;
}
}
function scanNumber() {
scanDigits();
let ch = input.charCodeAt(position);
if (ch === CharCode.Dot) {
position++;
scanDigits();
}
ch = input.charCodeAt(position);
if (ch === CharCode.e) {
position++;
ch = input.charCodeAt(position);
if (ch === CharCode.Plus || ch == CharCode.Minus) {
position++;
ch = input.charCodeAt(position);
}
if (isDigit(ch)) {
position++;
scanDigits();
} else {
error(Message.DigitExpected);
scanKnownDigits();
if (!eof()) {
switch (input.charCodeAt(position)) {
case CharCode.Dot:
scanFractionAndExponent();
break;
case CharCode.e:
scanExponent();
break;
}
}
return (token = Token.NumericLiteral);
}
function scanHexNumber() {
if (!isHexDigit(lookAhead(2))) {
error(Message.HexDigitExpected);
return next(Token.NumericLiteral, 2);
}
function scanKnownDigits() {
do {
position++;
} while (!eof() && isDigit(input.charCodeAt(position)));
}
function scanOptionalDigits() {
if (!eof() && isDigit(input.charCodeAt(position))) {
scanKnownDigits();
}
}
function scanRequiredDigits() {
if (eof() || !isDigit(input.charCodeAt(position))) {
error(Message.DigitExpected);
return;
}
scanKnownDigits();
}
function scanFractionAndExponent() {
position++; // consume '.'
scanOptionalDigits();
if (!eof() && input.charCodeAt(position) === CharCode.e) {
scanExponent();
}
}
function scanExponent() {
position++; // consume 'e'
if (eof()) {
error(Message.DigitExpected);
return;
}
const ch = input.charCodeAt(position);
if (ch === CharCode.Plus || ch === CharCode.Minus) {
position++;
}
scanRequiredDigits();
}
function scanHexNumber() {
position += 2; // consume '0x'
if (eof() || !isHexDigit(input.charCodeAt(position))) {
error(Message.HexDigitExpected);
return (token = Token.NumericLiteral);
}
do {
position++;
} while (!eof() && isHexDigit(input.charCodeAt(position)));
position += 2;
scanUntil((ch) => !isHexDigit(ch), "Hex Digit");
return (token = Token.NumericLiteral);
}
function scanBinaryNumber() {
if (!isBinaryDigit(lookAhead(2))) {
error(Message.BinaryDigitExpected);
return next(Token.NumericLiteral, 2);
}
position += 2; // consume '0b'
if (eof() || !isBinaryDigit(input.charCodeAt(position))) {
error(Message.BinaryDigitExpected);
return (token = Token.NumericLiteral);
}
do {
position++;
} while (!eof() && isBinaryDigit(input.charCodeAt(position)));
position += 2;
scanUntil((ch) => !isBinaryDigit(ch), "Binary Digit");
return (token = Token.NumericLiteral);
}
function scanUntil(
predicate: (char: number) => boolean,
expectedClose?: string,
consumeClose?: number
) {
let ch: number;
function scanSingleLineComment() {
position += 2; // consume '//'
do {
position++;
if (eof()) {
if (expectedClose) {
error(Message.UnexpectedEndOfFile, [expectedClose]);
}
while (!eof()) {
if (isLineBreak(input.charCodeAt(position))) {
break;
}
ch = input.charCodeAt(position);
} while (!predicate(ch));
if (consumeClose) {
position += consumeClose;
position++;
}
}
function scanSingleLineComment() {
scanUntil(isLineBreak);
return (token = Token.SingleLineComment);
}
function scanMultiLineComment() {
scanUntil((ch) => ch === CharCode.Asterisk && lookAhead(1) === CharCode.Slash, "*/", 2);
position += 2; // consume '/*'
while (!eof()) {
if (input.charCodeAt(position) === CharCode.Asterisk && lookAhead(1) === CharCode.Slash) {
position += 2;
return (token = Token.MultiLineComment);
}
position++;
}
error(Message.Unterminated, ["comment"]);
return (token = Token.MultiLineComment);
}
function scanString() {
let quoteLength = 1;
let closing = '"';
let isEscaping = false;
position++; // consume '"'
const tripleQuoted =
lookAhead(1) === CharCode.DoubleQuote && lookAhead(2) === CharCode.DoubleQuote;
if (tripleQuoted) {
tokenFlags |= TokenFlags.TripleQuoted;
quoteLength = 3;
position += 2;
closing = '"""';
loop: while (!eof()) {
const ch = input.charCodeAt(position);
switch (ch) {
case CharCode.CarriageReturn:
if (lookAhead(1) === CharCode.LineFeed) {
tokenFlags |= TokenFlags.HasCrlf;
position++;
}
break;
case CharCode.Backslash:
tokenFlags |= TokenFlags.Escaped;
position++;
if (eof()) {
break loop;
}
break;
case CharCode.DoubleQuote:
position++;
return (token = Token.StringLiteral);
}
position++;
}
scanUntil(
(ch) => {
if (isEscaping) {
isEscaping = false;
return false;
}
error(Message.Unterminated, ["string literal"]);
return (token = Token.StringLiteral);
}
switch (ch) {
case CharCode.CarriageReturn:
if (lookAhead(1) === CharCode.LineFeed) {
tokenFlags |= TokenFlags.HasCrlf;
}
return false;
function scanTripleQuotedString() {
tokenFlags |= TokenFlags.TripleQuoted;
position += 3; // consume '"""'
case CharCode.Backslash:
isEscaping = true;
tokenFlags |= TokenFlags.Escaped;
return false;
case CharCode.DoubleQuote:
if (tripleQuoted) {
return lookAhead(1) === CharCode.DoubleQuote && lookAhead(2) === CharCode.DoubleQuote;
}
return true;
default:
return false;
}
},
closing,
quoteLength
);
loop: while (!eof()) {
const ch = input.charCodeAt(position);
switch (ch) {
case CharCode.CarriageReturn:
if (lookAhead(1) === CharCode.LineFeed) {
tokenFlags |= TokenFlags.HasCrlf;
position++;
}
break;
case CharCode.Backslash:
tokenFlags |= TokenFlags.Escaped;
position++;
if (eof()) {
break loop;
}
break;
case CharCode.DoubleQuote:
if (lookAhead(1) === CharCode.DoubleQuote && lookAhead(2) === CharCode.DoubleQuote) {
position += 3;
return (token = Token.StringLiteral);
}
break;
}
position++;
}
error(Message.Unterminated, ["string literal"]);
return (token = Token.StringLiteral);
}
@ -576,11 +627,10 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
if (tokenValue !== undefined) {
return tokenValue;
}
return (tokenValue = token === Token.StringLiteral ? getStringTokenValue() : getTokenText());
}
if (token !== Token.StringLiteral) {
return (tokenValue = getTokenText());
}
function getStringTokenValue() {
// strip quotes
const quoteLength = tokenFlags & TokenFlags.TripleQuoted ? 3 : 1;
let value = input.substring(tokenPosition + quoteLength, position - quoteLength);
@ -729,30 +779,28 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
}
function scanIdentifierOrKeyword() {
let ch = input.charCodeAt(position);
if (!isAsciiIdentifierStart(ch)) {
return scanNonAsciiIdentifier();
}
const startChar = input.charCodeAt(position);
let ch = startChar;
do {
position++;
if (eof()) {
break;
}
ch = input.charCodeAt(position);
} while (isAsciiIdentifierContinue(ch));
} while (!eof() && isAsciiIdentifierContinue((ch = input.charCodeAt(position))));
if (!eof() && ch > CharCode.MaxAscii) {
if (ch > CharCode.MaxAscii) {
const codePoint = input.codePointAt(position)!;
if (isNonAsciiIdentifierContinue(codePoint)) {
return scanNonAsciiIdentifierContinue(codePoint);
}
}
if (position - tokenPosition <= maxKeywordLength) {
const value = getTokenValue();
const keyword = Keywords.get(value);
const length = position - tokenPosition;
if (
length >= KeywordLimit.MinLength &&
length <= KeywordLimit.MaxLength &&
startChar >= KeywordLimit.MinStartChar &&
startChar <= KeywordLimit.MaxStartChar
) {
tokenValue = getTokenText();
const keyword = Keywords.get(tokenValue);
if (keyword) {
return (token = keyword);
}
@ -761,23 +809,11 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
return (token = Token.Identifier);
}
function scanNonAsciiIdentifier() {
let codePoint = input.codePointAt(position)!;
return isNonAsciiIdentifierStart(codePoint)
? scanNonAsciiIdentifierContinue(codePoint)
: scanInvalidCharacter();
}
function scanNonAsciiIdentifierContinue(startCodePoint: number) {
let codePoint = startCodePoint;
do {
position += utf16CodeUnits(codePoint);
if (eof()) {
break;
}
codePoint = input.codePointAt(position)!;
} while (isIdentifierContinue(codePoint));
} while (!eof() && isIdentifierContinue((codePoint = input.codePointAt(position)!)));
return (token = Token.Identifier);
}

Просмотреть файл

@ -219,8 +219,38 @@ describe("syntax", () => {
]);
});
describe("unterminated tokens", () => {
parseErrorEach([
['model X = "banana', [/Unterminated string literal/]],
['model X = "banana\\', [/Unterminated string literal/]],
['model X = """\nbanana', [/Unterminated string literal/]],
['model X = """\nbanana\\', [/Unterminated string literal/]],
["/* Yada yada yada", [/Unterminated comment/]],
["123.0e", [/Digit expected/]],
["123.e", [/Digit expected/]],
["123e", [/Digit expected/]],
["0b", [/Binary digit expected/]],
["0x", [/Hexadecimal digit expected/]],
]);
});
describe("terminated tokens at EOF with missing semicolon", () => {
parseErrorEach([
["model X = 0x10101", [/';' expected/]],
["model X = 0xBEEF", [/';' expected/]],
["model X = 123", [/';' expected/]],
["model X = 123.", [/';' expected/]],
["model X = 123e45", [/';' expected/]],
["model X = 123.45", [/';' expected/]],
["model X = 123.45e2", [/';' expected/]],
["model X = Banana", [/';' expected/]],
['model X = "Banana"', [/';' expected/]],
['model X = """\nBanana\n"""', [/';' expected/]],
]);
});
describe("non-ascii identifiers", () => {
parseEach(["model Incompréhensible {}", "model 𐌰𐌲 {}", "model Banana𐌰𐌲Banana {}"]);
parseEach(["model Incompréhensible {}", "model 𐌰𐌲 {}", "model Banana𐌰𐌲42Banana {}"]);
parseErrorEach([["model 😢 {}", [/Invalid character/]]]);
});
});

Просмотреть файл

@ -7,14 +7,22 @@ import {
isKeyword,
isPunctuation,
isStatementKeyword,
KeywordLimit,
Keywords,
maxKeywordLength,
Token,
TokenDisplay,
} from "../compiler/scanner.js";
import { LineAndCharacter } from "../compiler/types.js";
type TokenEntry = [Token, string?, number?, LineAndCharacter?];
type TokenEntry = [
Token,
string?,
{
pos?: number;
line?: number;
character?: number;
value?: string;
}?
];
function tokens(text: string, onError = throwOnError): TokenEntry[] {
const scanner = createScanner(text, onError);
@ -25,8 +33,11 @@ function tokens(text: string, onError = throwOnError): TokenEntry[] {
result.push([
scanner.token,
scanner.getTokenText(),
scanner.tokenPosition,
scanner.file.getLineAndCharacterOfPosition(scanner.tokenPosition),
{
pos: scanner.tokenPosition,
value: scanner.getTokenValue(),
...scanner.file.getLineAndCharacterOfPosition(scanner.tokenPosition),
},
]);
} while (!scanner.eof());
@ -38,26 +49,43 @@ function tokens(text: string, onError = throwOnError): TokenEntry[] {
}
function verify(tokens: TokenEntry[], expecting: TokenEntry[]) {
for (const [
index,
[expectedToken, expectedText, expectedPosition, expectedLineAndCharacter],
] of expecting.entries()) {
const [token, text, position, lineAndCharacter] = tokens[index];
for (const [index, [expectedToken, expectedText, expectedAdditional]] of expecting.entries()) {
const [token, text, additional] = tokens[index];
assert.strictEqual(Token[token], Token[expectedToken], `Token ${index} must match`);
if (expectedText) {
assert.strictEqual(text, expectedText, `Token ${index} test must match`);
}
if (expectedPosition) {
assert.strictEqual(position, expectedPosition, `Token ${index} position must match`);
if (expectedAdditional?.pos) {
assert.strictEqual(
additional!.pos,
expectedAdditional.pos,
`Token ${index} position must match`
);
}
if (expectedLineAndCharacter) {
assert.deepStrictEqual(
lineAndCharacter,
expectedLineAndCharacter,
`Token ${index} line and character must match`
if (expectedAdditional?.line) {
assert.strictEqual(
additional!.line,
expectedAdditional.line,
`Token ${index} line must match`
);
}
if (expectedAdditional?.character) {
assert.strictEqual(
additional!.character,
expectedAdditional?.character,
`Token ${index} character must match`
);
}
if (expectedAdditional?.value) {
assert.strictEqual(
additional!.value,
expectedAdditional.value,
`Token ${index} value must match`
);
}
}
@ -66,16 +94,16 @@ function verify(tokens: TokenEntry[], expecting: TokenEntry[]) {
describe("scanner", () => {
/** verifies that we can scan tokens and get back some output. */
it("smoketest", () => {
const all = tokens("\tthis is a test");
const all = tokens('\tthis is "a" test');
verify(all, [
[Token.Whitespace],
[Token.Identifier, "this"],
[Token.Identifier, "this", { value: "this" }],
[Token.Whitespace],
[Token.Identifier, "is"],
[Token.Identifier, "is", { value: "is" }],
[Token.Whitespace],
[Token.Identifier, "a"],
[Token.StringLiteral, '"a"', { value: "a" }],
[Token.Whitespace],
[Token.Identifier, "test"],
[Token.Identifier, "test", { value: "test" }],
]);
});
@ -130,7 +158,7 @@ describe("scanner", () => {
});
it("scans numeric literals", () => {
const all = tokens("42 0xBEEF 0b1010 1.5e4 314.0e-2 1e+1000");
const all = tokens("42 0xBEEF 0b1010 1.5e4 314.0e-2 1e+1000 3. 2.e3");
verify(all, [
[Token.NumericLiteral, "42"],
[Token.Whitespace],
@ -143,6 +171,11 @@ describe("scanner", () => {
[Token.NumericLiteral, "314.0e-2"],
[Token.Whitespace],
[Token.NumericLiteral, "1e+1000"],
[Token.Whitespace],
// https://github.com/Azure/adl/issues/488 - we may want to disallow these
[Token.NumericLiteral, "3."],
[Token.Whitespace],
[Token.NumericLiteral, "2.e3"],
]);
});
@ -184,34 +217,34 @@ describe("scanner", () => {
it("provides token position", () => {
const all = tokens("a x\raa x\r\naaa x\naaaa x\u{2028}aaaaa x\u{2029}aaaaaa x");
verify(all, [
[Token.Identifier, "a", 0, { line: 0, character: 0 }],
[Token.Whitespace, " ", 1, { line: 0, character: 1 }],
[Token.Identifier, "x", 2, { line: 0, character: 2 }],
[Token.NewLine, "\r", 3, { line: 0, character: 3 }],
[Token.Identifier, "a", { pos: 0, line: 0, character: 0 }],
[Token.Whitespace, " ", { pos: 1, line: 0, character: 1 }],
[Token.Identifier, "x", { pos: 2, line: 0, character: 2 }],
[Token.NewLine, "\r", { pos: 3, line: 0, character: 3 }],
[Token.Identifier, "aa", 4, { line: 1, character: 0 }],
[Token.Whitespace, " ", 6, { line: 1, character: 2 }],
[Token.Identifier, "x", 7, { line: 1, character: 3 }],
[Token.NewLine, "\r\n", 8, { line: 1, character: 4 }],
[Token.Identifier, "aa", { pos: 4, line: 1, character: 0 }],
[Token.Whitespace, " ", { pos: 6, line: 1, character: 2 }],
[Token.Identifier, "x", { pos: 7, line: 1, character: 3 }],
[Token.NewLine, "\r\n", { pos: 8, line: 1, character: 4 }],
[Token.Identifier, "aaa", 10, { line: 2, character: 0 }],
[Token.Whitespace, " ", 13, { line: 2, character: 3 }],
[Token.Identifier, "x", 14, { line: 2, character: 4 }],
[Token.NewLine, "\n", 15, { line: 2, character: 5 }],
[Token.Identifier, "aaa", { pos: 10, line: 2, character: 0 }],
[Token.Whitespace, " ", { pos: 13, line: 2, character: 3 }],
[Token.Identifier, "x", { pos: 14, line: 2, character: 4 }],
[Token.NewLine, "\n", { pos: 15, line: 2, character: 5 }],
[Token.Identifier, "aaaa", 16, { line: 3, character: 0 }],
[Token.Whitespace, " ", 20, { line: 3, character: 4 }],
[Token.Identifier, "x", 21, { line: 3, character: 5 }],
[Token.NewLine, "\u{2028}", 22, { line: 3, character: 6 }],
[Token.Identifier, "aaaa", { pos: 16, line: 3, character: 0 }],
[Token.Whitespace, " ", { pos: 20, line: 3, character: 4 }],
[Token.Identifier, "x", { pos: 21, line: 3, character: 5 }],
[Token.NewLine, "\u{2028}", { pos: 22, line: 3, character: 6 }],
[Token.Identifier, "aaaaa", 23, { line: 4, character: 0 }],
[Token.Whitespace, " ", 28, { line: 4, character: 5 }],
[Token.Identifier, "x", 29, { line: 4, character: 6 }],
[Token.NewLine, "\u{2029}", 30, { line: 4, character: 7 }],
[Token.Identifier, "aaaaa", { pos: 23, line: 4, character: 0 }],
[Token.Whitespace, " ", { pos: 28, line: 4, character: 5 }],
[Token.Identifier, "x", { pos: 29, line: 4, character: 6 }],
[Token.NewLine, "\u{2029}", { pos: 30, line: 4, character: 7 }],
[Token.Identifier, "aaaaaa", 31, { line: 5, character: 0 }],
[Token.Whitespace, " ", 37, { line: 5, character: 6 }],
[Token.Identifier, "x", 38, { line: 5, character: 7 }],
[Token.Identifier, "aaaaaa", { pos: 31, line: 5, character: 0 }],
[Token.Whitespace, " ", { pos: 37, line: 5, character: 6 }],
[Token.Identifier, "x", { pos: 38, line: 5, character: 7 }],
]);
});
@ -225,11 +258,19 @@ describe("scanner", () => {
`Token enum has ${tokenCount} elements but TokenDisplay array has ${tokenDisplayCount}.`
);
// check that keywords have appropriate display
// check that keywords have appropriate display and limits
const nonStatementKeywords = [Token.ExtendsKeyword, Token.TrueKeyword, Token.FalseKeyword];
let maxKeywordLengthFound = -1;
let minKeywordLengthFound = Number.MAX_SAFE_INTEGER;
let maxKeywordLengthFound = Number.MIN_SAFE_INTEGER;
let minKeywordStartCharFound = Number.MAX_SAFE_INTEGER;
let maxKeywordStartCharFound = Number.MIN_SAFE_INTEGER;
for (const [name, token] of Keywords.entries()) {
minKeywordLengthFound = Math.min(minKeywordLengthFound, name.length);
maxKeywordLengthFound = Math.max(maxKeywordLengthFound, name.length);
minKeywordStartCharFound = Math.min(minKeywordStartCharFound, name.charCodeAt(0));
maxKeywordStartCharFound = Math.max(maxKeywordStartCharFound, name.charCodeAt(0));
assert.strictEqual(TokenDisplay[token], `'${name}'`);
assert(isKeyword(token), `${name} should be classified as a keyword`);
if (!nonStatementKeywords.includes(token)) {
@ -237,7 +278,10 @@ describe("scanner", () => {
}
}
assert.strictEqual(maxKeywordLengthFound, maxKeywordLength);
assert.strictEqual(minKeywordLengthFound, KeywordLimit.MinLength);
assert.strictEqual(maxKeywordLengthFound, KeywordLimit.MaxLength);
assert.strictEqual(minKeywordStartCharFound, KeywordLimit.MinStartChar);
assert.strictEqual(maxKeywordStartCharFound, KeywordLimit.MaxStartChar);
// check single character punctuation
for (let i = 33; i <= 126; i++) {