Fix scanning edge cases and optimize slightly
* Bug fixes * tokenValue() was wrong if an identifier ever came right after a string literal. * There was an incorrect and confusing end-of-file error if file ends with numeric literal. Generally hardened and cleaned up EOF checking throughout. * Digits weren't allowed in identifier with non-ascii characters. * Perf * Add more ASCII fast paths * Inline scanUntil everywhere, and simplify each use to what it actually needed * Avoid eager substring allocation and map lookup for keywords in more cases
This commit is contained in:
Родитель
9063e1b63f
Коммит
5fb28d6378
|
@ -12,13 +12,13 @@ export const enum CharCode {
|
|||
LineSeparator = 0x2028,
|
||||
ParagraphSeparator = 0x2029,
|
||||
|
||||
// ASCII whitespace
|
||||
// ASCII whitespace excluding line breaks
|
||||
Space = 0x20,
|
||||
FormFeed = 0x0c,
|
||||
Tab = 0x09,
|
||||
VerticalTab = 0x0b,
|
||||
|
||||
// Non-ASCII whitespace
|
||||
// Non-ASCII whitespace excluding line breaks
|
||||
ByteOrderMark = 0xfeff, // currently allowed anywhere
|
||||
NextLine = 0x0085, // not considered a line break, mirroring ECMA-262
|
||||
NonBreakingSpace = 0x00a0,
|
||||
|
@ -144,15 +144,24 @@ export const enum CharCode {
|
|||
Tilde = 0x7e,
|
||||
}
|
||||
|
||||
/** Does not include line breaks. For that, see isWhiteSpaceLike. */
|
||||
export function isWhiteSpaceSingleLine(ch: number): boolean {
|
||||
// Note: nextLine is in the Zs space, and should be considered to be a whitespace.
|
||||
// It is explicitly not a line-break as it isn't in the exact set specified by EcmaScript.
|
||||
export function isAsciiLineBreak(ch: number) {
|
||||
return ch === CharCode.LineFeed || ch == CharCode.CarriageReturn;
|
||||
}
|
||||
|
||||
export function isAsciiWhiteSpaceSingleLine(ch: number): boolean {
|
||||
return (
|
||||
ch === CharCode.Space ||
|
||||
ch === CharCode.Tab ||
|
||||
ch === CharCode.VerticalTab ||
|
||||
ch === CharCode.FormFeed ||
|
||||
ch === CharCode.FormFeed
|
||||
);
|
||||
}
|
||||
|
||||
export function isNonAsciiWhiteSpaceSingleLine(ch: number) {
|
||||
// Note: nextLine is in the Zs space, and should be considered to be a
|
||||
// whitespace. It is explicitly not a line-break as it isn't in the exact set
|
||||
// inherited by ADL from JavaScript.
|
||||
return (
|
||||
ch === CharCode.NonBreakingSpace ||
|
||||
ch === CharCode.NextLine ||
|
||||
ch === CharCode.Ogham ||
|
||||
|
@ -164,17 +173,23 @@ export function isWhiteSpaceSingleLine(ch: number): boolean {
|
|||
);
|
||||
}
|
||||
|
||||
export function isLineBreak(ch: number): boolean {
|
||||
// Other new line or line
|
||||
// breaking characters are treated as white space but not as line terminators.
|
||||
export function isNonAsciiLineBreak(ch: number) {
|
||||
// Other new line or line breaking characters are treated as white space but
|
||||
// not as line terminators.
|
||||
return ch === CharCode.ParagraphSeparator || ch === CharCode.LineSeparator;
|
||||
}
|
||||
|
||||
export function isWhiteSpaceSingleLine(ch: number) {
|
||||
return (
|
||||
ch === CharCode.LineFeed ||
|
||||
ch === CharCode.CarriageReturn ||
|
||||
ch === CharCode.LineSeparator ||
|
||||
ch === CharCode.ParagraphSeparator
|
||||
isAsciiWhiteSpaceSingleLine(ch) ||
|
||||
(ch > CharCode.MaxAscii && isNonAsciiWhiteSpaceSingleLine(ch))
|
||||
);
|
||||
}
|
||||
|
||||
export function isLineBreak(ch: number): boolean {
|
||||
return isAsciiLineBreak(ch) || (ch > CharCode.MaxAscii && isNonAsciiLineBreak(ch));
|
||||
}
|
||||
|
||||
export function isDigit(ch: number): boolean {
|
||||
return ch >= CharCode._0 && ch <= CharCode._9;
|
||||
}
|
||||
|
@ -210,7 +225,7 @@ export function isAsciiIdentifierContinue(ch: number): boolean {
|
|||
|
||||
export function isIdentifierContinue(codePoint: number) {
|
||||
return (
|
||||
isAsciiIdentifierStart(codePoint) ||
|
||||
isAsciiIdentifierContinue(codePoint) ||
|
||||
(codePoint > CharCode.MaxAscii && isNonAsciiIdentifierContinue(codePoint))
|
||||
);
|
||||
}
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
import { AssertionError } from "assert";
|
||||
import { CharCode } from "./charcode.js";
|
||||
import { CharCode, isNonAsciiLineBreak } from "./charcode.js";
|
||||
import { Message } from "./messages.js";
|
||||
import { Diagnostic, Node, SourceFile, SourceLocation, Sym, SyntaxKind, Type } from "./types.js";
|
||||
|
||||
|
@ -113,7 +113,7 @@ export function createSourceFile(text: string, path: string): SourceFile {
|
|||
};
|
||||
|
||||
function getLineStarts() {
|
||||
return (lineStarts = lineStarts ?? scanLineStarts());
|
||||
return (lineStarts = lineStarts ?? scanLineStarts(text));
|
||||
}
|
||||
|
||||
function getLineAndCharacterOfPosition(position: number) {
|
||||
|
@ -136,57 +136,6 @@ export function createSourceFile(text: string, path: string): SourceFile {
|
|||
character: position - starts[line],
|
||||
};
|
||||
}
|
||||
|
||||
function scanLineStarts() {
|
||||
const starts = [];
|
||||
let start = 0;
|
||||
let pos = 0;
|
||||
|
||||
while (pos < text.length) {
|
||||
const ch = text.charCodeAt(pos);
|
||||
pos++;
|
||||
switch (ch) {
|
||||
case CharCode.CarriageReturn:
|
||||
if (text.charCodeAt(pos) === CharCode.LineFeed) {
|
||||
pos++;
|
||||
}
|
||||
// fallthrough
|
||||
case CharCode.LineFeed:
|
||||
case CharCode.LineSeparator:
|
||||
case CharCode.ParagraphSeparator:
|
||||
starts.push(start);
|
||||
start = pos;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
starts.push(start);
|
||||
return starts;
|
||||
}
|
||||
|
||||
/**
|
||||
* Search sorted array of numbers for the given value. If found, return index
|
||||
* in array where value was found. If not found, return a negative number that
|
||||
* is the bitwise complement of the index where value would need to be inserted
|
||||
* to keep the array sorted.
|
||||
*/
|
||||
function binarySearch(array: readonly number[], value: number) {
|
||||
let low = 0;
|
||||
let high = array.length - 1;
|
||||
while (low <= high) {
|
||||
const middle = low + ((high - low) >> 1);
|
||||
const v = array[middle];
|
||||
if (v < value) {
|
||||
low = middle + 1;
|
||||
} else if (v > value) {
|
||||
high = middle - 1;
|
||||
} else {
|
||||
return middle;
|
||||
}
|
||||
}
|
||||
|
||||
return ~low;
|
||||
}
|
||||
}
|
||||
|
||||
export function getSourceLocation(target: DiagnosticTarget): SourceLocation {
|
||||
|
@ -328,3 +277,58 @@ function format(text: string, args?: (string | number)[]): [string, Error?] {
|
|||
function isNotUndefined<T>(value: T | undefined): value is T {
|
||||
return value !== undefined;
|
||||
}
|
||||
|
||||
function scanLineStarts(text: string): number[] {
|
||||
const starts = [];
|
||||
let start = 0;
|
||||
let pos = 0;
|
||||
|
||||
while (pos < text.length) {
|
||||
const ch = text.charCodeAt(pos);
|
||||
pos++;
|
||||
switch (ch) {
|
||||
case CharCode.CarriageReturn:
|
||||
if (text.charCodeAt(pos) === CharCode.LineFeed) {
|
||||
pos++;
|
||||
}
|
||||
// fallthrough
|
||||
case CharCode.LineFeed:
|
||||
starts.push(start);
|
||||
start = pos;
|
||||
break;
|
||||
default:
|
||||
if (ch > CharCode.MaxAscii && isNonAsciiLineBreak(ch)) {
|
||||
starts.push(start);
|
||||
start = pos;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
starts.push(start);
|
||||
return starts;
|
||||
}
|
||||
|
||||
/**
|
||||
* Search sorted array of numbers for the given value. If found, return index
|
||||
* in array where value was found. If not found, return a negative number that
|
||||
* is the bitwise complement of the index where value would need to be inserted
|
||||
* to keep the array sorted.
|
||||
*/
|
||||
function binarySearch(array: readonly number[], value: number) {
|
||||
let low = 0;
|
||||
let high = array.length - 1;
|
||||
while (low <= high) {
|
||||
const middle = low + ((high - low) >> 1);
|
||||
const v = array[middle];
|
||||
if (v < value) {
|
||||
low = middle + 1;
|
||||
} else if (v > value) {
|
||||
high = middle - 1;
|
||||
} else {
|
||||
return middle;
|
||||
}
|
||||
}
|
||||
|
||||
return ~low;
|
||||
}
|
||||
|
|
|
@ -8,50 +8,50 @@ export const Message = {
|
|||
DigitExpected: {
|
||||
code: 1100,
|
||||
severity: "error",
|
||||
text: "Digit expected (0-9)",
|
||||
text: "Digit expected.",
|
||||
} as const,
|
||||
|
||||
HexDigitExpected: {
|
||||
code: 1101,
|
||||
severity: "error",
|
||||
text: "Hex Digit expected (0-F)",
|
||||
text: "Hexadecimal digit expected.",
|
||||
} as const,
|
||||
|
||||
BinaryDigitExpected: {
|
||||
code: 1102,
|
||||
severity: "error",
|
||||
text: "Binary Digit expected (0,1)",
|
||||
text: "Binary digit expected.",
|
||||
} as const,
|
||||
|
||||
UnexpectedEndOfFile: {
|
||||
Unterminated: {
|
||||
code: 1103,
|
||||
severity: "error",
|
||||
text: "Unexpected end of file while searching for '{0}'",
|
||||
text: "Unterminated {0}.",
|
||||
} as const,
|
||||
|
||||
InvalidEscapeSequence: {
|
||||
code: 1104,
|
||||
severity: "error",
|
||||
text: "Invalid escape sequence",
|
||||
text: "Invalid escape sequence.",
|
||||
} as const,
|
||||
|
||||
NoNewLineAtStartOfTripleQuotedString: {
|
||||
code: 1105,
|
||||
severity: "error",
|
||||
text: "String content in triple quotes must begin on a new line",
|
||||
text: "String content in triple quotes must begin on a new line.",
|
||||
} as const,
|
||||
|
||||
NoNewLineAtEndOfTripleQuotedString: {
|
||||
code: 1106,
|
||||
severity: "error",
|
||||
text: "Closing triple quotes must begin on a new line",
|
||||
text: "Closing triple quotes must begin on a new line.",
|
||||
} as const,
|
||||
|
||||
InconsistentTripleQuoteIndentation: {
|
||||
code: 1107,
|
||||
severity: "error",
|
||||
text:
|
||||
"All lines in triple-quoted string lines must have the same indentation as closing triple quotes",
|
||||
"All lines in triple-quoted string lines must have the same indentation as closing triple quotes.",
|
||||
} as const,
|
||||
|
||||
InvalidCharacter: {
|
||||
|
|
|
@ -280,7 +280,7 @@ export function parse(code: string | SourceFile) {
|
|||
stmts.push(parseUsingStatement());
|
||||
break;
|
||||
case Token.EndOfFile:
|
||||
error("End of file reached without '}'.");
|
||||
parseExpected(Token.CloseBrace);
|
||||
return stmts;
|
||||
case Token.Semicolon:
|
||||
reportInvalidDecorators(decorators, "empty statement");
|
||||
|
@ -990,9 +990,8 @@ export function parse(code: string | SourceFile) {
|
|||
if (realPositionOfLastError === realPos) {
|
||||
return;
|
||||
}
|
||||
|
||||
realPositionOfLastError = realPos;
|
||||
parseErrorInNextFinishedNode = true;
|
||||
|
||||
reportDiagnostic(message, location);
|
||||
}
|
||||
|
||||
|
@ -1001,6 +1000,9 @@ export function parse(code: string | SourceFile) {
|
|||
target: DiagnosticTarget,
|
||||
args?: (string | number)[]
|
||||
) {
|
||||
if (typeof message === "string" || message.severity === "error") {
|
||||
parseErrorInNextFinishedNode = true;
|
||||
}
|
||||
const diagnostic = createDiagnostic(message, target, args);
|
||||
parseDiagnostics.push(diagnostic);
|
||||
}
|
||||
|
|
|
@ -9,6 +9,8 @@ import {
|
|||
isLineBreak,
|
||||
isNonAsciiIdentifierContinue,
|
||||
isNonAsciiIdentifierStart,
|
||||
isNonAsciiLineBreak,
|
||||
isNonAsciiWhiteSpaceSingleLine,
|
||||
isWhiteSpaceSingleLine,
|
||||
} from "./charcode.js";
|
||||
import { createSourceFile, Message, throwOnError } from "./diagnostics.js";
|
||||
|
@ -82,6 +84,7 @@ const MaxPunctuation = Token.At;
|
|||
const MinStatementKeyword = Token.ImportKeyword;
|
||||
const MaxStatementKeyword = Token.OpKeyword;
|
||||
|
||||
/** @internal */
|
||||
export const TokenDisplay: readonly string[] = [
|
||||
"<none>",
|
||||
"<invalid>",
|
||||
|
@ -122,6 +125,7 @@ export const TokenDisplay: readonly string[] = [
|
|||
"'false'",
|
||||
];
|
||||
|
||||
/** @internal */
|
||||
export const Keywords: ReadonlyMap<string, Token> = new Map([
|
||||
["import", Token.ImportKeyword],
|
||||
["model", Token.ModelKeyword],
|
||||
|
@ -133,7 +137,13 @@ export const Keywords: ReadonlyMap<string, Token> = new Map([
|
|||
["false", Token.FalseKeyword],
|
||||
]);
|
||||
|
||||
export const maxKeywordLength = 9;
|
||||
/** @internal */
|
||||
export const enum KeywordLimit {
|
||||
MinLength = 2,
|
||||
MaxLength = 9,
|
||||
MinStartChar = CharCode.e,
|
||||
MaxStartChar = CharCode.u,
|
||||
}
|
||||
|
||||
export interface Scanner {
|
||||
/** The source code being scanned. */
|
||||
|
@ -264,32 +274,12 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
|
|||
}
|
||||
// fallthrough
|
||||
case CharCode.LineFeed:
|
||||
case CharCode.LineSeparator:
|
||||
case CharCode.ParagraphSeparator:
|
||||
return next(Token.NewLine);
|
||||
|
||||
case CharCode.Space:
|
||||
case CharCode.Tab:
|
||||
case CharCode.VerticalTab:
|
||||
case CharCode.FormFeed:
|
||||
case CharCode.Space:
|
||||
case CharCode.NonBreakingSpace:
|
||||
case CharCode.Ogham:
|
||||
case CharCode.EnQuad:
|
||||
case CharCode.EmQuad:
|
||||
case CharCode.EnSpace:
|
||||
case CharCode.EmSpace:
|
||||
case CharCode.ThreePerEmSpace:
|
||||
case CharCode.FourPerEmSpace:
|
||||
case CharCode.SixPerEmSpace:
|
||||
case CharCode.FigureSpace:
|
||||
case CharCode.PunctuationSpace:
|
||||
case CharCode.ThinSpace:
|
||||
case CharCode.HairSpace:
|
||||
case CharCode.ZeroWidthSpace:
|
||||
case CharCode.NarrowNoBreakSpace:
|
||||
case CharCode.MathematicalSpace:
|
||||
case CharCode.IdeographicSpace:
|
||||
case CharCode.ByteOrderMark:
|
||||
return scanWhitespace();
|
||||
|
||||
case CharCode.OpenParen:
|
||||
|
@ -382,16 +372,45 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
|
|||
: next(Token.Bar);
|
||||
|
||||
case CharCode.DoubleQuote:
|
||||
return scanString();
|
||||
return lookAhead(1) === CharCode.DoubleQuote && lookAhead(2) === CharCode.DoubleQuote
|
||||
? scanTripleQuotedString()
|
||||
: scanString();
|
||||
|
||||
default:
|
||||
return scanIdentifierOrKeyword();
|
||||
if (isAsciiIdentifierStart(ch)) {
|
||||
return scanIdentifierOrKeyword();
|
||||
}
|
||||
|
||||
if (ch <= CharCode.MaxAscii) {
|
||||
return scanInvalidCharacter();
|
||||
}
|
||||
|
||||
return scanNonAsciiToken();
|
||||
}
|
||||
}
|
||||
|
||||
return (token = Token.EndOfFile);
|
||||
}
|
||||
|
||||
function scanNonAsciiToken() {
|
||||
const ch = input.charCodeAt(position);
|
||||
|
||||
if (isNonAsciiLineBreak(ch)) {
|
||||
return next(Token.NewLine);
|
||||
}
|
||||
|
||||
if (isNonAsciiWhiteSpaceSingleLine(ch)) {
|
||||
return scanWhitespace();
|
||||
}
|
||||
|
||||
const codePoint = input.codePointAt(position)!;
|
||||
if (isNonAsciiIdentifierStart(codePoint)) {
|
||||
return scanNonAsciiIdentifierContinue(codePoint);
|
||||
}
|
||||
|
||||
return scanInvalidCharacter();
|
||||
}
|
||||
|
||||
function scanInvalidCharacter() {
|
||||
const codePoint = input.codePointAt(position)!;
|
||||
token = next(Token.Invalid, utf16CodeUnits(codePoint));
|
||||
|
@ -423,152 +442,184 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
|
|||
function scanWhitespace(): Token {
|
||||
do {
|
||||
position++;
|
||||
} while (isWhiteSpaceSingleLine(input.charCodeAt(position)));
|
||||
} while (!eof() && isWhiteSpaceSingleLine(input.charCodeAt(position)));
|
||||
|
||||
return (token = Token.Whitespace);
|
||||
}
|
||||
|
||||
function scanDigits() {
|
||||
while (isDigit(input.charCodeAt(position))) {
|
||||
position++;
|
||||
}
|
||||
}
|
||||
|
||||
function scanNumber() {
|
||||
scanDigits();
|
||||
|
||||
let ch = input.charCodeAt(position);
|
||||
|
||||
if (ch === CharCode.Dot) {
|
||||
position++;
|
||||
scanDigits();
|
||||
}
|
||||
|
||||
ch = input.charCodeAt(position);
|
||||
if (ch === CharCode.e) {
|
||||
position++;
|
||||
ch = input.charCodeAt(position);
|
||||
if (ch === CharCode.Plus || ch == CharCode.Minus) {
|
||||
position++;
|
||||
ch = input.charCodeAt(position);
|
||||
}
|
||||
|
||||
if (isDigit(ch)) {
|
||||
position++;
|
||||
scanDigits();
|
||||
} else {
|
||||
error(Message.DigitExpected);
|
||||
scanKnownDigits();
|
||||
if (!eof()) {
|
||||
switch (input.charCodeAt(position)) {
|
||||
case CharCode.Dot:
|
||||
scanFractionAndExponent();
|
||||
break;
|
||||
case CharCode.e:
|
||||
scanExponent();
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
return (token = Token.NumericLiteral);
|
||||
}
|
||||
|
||||
function scanHexNumber() {
|
||||
if (!isHexDigit(lookAhead(2))) {
|
||||
error(Message.HexDigitExpected);
|
||||
return next(Token.NumericLiteral, 2);
|
||||
}
|
||||
function scanKnownDigits() {
|
||||
do {
|
||||
position++;
|
||||
} while (!eof() && isDigit(input.charCodeAt(position)));
|
||||
}
|
||||
|
||||
function scanOptionalDigits() {
|
||||
if (!eof() && isDigit(input.charCodeAt(position))) {
|
||||
scanKnownDigits();
|
||||
}
|
||||
}
|
||||
|
||||
function scanRequiredDigits() {
|
||||
if (eof() || !isDigit(input.charCodeAt(position))) {
|
||||
error(Message.DigitExpected);
|
||||
return;
|
||||
}
|
||||
scanKnownDigits();
|
||||
}
|
||||
|
||||
function scanFractionAndExponent() {
|
||||
position++; // consume '.'
|
||||
scanOptionalDigits();
|
||||
if (!eof() && input.charCodeAt(position) === CharCode.e) {
|
||||
scanExponent();
|
||||
}
|
||||
}
|
||||
|
||||
function scanExponent() {
|
||||
position++; // consume 'e'
|
||||
if (eof()) {
|
||||
error(Message.DigitExpected);
|
||||
return;
|
||||
}
|
||||
const ch = input.charCodeAt(position);
|
||||
if (ch === CharCode.Plus || ch === CharCode.Minus) {
|
||||
position++;
|
||||
}
|
||||
scanRequiredDigits();
|
||||
}
|
||||
|
||||
function scanHexNumber() {
|
||||
position += 2; // consume '0x'
|
||||
|
||||
if (eof() || !isHexDigit(input.charCodeAt(position))) {
|
||||
error(Message.HexDigitExpected);
|
||||
return (token = Token.NumericLiteral);
|
||||
}
|
||||
do {
|
||||
position++;
|
||||
} while (!eof() && isHexDigit(input.charCodeAt(position)));
|
||||
|
||||
position += 2;
|
||||
scanUntil((ch) => !isHexDigit(ch), "Hex Digit");
|
||||
return (token = Token.NumericLiteral);
|
||||
}
|
||||
|
||||
function scanBinaryNumber() {
|
||||
if (!isBinaryDigit(lookAhead(2))) {
|
||||
error(Message.BinaryDigitExpected);
|
||||
return next(Token.NumericLiteral, 2);
|
||||
}
|
||||
position += 2; // consume '0b'
|
||||
|
||||
if (eof() || !isBinaryDigit(input.charCodeAt(position))) {
|
||||
error(Message.BinaryDigitExpected);
|
||||
return (token = Token.NumericLiteral);
|
||||
}
|
||||
do {
|
||||
position++;
|
||||
} while (!eof() && isBinaryDigit(input.charCodeAt(position)));
|
||||
|
||||
position += 2;
|
||||
scanUntil((ch) => !isBinaryDigit(ch), "Binary Digit");
|
||||
return (token = Token.NumericLiteral);
|
||||
}
|
||||
|
||||
function scanUntil(
|
||||
predicate: (char: number) => boolean,
|
||||
expectedClose?: string,
|
||||
consumeClose?: number
|
||||
) {
|
||||
let ch: number;
|
||||
function scanSingleLineComment() {
|
||||
position += 2; // consume '//'
|
||||
|
||||
do {
|
||||
position++;
|
||||
|
||||
if (eof()) {
|
||||
if (expectedClose) {
|
||||
error(Message.UnexpectedEndOfFile, [expectedClose]);
|
||||
}
|
||||
while (!eof()) {
|
||||
if (isLineBreak(input.charCodeAt(position))) {
|
||||
break;
|
||||
}
|
||||
|
||||
ch = input.charCodeAt(position);
|
||||
} while (!predicate(ch));
|
||||
|
||||
if (consumeClose) {
|
||||
position += consumeClose;
|
||||
position++;
|
||||
}
|
||||
}
|
||||
|
||||
function scanSingleLineComment() {
|
||||
scanUntil(isLineBreak);
|
||||
return (token = Token.SingleLineComment);
|
||||
}
|
||||
|
||||
function scanMultiLineComment() {
|
||||
scanUntil((ch) => ch === CharCode.Asterisk && lookAhead(1) === CharCode.Slash, "*/", 2);
|
||||
position += 2; // consume '/*'
|
||||
|
||||
while (!eof()) {
|
||||
if (input.charCodeAt(position) === CharCode.Asterisk && lookAhead(1) === CharCode.Slash) {
|
||||
position += 2;
|
||||
return (token = Token.MultiLineComment);
|
||||
}
|
||||
position++;
|
||||
}
|
||||
|
||||
error(Message.Unterminated, ["comment"]);
|
||||
return (token = Token.MultiLineComment);
|
||||
}
|
||||
|
||||
function scanString() {
|
||||
let quoteLength = 1;
|
||||
let closing = '"';
|
||||
let isEscaping = false;
|
||||
position++; // consume '"'
|
||||
|
||||
const tripleQuoted =
|
||||
lookAhead(1) === CharCode.DoubleQuote && lookAhead(2) === CharCode.DoubleQuote;
|
||||
|
||||
if (tripleQuoted) {
|
||||
tokenFlags |= TokenFlags.TripleQuoted;
|
||||
quoteLength = 3;
|
||||
position += 2;
|
||||
closing = '"""';
|
||||
loop: while (!eof()) {
|
||||
const ch = input.charCodeAt(position);
|
||||
switch (ch) {
|
||||
case CharCode.CarriageReturn:
|
||||
if (lookAhead(1) === CharCode.LineFeed) {
|
||||
tokenFlags |= TokenFlags.HasCrlf;
|
||||
position++;
|
||||
}
|
||||
break;
|
||||
case CharCode.Backslash:
|
||||
tokenFlags |= TokenFlags.Escaped;
|
||||
position++;
|
||||
if (eof()) {
|
||||
break loop;
|
||||
}
|
||||
break;
|
||||
case CharCode.DoubleQuote:
|
||||
position++;
|
||||
return (token = Token.StringLiteral);
|
||||
}
|
||||
position++;
|
||||
}
|
||||
|
||||
scanUntil(
|
||||
(ch) => {
|
||||
if (isEscaping) {
|
||||
isEscaping = false;
|
||||
return false;
|
||||
}
|
||||
error(Message.Unterminated, ["string literal"]);
|
||||
return (token = Token.StringLiteral);
|
||||
}
|
||||
|
||||
switch (ch) {
|
||||
case CharCode.CarriageReturn:
|
||||
if (lookAhead(1) === CharCode.LineFeed) {
|
||||
tokenFlags |= TokenFlags.HasCrlf;
|
||||
}
|
||||
return false;
|
||||
function scanTripleQuotedString() {
|
||||
tokenFlags |= TokenFlags.TripleQuoted;
|
||||
position += 3; // consume '"""'
|
||||
|
||||
case CharCode.Backslash:
|
||||
isEscaping = true;
|
||||
tokenFlags |= TokenFlags.Escaped;
|
||||
return false;
|
||||
|
||||
case CharCode.DoubleQuote:
|
||||
if (tripleQuoted) {
|
||||
return lookAhead(1) === CharCode.DoubleQuote && lookAhead(2) === CharCode.DoubleQuote;
|
||||
}
|
||||
return true;
|
||||
|
||||
default:
|
||||
return false;
|
||||
}
|
||||
},
|
||||
closing,
|
||||
quoteLength
|
||||
);
|
||||
loop: while (!eof()) {
|
||||
const ch = input.charCodeAt(position);
|
||||
switch (ch) {
|
||||
case CharCode.CarriageReturn:
|
||||
if (lookAhead(1) === CharCode.LineFeed) {
|
||||
tokenFlags |= TokenFlags.HasCrlf;
|
||||
position++;
|
||||
}
|
||||
break;
|
||||
case CharCode.Backslash:
|
||||
tokenFlags |= TokenFlags.Escaped;
|
||||
position++;
|
||||
if (eof()) {
|
||||
break loop;
|
||||
}
|
||||
break;
|
||||
case CharCode.DoubleQuote:
|
||||
if (lookAhead(1) === CharCode.DoubleQuote && lookAhead(2) === CharCode.DoubleQuote) {
|
||||
position += 3;
|
||||
return (token = Token.StringLiteral);
|
||||
}
|
||||
break;
|
||||
}
|
||||
position++;
|
||||
}
|
||||
|
||||
error(Message.Unterminated, ["string literal"]);
|
||||
return (token = Token.StringLiteral);
|
||||
}
|
||||
|
||||
|
@ -576,11 +627,10 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
|
|||
if (tokenValue !== undefined) {
|
||||
return tokenValue;
|
||||
}
|
||||
return (tokenValue = token === Token.StringLiteral ? getStringTokenValue() : getTokenText());
|
||||
}
|
||||
|
||||
if (token !== Token.StringLiteral) {
|
||||
return (tokenValue = getTokenText());
|
||||
}
|
||||
|
||||
function getStringTokenValue() {
|
||||
// strip quotes
|
||||
const quoteLength = tokenFlags & TokenFlags.TripleQuoted ? 3 : 1;
|
||||
let value = input.substring(tokenPosition + quoteLength, position - quoteLength);
|
||||
|
@ -729,30 +779,28 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
|
|||
}
|
||||
|
||||
function scanIdentifierOrKeyword() {
|
||||
let ch = input.charCodeAt(position);
|
||||
|
||||
if (!isAsciiIdentifierStart(ch)) {
|
||||
return scanNonAsciiIdentifier();
|
||||
}
|
||||
|
||||
const startChar = input.charCodeAt(position);
|
||||
let ch = startChar;
|
||||
do {
|
||||
position++;
|
||||
if (eof()) {
|
||||
break;
|
||||
}
|
||||
ch = input.charCodeAt(position);
|
||||
} while (isAsciiIdentifierContinue(ch));
|
||||
} while (!eof() && isAsciiIdentifierContinue((ch = input.charCodeAt(position))));
|
||||
|
||||
if (!eof() && ch > CharCode.MaxAscii) {
|
||||
if (ch > CharCode.MaxAscii) {
|
||||
const codePoint = input.codePointAt(position)!;
|
||||
if (isNonAsciiIdentifierContinue(codePoint)) {
|
||||
return scanNonAsciiIdentifierContinue(codePoint);
|
||||
}
|
||||
}
|
||||
|
||||
if (position - tokenPosition <= maxKeywordLength) {
|
||||
const value = getTokenValue();
|
||||
const keyword = Keywords.get(value);
|
||||
const length = position - tokenPosition;
|
||||
if (
|
||||
length >= KeywordLimit.MinLength &&
|
||||
length <= KeywordLimit.MaxLength &&
|
||||
startChar >= KeywordLimit.MinStartChar &&
|
||||
startChar <= KeywordLimit.MaxStartChar
|
||||
) {
|
||||
tokenValue = getTokenText();
|
||||
const keyword = Keywords.get(tokenValue);
|
||||
if (keyword) {
|
||||
return (token = keyword);
|
||||
}
|
||||
|
@ -761,23 +809,11 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
|
|||
return (token = Token.Identifier);
|
||||
}
|
||||
|
||||
function scanNonAsciiIdentifier() {
|
||||
let codePoint = input.codePointAt(position)!;
|
||||
return isNonAsciiIdentifierStart(codePoint)
|
||||
? scanNonAsciiIdentifierContinue(codePoint)
|
||||
: scanInvalidCharacter();
|
||||
}
|
||||
|
||||
function scanNonAsciiIdentifierContinue(startCodePoint: number) {
|
||||
let codePoint = startCodePoint;
|
||||
|
||||
do {
|
||||
position += utf16CodeUnits(codePoint);
|
||||
if (eof()) {
|
||||
break;
|
||||
}
|
||||
codePoint = input.codePointAt(position)!;
|
||||
} while (isIdentifierContinue(codePoint));
|
||||
} while (!eof() && isIdentifierContinue((codePoint = input.codePointAt(position)!)));
|
||||
|
||||
return (token = Token.Identifier);
|
||||
}
|
||||
|
|
|
@ -219,8 +219,38 @@ describe("syntax", () => {
|
|||
]);
|
||||
});
|
||||
|
||||
describe("unterminated tokens", () => {
|
||||
parseErrorEach([
|
||||
['model X = "banana', [/Unterminated string literal/]],
|
||||
['model X = "banana\\', [/Unterminated string literal/]],
|
||||
['model X = """\nbanana', [/Unterminated string literal/]],
|
||||
['model X = """\nbanana\\', [/Unterminated string literal/]],
|
||||
["/* Yada yada yada", [/Unterminated comment/]],
|
||||
["123.0e", [/Digit expected/]],
|
||||
["123.e", [/Digit expected/]],
|
||||
["123e", [/Digit expected/]],
|
||||
["0b", [/Binary digit expected/]],
|
||||
["0x", [/Hexadecimal digit expected/]],
|
||||
]);
|
||||
});
|
||||
|
||||
describe("terminated tokens at EOF with missing semicolon", () => {
|
||||
parseErrorEach([
|
||||
["model X = 0x10101", [/';' expected/]],
|
||||
["model X = 0xBEEF", [/';' expected/]],
|
||||
["model X = 123", [/';' expected/]],
|
||||
["model X = 123.", [/';' expected/]],
|
||||
["model X = 123e45", [/';' expected/]],
|
||||
["model X = 123.45", [/';' expected/]],
|
||||
["model X = 123.45e2", [/';' expected/]],
|
||||
["model X = Banana", [/';' expected/]],
|
||||
['model X = "Banana"', [/';' expected/]],
|
||||
['model X = """\nBanana\n"""', [/';' expected/]],
|
||||
]);
|
||||
});
|
||||
|
||||
describe("non-ascii identifiers", () => {
|
||||
parseEach(["model Incompréhensible {}", "model 𐌰𐌲 {}", "model Banana𐌰𐌲Banana {}"]);
|
||||
parseEach(["model Incompréhensible {}", "model 𐌰𐌲 {}", "model Banana𐌰𐌲42Banana {}"]);
|
||||
parseErrorEach([["model 😢 {}", [/Invalid character/]]]);
|
||||
});
|
||||
});
|
||||
|
|
|
@ -7,14 +7,22 @@ import {
|
|||
isKeyword,
|
||||
isPunctuation,
|
||||
isStatementKeyword,
|
||||
KeywordLimit,
|
||||
Keywords,
|
||||
maxKeywordLength,
|
||||
Token,
|
||||
TokenDisplay,
|
||||
} from "../compiler/scanner.js";
|
||||
import { LineAndCharacter } from "../compiler/types.js";
|
||||
|
||||
type TokenEntry = [Token, string?, number?, LineAndCharacter?];
|
||||
type TokenEntry = [
|
||||
Token,
|
||||
string?,
|
||||
{
|
||||
pos?: number;
|
||||
line?: number;
|
||||
character?: number;
|
||||
value?: string;
|
||||
}?
|
||||
];
|
||||
|
||||
function tokens(text: string, onError = throwOnError): TokenEntry[] {
|
||||
const scanner = createScanner(text, onError);
|
||||
|
@ -25,8 +33,11 @@ function tokens(text: string, onError = throwOnError): TokenEntry[] {
|
|||
result.push([
|
||||
scanner.token,
|
||||
scanner.getTokenText(),
|
||||
scanner.tokenPosition,
|
||||
scanner.file.getLineAndCharacterOfPosition(scanner.tokenPosition),
|
||||
{
|
||||
pos: scanner.tokenPosition,
|
||||
value: scanner.getTokenValue(),
|
||||
...scanner.file.getLineAndCharacterOfPosition(scanner.tokenPosition),
|
||||
},
|
||||
]);
|
||||
} while (!scanner.eof());
|
||||
|
||||
|
@ -38,26 +49,43 @@ function tokens(text: string, onError = throwOnError): TokenEntry[] {
|
|||
}
|
||||
|
||||
function verify(tokens: TokenEntry[], expecting: TokenEntry[]) {
|
||||
for (const [
|
||||
index,
|
||||
[expectedToken, expectedText, expectedPosition, expectedLineAndCharacter],
|
||||
] of expecting.entries()) {
|
||||
const [token, text, position, lineAndCharacter] = tokens[index];
|
||||
for (const [index, [expectedToken, expectedText, expectedAdditional]] of expecting.entries()) {
|
||||
const [token, text, additional] = tokens[index];
|
||||
assert.strictEqual(Token[token], Token[expectedToken], `Token ${index} must match`);
|
||||
|
||||
if (expectedText) {
|
||||
assert.strictEqual(text, expectedText, `Token ${index} test must match`);
|
||||
}
|
||||
|
||||
if (expectedPosition) {
|
||||
assert.strictEqual(position, expectedPosition, `Token ${index} position must match`);
|
||||
if (expectedAdditional?.pos) {
|
||||
assert.strictEqual(
|
||||
additional!.pos,
|
||||
expectedAdditional.pos,
|
||||
`Token ${index} position must match`
|
||||
);
|
||||
}
|
||||
|
||||
if (expectedLineAndCharacter) {
|
||||
assert.deepStrictEqual(
|
||||
lineAndCharacter,
|
||||
expectedLineAndCharacter,
|
||||
`Token ${index} line and character must match`
|
||||
if (expectedAdditional?.line) {
|
||||
assert.strictEqual(
|
||||
additional!.line,
|
||||
expectedAdditional.line,
|
||||
`Token ${index} line must match`
|
||||
);
|
||||
}
|
||||
|
||||
if (expectedAdditional?.character) {
|
||||
assert.strictEqual(
|
||||
additional!.character,
|
||||
expectedAdditional?.character,
|
||||
`Token ${index} character must match`
|
||||
);
|
||||
}
|
||||
|
||||
if (expectedAdditional?.value) {
|
||||
assert.strictEqual(
|
||||
additional!.value,
|
||||
expectedAdditional.value,
|
||||
`Token ${index} value must match`
|
||||
);
|
||||
}
|
||||
}
|
||||
|
@ -66,16 +94,16 @@ function verify(tokens: TokenEntry[], expecting: TokenEntry[]) {
|
|||
describe("scanner", () => {
|
||||
/** verifies that we can scan tokens and get back some output. */
|
||||
it("smoketest", () => {
|
||||
const all = tokens("\tthis is a test");
|
||||
const all = tokens('\tthis is "a" test');
|
||||
verify(all, [
|
||||
[Token.Whitespace],
|
||||
[Token.Identifier, "this"],
|
||||
[Token.Identifier, "this", { value: "this" }],
|
||||
[Token.Whitespace],
|
||||
[Token.Identifier, "is"],
|
||||
[Token.Identifier, "is", { value: "is" }],
|
||||
[Token.Whitespace],
|
||||
[Token.Identifier, "a"],
|
||||
[Token.StringLiteral, '"a"', { value: "a" }],
|
||||
[Token.Whitespace],
|
||||
[Token.Identifier, "test"],
|
||||
[Token.Identifier, "test", { value: "test" }],
|
||||
]);
|
||||
});
|
||||
|
||||
|
@ -130,7 +158,7 @@ describe("scanner", () => {
|
|||
});
|
||||
|
||||
it("scans numeric literals", () => {
|
||||
const all = tokens("42 0xBEEF 0b1010 1.5e4 314.0e-2 1e+1000");
|
||||
const all = tokens("42 0xBEEF 0b1010 1.5e4 314.0e-2 1e+1000 3. 2.e3");
|
||||
verify(all, [
|
||||
[Token.NumericLiteral, "42"],
|
||||
[Token.Whitespace],
|
||||
|
@ -143,6 +171,11 @@ describe("scanner", () => {
|
|||
[Token.NumericLiteral, "314.0e-2"],
|
||||
[Token.Whitespace],
|
||||
[Token.NumericLiteral, "1e+1000"],
|
||||
[Token.Whitespace],
|
||||
// https://github.com/Azure/adl/issues/488 - we may want to disallow these
|
||||
[Token.NumericLiteral, "3."],
|
||||
[Token.Whitespace],
|
||||
[Token.NumericLiteral, "2.e3"],
|
||||
]);
|
||||
});
|
||||
|
||||
|
@ -184,34 +217,34 @@ describe("scanner", () => {
|
|||
it("provides token position", () => {
|
||||
const all = tokens("a x\raa x\r\naaa x\naaaa x\u{2028}aaaaa x\u{2029}aaaaaa x");
|
||||
verify(all, [
|
||||
[Token.Identifier, "a", 0, { line: 0, character: 0 }],
|
||||
[Token.Whitespace, " ", 1, { line: 0, character: 1 }],
|
||||
[Token.Identifier, "x", 2, { line: 0, character: 2 }],
|
||||
[Token.NewLine, "\r", 3, { line: 0, character: 3 }],
|
||||
[Token.Identifier, "a", { pos: 0, line: 0, character: 0 }],
|
||||
[Token.Whitespace, " ", { pos: 1, line: 0, character: 1 }],
|
||||
[Token.Identifier, "x", { pos: 2, line: 0, character: 2 }],
|
||||
[Token.NewLine, "\r", { pos: 3, line: 0, character: 3 }],
|
||||
|
||||
[Token.Identifier, "aa", 4, { line: 1, character: 0 }],
|
||||
[Token.Whitespace, " ", 6, { line: 1, character: 2 }],
|
||||
[Token.Identifier, "x", 7, { line: 1, character: 3 }],
|
||||
[Token.NewLine, "\r\n", 8, { line: 1, character: 4 }],
|
||||
[Token.Identifier, "aa", { pos: 4, line: 1, character: 0 }],
|
||||
[Token.Whitespace, " ", { pos: 6, line: 1, character: 2 }],
|
||||
[Token.Identifier, "x", { pos: 7, line: 1, character: 3 }],
|
||||
[Token.NewLine, "\r\n", { pos: 8, line: 1, character: 4 }],
|
||||
|
||||
[Token.Identifier, "aaa", 10, { line: 2, character: 0 }],
|
||||
[Token.Whitespace, " ", 13, { line: 2, character: 3 }],
|
||||
[Token.Identifier, "x", 14, { line: 2, character: 4 }],
|
||||
[Token.NewLine, "\n", 15, { line: 2, character: 5 }],
|
||||
[Token.Identifier, "aaa", { pos: 10, line: 2, character: 0 }],
|
||||
[Token.Whitespace, " ", { pos: 13, line: 2, character: 3 }],
|
||||
[Token.Identifier, "x", { pos: 14, line: 2, character: 4 }],
|
||||
[Token.NewLine, "\n", { pos: 15, line: 2, character: 5 }],
|
||||
|
||||
[Token.Identifier, "aaaa", 16, { line: 3, character: 0 }],
|
||||
[Token.Whitespace, " ", 20, { line: 3, character: 4 }],
|
||||
[Token.Identifier, "x", 21, { line: 3, character: 5 }],
|
||||
[Token.NewLine, "\u{2028}", 22, { line: 3, character: 6 }],
|
||||
[Token.Identifier, "aaaa", { pos: 16, line: 3, character: 0 }],
|
||||
[Token.Whitespace, " ", { pos: 20, line: 3, character: 4 }],
|
||||
[Token.Identifier, "x", { pos: 21, line: 3, character: 5 }],
|
||||
[Token.NewLine, "\u{2028}", { pos: 22, line: 3, character: 6 }],
|
||||
|
||||
[Token.Identifier, "aaaaa", 23, { line: 4, character: 0 }],
|
||||
[Token.Whitespace, " ", 28, { line: 4, character: 5 }],
|
||||
[Token.Identifier, "x", 29, { line: 4, character: 6 }],
|
||||
[Token.NewLine, "\u{2029}", 30, { line: 4, character: 7 }],
|
||||
[Token.Identifier, "aaaaa", { pos: 23, line: 4, character: 0 }],
|
||||
[Token.Whitespace, " ", { pos: 28, line: 4, character: 5 }],
|
||||
[Token.Identifier, "x", { pos: 29, line: 4, character: 6 }],
|
||||
[Token.NewLine, "\u{2029}", { pos: 30, line: 4, character: 7 }],
|
||||
|
||||
[Token.Identifier, "aaaaaa", 31, { line: 5, character: 0 }],
|
||||
[Token.Whitespace, " ", 37, { line: 5, character: 6 }],
|
||||
[Token.Identifier, "x", 38, { line: 5, character: 7 }],
|
||||
[Token.Identifier, "aaaaaa", { pos: 31, line: 5, character: 0 }],
|
||||
[Token.Whitespace, " ", { pos: 37, line: 5, character: 6 }],
|
||||
[Token.Identifier, "x", { pos: 38, line: 5, character: 7 }],
|
||||
]);
|
||||
});
|
||||
|
||||
|
@ -225,11 +258,19 @@ describe("scanner", () => {
|
|||
`Token enum has ${tokenCount} elements but TokenDisplay array has ${tokenDisplayCount}.`
|
||||
);
|
||||
|
||||
// check that keywords have appropriate display
|
||||
// check that keywords have appropriate display and limits
|
||||
const nonStatementKeywords = [Token.ExtendsKeyword, Token.TrueKeyword, Token.FalseKeyword];
|
||||
let maxKeywordLengthFound = -1;
|
||||
let minKeywordLengthFound = Number.MAX_SAFE_INTEGER;
|
||||
let maxKeywordLengthFound = Number.MIN_SAFE_INTEGER;
|
||||
let minKeywordStartCharFound = Number.MAX_SAFE_INTEGER;
|
||||
let maxKeywordStartCharFound = Number.MIN_SAFE_INTEGER;
|
||||
|
||||
for (const [name, token] of Keywords.entries()) {
|
||||
minKeywordLengthFound = Math.min(minKeywordLengthFound, name.length);
|
||||
maxKeywordLengthFound = Math.max(maxKeywordLengthFound, name.length);
|
||||
minKeywordStartCharFound = Math.min(minKeywordStartCharFound, name.charCodeAt(0));
|
||||
maxKeywordStartCharFound = Math.max(maxKeywordStartCharFound, name.charCodeAt(0));
|
||||
|
||||
assert.strictEqual(TokenDisplay[token], `'${name}'`);
|
||||
assert(isKeyword(token), `${name} should be classified as a keyword`);
|
||||
if (!nonStatementKeywords.includes(token)) {
|
||||
|
@ -237,7 +278,10 @@ describe("scanner", () => {
|
|||
}
|
||||
}
|
||||
|
||||
assert.strictEqual(maxKeywordLengthFound, maxKeywordLength);
|
||||
assert.strictEqual(minKeywordLengthFound, KeywordLimit.MinLength);
|
||||
assert.strictEqual(maxKeywordLengthFound, KeywordLimit.MaxLength);
|
||||
assert.strictEqual(minKeywordStartCharFound, KeywordLimit.MinStartChar);
|
||||
assert.strictEqual(maxKeywordStartCharFound, KeywordLimit.MaxStartChar);
|
||||
|
||||
// check single character punctuation
|
||||
for (let i = 33; i <= 126; i++) {
|
||||
|
|
Загрузка…
Ссылка в новой задаче