More scanner optimization and fixes

* Don't allocate substrings to match keywords.
* Make at most one pass over string to get its unquoted/unescaped/unindented
  value. (**)
* Add test coverage for impacted code paths.
* Fix issue where string value of an unterminated string was missing final
  character(s) in string value.

(**) Actually, we still make two passes in the case of a non-triple-quoted,
multi-line string with \r\n, but that is about to be removed by the next
commit which will disallow non-triple-quoted, multi-line strings
altogether.
This commit is contained in:
Nick Guerrera 2021-05-02 12:14:10 -07:00
Родитель 158dd31eb2
Коммит 44faaadf01
4 изменённых файлов: 381 добавлений и 234 удалений

Просмотреть файл

@ -144,11 +144,15 @@ export const enum CharCode {
Tilde = 0x7e, Tilde = 0x7e,
} }
export function utf16CodeUnits(codePoint: number) {
return codePoint >= 0x10000 ? 2 : 1;
}
export function isAsciiLineBreak(ch: number) { export function isAsciiLineBreak(ch: number) {
return ch === CharCode.LineFeed || ch == CharCode.CarriageReturn; return ch === CharCode.LineFeed || ch == CharCode.CarriageReturn;
} }
export function isAsciiWhiteSpaceSingleLine(ch: number): boolean { export function isAsciiWhiteSpaceSingleLine(ch: number) {
return ( return (
ch === CharCode.Space || ch === CharCode.Space ||
ch === CharCode.Tab || ch === CharCode.Tab ||
@ -186,25 +190,29 @@ export function isWhiteSpaceSingleLine(ch: number) {
); );
} }
export function isLineBreak(ch: number): boolean { export function isLineBreak(ch: number) {
return isAsciiLineBreak(ch) || (ch > CharCode.MaxAscii && isNonAsciiLineBreak(ch)); return isAsciiLineBreak(ch) || (ch > CharCode.MaxAscii && isNonAsciiLineBreak(ch));
} }
export function isDigit(ch: number): boolean { export function isDigit(ch: number) {
return ch >= CharCode._0 && ch <= CharCode._9; return ch >= CharCode._0 && ch <= CharCode._9;
} }
export function isHexDigit(ch: number): boolean { export function isHexDigit(ch: number) {
return ( return (
isDigit(ch) || (ch >= CharCode.A && ch <= CharCode.F) || (ch >= CharCode.a && ch <= CharCode.f) isDigit(ch) || (ch >= CharCode.A && ch <= CharCode.F) || (ch >= CharCode.a && ch <= CharCode.f)
); );
} }
export function isBinaryDigit(ch: number): boolean { export function isBinaryDigit(ch: number) {
return ch === CharCode._0 || ch === CharCode._1; return ch === CharCode._0 || ch === CharCode._1;
} }
export function isAsciiIdentifierStart(ch: number): boolean { export function isLowercaseAsciiLetter(ch: number) {
return ch >= CharCode.a && ch <= CharCode.z;
}
export function isAsciiIdentifierStart(ch: number) {
return ( return (
(ch >= CharCode.A && ch <= CharCode.Z) || (ch >= CharCode.A && ch <= CharCode.Z) ||
(ch >= CharCode.a && ch <= CharCode.z) || (ch >= CharCode.a && ch <= CharCode.z) ||
@ -213,7 +221,7 @@ export function isAsciiIdentifierStart(ch: number): boolean {
); );
} }
export function isAsciiIdentifierContinue(ch: number): boolean { export function isAsciiIdentifierContinue(ch: number) {
return ( return (
(ch >= CharCode.A && ch <= CharCode.Z) || (ch >= CharCode.A && ch <= CharCode.Z) ||
(ch >= CharCode.a && ch <= CharCode.z) || (ch >= CharCode.a && ch <= CharCode.z) ||
@ -245,7 +253,7 @@ export function isNonAsciiIdentifierContinue(codePoint: number) {
return lookupInNonAsciiMap(codePoint, nonAsciiIdentifierContinueMap); return lookupInNonAsciiMap(codePoint, nonAsciiIdentifierContinueMap);
} }
function lookupInNonAsciiMap(codePoint: number, map: readonly number[]): boolean { function lookupInNonAsciiMap(codePoint: number, map: readonly number[]) {
// Bail out quickly if it couldn't possibly be in the map. // Bail out quickly if it couldn't possibly be in the map.
if (codePoint < map[0]) { if (codePoint < map[0]) {
return false; return false;

Просмотреть файл

@ -7,11 +7,13 @@ import {
isHexDigit, isHexDigit,
isIdentifierContinue, isIdentifierContinue,
isLineBreak, isLineBreak,
isLowercaseAsciiLetter,
isNonAsciiIdentifierContinue, isNonAsciiIdentifierContinue,
isNonAsciiIdentifierStart, isNonAsciiIdentifierStart,
isNonAsciiLineBreak, isNonAsciiLineBreak,
isNonAsciiWhiteSpaceSingleLine, isNonAsciiWhiteSpaceSingleLine,
isWhiteSpaceSingleLine, isWhiteSpaceSingleLine,
utf16CodeUnits,
} from "./charcode.js"; } from "./charcode.js";
import { createSourceFile, Message, throwOnError } from "./diagnostics.js"; import { createSourceFile, Message, throwOnError } from "./diagnostics.js";
import { SourceFile } from "./types.js"; import { SourceFile } from "./types.js";
@ -91,16 +93,16 @@ const MaxStatementKeyword = Token.AliasKeyword;
/** @internal */ /** @internal */
export const TokenDisplay: readonly string[] = [ export const TokenDisplay: readonly string[] = [
"<none>", "none",
"<invalid>", "invalid",
"<end of file>", "end of file",
"<single-line comment>", "single-line comment",
"<multi-line comment>", "multi-line comment",
"<newline>", "newline",
"<whitespace>", "whitespace",
"<conflict marker>", "conflict marker",
"<numeric literal>", "numeric literal",
"<string literal>", "string literal",
"'{'", "'{'",
"'}'", "'}'",
"'('", "'('",
@ -119,7 +121,7 @@ export const TokenDisplay: readonly string[] = [
"'?'", "'?'",
"':'", "':'",
"'@'", "'@'",
"<identifier>", "identifier",
"'import'", "'import'",
"'model'", "'model'",
"'namespace'", "'namespace'",
@ -133,7 +135,7 @@ export const TokenDisplay: readonly string[] = [
]; ];
/** @internal */ /** @internal */
export const Keywords: ReadonlyMap<string, Token> = new Map([ export const Keywords: readonly [string, Token][] = [
["import", Token.ImportKeyword], ["import", Token.ImportKeyword],
["model", Token.ModelKeyword], ["model", Token.ModelKeyword],
["namespace", Token.NamespaceKeyword], ["namespace", Token.NamespaceKeyword],
@ -144,14 +146,30 @@ export const Keywords: ReadonlyMap<string, Token> = new Map([
["alias", Token.AliasKeyword], ["alias", Token.AliasKeyword],
["true", Token.TrueKeyword], ["true", Token.TrueKeyword],
["false", Token.FalseKeyword], ["false", Token.FalseKeyword],
]); ];
/** @internal */ /** @internal */
export const enum KeywordLimit { export const enum KeywordLimit {
MinLength = 2, MinLength = 2,
// If this ever exceeds 10, we will overflow the keyword map key, needing 11*5
// = 55 bits or more, exceeding the JavaScript safe integer range. We would
// have to change the keyword lookup algorithm in that case.
MaxLength = 9, MaxLength = 9,
MinStartChar = CharCode.a, }
MaxStartChar = CharCode.u,
const KeywordMap: ReadonlyMap<number, Token> = new Map(
Keywords.map((e) => [keywordKey(e[0]), e[1]])
);
// Since keywords are short and all lowercase, we can pack the whole string into
// a single number by using 5 bits for each letter, and use that as the map key.
// This lets us lookup keywords without making temporary substrings.
function keywordKey(keyword: string) {
let key = 0;
for (let i = 0; i < keyword.length; i++) {
key = (key << 5) | (keyword.charCodeAt(i) - CharCode.a);
}
return key;
} }
export interface Scanner { export interface Scanner {
@ -190,6 +208,7 @@ const enum TokenFlags {
HasCrlf = 1 << 0, HasCrlf = 1 << 0,
Escaped = 1 << 1, Escaped = 1 << 1,
TripleQuoted = 1 << 2, TripleQuoted = 1 << 2,
Unterminated = 1 << 3,
} }
export function isLiteral(token: Token) { export function isLiteral(token: Token) {
@ -226,9 +245,8 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
const file = typeof source === "string" ? createSourceFile(source, "<anonymous file>") : source; const file = typeof source === "string" ? createSourceFile(source, "<anonymous file>") : source;
const input = file.text; const input = file.text;
let position = 0; let position = 0;
let token = Token.Invalid; let token = Token.None;
let tokenPosition = -1; let tokenPosition = -1;
let tokenValue: string | undefined = undefined;
let tokenFlags = TokenFlags.None; let tokenFlags = TokenFlags.None;
return { return {
@ -252,26 +270,20 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
return position >= input.length; return position >= input.length;
} }
function next(t: Token, count = 1) {
position += count;
return (token = t);
}
function utf16CodeUnits(codePoint: number) {
return codePoint >= 0x10000 ? 2 : 1;
}
function getTokenText() { function getTokenText() {
return input.substring(tokenPosition, position); return input.substring(tokenPosition, position);
} }
function getTokenValue() {
return token === Token.StringLiteral ? getStringTokenValue() : getTokenText();
}
function lookAhead(offset: number) { function lookAhead(offset: number) {
return input.charCodeAt(position + offset); return input.charCodeAt(position + offset);
} }
function scan(): Token { function scan(): Token {
tokenPosition = position; tokenPosition = position;
tokenValue = undefined;
tokenFlags = TokenFlags.None; tokenFlags = TokenFlags.None;
if (!eof()) { if (!eof()) {
@ -390,10 +402,14 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
: scanString(); : scanString();
default: default:
if (isAsciiIdentifierStart(ch)) { if (isLowercaseAsciiLetter(ch)) {
return scanIdentifierOrKeyword(); return scanIdentifierOrKeyword();
} }
if (isAsciiIdentifierStart(ch)) {
return scanIdentifier();
}
if (ch <= CharCode.MaxAscii) { if (ch <= CharCode.MaxAscii) {
return scanInvalidCharacter(); return scanInvalidCharacter();
} }
@ -405,6 +421,17 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
return (token = Token.EndOfFile); return (token = Token.EndOfFile);
} }
function next(t: Token, count = 1) {
position += count;
return (token = t);
}
function unterminated(t: Token) {
tokenFlags |= TokenFlags.Unterminated;
error(Message.Unterminated, [TokenDisplay[t]]);
return (token = t);
}
function scanNonAsciiToken() { function scanNonAsciiToken() {
const ch = input.charCodeAt(position); const ch = input.charCodeAt(position);
@ -416,9 +443,9 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
return scanWhitespace(); return scanWhitespace();
} }
const codePoint = input.codePointAt(position)!; let cp = input.codePointAt(position)!;
if (isNonAsciiIdentifierStart(codePoint)) { if (isNonAsciiIdentifierStart(cp)) {
return scanNonAsciiIdentifierContinue(codePoint); return scanNonAsciiIdentifier(cp);
} }
return scanInvalidCharacter(); return scanInvalidCharacter();
@ -527,11 +554,10 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
function scanSingleLineComment() { function scanSingleLineComment() {
position += 2; // consume '//' position += 2; // consume '//'
while (!eof()) { for (; !eof(); position++) {
if (isLineBreak(input.charCodeAt(position))) { if (isLineBreak(input.charCodeAt(position))) {
break; break;
} }
position++;
} }
return (token = Token.SingleLineComment); return (token = Token.SingleLineComment);
@ -540,22 +566,20 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
function scanMultiLineComment() { function scanMultiLineComment() {
position += 2; // consume '/*' position += 2; // consume '/*'
while (!eof()) { for (; !eof(); position++) {
if (input.charCodeAt(position) === CharCode.Asterisk && lookAhead(1) === CharCode.Slash) { if (input.charCodeAt(position) === CharCode.Asterisk && lookAhead(1) === CharCode.Slash) {
position += 2; position += 2;
return (token = Token.MultiLineComment); return (token = Token.MultiLineComment);
} }
position++;
} }
error(Message.Unterminated, ["comment"]); return unterminated(Token.MultiLineComment);
return (token = Token.MultiLineComment);
} }
function scanString() { function scanString() {
position++; // consume '"' position++; // consume '"'
loop: while (!eof()) { loop: for (; !eof(); position++) {
const ch = input.charCodeAt(position); const ch = input.charCodeAt(position);
switch (ch) { switch (ch) {
case CharCode.CarriageReturn: case CharCode.CarriageReturn:
@ -570,150 +594,157 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
if (eof()) { if (eof()) {
break loop; break loop;
} }
break; continue;
case CharCode.DoubleQuote: case CharCode.DoubleQuote:
position++; position++;
return (token = Token.StringLiteral); return (token = Token.StringLiteral);
} }
position++;
} }
error(Message.Unterminated, ["string literal"]); return unterminated(Token.StringLiteral);
return (token = Token.StringLiteral);
} }
function scanTripleQuotedString() { function scanTripleQuotedString() {
tokenFlags |= TokenFlags.TripleQuoted; tokenFlags |= TokenFlags.TripleQuoted;
position += 3; // consume '"""' position += 3; // consume '"""'
loop: while (!eof()) { for (; !eof(); position++) {
const ch = input.charCodeAt(position); if (
switch (ch) { input.charCodeAt(position) === CharCode.DoubleQuote &&
case CharCode.CarriageReturn: lookAhead(1) === CharCode.DoubleQuote &&
if (lookAhead(1) === CharCode.LineFeed) { lookAhead(2) === CharCode.DoubleQuote
tokenFlags |= TokenFlags.HasCrlf; ) {
position++; position += 3;
} return (token = Token.StringLiteral);
break;
case CharCode.Backslash:
tokenFlags |= TokenFlags.Escaped;
position++;
if (eof()) {
break loop;
}
break;
case CharCode.DoubleQuote:
if (lookAhead(1) === CharCode.DoubleQuote && lookAhead(2) === CharCode.DoubleQuote) {
position += 3;
return (token = Token.StringLiteral);
}
break;
} }
position++;
} }
error(Message.Unterminated, ["string literal"]); return unterminated(Token.StringLiteral);
return (token = Token.StringLiteral);
}
function getTokenValue() {
if (tokenValue !== undefined) {
return tokenValue;
}
return (tokenValue = token === Token.StringLiteral ? getStringTokenValue() : getTokenText());
} }
function getStringTokenValue() { function getStringTokenValue() {
// strip quotes
const quoteLength = tokenFlags & TokenFlags.TripleQuoted ? 3 : 1; const quoteLength = tokenFlags & TokenFlags.TripleQuoted ? 3 : 1;
let value = input.substring(tokenPosition + quoteLength, position - quoteLength); const start = tokenPosition + quoteLength;
const end = tokenFlags & TokenFlags.Unterminated ? position : position - quoteLength;
// Normalize CRLF to LF when interpreting value of multi-line string
// literals. Matches JavaScript behavior and ensures program behavior does
// not change due to line-ending conversion.
if (tokenFlags & TokenFlags.HasCrlf) {
value = value.replace(/\r\n/g, "\n");
}
if (tokenFlags & TokenFlags.TripleQuoted) { if (tokenFlags & TokenFlags.TripleQuoted) {
value = unindentTripleQuoteString(value); return unindentAndUnescapeTripleQuotedString(start, end);
} }
if (tokenFlags & TokenFlags.Escaped) { if (tokenFlags & TokenFlags.Escaped) {
value = unescapeString(value); return unescapeString(start, end);
} }
return (tokenValue = value); let value = input.substring(start, end);
if (tokenFlags & TokenFlags.HasCrlf) {
value = value.replace(/\r\n/g, "\n");
}
return value;
} }
function unindentTripleQuoteString(text: string) { function unindentAndUnescapeTripleQuotedString(start: number, end: number) {
let start = 0;
let end = text.length;
// ignore leading whitespace before required initial line break // ignore leading whitespace before required initial line break
while (start < end && isWhiteSpaceSingleLine(text.charCodeAt(start))) { while (start < end && isWhiteSpaceSingleLine(input.charCodeAt(start))) {
start++; start++;
} }
// remove required initial line break // remove required initial line break
if (isLineBreak(text.charCodeAt(start))) { if (isLineBreak(input.charCodeAt(start))) {
if (isCrlf(start, start, end)) {
start++;
}
start++; start++;
} else { } else {
error(Message.NoNewLineAtStartOfTripleQuotedString); error(Message.NoNewLineAtStartOfTripleQuotedString);
} }
// remove whitespace before closing delimiter and record it as // remove whitespace before closing delimiter and record it as required
// required indentation for all lines. // indentation for all lines
while (end > start && isWhiteSpaceSingleLine(text.charCodeAt(end - 1))) { const indentationEnd = end;
while (end > start && isWhiteSpaceSingleLine(input.charCodeAt(end - 1))) {
end--; end--;
} }
const indentation = text.substring(end, text.length); const indentationStart = end;
// remove required final line break // remove required final line break
if (isLineBreak(text.charCodeAt(end - 1))) { if (isLineBreak(input.charCodeAt(end - 1))) {
if (isCrlf(end - 2, start, end)) {
end--;
}
end--; end--;
} else { } else {
error(Message.NoNewLineAtEndOfTripleQuotedString); error(Message.NoNewLineAtEndOfTripleQuotedString);
} }
// remove required matching indentation from each line // remove required matching indentation from each line and unescape in the
return removeMatchingIndentation(text, start, end, indentation); // process of doing so
}
function removeMatchingIndentation(
text: string,
start: number,
end: number,
indentation: string
) {
let result = ""; let result = "";
let pos = start; let pos = start;
while (pos < end) { while (pos < end) {
start = skipMatchingIndentation(text, pos, end, indentation); // skip indentation at start of line
while (pos < end && !isLineBreak(text.charCodeAt(pos))) { start = skipMatchingIndentation(pos, end, indentationStart, indentationEnd);
pos++; let ch;
while (pos < end && !isLineBreak((ch = input.charCodeAt(pos)))) {
if (ch !== CharCode.Backslash) {
pos++;
continue;
}
result += input.substring(start, pos);
if (pos === end - 1) {
error(Message.InvalidEscapeSequence);
pos++;
} else {
result += unescapeOne(pos);
pos += 2;
}
start = pos;
} }
if (pos < end) { if (pos < end) {
pos++; // include line break if (isCrlf(pos, start, end)) {
// CRLF in multi-line string is normalized to LF in string value.
// This keeps program behavior unchanged by line-eding conversion.
result += input.substring(start, pos);
result += "\n";
pos += 2;
} else {
pos++; // include non-CRLF newline
result += input.substring(start, pos);
}
start = pos;
} }
result += text.substring(start, pos);
} }
result += input.substring(start, pos);
return result; return result;
} }
function skipMatchingIndentation(text: string, pos: number, end: number, indentation: string) { function isCrlf(pos: number, start: number, end: number) {
end = Math.min(end, pos + indentation.length); return (
pos >= start &&
pos < end - 1 &&
input.charCodeAt(pos) === CharCode.CarriageReturn &&
input.charCodeAt(pos + 1) === CharCode.LineFeed
);
}
function skipMatchingIndentation(
pos: number,
end: number,
indentationStart: number,
indentationEnd: number
) {
let indentationPos = indentationStart;
end = Math.min(end, pos + (indentationEnd - indentationStart));
let indentationPos = 0;
while (pos < end) { while (pos < end) {
const ch = text.charCodeAt(pos); const ch = input.charCodeAt(pos);
if (isLineBreak(ch)) { if (isLineBreak(ch)) {
// allow subset of indentation if line has only whitespace // allow subset of indentation if line has only whitespace
break; break;
} }
if (ch != indentation.charCodeAt(indentationPos)) { if (ch !== input.charCodeAt(indentationPos)) {
error(Message.InconsistentTripleQuoteIndentation); error(Message.InconsistentTripleQuoteIndentation);
break; break;
} }
@ -724,76 +755,86 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
return pos; return pos;
} }
function unescapeString(text: string) { function unescapeString(start: number, end: number) {
let result = ""; let result = "";
let start = 0; let pos = start;
let pos = 0;
const end = text.length;
while (pos < end) { while (pos < end) {
let ch = text.charCodeAt(pos); let ch = input.charCodeAt(pos);
if (ch != CharCode.Backslash) { if (ch !== CharCode.Backslash) {
pos++; pos++;
continue; continue;
} }
result += text.substring(start, pos); if (pos === end - 1) {
pos++; error(Message.InvalidEscapeSequence);
ch = text.charCodeAt(pos); break;
switch (ch) {
case CharCode.r:
result += "\r";
break;
case CharCode.n:
result += "\n";
break;
case CharCode.t:
result += "\t";
break;
case CharCode.DoubleQuote:
result += '"';
break;
case CharCode.Backslash:
result += "\\";
break;
default:
error(Message.InvalidEscapeSequence);
result += String.fromCharCode(ch);
break;
} }
pos++; result += input.substring(start, pos);
result += unescapeOne(pos);
pos += 2;
start = pos; start = pos;
} }
result += text.substring(start, pos); result += input.substring(start, pos);
return result; return result;
} }
function scanIdentifierOrKeyword() { function unescapeOne(pos: number) {
const startChar = input.charCodeAt(position); const ch = input.charCodeAt(pos + 1);
let ch = startChar; switch (ch) {
do { case CharCode.r:
position++; return "\r";
} while (!eof() && isAsciiIdentifierContinue((ch = input.charCodeAt(position)))); case CharCode.n:
return "\n";
case CharCode.t:
return "\t";
case CharCode.DoubleQuote:
return '"';
case CharCode.Backslash:
return "\\";
default:
error(Message.InvalidEscapeSequence);
return String.fromCharCode(ch);
}
}
if (ch > CharCode.MaxAscii) { function scanIdentifierOrKeyword() {
const codePoint = input.codePointAt(position)!; let key = 0;
if (isNonAsciiIdentifierContinue(codePoint)) { let count = 0;
return scanNonAsciiIdentifierContinue(codePoint); let ch = input.charCodeAt(position);
while (true) {
position++;
count++;
key = (key << 5) | (ch - CharCode.a);
if (eof()) {
break;
} }
ch = input.charCodeAt(position);
if (count < KeywordLimit.MaxLength && isLowercaseAsciiLetter(ch)) {
continue;
}
if (isAsciiIdentifierContinue(ch)) {
return scanIdentifier();
}
if (ch > CharCode.MaxAscii) {
const cp = input.codePointAt(position)!;
if (isNonAsciiIdentifierContinue(cp)) {
return scanNonAsciiIdentifier(cp);
}
}
break;
} }
const length = position - tokenPosition; if (count >= KeywordLimit.MinLength && count <= KeywordLimit.MaxLength) {
if ( const keyword = KeywordMap.get(key);
length >= KeywordLimit.MinLength &&
length <= KeywordLimit.MaxLength &&
startChar >= KeywordLimit.MinStartChar &&
startChar <= KeywordLimit.MaxStartChar
) {
tokenValue = getTokenText();
const keyword = Keywords.get(tokenValue);
if (keyword) { if (keyword) {
return (token = keyword); return (token = keyword);
} }
@ -802,11 +843,31 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
return (token = Token.Identifier); return (token = Token.Identifier);
} }
function scanNonAsciiIdentifierContinue(startCodePoint: number) { function scanIdentifier() {
let codePoint = startCodePoint; let ch: number;
do { do {
position += utf16CodeUnits(codePoint); position++;
} while (!eof() && isIdentifierContinue((codePoint = input.codePointAt(position)!))); if (eof()) {
return (token = Token.Identifier);
}
} while (isAsciiIdentifierContinue((ch = input.charCodeAt(position))));
if (ch > CharCode.MaxAscii) {
let cp = input.codePointAt(position)!;
if (isNonAsciiIdentifierContinue(cp)) {
return scanNonAsciiIdentifier(cp);
}
}
return (token = Token.Identifier);
}
function scanNonAsciiIdentifier(startCodePoint: number) {
let cp = startCodePoint;
do {
position += utf16CodeUnits(cp);
} while (!eof() && isIdentifierContinue((cp = input.codePointAt(position)!)));
return (token = Token.Identifier); return (token = Token.Identifier);
} }

Просмотреть файл

@ -1,4 +1,5 @@
import assert from "assert"; import assert from "assert";
import { CharCode } from "../compiler/charcode.js";
import { logDiagnostics, logVerboseTestOutput } from "../compiler/diagnostics.js"; import { logDiagnostics, logVerboseTestOutput } from "../compiler/diagnostics.js";
import { hasParseError, NodeFlags, parse } from "../compiler/parser.js"; import { hasParseError, NodeFlags, parse } from "../compiler/parser.js";
import { ADLScriptNode, SyntaxKind } from "../compiler/types.js"; import { ADLScriptNode, SyntaxKind } from "../compiler/types.js";
@ -223,16 +224,25 @@ describe("syntax", () => {
}); });
describe("unterminated tokens", () => { describe("unterminated tokens", () => {
parseErrorEach([ parseErrorEach([["/* Yada yada yada", [/Unterminated multi-line comment/]]]);
['alias X = "banana', [/Unterminated string literal/]],
['alias X = "banana\\', [/Unterminated string literal/]], const strings = ['"banana', '"banana\\', '"""\nbanana', '"""\nbanana\\'];
['alias X = """\nbanana', [/Unterminated string literal/]], parseErrorEach(
['alias X = """\nbanana\\', [/Unterminated string literal/]], Array.from(strings.entries()).map((e) => [
["/* Yada yada yada", [/Unterminated comment/]], `alias ${String.fromCharCode(CharCode.A + e[0])} = ${e[1]}`,
]); [/Unterminated string literal/],
(node) => {
const statement = node.statements[0];
assert(statement.kind === SyntaxKind.AliasStatement, "alias statement expected");
const value = statement.value;
assert(value.kind === SyntaxKind.StringLiteral, "string literal expected");
assert.strictEqual(value.value, "banana");
},
])
);
}); });
describe("terminated tokens at EOF with missing semicolon", () => { describe("terminated tokens at EOF", () => {
parseErrorEach([ parseErrorEach([
["alias X = 0x10101", [/';' expected/]], ["alias X = 0x10101", [/';' expected/]],
["alias X = 0xBEEF", [/';' expected/]], ["alias X = 0xBEEF", [/';' expected/]],
@ -305,16 +315,68 @@ describe("syntax", () => {
} }
}); });
describe("non-ascii identifiers", () => { describe("identifiers", () => {
parseEach([ const good = [
"model Incompréhensible {}", "short",
"model 𐌰𐌲 {}", "short42",
"model Banana𐌰𐌲42Banana {}", "lowercaseandlong",
"model deaf\u{200c}ly {}", // ZWNJ "lowercaseandlong42",
"model क्‍ष {}", // ZWJ "camelCase",
]); "camelCase42",
parseErrorEach([["model 😢 {}", [/Invalid character/]]]); "PascalCase",
"PascalCase42",
"has_underscore",
"has_$dollar",
"_startsWithUnderscore",
"$startsWithDollar",
"Incompréhensible",
"incompréhensible",
"IncomprÉhensible",
"incomprÉhensible",
// leading astral character
"𐌰𐌲",
// continuing astral character
"Banana𐌰𐌲42Banana",
"banana𐌰𐌲42banana",
// ZWNJ
"deaf\u{200c}ly",
// ZWJ
"क्‍ष",
];
const bad: [string, RegExp][] = [
["😢", /Invalid character/],
["42", /Identifier expected/],
["true", /Keyword cannot be used as identifier/],
];
parseEach(
good.map((s) => [
`model ${s} {}`,
(node) => {
const statement = node.statements[0];
assert(statement.kind === SyntaxKind.ModelStatement, "Model statement expected.");
assert.strictEqual(statement.id.sv, s);
},
])
);
parseErrorEach(bad.map((e) => [`model ${e[0]} {}`, [e[1]]]));
}); });
});
// smaller repro of previous regen-samples baseline failures
describe("sample regressions", () => {
parseEach([
[
`/* \\n <-- before string! */ @format("\\\\w") model M {}`,
(node) => {
assert(node.statements[0].kind === SyntaxKind.ModelStatement);
assert(node.statements[0].decorators[0].arguments[0].kind === SyntaxKind.StringLiteral);
assert.strictEqual(node.statements[0].decorators[0].arguments[0].value, "\\w");
},
],
]);
describe("enum statements", () => { describe("enum statements", () => {
parseEach([ parseEach([
@ -344,7 +406,9 @@ describe("syntax", () => {
}); });
}); });
function parseEach(cases: (string | [string, (node: ADLScriptNode) => void])[]) { type Callback = (node: ADLScriptNode) => void;
function parseEach(cases: (string | [string, Callback])[]) {
for (const each of cases) { for (const each of cases) {
const code = typeof each === "string" ? each : each[0]; const code = typeof each === "string" ? each : each[0];
const callback = typeof each === "string" ? undefined : each[1]; const callback = typeof each === "string" ? undefined : each[1];
@ -377,13 +441,16 @@ function parseEach(cases: (string | [string, (node: ADLScriptNode) => void])[])
} }
} }
function parseErrorEach(cases: [string, RegExp[]][]) { function parseErrorEach(cases: [string, RegExp[], Callback?][], significantWhitespace = false) {
for (const [code, matches] of cases) { for (const [code, matches, callback] of cases) {
it(`doesn't parse ${shorten(code)}`, () => { it(`doesn't parse ${shorten(code)}`, () => {
logVerboseTestOutput("=== Source ==="); logVerboseTestOutput("=== Source ===");
logVerboseTestOutput(code); logVerboseTestOutput(code);
const astNode = parse(code); const astNode = parse(code);
if (callback) {
callback(astNode);
}
logVerboseTestOutput("\n=== Parse Result ==="); logVerboseTestOutput("\n=== Parse Result ===");
dumpAST(astNode); dumpAST(astNode);
@ -404,7 +471,7 @@ function parseErrorEach(cases: [string, RegExp[]][]) {
function dumpAST(astNode: ADLScriptNode) { function dumpAST(astNode: ADLScriptNode) {
logVerboseTestOutput((log) => { logVerboseTestOutput((log) => {
const hasErrors = hasParseError(astNode); // force flags to initialize hasParseError(astNode); // force flags to initialize
const json = JSON.stringify(astNode, replacer, 2); const json = JSON.stringify(astNode, replacer, 2);
log(json); log(json);
}); });

Просмотреть файл

@ -2,7 +2,7 @@ import assert from "assert";
import { readFile } from "fs/promises"; import { readFile } from "fs/promises";
import { URL } from "url"; import { URL } from "url";
import { isIdentifierContinue, isIdentifierStart } from "../compiler/charcode.js"; import { isIdentifierContinue, isIdentifierStart } from "../compiler/charcode.js";
import { throwOnError } from "../compiler/diagnostics.js"; import { createDiagnostic, formatDiagnostic, throwOnError } from "../compiler/diagnostics.js";
import { import {
createScanner, createScanner,
isKeyword, isKeyword,
@ -180,11 +180,21 @@ describe("scanner", () => {
]); ]);
}); });
function scanString(text: string, expectedValue: string) { function scanString(text: string, expectedValue: string, expectedDiagnostic?: RegExp) {
const scanner = createScanner(text); const scanner = createScanner(text, (message, target, args) => {
const diagnostic = createDiagnostic(message, target, args);
if (expectedDiagnostic) {
assert.match(diagnostic.message, expectedDiagnostic);
} else {
assert.fail("No diagnostic expected, but got " + formatDiagnostic(diagnostic));
}
});
assert.strictEqual(scanner.scan(), Token.StringLiteral); assert.strictEqual(scanner.scan(), Token.StringLiteral);
assert.strictEqual(scanner.token, Token.StringLiteral); assert.strictEqual(scanner.token, Token.StringLiteral);
assert.strictEqual(scanner.getTokenText(), text); if (!expectedDiagnostic) {
assert.strictEqual(scanner.getTokenText(), text);
}
assert.strictEqual(scanner.getTokenValue(), expectedValue); assert.strictEqual(scanner.getTokenValue(), expectedValue);
} }
@ -202,19 +212,24 @@ describe("scanner", () => {
it("scans triple-quoted strings", () => { it("scans triple-quoted strings", () => {
scanString( scanString(
// NOTE: sloppy blank line formatting and trailing whitespace after open
// quotes above is deliberate here and deliberately tolerated by
// the scanner.
`""" `"""
This is a triple-quoted string This is a triple-quoted string
"You do not need to escape lone quotes"
And this is another line You can use escape sequences: \\r \\n \\t \\\\ \\"
"""`, """`,
// NOTE: sloppy blank line formatting and trailing whitespace after open 'This is a triple-quoted string\n\n\n"You do not need to escape lone quotes"\nYou can use escape sequences: \r \n \t \\ "'
// quotes above is deliberately tolerated.
"This is a triple-quoted string\n\n\n\nAnd this is another line"
); );
}); });
it("normalizes CRLF to LF in multi-line string", () => {
scanString('"""\r\nThis\r\nis\r\na\r\ntest\r\n"""', "This\nis\na\ntest");
});
it("provides token position", () => { it("provides token position", () => {
const all = tokens("a x\raa x\r\naaa x\naaaa x\u{2028}aaaaa x\u{2029}aaaaaa x"); const all = tokens("a x\raa x\r\naaa x\naaaa x\u{2028}aaaaa x\u{2029}aaaaaa x");
verify(all, [ verify(all, [
@ -263,14 +278,15 @@ describe("scanner", () => {
const nonStatementKeywords = [Token.ExtendsKeyword, Token.TrueKeyword, Token.FalseKeyword]; const nonStatementKeywords = [Token.ExtendsKeyword, Token.TrueKeyword, Token.FalseKeyword];
let minKeywordLengthFound = Number.MAX_SAFE_INTEGER; let minKeywordLengthFound = Number.MAX_SAFE_INTEGER;
let maxKeywordLengthFound = Number.MIN_SAFE_INTEGER; let maxKeywordLengthFound = Number.MIN_SAFE_INTEGER;
let minKeywordStartCharFound = Number.MAX_SAFE_INTEGER;
let maxKeywordStartCharFound = Number.MIN_SAFE_INTEGER;
for (const [name, token] of Keywords.entries()) { for (const [name, token] of Keywords) {
assert.match(
name,
/^[a-z]+$/,
"We need to change the keyword lookup algorithm in the scanner if we ever add a keyword that is not all lowercase ascii letters."
);
minKeywordLengthFound = Math.min(minKeywordLengthFound, name.length); minKeywordLengthFound = Math.min(minKeywordLengthFound, name.length);
maxKeywordLengthFound = Math.max(maxKeywordLengthFound, name.length); maxKeywordLengthFound = Math.max(maxKeywordLengthFound, name.length);
minKeywordStartCharFound = Math.min(minKeywordStartCharFound, name.charCodeAt(0));
maxKeywordStartCharFound = Math.max(maxKeywordStartCharFound, name.charCodeAt(0));
assert.strictEqual(TokenDisplay[token], `'${name}'`); assert.strictEqual(TokenDisplay[token], `'${name}'`);
assert(isKeyword(token), `${name} should be classified as a keyword`); assert(isKeyword(token), `${name} should be classified as a keyword`);
@ -289,15 +305,10 @@ describe("scanner", () => {
KeywordLimit.MaxLength, KeywordLimit.MaxLength,
`max keyword length is incorrect, set KeywordLimit.MaxLength to ${maxKeywordLengthFound}` `max keyword length is incorrect, set KeywordLimit.MaxLength to ${maxKeywordLengthFound}`
); );
assert.strictEqual(
minKeywordStartCharFound, assert(
KeywordLimit.MinStartChar, maxKeywordLengthFound < 11,
`min keyword start char is incorrect, set KeywordLimit.MinStartChar to ${minKeywordStartCharFound}` "We need to change the keyword lookup algorithm in the scanner if we ever add a keyword with 11 characters or more."
);
assert.strictEqual(
maxKeywordStartCharFound,
KeywordLimit.MaxStartChar,
`max keyword start char is incorrect, set KeywordLimit.MaxStartChar to ${maxKeywordStartCharFound}`
); );
// check single character punctuation // check single character punctuation
@ -317,15 +328,15 @@ describe("scanner", () => {
// check the rest // check the rest
assert.strictEqual(TokenDisplay[Token.Elipsis], "'...'"); assert.strictEqual(TokenDisplay[Token.Elipsis], "'...'");
assert.strictEqual(TokenDisplay[Token.None], "<none>"); assert.strictEqual(TokenDisplay[Token.None], "none");
assert.strictEqual(TokenDisplay[Token.Invalid], "<invalid>"); assert.strictEqual(TokenDisplay[Token.Invalid], "invalid");
assert.strictEqual(TokenDisplay[Token.EndOfFile], "<end of file>"); assert.strictEqual(TokenDisplay[Token.EndOfFile], "end of file");
assert.strictEqual(TokenDisplay[Token.SingleLineComment], "<single-line comment>"); assert.strictEqual(TokenDisplay[Token.SingleLineComment], "single-line comment");
assert.strictEqual(TokenDisplay[Token.MultiLineComment], "<multi-line comment>"); assert.strictEqual(TokenDisplay[Token.MultiLineComment], "multi-line comment");
assert.strictEqual(TokenDisplay[Token.NewLine], "<newline>"); assert.strictEqual(TokenDisplay[Token.NewLine], "newline");
assert.strictEqual(TokenDisplay[Token.Whitespace], "<whitespace>"); assert.strictEqual(TokenDisplay[Token.Whitespace], "whitespace");
assert.strictEqual(TokenDisplay[Token.ConflictMarker], "<conflict marker>"); assert.strictEqual(TokenDisplay[Token.ConflictMarker], "conflict marker");
assert.strictEqual(TokenDisplay[Token.Identifier], "<identifier>"); assert.strictEqual(TokenDisplay[Token.Identifier], "identifier");
}); });
// Search for Other_ID_Start in https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt // Search for Other_ID_Start in https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt