From 44faaadf014a05b190ee1289f7de07781e49bedb Mon Sep 17 00:00:00 2001 From: Nick Guerrera Date: Sun, 2 May 2021 12:14:10 -0700 Subject: [PATCH] More scanner optimization and fixes * Don't allocate substrings to match keywords. * Make at most one pass over string to get its unquoted/unescaped/unindented value. (**) * Add test coverage for impacted code paths. * Fix issue where string value of an unterminated string was missing final character(s) in string value. (**) Actually, we still make two passes in the case of a non-triple-quoted, multi-line string with \r\n, but that is about to be removed by the next commit which will disallow non-triple-quoted, multi-line strings altogether. --- packages/adl/compiler/charcode.ts | 24 +- packages/adl/compiler/scanner.ts | 407 +++++++++++++++++------------- packages/adl/test/test-parser.ts | 109 ++++++-- packages/adl/test/test-scanner.ts | 75 +++--- 4 files changed, 381 insertions(+), 234 deletions(-) diff --git a/packages/adl/compiler/charcode.ts b/packages/adl/compiler/charcode.ts index 94b6e7d13..d373b54fa 100644 --- a/packages/adl/compiler/charcode.ts +++ b/packages/adl/compiler/charcode.ts @@ -144,11 +144,15 @@ export const enum CharCode { Tilde = 0x7e, } +export function utf16CodeUnits(codePoint: number) { + return codePoint >= 0x10000 ? 2 : 1; +} + export function isAsciiLineBreak(ch: number) { return ch === CharCode.LineFeed || ch == CharCode.CarriageReturn; } -export function isAsciiWhiteSpaceSingleLine(ch: number): boolean { +export function isAsciiWhiteSpaceSingleLine(ch: number) { return ( ch === CharCode.Space || ch === CharCode.Tab || @@ -186,25 +190,29 @@ export function isWhiteSpaceSingleLine(ch: number) { ); } -export function isLineBreak(ch: number): boolean { +export function isLineBreak(ch: number) { return isAsciiLineBreak(ch) || (ch > CharCode.MaxAscii && isNonAsciiLineBreak(ch)); } -export function isDigit(ch: number): boolean { +export function isDigit(ch: number) { return ch >= CharCode._0 && ch <= CharCode._9; } -export function isHexDigit(ch: number): boolean { +export function isHexDigit(ch: number) { return ( isDigit(ch) || (ch >= CharCode.A && ch <= CharCode.F) || (ch >= CharCode.a && ch <= CharCode.f) ); } -export function isBinaryDigit(ch: number): boolean { +export function isBinaryDigit(ch: number) { return ch === CharCode._0 || ch === CharCode._1; } -export function isAsciiIdentifierStart(ch: number): boolean { +export function isLowercaseAsciiLetter(ch: number) { + return ch >= CharCode.a && ch <= CharCode.z; +} + +export function isAsciiIdentifierStart(ch: number) { return ( (ch >= CharCode.A && ch <= CharCode.Z) || (ch >= CharCode.a && ch <= CharCode.z) || @@ -213,7 +221,7 @@ export function isAsciiIdentifierStart(ch: number): boolean { ); } -export function isAsciiIdentifierContinue(ch: number): boolean { +export function isAsciiIdentifierContinue(ch: number) { return ( (ch >= CharCode.A && ch <= CharCode.Z) || (ch >= CharCode.a && ch <= CharCode.z) || @@ -245,7 +253,7 @@ export function isNonAsciiIdentifierContinue(codePoint: number) { return lookupInNonAsciiMap(codePoint, nonAsciiIdentifierContinueMap); } -function lookupInNonAsciiMap(codePoint: number, map: readonly number[]): boolean { +function lookupInNonAsciiMap(codePoint: number, map: readonly number[]) { // Bail out quickly if it couldn't possibly be in the map. if (codePoint < map[0]) { return false; diff --git a/packages/adl/compiler/scanner.ts b/packages/adl/compiler/scanner.ts index c2cfcaaed..16c8a34f4 100644 --- a/packages/adl/compiler/scanner.ts +++ b/packages/adl/compiler/scanner.ts @@ -7,11 +7,13 @@ import { isHexDigit, isIdentifierContinue, isLineBreak, + isLowercaseAsciiLetter, isNonAsciiIdentifierContinue, isNonAsciiIdentifierStart, isNonAsciiLineBreak, isNonAsciiWhiteSpaceSingleLine, isWhiteSpaceSingleLine, + utf16CodeUnits, } from "./charcode.js"; import { createSourceFile, Message, throwOnError } from "./diagnostics.js"; import { SourceFile } from "./types.js"; @@ -91,16 +93,16 @@ const MaxStatementKeyword = Token.AliasKeyword; /** @internal */ export const TokenDisplay: readonly string[] = [ - "", - "", - "", - "", - "", - "", - "", - "", - "", - "", + "none", + "invalid", + "end of file", + "single-line comment", + "multi-line comment", + "newline", + "whitespace", + "conflict marker", + "numeric literal", + "string literal", "'{'", "'}'", "'('", @@ -119,7 +121,7 @@ export const TokenDisplay: readonly string[] = [ "'?'", "':'", "'@'", - "", + "identifier", "'import'", "'model'", "'namespace'", @@ -133,7 +135,7 @@ export const TokenDisplay: readonly string[] = [ ]; /** @internal */ -export const Keywords: ReadonlyMap = new Map([ +export const Keywords: readonly [string, Token][] = [ ["import", Token.ImportKeyword], ["model", Token.ModelKeyword], ["namespace", Token.NamespaceKeyword], @@ -144,14 +146,30 @@ export const Keywords: ReadonlyMap = new Map([ ["alias", Token.AliasKeyword], ["true", Token.TrueKeyword], ["false", Token.FalseKeyword], -]); +]; /** @internal */ export const enum KeywordLimit { MinLength = 2, + // If this ever exceeds 10, we will overflow the keyword map key, needing 11*5 + // = 55 bits or more, exceeding the JavaScript safe integer range. We would + // have to change the keyword lookup algorithm in that case. MaxLength = 9, - MinStartChar = CharCode.a, - MaxStartChar = CharCode.u, +} + +const KeywordMap: ReadonlyMap = new Map( + Keywords.map((e) => [keywordKey(e[0]), e[1]]) +); + +// Since keywords are short and all lowercase, we can pack the whole string into +// a single number by using 5 bits for each letter, and use that as the map key. +// This lets us lookup keywords without making temporary substrings. +function keywordKey(keyword: string) { + let key = 0; + for (let i = 0; i < keyword.length; i++) { + key = (key << 5) | (keyword.charCodeAt(i) - CharCode.a); + } + return key; } export interface Scanner { @@ -190,6 +208,7 @@ const enum TokenFlags { HasCrlf = 1 << 0, Escaped = 1 << 1, TripleQuoted = 1 << 2, + Unterminated = 1 << 3, } export function isLiteral(token: Token) { @@ -226,9 +245,8 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro const file = typeof source === "string" ? createSourceFile(source, "") : source; const input = file.text; let position = 0; - let token = Token.Invalid; + let token = Token.None; let tokenPosition = -1; - let tokenValue: string | undefined = undefined; let tokenFlags = TokenFlags.None; return { @@ -252,26 +270,20 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro return position >= input.length; } - function next(t: Token, count = 1) { - position += count; - return (token = t); - } - - function utf16CodeUnits(codePoint: number) { - return codePoint >= 0x10000 ? 2 : 1; - } - function getTokenText() { return input.substring(tokenPosition, position); } + function getTokenValue() { + return token === Token.StringLiteral ? getStringTokenValue() : getTokenText(); + } + function lookAhead(offset: number) { return input.charCodeAt(position + offset); } function scan(): Token { tokenPosition = position; - tokenValue = undefined; tokenFlags = TokenFlags.None; if (!eof()) { @@ -390,10 +402,14 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro : scanString(); default: - if (isAsciiIdentifierStart(ch)) { + if (isLowercaseAsciiLetter(ch)) { return scanIdentifierOrKeyword(); } + if (isAsciiIdentifierStart(ch)) { + return scanIdentifier(); + } + if (ch <= CharCode.MaxAscii) { return scanInvalidCharacter(); } @@ -405,6 +421,17 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro return (token = Token.EndOfFile); } + function next(t: Token, count = 1) { + position += count; + return (token = t); + } + + function unterminated(t: Token) { + tokenFlags |= TokenFlags.Unterminated; + error(Message.Unterminated, [TokenDisplay[t]]); + return (token = t); + } + function scanNonAsciiToken() { const ch = input.charCodeAt(position); @@ -416,9 +443,9 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro return scanWhitespace(); } - const codePoint = input.codePointAt(position)!; - if (isNonAsciiIdentifierStart(codePoint)) { - return scanNonAsciiIdentifierContinue(codePoint); + let cp = input.codePointAt(position)!; + if (isNonAsciiIdentifierStart(cp)) { + return scanNonAsciiIdentifier(cp); } return scanInvalidCharacter(); @@ -527,11 +554,10 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro function scanSingleLineComment() { position += 2; // consume '//' - while (!eof()) { + for (; !eof(); position++) { if (isLineBreak(input.charCodeAt(position))) { break; } - position++; } return (token = Token.SingleLineComment); @@ -540,22 +566,20 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro function scanMultiLineComment() { position += 2; // consume '/*' - while (!eof()) { + for (; !eof(); position++) { if (input.charCodeAt(position) === CharCode.Asterisk && lookAhead(1) === CharCode.Slash) { position += 2; return (token = Token.MultiLineComment); } - position++; } - error(Message.Unterminated, ["comment"]); - return (token = Token.MultiLineComment); + return unterminated(Token.MultiLineComment); } function scanString() { position++; // consume '"' - loop: while (!eof()) { + loop: for (; !eof(); position++) { const ch = input.charCodeAt(position); switch (ch) { case CharCode.CarriageReturn: @@ -570,150 +594,157 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro if (eof()) { break loop; } - break; + continue; case CharCode.DoubleQuote: position++; return (token = Token.StringLiteral); } - position++; } - error(Message.Unterminated, ["string literal"]); - return (token = Token.StringLiteral); + return unterminated(Token.StringLiteral); } function scanTripleQuotedString() { tokenFlags |= TokenFlags.TripleQuoted; position += 3; // consume '"""' - loop: while (!eof()) { - const ch = input.charCodeAt(position); - switch (ch) { - case CharCode.CarriageReturn: - if (lookAhead(1) === CharCode.LineFeed) { - tokenFlags |= TokenFlags.HasCrlf; - position++; - } - break; - case CharCode.Backslash: - tokenFlags |= TokenFlags.Escaped; - position++; - if (eof()) { - break loop; - } - break; - case CharCode.DoubleQuote: - if (lookAhead(1) === CharCode.DoubleQuote && lookAhead(2) === CharCode.DoubleQuote) { - position += 3; - return (token = Token.StringLiteral); - } - break; + for (; !eof(); position++) { + if ( + input.charCodeAt(position) === CharCode.DoubleQuote && + lookAhead(1) === CharCode.DoubleQuote && + lookAhead(2) === CharCode.DoubleQuote + ) { + position += 3; + return (token = Token.StringLiteral); } - position++; } - error(Message.Unterminated, ["string literal"]); - return (token = Token.StringLiteral); - } - - function getTokenValue() { - if (tokenValue !== undefined) { - return tokenValue; - } - return (tokenValue = token === Token.StringLiteral ? getStringTokenValue() : getTokenText()); + return unterminated(Token.StringLiteral); } function getStringTokenValue() { - // strip quotes const quoteLength = tokenFlags & TokenFlags.TripleQuoted ? 3 : 1; - let value = input.substring(tokenPosition + quoteLength, position - quoteLength); - - // Normalize CRLF to LF when interpreting value of multi-line string - // literals. Matches JavaScript behavior and ensures program behavior does - // not change due to line-ending conversion. - if (tokenFlags & TokenFlags.HasCrlf) { - value = value.replace(/\r\n/g, "\n"); - } + const start = tokenPosition + quoteLength; + const end = tokenFlags & TokenFlags.Unterminated ? position : position - quoteLength; if (tokenFlags & TokenFlags.TripleQuoted) { - value = unindentTripleQuoteString(value); + return unindentAndUnescapeTripleQuotedString(start, end); } if (tokenFlags & TokenFlags.Escaped) { - value = unescapeString(value); + return unescapeString(start, end); } - return (tokenValue = value); + let value = input.substring(start, end); + if (tokenFlags & TokenFlags.HasCrlf) { + value = value.replace(/\r\n/g, "\n"); + } + return value; } - function unindentTripleQuoteString(text: string) { - let start = 0; - let end = text.length; - + function unindentAndUnescapeTripleQuotedString(start: number, end: number) { // ignore leading whitespace before required initial line break - while (start < end && isWhiteSpaceSingleLine(text.charCodeAt(start))) { + while (start < end && isWhiteSpaceSingleLine(input.charCodeAt(start))) { start++; } // remove required initial line break - if (isLineBreak(text.charCodeAt(start))) { + if (isLineBreak(input.charCodeAt(start))) { + if (isCrlf(start, start, end)) { + start++; + } start++; } else { error(Message.NoNewLineAtStartOfTripleQuotedString); } - // remove whitespace before closing delimiter and record it as - // required indentation for all lines. - while (end > start && isWhiteSpaceSingleLine(text.charCodeAt(end - 1))) { + // remove whitespace before closing delimiter and record it as required + // indentation for all lines + const indentationEnd = end; + while (end > start && isWhiteSpaceSingleLine(input.charCodeAt(end - 1))) { end--; } - const indentation = text.substring(end, text.length); + const indentationStart = end; // remove required final line break - if (isLineBreak(text.charCodeAt(end - 1))) { + if (isLineBreak(input.charCodeAt(end - 1))) { + if (isCrlf(end - 2, start, end)) { + end--; + } end--; } else { error(Message.NoNewLineAtEndOfTripleQuotedString); } - // remove required matching indentation from each line - return removeMatchingIndentation(text, start, end, indentation); - } - - function removeMatchingIndentation( - text: string, - start: number, - end: number, - indentation: string - ) { + // remove required matching indentation from each line and unescape in the + // process of doing so let result = ""; let pos = start; - while (pos < end) { - start = skipMatchingIndentation(text, pos, end, indentation); - while (pos < end && !isLineBreak(text.charCodeAt(pos))) { - pos++; + // skip indentation at start of line + start = skipMatchingIndentation(pos, end, indentationStart, indentationEnd); + let ch; + + while (pos < end && !isLineBreak((ch = input.charCodeAt(pos)))) { + if (ch !== CharCode.Backslash) { + pos++; + continue; + } + result += input.substring(start, pos); + if (pos === end - 1) { + error(Message.InvalidEscapeSequence); + pos++; + } else { + result += unescapeOne(pos); + pos += 2; + } + start = pos; } + if (pos < end) { - pos++; // include line break + if (isCrlf(pos, start, end)) { + // CRLF in multi-line string is normalized to LF in string value. + // This keeps program behavior unchanged by line-eding conversion. + result += input.substring(start, pos); + result += "\n"; + pos += 2; + } else { + pos++; // include non-CRLF newline + result += input.substring(start, pos); + } + start = pos; } - result += text.substring(start, pos); } + result += input.substring(start, pos); return result; } - function skipMatchingIndentation(text: string, pos: number, end: number, indentation: string) { - end = Math.min(end, pos + indentation.length); + function isCrlf(pos: number, start: number, end: number) { + return ( + pos >= start && + pos < end - 1 && + input.charCodeAt(pos) === CharCode.CarriageReturn && + input.charCodeAt(pos + 1) === CharCode.LineFeed + ); + } + + function skipMatchingIndentation( + pos: number, + end: number, + indentationStart: number, + indentationEnd: number + ) { + let indentationPos = indentationStart; + end = Math.min(end, pos + (indentationEnd - indentationStart)); - let indentationPos = 0; while (pos < end) { - const ch = text.charCodeAt(pos); + const ch = input.charCodeAt(pos); if (isLineBreak(ch)) { // allow subset of indentation if line has only whitespace break; } - if (ch != indentation.charCodeAt(indentationPos)) { + if (ch !== input.charCodeAt(indentationPos)) { error(Message.InconsistentTripleQuoteIndentation); break; } @@ -724,76 +755,86 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro return pos; } - function unescapeString(text: string) { + function unescapeString(start: number, end: number) { let result = ""; - let start = 0; - let pos = 0; - const end = text.length; + let pos = start; while (pos < end) { - let ch = text.charCodeAt(pos); - if (ch != CharCode.Backslash) { + let ch = input.charCodeAt(pos); + if (ch !== CharCode.Backslash) { pos++; continue; } - result += text.substring(start, pos); - pos++; - ch = text.charCodeAt(pos); - - switch (ch) { - case CharCode.r: - result += "\r"; - break; - case CharCode.n: - result += "\n"; - break; - case CharCode.t: - result += "\t"; - break; - case CharCode.DoubleQuote: - result += '"'; - break; - case CharCode.Backslash: - result += "\\"; - break; - default: - error(Message.InvalidEscapeSequence); - result += String.fromCharCode(ch); - break; + if (pos === end - 1) { + error(Message.InvalidEscapeSequence); + break; } - pos++; + result += input.substring(start, pos); + result += unescapeOne(pos); + pos += 2; start = pos; } - result += text.substring(start, pos); + result += input.substring(start, pos); return result; } - function scanIdentifierOrKeyword() { - const startChar = input.charCodeAt(position); - let ch = startChar; - do { - position++; - } while (!eof() && isAsciiIdentifierContinue((ch = input.charCodeAt(position)))); + function unescapeOne(pos: number) { + const ch = input.charCodeAt(pos + 1); + switch (ch) { + case CharCode.r: + return "\r"; + case CharCode.n: + return "\n"; + case CharCode.t: + return "\t"; + case CharCode.DoubleQuote: + return '"'; + case CharCode.Backslash: + return "\\"; + default: + error(Message.InvalidEscapeSequence); + return String.fromCharCode(ch); + } + } - if (ch > CharCode.MaxAscii) { - const codePoint = input.codePointAt(position)!; - if (isNonAsciiIdentifierContinue(codePoint)) { - return scanNonAsciiIdentifierContinue(codePoint); + function scanIdentifierOrKeyword() { + let key = 0; + let count = 0; + let ch = input.charCodeAt(position); + + while (true) { + position++; + count++; + key = (key << 5) | (ch - CharCode.a); + + if (eof()) { + break; } + + ch = input.charCodeAt(position); + if (count < KeywordLimit.MaxLength && isLowercaseAsciiLetter(ch)) { + continue; + } + + if (isAsciiIdentifierContinue(ch)) { + return scanIdentifier(); + } + + if (ch > CharCode.MaxAscii) { + const cp = input.codePointAt(position)!; + if (isNonAsciiIdentifierContinue(cp)) { + return scanNonAsciiIdentifier(cp); + } + } + + break; } - const length = position - tokenPosition; - if ( - length >= KeywordLimit.MinLength && - length <= KeywordLimit.MaxLength && - startChar >= KeywordLimit.MinStartChar && - startChar <= KeywordLimit.MaxStartChar - ) { - tokenValue = getTokenText(); - const keyword = Keywords.get(tokenValue); + if (count >= KeywordLimit.MinLength && count <= KeywordLimit.MaxLength) { + const keyword = KeywordMap.get(key); if (keyword) { return (token = keyword); } @@ -802,11 +843,31 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro return (token = Token.Identifier); } - function scanNonAsciiIdentifierContinue(startCodePoint: number) { - let codePoint = startCodePoint; + function scanIdentifier() { + let ch: number; + do { - position += utf16CodeUnits(codePoint); - } while (!eof() && isIdentifierContinue((codePoint = input.codePointAt(position)!))); + position++; + if (eof()) { + return (token = Token.Identifier); + } + } while (isAsciiIdentifierContinue((ch = input.charCodeAt(position)))); + + if (ch > CharCode.MaxAscii) { + let cp = input.codePointAt(position)!; + if (isNonAsciiIdentifierContinue(cp)) { + return scanNonAsciiIdentifier(cp); + } + } + + return (token = Token.Identifier); + } + + function scanNonAsciiIdentifier(startCodePoint: number) { + let cp = startCodePoint; + do { + position += utf16CodeUnits(cp); + } while (!eof() && isIdentifierContinue((cp = input.codePointAt(position)!))); return (token = Token.Identifier); } diff --git a/packages/adl/test/test-parser.ts b/packages/adl/test/test-parser.ts index 5186d1a43..0dec9e7cf 100644 --- a/packages/adl/test/test-parser.ts +++ b/packages/adl/test/test-parser.ts @@ -1,4 +1,5 @@ import assert from "assert"; +import { CharCode } from "../compiler/charcode.js"; import { logDiagnostics, logVerboseTestOutput } from "../compiler/diagnostics.js"; import { hasParseError, NodeFlags, parse } from "../compiler/parser.js"; import { ADLScriptNode, SyntaxKind } from "../compiler/types.js"; @@ -223,16 +224,25 @@ describe("syntax", () => { }); describe("unterminated tokens", () => { - parseErrorEach([ - ['alias X = "banana', [/Unterminated string literal/]], - ['alias X = "banana\\', [/Unterminated string literal/]], - ['alias X = """\nbanana', [/Unterminated string literal/]], - ['alias X = """\nbanana\\', [/Unterminated string literal/]], - ["/* Yada yada yada", [/Unterminated comment/]], - ]); + parseErrorEach([["/* Yada yada yada", [/Unterminated multi-line comment/]]]); + + const strings = ['"banana', '"banana\\', '"""\nbanana', '"""\nbanana\\']; + parseErrorEach( + Array.from(strings.entries()).map((e) => [ + `alias ${String.fromCharCode(CharCode.A + e[0])} = ${e[1]}`, + [/Unterminated string literal/], + (node) => { + const statement = node.statements[0]; + assert(statement.kind === SyntaxKind.AliasStatement, "alias statement expected"); + const value = statement.value; + assert(value.kind === SyntaxKind.StringLiteral, "string literal expected"); + assert.strictEqual(value.value, "banana"); + }, + ]) + ); }); - describe("terminated tokens at EOF with missing semicolon", () => { + describe("terminated tokens at EOF", () => { parseErrorEach([ ["alias X = 0x10101", [/';' expected/]], ["alias X = 0xBEEF", [/';' expected/]], @@ -305,16 +315,68 @@ describe("syntax", () => { } }); - describe("non-ascii identifiers", () => { - parseEach([ - "model Incompréhensible {}", - "model 𐌰𐌲 {}", - "model Banana𐌰𐌲42Banana {}", - "model deaf\u{200c}ly {}", // ZWNJ - "model क्‍ष {}", // ZWJ - ]); - parseErrorEach([["model 😢 {}", [/Invalid character/]]]); + describe("identifiers", () => { + const good = [ + "short", + "short42", + "lowercaseandlong", + "lowercaseandlong42", + "camelCase", + "camelCase42", + "PascalCase", + "PascalCase42", + "has_underscore", + "has_$dollar", + "_startsWithUnderscore", + "$startsWithDollar", + "Incompréhensible", + "incompréhensible", + "IncomprÉhensible", + "incomprÉhensible", + // leading astral character + "𐌰𐌲", + // continuing astral character + "Banana𐌰𐌲42Banana", + "banana𐌰𐌲42banana", + // ZWNJ + "deaf\u{200c}ly", + // ZWJ + "क्‍ष", + ]; + + const bad: [string, RegExp][] = [ + ["😢", /Invalid character/], + ["42", /Identifier expected/], + ["true", /Keyword cannot be used as identifier/], + ]; + + parseEach( + good.map((s) => [ + `model ${s} {}`, + (node) => { + const statement = node.statements[0]; + assert(statement.kind === SyntaxKind.ModelStatement, "Model statement expected."); + assert.strictEqual(statement.id.sv, s); + }, + ]) + ); + + parseErrorEach(bad.map((e) => [`model ${e[0]} {}`, [e[1]]])); }); +}); + +// smaller repro of previous regen-samples baseline failures +describe("sample regressions", () => { + parseEach([ + [ + `/* \\n <-- before string! */ @format("\\\\w") model M {}`, + (node) => { + assert(node.statements[0].kind === SyntaxKind.ModelStatement); + assert(node.statements[0].decorators[0].arguments[0].kind === SyntaxKind.StringLiteral); + assert.strictEqual(node.statements[0].decorators[0].arguments[0].value, "\\w"); + }, + ], + ]); describe("enum statements", () => { parseEach([ @@ -344,7 +406,9 @@ describe("syntax", () => { }); }); -function parseEach(cases: (string | [string, (node: ADLScriptNode) => void])[]) { +type Callback = (node: ADLScriptNode) => void; + +function parseEach(cases: (string | [string, Callback])[]) { for (const each of cases) { const code = typeof each === "string" ? each : each[0]; const callback = typeof each === "string" ? undefined : each[1]; @@ -377,13 +441,16 @@ function parseEach(cases: (string | [string, (node: ADLScriptNode) => void])[]) } } -function parseErrorEach(cases: [string, RegExp[]][]) { - for (const [code, matches] of cases) { +function parseErrorEach(cases: [string, RegExp[], Callback?][], significantWhitespace = false) { + for (const [code, matches, callback] of cases) { it(`doesn't parse ${shorten(code)}`, () => { logVerboseTestOutput("=== Source ==="); logVerboseTestOutput(code); const astNode = parse(code); + if (callback) { + callback(astNode); + } logVerboseTestOutput("\n=== Parse Result ==="); dumpAST(astNode); @@ -404,7 +471,7 @@ function parseErrorEach(cases: [string, RegExp[]][]) { function dumpAST(astNode: ADLScriptNode) { logVerboseTestOutput((log) => { - const hasErrors = hasParseError(astNode); // force flags to initialize + hasParseError(astNode); // force flags to initialize const json = JSON.stringify(astNode, replacer, 2); log(json); }); diff --git a/packages/adl/test/test-scanner.ts b/packages/adl/test/test-scanner.ts index cb293de01..573e2e17b 100644 --- a/packages/adl/test/test-scanner.ts +++ b/packages/adl/test/test-scanner.ts @@ -2,7 +2,7 @@ import assert from "assert"; import { readFile } from "fs/promises"; import { URL } from "url"; import { isIdentifierContinue, isIdentifierStart } from "../compiler/charcode.js"; -import { throwOnError } from "../compiler/diagnostics.js"; +import { createDiagnostic, formatDiagnostic, throwOnError } from "../compiler/diagnostics.js"; import { createScanner, isKeyword, @@ -180,11 +180,21 @@ describe("scanner", () => { ]); }); - function scanString(text: string, expectedValue: string) { - const scanner = createScanner(text); + function scanString(text: string, expectedValue: string, expectedDiagnostic?: RegExp) { + const scanner = createScanner(text, (message, target, args) => { + const diagnostic = createDiagnostic(message, target, args); + if (expectedDiagnostic) { + assert.match(diagnostic.message, expectedDiagnostic); + } else { + assert.fail("No diagnostic expected, but got " + formatDiagnostic(diagnostic)); + } + }); + assert.strictEqual(scanner.scan(), Token.StringLiteral); assert.strictEqual(scanner.token, Token.StringLiteral); - assert.strictEqual(scanner.getTokenText(), text); + if (!expectedDiagnostic) { + assert.strictEqual(scanner.getTokenText(), text); + } assert.strictEqual(scanner.getTokenValue(), expectedValue); } @@ -202,19 +212,24 @@ describe("scanner", () => { it("scans triple-quoted strings", () => { scanString( + // NOTE: sloppy blank line formatting and trailing whitespace after open + // quotes above is deliberate here and deliberately tolerated by + // the scanner. `""" This is a triple-quoted string - - And this is another line + "You do not need to escape lone quotes" + You can use escape sequences: \\r \\n \\t \\\\ \\" """`, - // NOTE: sloppy blank line formatting and trailing whitespace after open - // quotes above is deliberately tolerated. - "This is a triple-quoted string\n\n\n\nAnd this is another line" + 'This is a triple-quoted string\n\n\n"You do not need to escape lone quotes"\nYou can use escape sequences: \r \n \t \\ "' ); }); + it("normalizes CRLF to LF in multi-line string", () => { + scanString('"""\r\nThis\r\nis\r\na\r\ntest\r\n"""', "This\nis\na\ntest"); + }); + it("provides token position", () => { const all = tokens("a x\raa x\r\naaa x\naaaa x\u{2028}aaaaa x\u{2029}aaaaaa x"); verify(all, [ @@ -263,14 +278,15 @@ describe("scanner", () => { const nonStatementKeywords = [Token.ExtendsKeyword, Token.TrueKeyword, Token.FalseKeyword]; let minKeywordLengthFound = Number.MAX_SAFE_INTEGER; let maxKeywordLengthFound = Number.MIN_SAFE_INTEGER; - let minKeywordStartCharFound = Number.MAX_SAFE_INTEGER; - let maxKeywordStartCharFound = Number.MIN_SAFE_INTEGER; - for (const [name, token] of Keywords.entries()) { + for (const [name, token] of Keywords) { + assert.match( + name, + /^[a-z]+$/, + "We need to change the keyword lookup algorithm in the scanner if we ever add a keyword that is not all lowercase ascii letters." + ); minKeywordLengthFound = Math.min(minKeywordLengthFound, name.length); maxKeywordLengthFound = Math.max(maxKeywordLengthFound, name.length); - minKeywordStartCharFound = Math.min(minKeywordStartCharFound, name.charCodeAt(0)); - maxKeywordStartCharFound = Math.max(maxKeywordStartCharFound, name.charCodeAt(0)); assert.strictEqual(TokenDisplay[token], `'${name}'`); assert(isKeyword(token), `${name} should be classified as a keyword`); @@ -289,15 +305,10 @@ describe("scanner", () => { KeywordLimit.MaxLength, `max keyword length is incorrect, set KeywordLimit.MaxLength to ${maxKeywordLengthFound}` ); - assert.strictEqual( - minKeywordStartCharFound, - KeywordLimit.MinStartChar, - `min keyword start char is incorrect, set KeywordLimit.MinStartChar to ${minKeywordStartCharFound}` - ); - assert.strictEqual( - maxKeywordStartCharFound, - KeywordLimit.MaxStartChar, - `max keyword start char is incorrect, set KeywordLimit.MaxStartChar to ${maxKeywordStartCharFound}` + + assert( + maxKeywordLengthFound < 11, + "We need to change the keyword lookup algorithm in the scanner if we ever add a keyword with 11 characters or more." ); // check single character punctuation @@ -317,15 +328,15 @@ describe("scanner", () => { // check the rest assert.strictEqual(TokenDisplay[Token.Elipsis], "'...'"); - assert.strictEqual(TokenDisplay[Token.None], ""); - assert.strictEqual(TokenDisplay[Token.Invalid], ""); - assert.strictEqual(TokenDisplay[Token.EndOfFile], ""); - assert.strictEqual(TokenDisplay[Token.SingleLineComment], ""); - assert.strictEqual(TokenDisplay[Token.MultiLineComment], ""); - assert.strictEqual(TokenDisplay[Token.NewLine], ""); - assert.strictEqual(TokenDisplay[Token.Whitespace], ""); - assert.strictEqual(TokenDisplay[Token.ConflictMarker], ""); - assert.strictEqual(TokenDisplay[Token.Identifier], ""); + assert.strictEqual(TokenDisplay[Token.None], "none"); + assert.strictEqual(TokenDisplay[Token.Invalid], "invalid"); + assert.strictEqual(TokenDisplay[Token.EndOfFile], "end of file"); + assert.strictEqual(TokenDisplay[Token.SingleLineComment], "single-line comment"); + assert.strictEqual(TokenDisplay[Token.MultiLineComment], "multi-line comment"); + assert.strictEqual(TokenDisplay[Token.NewLine], "newline"); + assert.strictEqual(TokenDisplay[Token.Whitespace], "whitespace"); + assert.strictEqual(TokenDisplay[Token.ConflictMarker], "conflict marker"); + assert.strictEqual(TokenDisplay[Token.Identifier], "identifier"); }); // Search for Other_ID_Start in https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt