From 44faaadf014a05b190ee1289f7de07781e49bedb Mon Sep 17 00:00:00 2001
From: Nick Guerrera <nicholg@microsoft.com>
Date: Sun, 2 May 2021 12:14:10 -0700
Subject: [PATCH] More scanner optimization and fixes

* Don't allocate substrings to match keywords.
* Make at most one pass over string to get its unquoted/unescaped/unindented
  value. (**)
* Add test coverage for impacted code paths.
* Fix issue where string value of an unterminated string was missing final
  character(s) in string value.

(**) Actually, we still make two passes in the case of a non-triple-quoted,
multi-line string with \r\n, but that is about to be removed by the next
commit which will disallow non-triple-quoted, multi-line strings
altogether.
---
 packages/adl/compiler/charcode.ts |  24 +-
 packages/adl/compiler/scanner.ts  | 407 +++++++++++++++++-------------
 packages/adl/test/test-parser.ts  | 109 ++++++--
 packages/adl/test/test-scanner.ts |  75 +++---
 4 files changed, 381 insertions(+), 234 deletions(-)

diff --git a/packages/adl/compiler/charcode.ts b/packages/adl/compiler/charcode.ts
index 94b6e7d13..d373b54fa 100644
--- a/packages/adl/compiler/charcode.ts
+++ b/packages/adl/compiler/charcode.ts
@@ -144,11 +144,15 @@ export const enum CharCode {
   Tilde = 0x7e,
 }
 
+export function utf16CodeUnits(codePoint: number) {
+  return codePoint >= 0x10000 ? 2 : 1;
+}
+
 export function isAsciiLineBreak(ch: number) {
   return ch === CharCode.LineFeed || ch == CharCode.CarriageReturn;
 }
 
-export function isAsciiWhiteSpaceSingleLine(ch: number): boolean {
+export function isAsciiWhiteSpaceSingleLine(ch: number) {
   return (
     ch === CharCode.Space ||
     ch === CharCode.Tab ||
@@ -186,25 +190,29 @@ export function isWhiteSpaceSingleLine(ch: number) {
   );
 }
 
-export function isLineBreak(ch: number): boolean {
+export function isLineBreak(ch: number) {
   return isAsciiLineBreak(ch) || (ch > CharCode.MaxAscii && isNonAsciiLineBreak(ch));
 }
 
-export function isDigit(ch: number): boolean {
+export function isDigit(ch: number) {
   return ch >= CharCode._0 && ch <= CharCode._9;
 }
 
-export function isHexDigit(ch: number): boolean {
+export function isHexDigit(ch: number) {
   return (
     isDigit(ch) || (ch >= CharCode.A && ch <= CharCode.F) || (ch >= CharCode.a && ch <= CharCode.f)
   );
 }
 
-export function isBinaryDigit(ch: number): boolean {
+export function isBinaryDigit(ch: number) {
   return ch === CharCode._0 || ch === CharCode._1;
 }
 
-export function isAsciiIdentifierStart(ch: number): boolean {
+export function isLowercaseAsciiLetter(ch: number) {
+  return ch >= CharCode.a && ch <= CharCode.z;
+}
+
+export function isAsciiIdentifierStart(ch: number) {
   return (
     (ch >= CharCode.A && ch <= CharCode.Z) ||
     (ch >= CharCode.a && ch <= CharCode.z) ||
@@ -213,7 +221,7 @@ export function isAsciiIdentifierStart(ch: number): boolean {
   );
 }
 
-export function isAsciiIdentifierContinue(ch: number): boolean {
+export function isAsciiIdentifierContinue(ch: number) {
   return (
     (ch >= CharCode.A && ch <= CharCode.Z) ||
     (ch >= CharCode.a && ch <= CharCode.z) ||
@@ -245,7 +253,7 @@ export function isNonAsciiIdentifierContinue(codePoint: number) {
   return lookupInNonAsciiMap(codePoint, nonAsciiIdentifierContinueMap);
 }
 
-function lookupInNonAsciiMap(codePoint: number, map: readonly number[]): boolean {
+function lookupInNonAsciiMap(codePoint: number, map: readonly number[]) {
   // Bail out quickly if it couldn't possibly be in the map.
   if (codePoint < map[0]) {
     return false;
diff --git a/packages/adl/compiler/scanner.ts b/packages/adl/compiler/scanner.ts
index c2cfcaaed..16c8a34f4 100644
--- a/packages/adl/compiler/scanner.ts
+++ b/packages/adl/compiler/scanner.ts
@@ -7,11 +7,13 @@ import {
   isHexDigit,
   isIdentifierContinue,
   isLineBreak,
+  isLowercaseAsciiLetter,
   isNonAsciiIdentifierContinue,
   isNonAsciiIdentifierStart,
   isNonAsciiLineBreak,
   isNonAsciiWhiteSpaceSingleLine,
   isWhiteSpaceSingleLine,
+  utf16CodeUnits,
 } from "./charcode.js";
 import { createSourceFile, Message, throwOnError } from "./diagnostics.js";
 import { SourceFile } from "./types.js";
@@ -91,16 +93,16 @@ const MaxStatementKeyword = Token.AliasKeyword;
 
 /** @internal */
 export const TokenDisplay: readonly string[] = [
-  "<none>",
-  "<invalid>",
-  "<end of file>",
-  "<single-line comment>",
-  "<multi-line comment>",
-  "<newline>",
-  "<whitespace>",
-  "<conflict marker>",
-  "<numeric literal>",
-  "<string literal>",
+  "none",
+  "invalid",
+  "end of file",
+  "single-line comment",
+  "multi-line comment",
+  "newline",
+  "whitespace",
+  "conflict marker",
+  "numeric literal",
+  "string literal",
   "'{'",
   "'}'",
   "'('",
@@ -119,7 +121,7 @@ export const TokenDisplay: readonly string[] = [
   "'?'",
   "':'",
   "'@'",
-  "<identifier>",
+  "identifier",
   "'import'",
   "'model'",
   "'namespace'",
@@ -133,7 +135,7 @@ export const TokenDisplay: readonly string[] = [
 ];
 
 /** @internal */
-export const Keywords: ReadonlyMap<string, Token> = new Map([
+export const Keywords: readonly [string, Token][] = [
   ["import", Token.ImportKeyword],
   ["model", Token.ModelKeyword],
   ["namespace", Token.NamespaceKeyword],
@@ -144,14 +146,30 @@ export const Keywords: ReadonlyMap<string, Token> = new Map([
   ["alias", Token.AliasKeyword],
   ["true", Token.TrueKeyword],
   ["false", Token.FalseKeyword],
-]);
+];
 
 /** @internal */
 export const enum KeywordLimit {
   MinLength = 2,
+  // If this ever exceeds 10, we will overflow the keyword map key, needing 11*5
+  // = 55 bits or more, exceeding the JavaScript safe integer range. We would
+  // have to change the keyword lookup algorithm in that case.
   MaxLength = 9,
-  MinStartChar = CharCode.a,
-  MaxStartChar = CharCode.u,
+}
+
+const KeywordMap: ReadonlyMap<number, Token> = new Map(
+  Keywords.map((e) => [keywordKey(e[0]), e[1]])
+);
+
+// Since keywords are short and all lowercase, we can pack the whole string into
+// a single number by using 5 bits for each letter, and use that as the map key.
+// This lets us lookup keywords without making temporary substrings.
+function keywordKey(keyword: string) {
+  let key = 0;
+  for (let i = 0; i < keyword.length; i++) {
+    key = (key << 5) | (keyword.charCodeAt(i) - CharCode.a);
+  }
+  return key;
 }
 
 export interface Scanner {
@@ -190,6 +208,7 @@ const enum TokenFlags {
   HasCrlf = 1 << 0,
   Escaped = 1 << 1,
   TripleQuoted = 1 << 2,
+  Unterminated = 1 << 3,
 }
 
 export function isLiteral(token: Token) {
@@ -226,9 +245,8 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
   const file = typeof source === "string" ? createSourceFile(source, "<anonymous file>") : source;
   const input = file.text;
   let position = 0;
-  let token = Token.Invalid;
+  let token = Token.None;
   let tokenPosition = -1;
-  let tokenValue: string | undefined = undefined;
   let tokenFlags = TokenFlags.None;
 
   return {
@@ -252,26 +270,20 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
     return position >= input.length;
   }
 
-  function next(t: Token, count = 1) {
-    position += count;
-    return (token = t);
-  }
-
-  function utf16CodeUnits(codePoint: number) {
-    return codePoint >= 0x10000 ? 2 : 1;
-  }
-
   function getTokenText() {
     return input.substring(tokenPosition, position);
   }
 
+  function getTokenValue() {
+    return token === Token.StringLiteral ? getStringTokenValue() : getTokenText();
+  }
+
   function lookAhead(offset: number) {
     return input.charCodeAt(position + offset);
   }
 
   function scan(): Token {
     tokenPosition = position;
-    tokenValue = undefined;
     tokenFlags = TokenFlags.None;
 
     if (!eof()) {
@@ -390,10 +402,14 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
             : scanString();
 
         default:
-          if (isAsciiIdentifierStart(ch)) {
+          if (isLowercaseAsciiLetter(ch)) {
             return scanIdentifierOrKeyword();
           }
 
+          if (isAsciiIdentifierStart(ch)) {
+            return scanIdentifier();
+          }
+
           if (ch <= CharCode.MaxAscii) {
             return scanInvalidCharacter();
           }
@@ -405,6 +421,17 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
     return (token = Token.EndOfFile);
   }
 
+  function next(t: Token, count = 1) {
+    position += count;
+    return (token = t);
+  }
+
+  function unterminated(t: Token) {
+    tokenFlags |= TokenFlags.Unterminated;
+    error(Message.Unterminated, [TokenDisplay[t]]);
+    return (token = t);
+  }
+
   function scanNonAsciiToken() {
     const ch = input.charCodeAt(position);
 
@@ -416,9 +443,9 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
       return scanWhitespace();
     }
 
-    const codePoint = input.codePointAt(position)!;
-    if (isNonAsciiIdentifierStart(codePoint)) {
-      return scanNonAsciiIdentifierContinue(codePoint);
+    let cp = input.codePointAt(position)!;
+    if (isNonAsciiIdentifierStart(cp)) {
+      return scanNonAsciiIdentifier(cp);
     }
 
     return scanInvalidCharacter();
@@ -527,11 +554,10 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
   function scanSingleLineComment() {
     position += 2; // consume '//'
 
-    while (!eof()) {
+    for (; !eof(); position++) {
       if (isLineBreak(input.charCodeAt(position))) {
         break;
       }
-      position++;
     }
 
     return (token = Token.SingleLineComment);
@@ -540,22 +566,20 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
   function scanMultiLineComment() {
     position += 2; // consume '/*'
 
-    while (!eof()) {
+    for (; !eof(); position++) {
       if (input.charCodeAt(position) === CharCode.Asterisk && lookAhead(1) === CharCode.Slash) {
         position += 2;
         return (token = Token.MultiLineComment);
       }
-      position++;
     }
 
-    error(Message.Unterminated, ["comment"]);
-    return (token = Token.MultiLineComment);
+    return unterminated(Token.MultiLineComment);
   }
 
   function scanString() {
     position++; // consume '"'
 
-    loop: while (!eof()) {
+    loop: for (; !eof(); position++) {
       const ch = input.charCodeAt(position);
       switch (ch) {
         case CharCode.CarriageReturn:
@@ -570,150 +594,157 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
           if (eof()) {
             break loop;
           }
-          break;
+          continue;
         case CharCode.DoubleQuote:
           position++;
           return (token = Token.StringLiteral);
       }
-      position++;
     }
 
-    error(Message.Unterminated, ["string literal"]);
-    return (token = Token.StringLiteral);
+    return unterminated(Token.StringLiteral);
   }
 
   function scanTripleQuotedString() {
     tokenFlags |= TokenFlags.TripleQuoted;
     position += 3; // consume '"""'
 
-    loop: while (!eof()) {
-      const ch = input.charCodeAt(position);
-      switch (ch) {
-        case CharCode.CarriageReturn:
-          if (lookAhead(1) === CharCode.LineFeed) {
-            tokenFlags |= TokenFlags.HasCrlf;
-            position++;
-          }
-          break;
-        case CharCode.Backslash:
-          tokenFlags |= TokenFlags.Escaped;
-          position++;
-          if (eof()) {
-            break loop;
-          }
-          break;
-        case CharCode.DoubleQuote:
-          if (lookAhead(1) === CharCode.DoubleQuote && lookAhead(2) === CharCode.DoubleQuote) {
-            position += 3;
-            return (token = Token.StringLiteral);
-          }
-          break;
+    for (; !eof(); position++) {
+      if (
+        input.charCodeAt(position) === CharCode.DoubleQuote &&
+        lookAhead(1) === CharCode.DoubleQuote &&
+        lookAhead(2) === CharCode.DoubleQuote
+      ) {
+        position += 3;
+        return (token = Token.StringLiteral);
       }
-      position++;
     }
 
-    error(Message.Unterminated, ["string literal"]);
-    return (token = Token.StringLiteral);
-  }
-
-  function getTokenValue() {
-    if (tokenValue !== undefined) {
-      return tokenValue;
-    }
-    return (tokenValue = token === Token.StringLiteral ? getStringTokenValue() : getTokenText());
+    return unterminated(Token.StringLiteral);
   }
 
   function getStringTokenValue() {
-    // strip quotes
     const quoteLength = tokenFlags & TokenFlags.TripleQuoted ? 3 : 1;
-    let value = input.substring(tokenPosition + quoteLength, position - quoteLength);
-
-    // Normalize CRLF to LF when interpreting value of multi-line string
-    // literals. Matches JavaScript behavior and ensures program behavior does
-    // not change due to line-ending conversion.
-    if (tokenFlags & TokenFlags.HasCrlf) {
-      value = value.replace(/\r\n/g, "\n");
-    }
+    const start = tokenPosition + quoteLength;
+    const end = tokenFlags & TokenFlags.Unterminated ? position : position - quoteLength;
 
     if (tokenFlags & TokenFlags.TripleQuoted) {
-      value = unindentTripleQuoteString(value);
+      return unindentAndUnescapeTripleQuotedString(start, end);
     }
 
     if (tokenFlags & TokenFlags.Escaped) {
-      value = unescapeString(value);
+      return unescapeString(start, end);
     }
 
-    return (tokenValue = value);
+    let value = input.substring(start, end);
+    if (tokenFlags & TokenFlags.HasCrlf) {
+      value = value.replace(/\r\n/g, "\n");
+    }
+    return value;
   }
 
-  function unindentTripleQuoteString(text: string) {
-    let start = 0;
-    let end = text.length;
-
+  function unindentAndUnescapeTripleQuotedString(start: number, end: number) {
     // ignore leading whitespace before required initial line break
-    while (start < end && isWhiteSpaceSingleLine(text.charCodeAt(start))) {
+    while (start < end && isWhiteSpaceSingleLine(input.charCodeAt(start))) {
       start++;
     }
 
     // remove required initial line break
-    if (isLineBreak(text.charCodeAt(start))) {
+    if (isLineBreak(input.charCodeAt(start))) {
+      if (isCrlf(start, start, end)) {
+        start++;
+      }
       start++;
     } else {
       error(Message.NoNewLineAtStartOfTripleQuotedString);
     }
 
-    // remove whitespace before closing delimiter and record it as
-    // required indentation for all lines.
-    while (end > start && isWhiteSpaceSingleLine(text.charCodeAt(end - 1))) {
+    // remove whitespace before closing delimiter and record it as required
+    // indentation for all lines
+    const indentationEnd = end;
+    while (end > start && isWhiteSpaceSingleLine(input.charCodeAt(end - 1))) {
       end--;
     }
-    const indentation = text.substring(end, text.length);
+    const indentationStart = end;
 
     // remove required final line break
-    if (isLineBreak(text.charCodeAt(end - 1))) {
+    if (isLineBreak(input.charCodeAt(end - 1))) {
+      if (isCrlf(end - 2, start, end)) {
+        end--;
+      }
       end--;
     } else {
       error(Message.NoNewLineAtEndOfTripleQuotedString);
     }
 
-    // remove required matching indentation from each line
-    return removeMatchingIndentation(text, start, end, indentation);
-  }
-
-  function removeMatchingIndentation(
-    text: string,
-    start: number,
-    end: number,
-    indentation: string
-  ) {
+    // remove required matching indentation from each line and unescape in the
+    // process of doing so
     let result = "";
     let pos = start;
-
     while (pos < end) {
-      start = skipMatchingIndentation(text, pos, end, indentation);
-      while (pos < end && !isLineBreak(text.charCodeAt(pos))) {
-        pos++;
+      // skip indentation at start of line
+      start = skipMatchingIndentation(pos, end, indentationStart, indentationEnd);
+      let ch;
+
+      while (pos < end && !isLineBreak((ch = input.charCodeAt(pos)))) {
+        if (ch !== CharCode.Backslash) {
+          pos++;
+          continue;
+        }
+        result += input.substring(start, pos);
+        if (pos === end - 1) {
+          error(Message.InvalidEscapeSequence);
+          pos++;
+        } else {
+          result += unescapeOne(pos);
+          pos += 2;
+        }
+        start = pos;
       }
+
       if (pos < end) {
-        pos++; // include line break
+        if (isCrlf(pos, start, end)) {
+          // CRLF in multi-line string is normalized to LF in string value.
+          // This keeps program behavior unchanged by line-eding conversion.
+          result += input.substring(start, pos);
+          result += "\n";
+          pos += 2;
+        } else {
+          pos++; // include non-CRLF newline
+          result += input.substring(start, pos);
+        }
+        start = pos;
       }
-      result += text.substring(start, pos);
     }
 
+    result += input.substring(start, pos);
     return result;
   }
 
-  function skipMatchingIndentation(text: string, pos: number, end: number, indentation: string) {
-    end = Math.min(end, pos + indentation.length);
+  function isCrlf(pos: number, start: number, end: number) {
+    return (
+      pos >= start &&
+      pos < end - 1 &&
+      input.charCodeAt(pos) === CharCode.CarriageReturn &&
+      input.charCodeAt(pos + 1) === CharCode.LineFeed
+    );
+  }
+
+  function skipMatchingIndentation(
+    pos: number,
+    end: number,
+    indentationStart: number,
+    indentationEnd: number
+  ) {
+    let indentationPos = indentationStart;
+    end = Math.min(end, pos + (indentationEnd - indentationStart));
 
-    let indentationPos = 0;
     while (pos < end) {
-      const ch = text.charCodeAt(pos);
+      const ch = input.charCodeAt(pos);
       if (isLineBreak(ch)) {
         // allow subset of indentation if line has only whitespace
         break;
       }
-      if (ch != indentation.charCodeAt(indentationPos)) {
+      if (ch !== input.charCodeAt(indentationPos)) {
         error(Message.InconsistentTripleQuoteIndentation);
         break;
       }
@@ -724,76 +755,86 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
     return pos;
   }
 
-  function unescapeString(text: string) {
+  function unescapeString(start: number, end: number) {
     let result = "";
-    let start = 0;
-    let pos = 0;
-    const end = text.length;
+    let pos = start;
 
     while (pos < end) {
-      let ch = text.charCodeAt(pos);
-      if (ch != CharCode.Backslash) {
+      let ch = input.charCodeAt(pos);
+      if (ch !== CharCode.Backslash) {
         pos++;
         continue;
       }
 
-      result += text.substring(start, pos);
-      pos++;
-      ch = text.charCodeAt(pos);
-
-      switch (ch) {
-        case CharCode.r:
-          result += "\r";
-          break;
-        case CharCode.n:
-          result += "\n";
-          break;
-        case CharCode.t:
-          result += "\t";
-          break;
-        case CharCode.DoubleQuote:
-          result += '"';
-          break;
-        case CharCode.Backslash:
-          result += "\\";
-          break;
-        default:
-          error(Message.InvalidEscapeSequence);
-          result += String.fromCharCode(ch);
-          break;
+      if (pos === end - 1) {
+        error(Message.InvalidEscapeSequence);
+        break;
       }
 
-      pos++;
+      result += input.substring(start, pos);
+      result += unescapeOne(pos);
+      pos += 2;
       start = pos;
     }
 
-    result += text.substring(start, pos);
+    result += input.substring(start, pos);
     return result;
   }
 
-  function scanIdentifierOrKeyword() {
-    const startChar = input.charCodeAt(position);
-    let ch = startChar;
-    do {
-      position++;
-    } while (!eof() && isAsciiIdentifierContinue((ch = input.charCodeAt(position))));
+  function unescapeOne(pos: number) {
+    const ch = input.charCodeAt(pos + 1);
+    switch (ch) {
+      case CharCode.r:
+        return "\r";
+      case CharCode.n:
+        return "\n";
+      case CharCode.t:
+        return "\t";
+      case CharCode.DoubleQuote:
+        return '"';
+      case CharCode.Backslash:
+        return "\\";
+      default:
+        error(Message.InvalidEscapeSequence);
+        return String.fromCharCode(ch);
+    }
+  }
 
-    if (ch > CharCode.MaxAscii) {
-      const codePoint = input.codePointAt(position)!;
-      if (isNonAsciiIdentifierContinue(codePoint)) {
-        return scanNonAsciiIdentifierContinue(codePoint);
+  function scanIdentifierOrKeyword() {
+    let key = 0;
+    let count = 0;
+    let ch = input.charCodeAt(position);
+
+    while (true) {
+      position++;
+      count++;
+      key = (key << 5) | (ch - CharCode.a);
+
+      if (eof()) {
+        break;
       }
+
+      ch = input.charCodeAt(position);
+      if (count < KeywordLimit.MaxLength && isLowercaseAsciiLetter(ch)) {
+        continue;
+      }
+
+      if (isAsciiIdentifierContinue(ch)) {
+        return scanIdentifier();
+      }
+
+      if (ch > CharCode.MaxAscii) {
+        const cp = input.codePointAt(position)!;
+        if (isNonAsciiIdentifierContinue(cp)) {
+          return scanNonAsciiIdentifier(cp);
+        }
+      }
+
+      break;
     }
 
-    const length = position - tokenPosition;
-    if (
-      length >= KeywordLimit.MinLength &&
-      length <= KeywordLimit.MaxLength &&
-      startChar >= KeywordLimit.MinStartChar &&
-      startChar <= KeywordLimit.MaxStartChar
-    ) {
-      tokenValue = getTokenText();
-      const keyword = Keywords.get(tokenValue);
+    if (count >= KeywordLimit.MinLength && count <= KeywordLimit.MaxLength) {
+      const keyword = KeywordMap.get(key);
       if (keyword) {
         return (token = keyword);
       }
@@ -802,11 +843,31 @@ export function createScanner(source: string | SourceFile, onError = throwOnErro
     return (token = Token.Identifier);
   }
 
-  function scanNonAsciiIdentifierContinue(startCodePoint: number) {
-    let codePoint = startCodePoint;
+  function scanIdentifier() {
+    let ch: number;
+
     do {
-      position += utf16CodeUnits(codePoint);
-    } while (!eof() && isIdentifierContinue((codePoint = input.codePointAt(position)!)));
+      position++;
+      if (eof()) {
+        return (token = Token.Identifier);
+      }
+    } while (isAsciiIdentifierContinue((ch = input.charCodeAt(position))));
+
+    if (ch > CharCode.MaxAscii) {
+      let cp = input.codePointAt(position)!;
+      if (isNonAsciiIdentifierContinue(cp)) {
+        return scanNonAsciiIdentifier(cp);
+      }
+    }
+
+    return (token = Token.Identifier);
+  }
+
+  function scanNonAsciiIdentifier(startCodePoint: number) {
+    let cp = startCodePoint;
+    do {
+      position += utf16CodeUnits(cp);
+    } while (!eof() && isIdentifierContinue((cp = input.codePointAt(position)!)));
 
     return (token = Token.Identifier);
   }
diff --git a/packages/adl/test/test-parser.ts b/packages/adl/test/test-parser.ts
index 5186d1a43..0dec9e7cf 100644
--- a/packages/adl/test/test-parser.ts
+++ b/packages/adl/test/test-parser.ts
@@ -1,4 +1,5 @@
 import assert from "assert";
+import { CharCode } from "../compiler/charcode.js";
 import { logDiagnostics, logVerboseTestOutput } from "../compiler/diagnostics.js";
 import { hasParseError, NodeFlags, parse } from "../compiler/parser.js";
 import { ADLScriptNode, SyntaxKind } from "../compiler/types.js";
@@ -223,16 +224,25 @@ describe("syntax", () => {
   });
 
   describe("unterminated tokens", () => {
-    parseErrorEach([
-      ['alias X = "banana', [/Unterminated string literal/]],
-      ['alias X = "banana\\', [/Unterminated string literal/]],
-      ['alias X = """\nbanana', [/Unterminated string literal/]],
-      ['alias X = """\nbanana\\', [/Unterminated string literal/]],
-      ["/* Yada yada yada", [/Unterminated comment/]],
-    ]);
+    parseErrorEach([["/* Yada yada yada", [/Unterminated multi-line comment/]]]);
+
+    const strings = ['"banana', '"banana\\', '"""\nbanana', '"""\nbanana\\'];
+    parseErrorEach(
+      Array.from(strings.entries()).map((e) => [
+        `alias ${String.fromCharCode(CharCode.A + e[0])} = ${e[1]}`,
+        [/Unterminated string literal/],
+        (node) => {
+          const statement = node.statements[0];
+          assert(statement.kind === SyntaxKind.AliasStatement, "alias statement expected");
+          const value = statement.value;
+          assert(value.kind === SyntaxKind.StringLiteral, "string literal expected");
+          assert.strictEqual(value.value, "banana");
+        },
+      ])
+    );
   });
 
-  describe("terminated tokens at EOF with missing semicolon", () => {
+  describe("terminated tokens at EOF", () => {
     parseErrorEach([
       ["alias X = 0x10101", [/';' expected/]],
       ["alias X = 0xBEEF", [/';' expected/]],
@@ -305,16 +315,68 @@ describe("syntax", () => {
     }
   });
 
-  describe("non-ascii identifiers", () => {
-    parseEach([
-      "model Incompréhensible {}",
-      "model 𐌰𐌲 {}",
-      "model Banana𐌰𐌲42Banana {}",
-      "model deaf\u{200c}ly {}", // ZWNJ
-      "model क्‍ष {}", // ZWJ
-    ]);
-    parseErrorEach([["model 😢 {}", [/Invalid character/]]]);
+  describe("identifiers", () => {
+    const good = [
+      "short",
+      "short42",
+      "lowercaseandlong",
+      "lowercaseandlong42",
+      "camelCase",
+      "camelCase42",
+      "PascalCase",
+      "PascalCase42",
+      "has_underscore",
+      "has_$dollar",
+      "_startsWithUnderscore",
+      "$startsWithDollar",
+      "Incompréhensible",
+      "incompréhensible",
+      "IncomprÉhensible",
+      "incomprÉhensible",
+      // leading astral character
+      "𐌰𐌲",
+      // continuing astral character
+      "Banana𐌰𐌲42Banana",
+      "banana𐌰𐌲42banana",
+      // ZWNJ
+      "deaf\u{200c}ly",
+      // ZWJ
+      "क्‍ष",
+    ];
+
+    const bad: [string, RegExp][] = [
+      ["😢", /Invalid character/],
+      ["42", /Identifier expected/],
+      ["true", /Keyword cannot be used as identifier/],
+    ];
+
+    parseEach(
+      good.map((s) => [
+        `model ${s} {}`,
+        (node) => {
+          const statement = node.statements[0];
+          assert(statement.kind === SyntaxKind.ModelStatement, "Model statement expected.");
+          assert.strictEqual(statement.id.sv, s);
+        },
+      ])
+    );
+
+    parseErrorEach(bad.map((e) => [`model ${e[0]} {}`, [e[1]]]));
   });
+});
+
+// smaller repro of previous regen-samples baseline failures
+describe("sample regressions", () => {
+  parseEach([
+    [
+      `/* \\n <-- before string! */ @format("\\\\w") model M {}`,
+      (node) => {
+        assert(node.statements[0].kind === SyntaxKind.ModelStatement);
+        assert(node.statements[0].decorators[0].arguments[0].kind === SyntaxKind.StringLiteral);
+        assert.strictEqual(node.statements[0].decorators[0].arguments[0].value, "\\w");
+      },
+    ],
+  ]);
 
   describe("enum statements", () => {
     parseEach([
@@ -344,7 +406,9 @@ describe("syntax", () => {
   });
 });
 
-function parseEach(cases: (string | [string, (node: ADLScriptNode) => void])[]) {
+type Callback = (node: ADLScriptNode) => void;
+
+function parseEach(cases: (string | [string, Callback])[]) {
   for (const each of cases) {
     const code = typeof each === "string" ? each : each[0];
     const callback = typeof each === "string" ? undefined : each[1];
@@ -377,13 +441,16 @@ function parseEach(cases: (string | [string, (node: ADLScriptNode) => void])[])
   }
 }
 
-function parseErrorEach(cases: [string, RegExp[]][]) {
-  for (const [code, matches] of cases) {
+function parseErrorEach(cases: [string, RegExp[], Callback?][], significantWhitespace = false) {
+  for (const [code, matches, callback] of cases) {
     it(`doesn't parse ${shorten(code)}`, () => {
       logVerboseTestOutput("=== Source ===");
       logVerboseTestOutput(code);
 
       const astNode = parse(code);
+      if (callback) {
+        callback(astNode);
+      }
       logVerboseTestOutput("\n=== Parse Result ===");
       dumpAST(astNode);
 
@@ -404,7 +471,7 @@ function parseErrorEach(cases: [string, RegExp[]][]) {
 
 function dumpAST(astNode: ADLScriptNode) {
   logVerboseTestOutput((log) => {
-    const hasErrors = hasParseError(astNode); // force flags to initialize
+    hasParseError(astNode); // force flags to initialize
     const json = JSON.stringify(astNode, replacer, 2);
     log(json);
   });
diff --git a/packages/adl/test/test-scanner.ts b/packages/adl/test/test-scanner.ts
index cb293de01..573e2e17b 100644
--- a/packages/adl/test/test-scanner.ts
+++ b/packages/adl/test/test-scanner.ts
@@ -2,7 +2,7 @@ import assert from "assert";
 import { readFile } from "fs/promises";
 import { URL } from "url";
 import { isIdentifierContinue, isIdentifierStart } from "../compiler/charcode.js";
-import { throwOnError } from "../compiler/diagnostics.js";
+import { createDiagnostic, formatDiagnostic, throwOnError } from "../compiler/diagnostics.js";
 import {
   createScanner,
   isKeyword,
@@ -180,11 +180,21 @@ describe("scanner", () => {
     ]);
   });
 
-  function scanString(text: string, expectedValue: string) {
-    const scanner = createScanner(text);
+  function scanString(text: string, expectedValue: string, expectedDiagnostic?: RegExp) {
+    const scanner = createScanner(text, (message, target, args) => {
+      const diagnostic = createDiagnostic(message, target, args);
+      if (expectedDiagnostic) {
+        assert.match(diagnostic.message, expectedDiagnostic);
+      } else {
+        assert.fail("No diagnostic expected, but got " + formatDiagnostic(diagnostic));
+      }
+    });
+
     assert.strictEqual(scanner.scan(), Token.StringLiteral);
     assert.strictEqual(scanner.token, Token.StringLiteral);
-    assert.strictEqual(scanner.getTokenText(), text);
+    if (!expectedDiagnostic) {
+      assert.strictEqual(scanner.getTokenText(), text);
+    }
     assert.strictEqual(scanner.getTokenValue(), expectedValue);
   }
 
@@ -202,19 +212,24 @@ describe("scanner", () => {
 
   it("scans triple-quoted strings", () => {
     scanString(
+      // NOTE: sloppy blank line formatting and trailing whitespace after open
+      //       quotes above is deliberate here and deliberately tolerated by
+      //       the scanner.
       `"""   
       This is a triple-quoted string
 
   
-      
-      And this is another line
+      "You do not need to escape lone quotes"
+      You can use escape sequences: \\r \\n \\t \\\\ \\"
       """`,
-      // NOTE: sloppy blank line formatting and trailing whitespace after open
-      //       quotes above is deliberately tolerated.
-      "This is a triple-quoted string\n\n\n\nAnd this is another line"
+      'This is a triple-quoted string\n\n\n"You do not need to escape lone quotes"\nYou can use escape sequences: \r \n \t \\ "'
     );
   });
 
+  it("normalizes CRLF to LF in multi-line string", () => {
+    scanString('"""\r\nThis\r\nis\r\na\r\ntest\r\n"""', "This\nis\na\ntest");
+  });
+
   it("provides token position", () => {
     const all = tokens("a x\raa x\r\naaa x\naaaa x\u{2028}aaaaa x\u{2029}aaaaaa x");
     verify(all, [
@@ -263,14 +278,15 @@ describe("scanner", () => {
     const nonStatementKeywords = [Token.ExtendsKeyword, Token.TrueKeyword, Token.FalseKeyword];
     let minKeywordLengthFound = Number.MAX_SAFE_INTEGER;
     let maxKeywordLengthFound = Number.MIN_SAFE_INTEGER;
-    let minKeywordStartCharFound = Number.MAX_SAFE_INTEGER;
-    let maxKeywordStartCharFound = Number.MIN_SAFE_INTEGER;
 
-    for (const [name, token] of Keywords.entries()) {
+    for (const [name, token] of Keywords) {
+      assert.match(
+        name,
+        /^[a-z]+$/,
+        "We need to change the keyword lookup algorithm in the scanner if we ever add a keyword that is not all lowercase ascii letters."
+      );
       minKeywordLengthFound = Math.min(minKeywordLengthFound, name.length);
       maxKeywordLengthFound = Math.max(maxKeywordLengthFound, name.length);
-      minKeywordStartCharFound = Math.min(minKeywordStartCharFound, name.charCodeAt(0));
-      maxKeywordStartCharFound = Math.max(maxKeywordStartCharFound, name.charCodeAt(0));
 
       assert.strictEqual(TokenDisplay[token], `'${name}'`);
       assert(isKeyword(token), `${name} should be classified as a keyword`);
@@ -289,15 +305,10 @@ describe("scanner", () => {
       KeywordLimit.MaxLength,
       `max keyword length is incorrect, set KeywordLimit.MaxLength to ${maxKeywordLengthFound}`
     );
-    assert.strictEqual(
-      minKeywordStartCharFound,
-      KeywordLimit.MinStartChar,
-      `min keyword start char is incorrect, set KeywordLimit.MinStartChar to ${minKeywordStartCharFound}`
-    );
-    assert.strictEqual(
-      maxKeywordStartCharFound,
-      KeywordLimit.MaxStartChar,
-      `max keyword start char is incorrect, set KeywordLimit.MaxStartChar to ${maxKeywordStartCharFound}`
+
+    assert(
+      maxKeywordLengthFound < 11,
+      "We need to change the keyword lookup algorithm in the scanner if we ever add a keyword with 11 characters or more."
     );
 
     // check single character punctuation
@@ -317,15 +328,15 @@ describe("scanner", () => {
 
     // check the rest
     assert.strictEqual(TokenDisplay[Token.Elipsis], "'...'");
-    assert.strictEqual(TokenDisplay[Token.None], "<none>");
-    assert.strictEqual(TokenDisplay[Token.Invalid], "<invalid>");
-    assert.strictEqual(TokenDisplay[Token.EndOfFile], "<end of file>");
-    assert.strictEqual(TokenDisplay[Token.SingleLineComment], "<single-line comment>");
-    assert.strictEqual(TokenDisplay[Token.MultiLineComment], "<multi-line comment>");
-    assert.strictEqual(TokenDisplay[Token.NewLine], "<newline>");
-    assert.strictEqual(TokenDisplay[Token.Whitespace], "<whitespace>");
-    assert.strictEqual(TokenDisplay[Token.ConflictMarker], "<conflict marker>");
-    assert.strictEqual(TokenDisplay[Token.Identifier], "<identifier>");
+    assert.strictEqual(TokenDisplay[Token.None], "none");
+    assert.strictEqual(TokenDisplay[Token.Invalid], "invalid");
+    assert.strictEqual(TokenDisplay[Token.EndOfFile], "end of file");
+    assert.strictEqual(TokenDisplay[Token.SingleLineComment], "single-line comment");
+    assert.strictEqual(TokenDisplay[Token.MultiLineComment], "multi-line comment");
+    assert.strictEqual(TokenDisplay[Token.NewLine], "newline");
+    assert.strictEqual(TokenDisplay[Token.Whitespace], "whitespace");
+    assert.strictEqual(TokenDisplay[Token.ConflictMarker], "conflict marker");
+    assert.strictEqual(TokenDisplay[Token.Identifier], "identifier");
   });
 
   // Search for Other_ID_Start in https://www.unicode.org/Public/UCD/latest/ucd/PropList.txt